#http://xgenes.com/article/article-content/187/from-epidome-to-treeheatmap/

#two-component system activity regulator YycH [Staphylococcus epidermidis] 
#https://www.ncbi.nlm.nih.gov/protein/MDU6446748.1?report=genbank&log$=protalign&blast_rank=1&RID=XBK1RDSX01N

# ---------------------------------------------------------------------------
# ----------------------------------- FOR EPIDOME ---------------------------

# --------------------------------------------
# ---- STEP1: raw data preparing ----  

ln -s ../Data_Luise_Epidome_batch2/raw_data/A2-1_R1.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A2-1_R2.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A2-2_R1.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A2-2_R2.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A2-3_R1.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A2-3_R2.fastq.gz .

ln -s ../Data_Luise_Epidome_batch2/raw_data/A3-1_R1.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A3-1_R2.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A3-2_R1.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A3-2_R2.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A3-3_R1.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A3-3_R2.fastq.gz .

ln -s ../Data_Luise_Epidome_batch2/raw_data/A4-1_R1.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A4-1_R2.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A4-2_R1.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A4-2_R2.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A4-3_R1.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A4-3_R2.fastq.gz .

ln -s ../Data_Luise_Epidome_batch2/raw_data/A5-1_R1.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A5-1_R2.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A5-2_R1.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A5-2_R2.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A5-3_R1.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A5-3_R2.fastq.gz .

ln -s ../Data_Luise_Epidome_batch2/raw_data/A10-1_R1.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A10-1_R2.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A10-2_R1.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A10-2_R2.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A10-3_R1.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A10-3_R2.fastq.gz .

ln -s ../Data_Luise_Epidome_batch2/raw_data/A17-1_R1.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A17-1_R2.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A17-2_R1.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A17-2_R2.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A17-3_R1.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A17-3_R2.fastq.gz .

ln -s ../Data_Luise_Epidome_batch2/raw_data/A21-1_R1.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A21-1_R2.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A21-2_R1.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A21-2_R2.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A21-3_R1.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A21-3_R2.fastq.gz .

ln -s ../Data_Luise_Epidome_batch2/raw_data/A22-1_R1.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A22-1_R2.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A22-2_R1.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A22-2_R2.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A22-3_R1.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A22-3_R2.fastq.gz .

ln -s ../Data_Luise_Epidome_batch2/raw_data/A24-1_R1.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A24-1_R2.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A24-2_R1.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A24-2_R2.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A24-3_R1.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A24-3_R2.fastq.gz .

ln -s ../Data_Luise_Epidome_batch2/raw_data/A25-1_R1.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A25-1_R2.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A25-2_R1.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A25-2_R2.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A25-3_R1.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A25-3_R2.fastq.gz .

ln -s ../Data_Luise_Epidome_batch2/raw_data/A27-1_R1.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A27-1_R2.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A27-2_R1.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A27-2_R2.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A27-3_R1.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A27-3_R2.fastq.gz .

ln -s ../Data_Luise_Epidome_batch2/raw_data/A28-1_R1.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A28-1_R2.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A28-2_R1.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A28-2_R2.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A28-3_R1.fastq.gz .
ln -s ../Data_Luise_Epidome_batch2/raw_data/A28-3_R2.fastq.gz .

for file in *.fastq.gz; do 
  echo "mv $file $(echo $file | cut -d'_' -f1)_$(echo $file | cut -d'_' -f4).fastq.gz;"
done

#In total (15 samples of A and 21 samples of H): 36 x 3 time_points= 108x2 paired_end=216 files!


# --------------------------------------------
# ---- STEP2: quality controls (optional) ----  
#under testrun2 should have
#BiocManager::install("dada2")
library(dada2); packageVersion("dada2")
path <- "~/DATA/Data_Luise_Epidome_longitudinal_nose/raw_data" # CHANGE ME to the directory containing the fastq files after unzipping.
list.files(path)

# Forward and reverse fastq filenames have format: SAMPLENAME_R1_001.fastq and SAMPLENAME_R2_001.fastq
fnFs <- sort(list.files(path, pattern="_R1.fastq.gz", full.names = TRUE))
fnRs <- sort(list.files(path, pattern="_R2.fastq.gz", full.names = TRUE))
# Extract sample names, assuming filenames have format: SAMPLENAME_XXX.fastq
sample.names <- sapply(strsplit(basename(fnFs), "_"), `[`, 1)

png("fnFs_1_2.png", width=1080, height=1080)
plotQualityProfile(fnFs[1:2])
dev.off()
png("fnRs_1_2.png", width=1080, height=1080)
plotQualityProfile(fnRs[1:2])
dev.off()


#-------------------------------------------------------------------------------------------------------------------------------
#---- STEP3: cutadapt instead of Trimmomatic (namely demultiplexing, see epidome/scripts/EPIDOME_yycH_cutadapt_loop.sh) ----
#epidome/scripts/EPIDOME_yycH_cutadapt_loop.sh

#5′-CGATGCKAAAGTGCCGAATA-3′/5′-CTTCATTTAAGAAGCCACCWTGACT-3′  for yycH
#5′-TGGGTATGRCAATCACTTTACA-3′/5′-GCATCAAAAGCACTCTCATTACC-3′  for g216
#-p CCTACGGGNGGCWGCAG -q GACTACHVGGGTATCTAATCC -l 300        for 16S
mkdir cutadapted_yycH cutadapted_g216 cutadapted_16S
cd raw_data
#The default is --action=trim. With --action=retain, the read is trimmed, but the adapter sequence itself is not removed.

for file in *_R1.fastq.gz; do
cutadapt -e 0.06 -g TGGGTATGRCAATCACTTTACA -G GCATCAAAAGCACTCTCATTACC --pair-filter=any -o ../cutadapted_g216/${file} --paired-output ../cutadapted_g216/${file/R1.fastq.gz/R2.fastq.gz} --discard-untrimmed $file ${file/R1.fastq.gz/R2.fastq.gz};
done

for file in *_R1.fastq.gz; do
cutadapt -e 0.06 -g CGATGCKAAAGTGCCGAATA -G CTTCATTTAAGAAGCCACCWTGACT --pair-filter=any -o ../cutadapted_yycH/${file} --paired-output ../cutadapted_yycH/${file/R1.fastq.gz/R2.fastq.gz} --discard-untrimmed $file ${file/R1.fastq.gz/R2.fastq.gz};
done

for file in *_R1.fastq.gz; do
cutadapt -e 0.06 -g CCTACGGGNGGCWGCAG -G GACTACHVGGGTATCTAATCC --pair-filter=any -o ../cutadapted_16S/${file} --paired-output ../cutadapted_16S/${file/R1.fastq.gz/R2.fastq.gz} --discard-untrimmed $file ${file/R1.fastq.gz/R2.fastq.gz};
done

#END

# TO BE DELETED!
##### Load dada2 output and metadata into R, make sure rownames in metadata match names of isolates
#epi01_table = read.table("epi01_dada_output.csv",sep = ";",header=TRUE,row.names=1)
#epi02_table = read.table("epi02_dada_output.csv",sep = ";",header=TRUE,row.names=1)
#metadata_table = read.table("metadata_table.txt")
#
##### Setup object for easy handling of data
#epidome_object = setup_epidome_object(epi01_table,epi02_table,metadata_table)


#------------------------------------------------------------------------------------------------------
#---- STEP4: regenerate filtered_R1 and filtered_R2 (under env qiime1, the scripts are IGNORED, since we generated filtered_R1 and R2 with dada2) ----
#mkdir pandaseq_16S pandaseq_yycH pandaseq_g216
mkdir pear_16S pear_yycH pear_g216
#-p CCTACGGGNGGCWGCAG -q GACTACHVGGGTATCTAATCC 
#for file in cutadapted_16S/*_R1.fastq.gz; do pandaseq -f ${file} -r ${file/_R1.fastq.gz/_R2.fastq.gz} -l 300  -w pandaseq_16S/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_merged.fasta >> LOG_pandaseq_16S; done
#https://learnmetabarcoding.github.io/LearnMetabarcoding/processing/pair_merging.html#
#conda install -c conda-forge -c bioconda -c defaults seqkit
#pear -f cutadapted_g216/A10-1_R1.fastq.gz -r cutadapted_g216/A10-1_R2.fastq.gz -o pear_g216/A10-1 -q 26 -v 10;

for file in cutadapted_g216/*_R1.fastq.gz; do 
  pear -f ${file} -r ${file/_R1.fastq.gz/_R2.fastq.gz} -j 2 -q 26 -v 10 -o pear_g216/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1) >> LOG_pear_g216; 
done
for file in cutadapted_yycH/*_R1.fastq.gz; do   
  pear -f ${file} -r ${file/_R1.fastq.gz/_R2.fastq.gz} -j 4 -q 26 -v 10 -o pear_yycH/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1) >> LOG_pear_yycH;
done
for file in cutadapted_16S/*_R1.fastq.gz; do 
  pear -f ${file} -r ${file/_R1.fastq.gz/_R2.fastq.gz} -j 2 -q 26 -v 10 -o pear_16S/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1) >> LOG_pear_16S; 
done

#Since the correct of my_EPIDOME_yycH_runwise_pipeline.R, the following processing are not necessary!!
#for file in cutadapted_g216/*_R1.fastq.gz; do
#  grep "@M0370" pear_g216/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1).assembled.fastq > cutadapted_g216/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs.txt
#  sed -i -e 's/@//g' cutadapted_g216/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs.txt
#  cut -d' ' -f1 cutadapted_g216/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs.txt > cutadapted_g216/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs_.txt
#  seqkit grep -f cutadapted_g216/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs_.txt cutadapted_g216/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_R1.fastq.gz -o cutadapted_g216/filtered_R1/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_R1.fastq.gz
#  seqkit grep -f cutadapted_g216/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs_.txt cutadapted_g216/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_R2.fastq.gz -o cutadapted_g216/filtered_R2/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_R2.fastq.gz
#done
#
#for file in cutadapted_yycH/*_R1.fastq.gz; do
#  grep "@M0370" pear_yycH/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1).assembled.fastq > cutadapted_yycH/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs.txt
#  sed -i -e 's/@//g' cutadapted_yycH/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs.txt
#  cut -d' ' -f1 cutadapted_yycH/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs.txt > cutadapted_yycH/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs_.txt
#  seqkit grep -f cutadapted_yycH/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs_.txt cutadapted_yycH/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_R1.fastq.gz -o cutadapted_yycH/filtered_R1/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_R1.fastq.gz
#  seqkit grep -f cutadapted_yycH/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs_.txt cutadapted_yycH/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_R2.fastq.gz -o cutadapted_yycH/filtered_R2/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_R2.fastq.gz
#done
#
#for file in cutadapted_16S/*_R1.fastq.gz; do
#  grep "@M0370" pear_16S/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1).assembled.fastq > cutadapted_16S/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs.txt
#  sed -i -e 's/@//g' cutadapted_16S/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs.txt
#  cut -d' ' -f1 cutadapted_16S/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs.txt > cutadapted_16S/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs_.txt
#  seqkit grep -f cutadapted_16S/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs_.txt cutadapted_16S/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_R1.fastq.gz -o cutadapted_16S/filtered_R1/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_R1.fastq.gz
#  seqkit grep -f cutadapted_16S/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs_.txt cutadapted_16S/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_R2.fastq.gz -o cutadapted_16S/filtered_R2/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_R2.fastq.gz
#done


# using ./my_EPIDOME_g216_runwise_pipeline.R and ./my_EPIDOME_yycH_runwise_pipeline.R instead!
#./my_EPIDOME_yycH_on_peared.R
#TODOs: create my_EPIDOME_g216_on_peared.R and my_EPIDOME_16S_on_peared.R

#for 16S data
validate_mapping_file.py -m map.txt
#cp Data_16S_Arck_vaginal_stool/map.txt  Data_Holger_Epidome_myData2


#-------------------------------------------------------------------------------------------------------------------------------------------------
#---- STEP5 filtering+trimming+merging+chimera-removing (scripts are modified from epidome/scripts/dada2_for_EPIDOME_yycH_runwise_pipeline.R under host-R) ----
#Input: cutadapted_yycH, cutadapted_g216
#Outputs: 16S_seqtab_from_dada2.rds
#         16S_seqtab_from_dada2.csv
#         16S_seqtab_nochim.rds
#         16S_seqtab_nochim.csv
#         16S_seqtab_image.RData
#         track_16S.csv
#RUN: (r4-base) 
#NOTE: Due to the error of compress, run the code manually, after dada2::filterAndTrim rename the generated filename (to fastq) and compress them manully using shell commands!
#out <- filterAndTrim(fwd=file.path(pathF, fastqFs), filt=file.path(filtpathF, fastqFs),
#                     rev=file.path(pathR, fastqRs), filt.rev=file.path(filtpathR, fastqRs),
#                      maxEE=10, truncQ=2, maxN=0, rm.phix=TRUE, minLen=200,
#                     compress=FALSE, verbose=TRUE, multithread=FALSE, matchIDs = TRUE)

#install dada2 on jhuang@saga:/home/jhuang/R/x86_64-redhat-linux-gnu-library/4.3
#install dada2 on conda env r on jhuang@hamm:~/miniconda3/envs/r/lib/R/library

conda create -n r r-base=4.1
conda activate r
conda install -c conda-forge gxx_linux-64 gcc_linux-64   
R
if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
BiocManager::install("dada2")
#GREAT_DEBUG: the script cannot run on @hamburg since the compress=TRUE of dada2::filterAndTrim was not working, the the following two scripts on saga or hamm! 
./my_EPIDOME_g216_runwise_pipeline.R    #minLen=185
./my_EPIDOME_yycH_runwise_pipeline.R    #minLen=200
#./my_EPIDOME_g216_runwise_pipeline_.R > g216_runwise_pipeline_.LOG
#./my_EPIDOME_yycH_runwise_pipeline_.R > yycH_runwise_pipeline_.LOG
#./my_EPIDOME_16S_runwise_pipeline.R     #minLen=170

# ---- g216 ----
Creating output directory: cutadapted_g216/filtered_R1
Creating output directory:cutadapted_g216/filtered_R2
Read in 76267 paired-sequences, output 75513 (99%) filtered paired-sequences.
Read in 43990 paired-sequences, output 43517 (98.9%) filtered paired-sequences.
Read in 42765 paired-sequences, output 42359 (99.1%) filtered paired-sequences.
Read in 36009 paired-sequences, output 35116 (97.5%) filtered paired-sequences.
Read in 175221 paired-sequences, output 174074 (99.3%) filtered paired-sequences.
Read in 110170 paired-sequences, output 109346 (99.3%) filtered paired-sequences.
Read in 70430 paired-sequences, output 69574 (98.8%) filtered paired-sequences.
Read in 97333 paired-sequences, output 96214 (98.9%) filtered paired-sequences.
Read in 53635 paired-sequences, output 52984 (98.8%) filtered paired-sequences.
Read in 94302 paired-sequences, output 93241 (98.9%) filtered paired-sequences.
Read in 140765 paired-sequences, output 139306 (99%) filtered paired-sequences.
Read in 134047 paired-sequences, output 132740 (99%) filtered paired-sequences.
Read in 82173 paired-sequences, output 81428 (99.1%) filtered paired-sequences.
Read in 104187 paired-sequences, output 103228 (99.1%) filtered paired-sequences.
Read in 83487 paired-sequences, output 82815 (99.2%) filtered paired-sequences.
Read in 75503 paired-sequences, output 74665 (98.9%) filtered paired-sequences.
Read in 69272 paired-sequences, output 68537 (98.9%) filtered paired-sequences.
Read in 102545 paired-sequences, output 101271 (98.8%) filtered paired-sequences.
Read in 149913 paired-sequences, output 148744 (99.2%) filtered paired-sequences.
Read in 123950 paired-sequences, output 122980 (99.2%) filtered paired-sequences.
Read in 89754 paired-sequences, output 88974 (99.1%) filtered paired-sequences.
Read in 155640 paired-sequences, output 153815 (98.8%) filtered paired-sequences.
Read in 107422 paired-sequences, output 106021 (98.7%) filtered paired-sequences.
Read in 55177 paired-sequences, output 54717 (99.2%) filtered paired-sequences.
Read in 68829 paired-sequences, output 68371 (99.3%) filtered paired-sequences.
Read in 64450 paired-sequences, output 63971 (99.3%) filtered paired-sequences.
Read in 104042 paired-sequences, output 103177 (99.2%) filtered paired-sequences.
Read in 74444 paired-sequences, output 73766 (99.1%) filtered paired-sequences.
Read in 84502 paired-sequences, output 83700 (99.1%) filtered paired-sequences.
Read in 67334 paired-sequences, output 66706 (99.1%) filtered paired-sequences.
Read in 75241 paired-sequences, output 74437 (98.9%) filtered paired-sequences.
Read in 93572 paired-sequences, output 92401 (98.7%) filtered paired-sequences.
Read in 115399 paired-sequences, output 113884 (98.7%) filtered paired-sequences.
Read in 73765 paired-sequences, output 73121 (99.1%) filtered paired-sequences.
Read in 56312 paired-sequences, output 55739 (99%) filtered paired-sequences.
Read in 50016 paired-sequences, output 49635 (99.2%) filtered paired-sequences.
Read in 107080 paired-sequences, output 106118 (99.1%) filtered paired-sequences.
Read in 66611 paired-sequences, output 65832 (98.8%) filtered paired-sequences.
Read in 97963 paired-sequences, output 97137 (99.2%) filtered paired-sequences.
Read in 106957 paired-sequences, output 106155 (99.3%) filtered paired-sequences.
Read in 107990 paired-sequences, output 107143 (99.2%) filtered paired-sequences.
Read in 70050 paired-sequences, output 69568 (99.3%) filtered paired-sequences.
Read in 97387 paired-sequences, output 95937 (98.5%) filtered paired-sequences.
Read in 129578 paired-sequences, output 127634 (98.5%) filtered paired-sequences.
Read in 79968 paired-sequences, output 78876 (98.6%) filtered paired-sequences.
Read in 64608 paired-sequences, output 63943 (99%) filtered paired-sequences.
Read in 54883 paired-sequences, output 54328 (99%) filtered paired-sequences.
Read in 74474 paired-sequences, output 73697 (99%) filtered paired-sequences.
Read in 78965 paired-sequences, output 78146 (99%) filtered paired-sequences.
Read in 52575 paired-sequences, output 52018 (98.9%) filtered paired-sequences.
Read in 54132 paired-sequences, output 53691 (99.2%) filtered paired-sequences.
Read in 49672 paired-sequences, output 49112 (98.9%) filtered paired-sequences.
Read in 79981 paired-sequences, output 79145 (99%) filtered paired-sequences.
Read in 32810 paired-sequences, output 32393 (98.7%) filtered paired-sequences.
Read in 778606 paired-sequences, output 769472 (98.8%) filtered paired-sequences.
Read in 53169 paired-sequences, output 52564 (98.9%) filtered paired-sequences.
Read in 103600 paired-sequences, output 102242 (98.7%) filtered paired-sequences.
Read in 64389 paired-sequences, output 63420 (98.5%) filtered paired-sequences.
Read in 105433 paired-sequences, output 103804 (98.5%) filtered paired-sequences.
Read in 117367 paired-sequences, output 114857 (97.9%) filtered paired-sequences.
Read in 49686 paired-sequences, output 48902 (98.4%) filtered paired-sequences.
Read in 66683 paired-sequences, output 65526 (98.3%) filtered paired-sequences.
Read in 43438 paired-sequences, output 42542 (97.9%) filtered paired-sequences.
Read in 63031 paired-sequences, output 62109 (98.5%) filtered paired-sequences.
Read in 95246 paired-sequences, output 93955 (98.6%) filtered paired-sequences.
Read in 52345 paired-sequences, output 51708 (98.8%) filtered paired-sequences.
Read in 64620 paired-sequences, output 63845 (98.8%) filtered paired-sequences.
Read in 114941 paired-sequences, output 113515 (98.8%) filtered paired-sequences.
Read in 89493 paired-sequences, output 88212 (98.6%) filtered paired-sequences.
Read in 90064 paired-sequences, output 88988 (98.8%) filtered paired-sequences.
Read in 49959 paired-sequences, output 49457 (99%) filtered paired-sequences.
Read in 94439 paired-sequences, output 93434 (98.9%) filtered paired-sequences.
Read in 63678 paired-sequences, output 62959 (98.9%) filtered paired-sequences.
Read in 63189 paired-sequences, output 62307 (98.6%) filtered paired-sequences.
Read in 139718 paired-sequences, output 137254 (98.2%) filtered paired-sequences.
Read in 127774 paired-sequences, output 126210 (98.8%) filtered paired-sequences.
Read in 61562 paired-sequences, output 60316 (98%) filtered paired-sequences.
Read in 90008 paired-sequences, output 88689 (98.5%) filtered paired-sequences.
Read in 86113 paired-sequences, output 84967 (98.7%) filtered paired-sequences.
Read in 77636 paired-sequences, output 76506 (98.5%) filtered paired-sequences.
Read in 119641 paired-sequences, output 117844 (98.5%) filtered paired-sequences.
Read in 116399 paired-sequences, output 115340 (99.1%) filtered paired-sequences.
Read in 75877 paired-sequences, output 75235 (99.2%) filtered paired-sequences.
Read in 70278 paired-sequences, output 69644 (99.1%) filtered paired-sequences.
Read in 118330 paired-sequences, output 117190 (99%) filtered paired-sequences.
Read in 90343 paired-sequences, output 89214 (98.8%) filtered paired-sequences.
Read in 93313 paired-sequences, output 92186 (98.8%) filtered paired-sequences.
Read in 97544 paired-sequences, output 96486 (98.9%) filtered paired-sequences.
Read in 116922 paired-sequences, output 115599 (98.9%) filtered paired-sequences.
Read in 97333 paired-sequences, output 96032 (98.7%) filtered paired-sequences.
Read in 113298 paired-sequences, output 112089 (98.9%) filtered paired-sequences.
Read in 85730 paired-sequences, output 84804 (98.9%) filtered paired-sequences.
Read in 93040 paired-sequences, output 91871 (98.7%) filtered paired-sequences.
Read in 53260 paired-sequences, output 52516 (98.6%) filtered paired-sequences.
Read in 49654 paired-sequences, output 49049 (98.8%) filtered paired-sequences.
Read in 71490 paired-sequences, output 70390 (98.5%) filtered paired-sequences.
Read in 87554 paired-sequences, output 86584 (98.9%) filtered paired-sequences.
Read in 48509 paired-sequences, output 47907 (98.8%) filtered paired-sequences.
Read in 45813 paired-sequences, output 45328 (98.9%) filtered paired-sequences.
Read in 52642 paired-sequences, output 52015 (98.8%) filtered paired-sequences.
Read in 59033 paired-sequences, output 58345 (98.8%) filtered paired-sequences.
Read in 40511 paired-sequences, output 39848 (98.4%) filtered paired-sequences.
Read in 45126 paired-sequences, output 44630 (98.9%) filtered paired-sequences.
Read in 73866 paired-sequences, output 73033 (98.9%) filtered paired-sequences.
Read in 92762 paired-sequences, output 91538 (98.7%) filtered paired-sequences.
Read in 56650 paired-sequences, output 55958 (98.8%) filtered paired-sequences.
Read in 51243 paired-sequences, output 50670 (98.9%) filtered paired-sequences.
Read in 86090 paired-sequences, output 85124 (98.9%) filtered paired-sequences.

# ---- yycH ----
Creating output directory: cutadapted_yycH/filtered_R1
Creating output directory:cutadapted_yycH/filtered_R2
Read in 63324 paired-sequences, output 62051 (98%) filtered paired-sequences.
Read in 40363 paired-sequences, output 39552 (98%) filtered paired-sequences.
Read in 38439 paired-sequences, output 37669 (98%) filtered paired-sequences.
Read in 37158 paired-sequences, output 35498 (95.5%) filtered paired-sequences.
Read in 82145 paired-sequences, output 80918 (98.5%) filtered paired-sequences.
Read in 53438 paired-sequences, output 52705 (98.6%) filtered paired-sequences.
Read in 58083 paired-sequences, output 56827 (97.8%) filtered paired-sequences.
Read in 86811 paired-sequences, output 84989 (97.9%) filtered paired-sequences.
Read in 44325 paired-sequences, output 43226 (97.5%) filtered paired-sequences.
Read in 90737 paired-sequences, output 88992 (98.1%) filtered paired-sequences.
Read in 59238 paired-sequences, output 58158 (98.2%) filtered paired-sequences.
Read in 93831 paired-sequences, output 92128 (98.2%) filtered paired-sequences.
Read in 52959 paired-sequences, output 51805 (97.8%) filtered paired-sequences.
Read in 74834 paired-sequences, output 73249 (97.9%) filtered paired-sequences.
Read in 37658 paired-sequences, output 36927 (98.1%) filtered paired-sequences.
Read in 80126 paired-sequences, output 78390 (97.8%) filtered paired-sequences.
Read in 57841 paired-sequences, output 56508 (97.7%) filtered paired-sequences.
Read in 85242 paired-sequences, output 83164 (97.6%) filtered paired-sequences.
Read in 78644 paired-sequences, output 77376 (98.4%) filtered paired-sequences.
Read in 76579 paired-sequences, output 75195 (98.2%) filtered paired-sequences.
Read in 51168 paired-sequences, output 50240 (98.2%) filtered paired-sequences.
Read in 87143 paired-sequences, output 85348 (97.9%) filtered paired-sequences.
Read in 55411 paired-sequences, output 53800 (97.1%) filtered paired-sequences.
Read in 45036 paired-sequences, output 44389 (98.6%) filtered paired-sequences.
Read in 59258 paired-sequences, output 58435 (98.6%) filtered paired-sequences.
Read in 45191 paired-sequences, output 44465 (98.4%) filtered paired-sequences.
Read in 53300 paired-sequences, output 52447 (98.4%) filtered paired-sequences.
Read in 49048 paired-sequences, output 48176 (98.2%) filtered paired-sequences.
Read in 52756 paired-sequences, output 51769 (98.1%) filtered paired-sequences.
Read in 46326 paired-sequences, output 45563 (98.4%) filtered paired-sequences.
Read in 66472 paired-sequences, output 65141 (98%) filtered paired-sequences.
Read in 97354 paired-sequences, output 95371 (98%) filtered paired-sequences.
Read in 94996 paired-sequences, output 93042 (97.9%) filtered paired-sequences.
Read in 93068 paired-sequences, output 91097 (97.9%) filtered paired-sequences.
Read in 62910 paired-sequences, output 60924 (96.8%) filtered paired-sequences.
Read in 80765 paired-sequences, output 79449 (98.4%) filtered paired-sequences.
Read in 64129 paired-sequences, output 62586 (97.6%) filtered paired-sequences.
Read in 35010 paired-sequences, output 33711 (96.3%) filtered paired-sequences.
Read in 55881 paired-sequences, output 54801 (98.1%) filtered paired-sequences.
Read in 53079 paired-sequences, output 52078 (98.1%) filtered paired-sequences.
Read in 96270 paired-sequences, output 94447 (98.1%) filtered paired-sequences.
Read in 91114 paired-sequences, output 89434 (98.2%) filtered paired-sequences.
Read in 37844 paired-sequences, output 37037 (97.9%) filtered paired-sequences.
Read in 102657 paired-sequences, output 100302 (97.7%) filtered paired-sequences.
Read in 46760 paired-sequences, output 45646 (97.6%) filtered paired-sequences.
Read in 66909 paired-sequences, output 65583 (98%) filtered paired-sequences.
Read in 49918 paired-sequences, output 48846 (97.9%) filtered paired-sequences.
Read in 70474 paired-sequences, output 69086 (98%) filtered paired-sequences.
Read in 66055 paired-sequences, output 64778 (98.1%) filtered paired-sequences.
Read in 42272 paired-sequences, output 41438 (98%) filtered paired-sequences.
Read in 50782 paired-sequences, output 49816 (98.1%) filtered paired-sequences.
Read in 37856 paired-sequences, output 37178 (98.2%) filtered paired-sequences.
Read in 75504 paired-sequences, output 74003 (98%) filtered paired-sequences.
Read in 25950 paired-sequences, output 25353 (97.7%) filtered paired-sequences.
Read in 687754 paired-sequences, output 673345 (97.9%) filtered paired-sequences.
Read in 47830 paired-sequences, output 46719 (97.7%) filtered paired-sequences.
Read in 101418 paired-sequences, output 98742 (97.4%) filtered paired-sequences.
Read in 50211 paired-sequences, output 48933 (97.5%) filtered paired-sequences.
Read in 92561 paired-sequences, output 90440 (97.7%) filtered paired-sequences.
Read in 79009 paired-sequences, output 76801 (97.2%) filtered paired-sequences.
Read in 46475 paired-sequences, output 45429 (97.7%) filtered paired-sequences.
Read in 64475 paired-sequences, output 63072 (97.8%) filtered paired-sequences.
Read in 38830 paired-sequences, output 37861 (97.5%) filtered paired-sequences.
Read in 50498 paired-sequences, output 49435 (97.9%) filtered paired-sequences.
Read in 78337 paired-sequences, output 76672 (97.9%) filtered paired-sequences.
Read in 46169 paired-sequences, output 44994 (97.5%) filtered paired-sequences.
Read in 54602 paired-sequences, output 53392 (97.8%) filtered paired-sequences.
Read in 99293 paired-sequences, output 97149 (97.8%) filtered paired-sequences.
Read in 78400 paired-sequences, output 76533 (97.6%) filtered paired-sequences.
Read in 68330 paired-sequences, output 66759 (97.7%) filtered paired-sequences.
Read in 54567 paired-sequences, output 53552 (98.1%) filtered paired-sequences.
Read in 91681 paired-sequences, output 89670 (97.8%) filtered paired-sequences.
Read in 48717 paired-sequences, output 47703 (97.9%) filtered paired-sequences.
Read in 44665 paired-sequences, output 43702 (97.8%) filtered paired-sequences.
Read in 113114 paired-sequences, output 110404 (97.6%) filtered paired-sequences.
Read in 96 paired-sequences, output 71 (74%) filtered paired-sequences.
Read in 55164 paired-sequences, output 53920 (97.7%) filtered paired-sequences.
Read in 92419 paired-sequences, output 90039 (97.4%) filtered paired-sequences.
Read in 78548 paired-sequences, output 76931 (97.9%) filtered paired-sequences.
Read in 69966 paired-sequences, output 68382 (97.7%) filtered paired-sequences.
Read in 228 paired-sequences, output 192 (84.2%) filtered paired-sequences.
Read in 110552 paired-sequences, output 108523 (98.2%) filtered paired-sequences.
Read in 61279 paired-sequences, output 60118 (98.1%) filtered paired-sequences.
Read in 61732 paired-sequences, output 60611 (98.2%) filtered paired-sequences.
Read in 96531 paired-sequences, output 94696 (98.1%) filtered paired-sequences.
Read in 92917 paired-sequences, output 90922 (97.9%) filtered paired-sequences.
Read in 97231 paired-sequences, output 95265 (98%) filtered paired-sequences.
Read in 72380 paired-sequences, output 71015 (98.1%) filtered paired-sequences.
Read in 92020 paired-sequences, output 90456 (98.3%) filtered paired-sequences.
Read in 84005 paired-sequences, output 82219 (97.9%) filtered paired-sequences.
Read in 104732 paired-sequences, output 102697 (98.1%) filtered paired-sequences.
Read in 89812 paired-sequences, output 88062 (98.1%) filtered paired-sequences.
Read in 110048 paired-sequences, output 107666 (97.8%) filtered paired-sequences.
Read in 37102 paired-sequences, output 36303 (97.8%) filtered paired-sequences.
Read in 57836 paired-sequences, output 56714 (98.1%) filtered paired-sequences.
Read in 71329 paired-sequences, output 69812 (97.9%) filtered paired-sequences.
Read in 73067 paired-sequences, output 71738 (98.2%) filtered paired-sequences.
Read in 45990 paired-sequences, output 45111 (98.1%) filtered paired-sequences.
Read in 43900 paired-sequences, output 42869 (97.7%) filtered paired-sequences.
Read in 44497 paired-sequences, output 43591 (98%) filtered paired-sequences.
Read in 55302 paired-sequences, output 54178 (98%) filtered paired-sequences.
Read in 38277 paired-sequences, output 37329 (97.5%) filtered paired-sequences.
Read in 39915 paired-sequences, output 39079 (97.9%) filtered paired-sequences.
Read in 60205 paired-sequences, output 58987 (98%) filtered paired-sequences.
Read in 75449 paired-sequences, output 73564 (97.5%) filtered paired-sequences.
Read in 45301 paired-sequences, output 44278 (97.7%) filtered paired-sequences.
Read in 45517 paired-sequences, output 44472 (97.7%) filtered paired-sequences.
Read in 86490 paired-sequences, output 84506 (97.7%) filtered paired-sequences.

# ---- 16S ----
[1] ‘1.30.0’
Creating output directory: cutadapted_16S/filtered_R1
Creating output directory:cutadapted_16S/filtered_R2
Read in 81868 paired-sequences, output 70998 (86.7%) filtered paired-sequences.
Read in 43465 paired-sequences, output 37518 (86.3%) filtered paired-sequences.
Read in 57576 paired-sequences, output 49745 (86.4%) filtered paired-sequences.
Read in 46149 paired-sequences, output 22206 (48.1%) filtered paired-sequences.
Read in 197875 paired-sequences, output 168938 (85.4%) filtered paired-sequences.
Read in 230646 paired-sequences, output 201375 (87.3%) filtered paired-sequences.
Read in 83476 paired-sequences, output 73070 (87.5%) filtered paired-sequences.
Read in 106248 paired-sequences, output 94357 (88.8%) filtered paired-sequences.
Read in 64478 paired-sequences, output 56687 (87.9%) filtered paired-sequences.
Read in 147546 paired-sequences, output 128864 (87.3%) filtered paired-sequences.
Read in 156421 paired-sequences, output 135014 (86.3%) filtered paired-sequences.
Read in 155426 paired-sequences, output 133291 (85.8%) filtered paired-sequences.
Read in 170440 paired-sequences, output 146158 (85.8%) filtered paired-sequences.
Read in 87769 paired-sequences, output 76695 (87.4%) filtered paired-sequences.
Read in 116783 paired-sequences, output 100378 (86%) filtered paired-sequences.
Read in 104601 paired-sequences, output 90289 (86.3%) filtered paired-sequences.
Read in 89691 paired-sequences, output 75068 (83.7%) filtered paired-sequences.
Read in 131656 paired-sequences, output 111782 (84.9%) filtered paired-sequences.
Read in 210640 paired-sequences, output 177041 (84%) filtered paired-sequences.
Read in 218950 paired-sequences, output 181668 (83%) filtered paired-sequences.
Read in 104352 paired-sequences, output 91933 (88.1%) filtered paired-sequences.
Read in 70199 paired-sequences, output 58848 (83.8%) filtered paired-sequences.
Read in 109548 paired-sequences, output 86202 (78.7%) filtered paired-sequences.
Read in 106843 paired-sequences, output 91557 (85.7%) filtered paired-sequences.
Read in 99343 paired-sequences, output 84103 (84.7%) filtered paired-sequences.
Read in 68186 paired-sequences, output 57540 (84.4%) filtered paired-sequences.
Read in 137489 paired-sequences, output 113894 (82.8%) filtered paired-sequences.
Read in 87680 paired-sequences, output 76565 (87.3%) filtered paired-sequences.
Read in 104797 paired-sequences, output 90842 (86.7%) filtered paired-sequences.
Read in 106061 paired-sequences, output 90058 (84.9%) filtered paired-sequences.
Read in 112314 paired-sequences, output 95034 (84.6%) filtered paired-sequences.
Read in 110075 paired-sequences, output 92684 (84.2%) filtered paired-sequences.
Read in 142284 paired-sequences, output 122560 (86.1%) filtered paired-sequences.
Read in 117030 paired-sequences, output 99983 (85.4%) filtered paired-sequences.
Read in 83004 paired-sequences, output 66003 (79.5%) filtered paired-sequences.
Read in 100102 paired-sequences, output 84709 (84.6%) filtered paired-sequences.
Read in 115385 paired-sequences, output 98665 (85.5%) filtered paired-sequences.
Read in 108998 paired-sequences, output 84706 (77.7%) filtered paired-sequences.
Read in 97697 paired-sequences, output 85634 (87.7%) filtered paired-sequences.
Read in 146495 paired-sequences, output 123186 (84.1%) filtered paired-sequences.
Read in 125068 paired-sequences, output 104228 (83.3%) filtered paired-sequences.
Read in 95073 paired-sequences, output 79830 (84%) filtered paired-sequences.
Read in 107716 paired-sequences, output 85963 (79.8%) filtered paired-sequences.
Read in 104179 paired-sequences, output 85829 (82.4%) filtered paired-sequences.
Read in 104378 paired-sequences, output 90404 (86.6%) filtered paired-sequences.
Read in 100915 paired-sequences, output 87231 (86.4%) filtered paired-sequences.
Read in 70839 paired-sequences, output 61626 (87%) filtered paired-sequences.
Read in 103779 paired-sequences, output 89499 (86.2%) filtered paired-sequences.
Read in 80459 paired-sequences, output 69998 (87%) filtered paired-sequences.
Read in 53195 paired-sequences, output 46441 (87.3%) filtered paired-sequences.
Read in 73890 paired-sequences, output 64558 (87.4%) filtered paired-sequences.
Read in 45422 paired-sequences, output 38340 (84.4%) filtered paired-sequences.
Read in 86131 paired-sequences, output 70791 (82.2%) filtered paired-sequences.
Read in 36659 paired-sequences, output 30330 (82.7%) filtered paired-sequences.
Read in 1020636 paired-sequences, output 890205 (87.2%) filtered paired-sequences.
Read in 61639 paired-sequences, output 53489 (86.8%) filtered paired-sequences.
Read in 149775 paired-sequences, output 128690 (85.9%) filtered paired-sequences.
Read in 69245 paired-sequences, output 59031 (85.2%) filtered paired-sequences.
Read in 123481 paired-sequences, output 106335 (86.1%) filtered paired-sequences.
Read in 144943 paired-sequences, output 123300 (85.1%) filtered paired-sequences.
Read in 57247 paired-sequences, output 49100 (85.8%) filtered paired-sequences.
Read in 94696 paired-sequences, output 81376 (85.9%) filtered paired-sequences.
Read in 61054 paired-sequences, output 52806 (86.5%) filtered paired-sequences.
Read in 70786 paired-sequences, output 59461 (84%) filtered paired-sequences.
Read in 111418 paired-sequences, output 95182 (85.4%) filtered paired-sequences.
Read in 66641 paired-sequences, output 57878 (86.9%) filtered paired-sequences.
Read in 89275 paired-sequences, output 77365 (86.7%) filtered paired-sequences.
Read in 14026 paired-sequences, output 12062 (86%) filtered paired-sequences.
Read in 124091 paired-sequences, output 107478 (86.6%) filtered paired-sequences.
Read in 135865 paired-sequences, output 117058 (86.2%) filtered paired-sequences.
Read in 57133 paired-sequences, output 50619 (88.6%) filtered paired-sequences.
Read in 144291 paired-sequences, output 128562 (89.1%) filtered paired-sequences.
Read in 72266 paired-sequences, output 62684 (86.7%) filtered paired-sequences.
Read in 64325 paired-sequences, output 56052 (87.1%) filtered paired-sequences.
Read in 143308 paired-sequences, output 122579 (85.5%) filtered paired-sequences.
Read in 122995 paired-sequences, output 100196 (81.5%) filtered paired-sequences.
Read in 72233 paired-sequences, output 60073 (83.2%) filtered paired-sequences.
Read in 121640 paired-sequences, output 102855 (84.6%) filtered paired-sequences.
Read in 125526 paired-sequences, output 108366 (86.3%) filtered paired-sequences.
Read in 100392 paired-sequences, output 85640 (85.3%) filtered paired-sequences.
Read in 121633 paired-sequences, output 102923 (84.6%) filtered paired-sequences.
Read in 158454 paired-sequences, output 137517 (86.8%) filtered paired-sequences.
Read in 95291 paired-sequences, output 81605 (85.6%) filtered paired-sequences.
Read in 82398 paired-sequences, output 70457 (85.5%) filtered paired-sequences.
Read in 119979 paired-sequences, output 98876 (82.4%) filtered paired-sequences.
Read in 104955 paired-sequences, output 88689 (84.5%) filtered paired-sequences.
Read in 131962 paired-sequences, output 113282 (85.8%) filtered paired-sequences.
Read in 113428 paired-sequences, output 99979 (88.1%) filtered paired-sequences.
Read in 118000 paired-sequences, output 103270 (87.5%) filtered paired-sequences.
Read in 97967 paired-sequences, output 84645 (86.4%) filtered paired-sequences.
Read in 148288 paired-sequences, output 127233 (85.8%) filtered paired-sequences.
Read in 119672 paired-sequences, output 101204 (84.6%) filtered paired-sequences.
Read in 129246 paired-sequences, output 110209 (85.3%) filtered paired-sequences.
Read in 69946 paired-sequences, output 60480 (86.5%) filtered paired-sequences.
Read in 70772 paired-sequences, output 61906 (87.5%) filtered paired-sequences.
Read in 90353 paired-sequences, output 80184 (88.7%) filtered paired-sequences.
Read in 77852 paired-sequences, output 67520 (86.7%) filtered paired-sequences.
Read in 60525 paired-sequences, output 52079 (86%) filtered paired-sequences.
Read in 69867 paired-sequences, output 60784 (87%) filtered paired-sequences.
Read in 75829 paired-sequences, output 64567 (85.1%) filtered paired-sequences.
Read in 77920 paired-sequences, output 66082 (84.8%) filtered paired-sequences.
Read in 59361 paired-sequences, output 48932 (82.4%) filtered paired-sequences.
Read in 59396 paired-sequences, output 50137 (84.4%) filtered paired-sequences.
Read in 64692 paired-sequences, output 55122 (85.2%) filtered paired-sequences.
Read in 115575 paired-sequences, output 99515 (86.1%) filtered paired-sequences.
Read in 65049 paired-sequences, output 56286 (86.5%) filtered paired-sequences.
Read in 54340 paired-sequences, output 46639 (85.8%) filtered paired-sequences.
Read in 107117 paired-sequences, output 94036 (87.8%) filtered paired-sequences.
156313773 total bases in 550780 reads from 6 samples will be used for learning the error rates.
153744630 total bases in 550780 reads from 6 samples will be used for learning the error rates.
Processing: A1-1 


#--------------------


#wc -l cutadapted_yycH/filtered_R1$ vim Extraction-control-2_R1.fastq.gz #-->2696
#Read in 1138 paired-sequences, output 674 (59.2%) filtered paired-sequences.
#"Extraction-control-2";0;61;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;591;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 -->600 sequences
#"Extraction-control-2";0;61;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;591;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0  #-->after chimera-removing, only 107 sequences
#Processing: Extraction-control-2
#Sample 1 - 674 reads in 209 unique sequences (What does the unique sequences mean???).  #merged sequences are 209
#Sample 1 - 674 reads in 280 unique sequences.
#"";"input_read_count"*; "filtered_and_trimmed_read_count";"merged_after_dada2_read_count"; "non-chimeric_read_count"*
#"Extraction-control-2"; 1138*; 674;652; 652*


biom convert -i table.txt -o table.from_txt_json.biom --table-type="OTU table" --to-json
summarize_taxa_through_plots.py -i clustering/otu_table_mc2_w_tax_no_pynast_failures.biom -o plots/taxa_summary -s
summarize_taxa.py -i otu_table.biom -o ./tax


#-------------------------------------------------------------------------------------------------
#---STEP6(IGNORED!) stitching and removing chimeras (see ~/DATA/Data_Holger_Epidome/epidome/scripts/Combine_and_Remove_Chimeras_yycH.R) ----
#my_Combine_and_Remove_Chimeras_g216.R is a part of my_EPIDOME_yycH_runwise_pipeline.R (see lines 53-55)

#zcat A10-1_R1.fastq.gz | echo $((`wc -l`/4))
#121007 > 37158 + 36009 + 46149 = 119316
#458392 > 82145 + 175221 + 197875 = 455241

#(damian2) jhuang@hamburg:~/DATA/Data_Holger_Epidome_myData2/raw_data$ zcat A2-1_R1.fastq.gz | echo $((`wc -l`/4))
#308337


#g216_track.csv
"A10-1";36009;493;367;367
"A10-2";175221;82727;30264;27890
"A10-3";110170;36812;13715;13065
"A10-4";142323;64398;24306;21628

#yycH_track.csv
"A10-1";37158;549;0;0     pandaseq:36791
"A10-2";82145;23953;6180;5956
"A10-3";53438;12480;2944;2944
"A10-4";64516;18361;12350;11797

#16S_track.csv
"A10-1";46149(in cutadapted_16S);13(in filtered_R1);8;8
"A10-2";197875;2218;1540;1391
"A10-3";230646;2429;1819;1752
"A10-4";175759;2001;1439;1366


# --------------------------------------------------------------------------
# ---- STEP7 Classification: epidome/scripts/ASV_blast_classification.py ----
#Input: g216_seqtab_nochim.csv using DATABASE epidome/DB/g216_ref_aln.fasta
#Output: g216_seqtab_ASV_seqs.fasta, g216_seqtab_ASV_blast.txt and g216_seqtab.csv.classified.csv

ln -s /home/jhuang/Tools/epidome .

python3 epidome/scripts/ASV_blast_classification.py   yycH_seqtab_nochim.csv yycH_seqtab_ASV_seqs.fasta  epidome/DB/yycH_ref_aln.fasta  yycH_seqtab_ASV_blast.txt yycH_seqtab.csv.classified.csv 99.5
python3 epidome/scripts/ASV_blast_classification.py   g216_seqtab_nochim.csv g216_seqtab_ASV_seqs.fasta  epidome/DB/g216_ref_aln.fasta  g216_seqtab_ASV_blast.txt g216_seqtab.csv.classified.csv 99.5


#old: python3 epidome/scripts/ASV_blast_classification.py   yycH_seqtab.csv yycH_seqtab.csv.ASV_seqs.fasta  epidome/DB/yycH_ref_aln.fasta yycH_seqtab.csv.ASV_blast.txt yycH_seqtab.csv.classified.csv 99.5
#old: python3 epidome/scripts/ASV_blast_classification_combined.py -p1 190920_run1_yycH_seqtab_from_dada2.csv -p2 190920_run1_G216_seqtab_from_dada2.csv -p1_ref epidome/DB/yycH_ref_aln.fasta -p2_ref epidome/DB/g216_ref_aln.fasta

##rename "seqseq2" --> seq2
#sed -i -e s/seq//g 190920_run1_yycH_seqtab_from_dada2.csv.ASV_blast.txt
#sed -i -e s/seqseq/seq/g 190920_run1_yycH_seqtab_from_dada2.csv.classified.csv
#diff 190920_run1_yycH_seqtab_from_dada2.csv.ASV_seqs.fasta epidome/example_data/190920_run1_yycH_seqtab_from_dada2.csv.ASV_seqs.fasta
#diff 190920_run1_yycH_seqtab_from_dada2.csv.ASV_blast.txt epidome/example_data/190920_run1_yycH_seqtab_from_dada2.csv.ASV_blast.txt
#diff 190920_run1_yycH_seqtab_from_dada2.csv.classified.csv epidome/example_data/190920_run1_yycH_seqtab_from_dada2.csv.classified.csv
## WHY: 667 seqs in old calculation, but in our calculation only 108 seqs
## They took *_seqtab_from_dada2.csv, but we took *_seqtab_nochim.csv. (653 vs 108 records!)
##AAAT";"seq37,36";0;
sed -i -e s/seq//g yycH_seqtab_ASV_blast.txt
sed -i -e s/seq//g g216_seqtab_ASV_blast.txt
#;-->""
sed -i -e s/';'//g yycH_seqtab_ASV_blast.txt
sed -i -e s/';'//g g216_seqtab_ASV_blast.txt
sed -i -e s/seqseq/seq/g yycH_seqtab.csv.classified.csv
sed -i -e s/seqseq/seq/g g216_seqtab.csv.classified.csv
#;,seq --> ,seq
#;"; --> ";
sed -i -e s/";,seq"/",seq"/g yycH_seqtab.csv.classified.csv
sed -i -e s/";,seq"/",seq"/g g216_seqtab.csv.classified.csv
sed -i -e s/";\";"/"\";"/g yycH_seqtab.csv.classified.csv
sed -i -e s/";\";"/"\";"/g g216_seqtab.csv.classified.csv

#"ASV";"Seq_number";"even-mock3-1_S258_L001";"even-mock3-2_S282_L001";"even-mock3-3_S199_L001";"staggered-mock3-1_S270_L001";"staggered-mock3-2_S211_L001";"staggered-mock3-3_S223_L001"
#"ASV";"Seq_number";"Extraction_control_1";"Extraction_control_2";"P01_nose_1";"P01_nose_2";"P01_skin_1";"P01_skin_2";"P02_nose_1";"P02_nose_2";"P02_skin_1";"P02_skin_2";"P03_nose_1";"P03_nose_2";"P03_skin_1";"P03_skin_2";"P04_nose_1";"P04_nose_2";"P04_skin_1";"P04_skin_2";"P05_nose_1";"P05_nose_2";"P05_skin_1";"P05_skin_2";"P06_nose_1";"P06_nose_2";"P06_skin_1";"P06_skin_2";"P07_nose_1";"P07_nose_2";"P07_skin_1";"P07_skin_2";"P08_nose_1";"P08_nose_2";"P08_skin_1";"P08_skin_2";"P09_nose_1";"P09_nose_2";"P09_skin_1";"P09_skin_2";"P10_nose_1";"P10_nose_2";"P10_skin_1";"P10_skin_2";"P11_nose_1";"P11_nose_2";"P11_skin_1";"P11_skin_2";"even-mock3-1_S258_L001";"even-mock3-2_S282_L001";"even-mock3-3_S199_L001";"staggered-mock3-1_S270_L001";"staggered-mock3-2_S211_L001";"staggered-mock3-3_S223_L001"

#[OPTIONAL] grep -v ";NA;" yycH_seqtab.csv.classified.csv > yycH_seqtab.csv.classified_noNA.csv
#[OPTIONAL] grep -v ";NA;" g216_seqtab.csv.classified.csv > g216_seqtab.csv.classified_noNA.csv

#https://github.com/ssi-dk/epidome/blob/master/example_data/190920_run1_G216_seqtab_from_dada2.csv.classified.csv
#change seq24,seq21 --> seq24,21; seq26,seq24,seq23,seq22 --> seq26,24,23,22
sed -i -e s/",seq"/","/g yycH_seqtab.csv.classified.csv
sed -i -e s/",seq"/","/g g216_seqtab.csv.classified.csv


#ALTERLATE: line 698 + line 711 = 'sed -i -e s/";,seq"/","/g yycH_seqtab.csv.classified.csv'
#ALTERLATE: line 699 + line 712 = 'sed -i -e s/";,seq"/","/g g216_seqtab.csv.classified.csv'

#[OPTIONAL] TO reduce the unclassified, rename seq31,30 --> seq31 in g216,  seq37,36 --> seq37 in yycH.
#Why record 2 are seq37,36 due to the following!
#2       37      100.000 476     0       0       1       476     22      497     0.0     880
#2       36      100.000 476     0       0       1       476     22      497     0.0     880


#[IMPORTANT_COMPARISON]: compare the format to epidome/example_data/190920_run1_and_2_yycH_seqtab_nochim.csv.classified.csv and 190920_run1_and_2_yycH_seqtab_nochim.csv.ASV_blast.txt, 190920_run1_and_2_G216_seqtab_nochim.csv.ASV_seqs.fasta
#in classified.csv: "seq37,36" or NA; 
#in fasta: >1; >2; 
#in blast.txt: 1       37      100.000 476     0       0       1       476     22      497     0.0     880

#Results: g216_seqtab.csv.classified_noNA_seq31_or_30.csv and yycH_seqtab.csv.classified_noNA_seq37_or_36.csv


# ---------------------------------------------------------------------------------------------------
# ------------------- STEP8: draw plot from two amplicons ---------------------------------------
Taxonomic database setup and classification
- Custom databases of all unique g216 and yycH target sequences can be found at https://github.com/ssi-dk/epidome/tree/master/DB. 
- We formatted our g216 and yycH gene databases to be compatible with DADA2’s assign-Taxonomy function and used it to classify the S. epidermidis ASVs with the RDP naive Bayesian classifier method (https://github.com/ssi-dk/epidome/tree/master/scripts).
- ST classification of samples was performed using the g216 target sequence as the primary identifier. 
- All g216 sequences unique to a single clonal cluster in the database were immediately classified as the matching clone, and in cases were the g216 sequence matched multiple clones, the secondary yycH target sequences were parsed to determine which clone was present. When this classification failed to resolve due to multiple potential combinations of sequences, ASVs were categorized as “Unclassified”. Similarly, g216 sequences not found in the database were labelled as “Novel”. --> Since in our data, yycH reads in H60.1 and H64.3 are very low, however, the reads of g216 of the two samples are pretty, if g216 are unique, they still can be assigned to the ST-classes!

#The resulting taxonomic count tables are provided at https://github.

#~/Tools/csv2xls-0.4/csv_to_xls.py g216_seqtab.csv.classified_noNA.csv yycH_seqtab.csv.classified_noNA.csv -d$';' -o counts.xls


#under r4-base
source("epidome/scripts/epidome_functions.R")

ST_amplicon_table = read.table("epidome/DB/epidome_ST_amplicon_frequencies.txt",sep = "\t")

# ---- read original classified.csv (Option) ----
epi01_table = read.table("g216_seqtab.csv.classified.csv",sep = ";",header=TRUE,row.names=1)
epi02_table = read.table("yycH_seqtab.csv.classified.csv",sep = ";",header=TRUE,row.names=1)
#> sum(epi01_table$A2.1)
#[1] 75117
#> sum(epi02_table$A2.1)
#[1] 44813
# ---- To reduce the unclassified reads, try seq31 and seq37 (Advanced Option) ----
#seq31 combine seq37
#--> 30 Class!
#seq30 combine seq37
#seq31 combine seq36
#seq30 combine seq36
#sed -e s/seq31,30/seq31/g g216_seqtab.csv.classified.csv > g216_seqtab.csv.classified_seq31.csv
#sed -e s/seq37,36/seq37/g yycH_seqtab.csv.classified.csv > yycH_seqtab.csv.classified_seq37.csv
epi01_table = read.table("g216_seqtab.csv.classified_seq31.csv",sep = ";",header=TRUE,row.names=1)
epi02_table = read.table("yycH_seqtab.csv.classified_seq37.csv",sep = ";",header=TRUE,row.names=1)


metadata_table = read.table("metadata.txt",header=TRUE,row.names=1)
metadata_table$patient.ID <- factor(metadata_table$patient.ID,levels=c("A1","A2","A3","A4","A5","A10","A12","A17","A20","A21","A22","A24","A25","A27","A28","H11","H13","H14","H16","H19","H24","H53","H56","H57","H59","H60","H64","H65","H67","H72","H73","H76","H78","H91","H94","H99"))
epidome_object = setup_epidome_object(epi01_table,epi02_table,metadata_table = metadata_table)

eo_ASV_combined = combine_ASVs_epidome(epidome_object)
eo_filtered = filter_lowcount_samples_epidome(eo_ASV_combined,500,500)
#TODO1 in REPORT: [1] "2 low count samples removed from data: H60.1 H64.3"

count_table = classify_epidome(eo_ASV_combined,ST_amplicon_table)
#count_df_ordered = count_table[order(rowSums(count_table),decreasing = T),]
write.csv(file="count_table_seq31_seq37.txt", count_table)

#-->33 ST and "Unclassified",32974,24659,23884,52064,63733,77343,2711,2737,4341,77872,79470,68072,162,31495,27483,1961,8303,2473,7097,5033,6298,612,6376,3784,12680,6960,5997,2050,2089,290,62292,49749,32765,1350,6675,1033,1678,70,3637,3103,6728,5204,2385,3403,4839,8927,11550,5943,14397,8168,2106,1333,19404,773,17451,908,2701,1722,2739,2241,10309,22460,2237,9246,12036,40361,1765,1945,1683,3318,883,886,2301,2144,8839,3598,2364,3491,10667,7534,5074,3713,4606,2051,7974,2635,3068,3165,2128,1493,8029,13728,11844,1054,1947,295,7579,3461,4408,2360,3035,1949,780,1843,156,4936,3169,8881
#vs.
#-->34 ST and "Unclassified",2125,1252,1121,1539,4771,383,2245,2269,3126,1011,705,0,122,8960,3345,1029,2276,754,2497,3652,2304,251,2732,3275,3190,1527,2559,1856,1874,205,6139,4363,773,457,3212,429,1552,70,3328,2465,1717,4253,2369,3403,4812,1407,1630,1197,2322,637,420,558,1715,626,10247,777,2382,1042,1863,1628,3864,7037,1267,3666,4018,594,730,1003,1303,1722,759,833,1503,1574,7176,923,3,1436,3530,688,1208,2667,3294,781,3524,1784,2724,2767,2046,1417,1417,1154,1652,816,1657,295,3257,662,316,0,696,0,603,1442,156,2072,720,4022

# ---- Image1 ----
primer_compare = compare_primer_output(epidome_object,color_variable = "patient.type")
png("image1.png", width=1000, height=1000)
primer_compare$plot
dev.off()


TODO:
What are 307 reads and 50518 sequences? How are those sequences defined? Read the paper to understand the two concepts ("-" and "Unclassified")!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
If 31,30 --> Unclassified, 
          ST Group epi01_ASV epi02_ASV freq
865555   278    37         5         3    4
865641    32    38         5         3    3
1276441   32    38         5         4    2
5391686  184   130         5        14    1
12373592 549   108         5        31    1
13601043 130    46         5        34   13
13605992 549   108         5        34    1
14429286 184   130         5        36    4
17299835 200    66         5        43    1
"2939275""-"    27         7         8    1
If 7 combined to 8, then it is --> Novel. 


#install.packages("pls")
#library(pls)
#install.packages("reshape")
#library(reshape)
#install.packages("vegan")
library('vegan') 
library(scales)
library(RColorBrewer)


#Image2
#TODO: find out what are the combination 21006 in Aachen?


source("epidome/scripts/epidome_functions.R")
#count_table = count_table[-29,]
#row.names(count_table) <- c("-", "ST297", "ST170", "ST73", "ST225", "ST673", "ST215", "ST19", "Unclassified")
#row.names(count_table) <- c("NA", "-", "X297", "X170", "X73", "X225", "X673", "X215", "X19", "Unclassified")
row.names(count_table) <- c("ST73","ST2","ST88","ST293","ST59","ST130","ST200","ST278","ST184","ST8","ST5","ST83","ST297","ST215","-","ST60","ST384","ST14","ST210","ST20","ST218","ST19","ST100","ST170","ST673","ST329","ST331","ST87","ST23","ST626","ST225","ST691","ST355","Unclassified") #c("ST130","ST200","ST278","ST184","ST8","ST73","ST2","ST88","ST293","ST59","ST83","ST5","ST297","-","ST331","ST215","ST100","ST290","ST218","ST384","ST14","ST210","ST60","ST170","ST19","ST329","ST673","ST691","ST355","Unclassified")
colnames(count_table) <- c("A1.1","A1.2","A1.3","A2.1","A2.2","A2.3","A3.1","A3.2","A3.3","A4.1","A4.2","A4.3","A5.1","A5.2","A5.3","A10.1","A10.2","A10.3","A12.1","A12.2","A12.3","A17.1","A17.2","A17.3","A20.1","A20.2","A20.3","A21.1","A21.2","A21.3","A22.1","A22.2","A22.3","A24.1","A24.2","A24.3","A25.1","A25.2","A25.3","A27.1","A27.2","A27.3","A28.1","A28.2","A28.3","H11.1","H11.2","H11.3","H13.1","H13.2","H13.3","H14.1","H14.2","H14.3","H16.1","H16.2","H16.3","H19.1","H19.2","H19.3","H24.1","H24.2","H24.3","H53.1","H53.2","H53.3","H56.1","H56.2","H56.3","H57.1","H57.2","H57.3","H59.1","H59.2","H59.3","H60.1","H60.2","H60.3","H64.1","H64.2","H64.3","H65.1","H65.2","H65.3","H67.1","H67.2","H67.3","H72.1","H72.2","H72.3","H73.1","H73.2","H73.3","H76.1","H76.2","H76.3","H78.1","H78.2","H78.3","H91.1","H91.2","H91.3","H94.1","H94.2","H94.3","H99.1","H99.2","H99.3")
col_order <- c("A1.1","A1.2","A1.3","A2.1","A2.2","A2.3","A3.1","A3.2","A3.3","A4.1","A4.2","A4.3","A5.1","A5.2","A5.3","A10.1","A10.2","A10.3","A12.1","A12.2","A12.3","A17.1","A17.2","A17.3","A20.1","A20.2","A20.3","A21.1","A21.2","A21.3","A22.1","A22.2","A22.3","A24.1","A24.2","A24.3","A25.1","A25.2","A25.3","A27.1","A27.2","A27.3","A28.1","A28.2","A28.3","H11.1","H11.2","H11.3","H13.1","H13.2","H13.3","H14.1","H14.2","H14.3","H16.1","H16.2","H16.3","H19.1","H19.2","H19.3","H24.1","H24.2","H24.3","H53.1","H53.2","H53.3","H56.1","H56.2","H56.3","H57.1","H57.2","H57.3","H59.1","H59.2","H59.3","H60.1","H60.2","H60.3","H64.1","H64.2","H64.3","H65.1","H65.2","H65.3","H67.1","H67.2","H67.3","H72.1","H72.2","H72.3","H73.1","H73.2","H73.3","H76.1","H76.2","H76.3","H78.1","H78.2","H78.3","H91.1","H91.2","H91.3","H94.1","H94.2","H94.3","H99.1","H99.2","H99.3")
count_table_reordered <- count_table[,col_order]

write.csv(file="count_table_reordered.txt", count_table_reordered)
#NOTE to change rowname from '-' to 'Novel'

p = make_barplot_epidome(count_table_reordered,reorder=FALSE,normalize=TRUE)
#p = make_barplot_epidome(count_table_reordered,reorder=TRUE,normalize=TRUE)
png("Barplot_All.png", width=1600, height=900)
p
dev.off()


#Image3
eo_aneurysm = prune_by_variable_epidome(epidome_object,"patient.type",c("aneurysm"))
eo_hypophysis = prune_by_variable_epidome(epidome_object,"patient.type",c("hypophysis"))

epidome_object_aneurysm_norm = normalize_epidome_object(eo_aneurysm) ### Normalize counts to percent
epidome_object_hypophysis_norm = normalize_epidome_object(eo_hypophysis)

png("PCA_by_patientID.png", width=1200, height=800)
PCA_patient_colored = plot_PCA_epidome(eo_filtered,color_variable = "patient.ID",colors=c(), plot_ellipse = FALSE)
PCA_patient_colored + ggtitle("PCA plot of all samples colored by patient ID")
dev.off()
png("PCA_by_patientType.png", width=1200, height=800)
#PCA_patient_colored = plot_PCA_epidome(eo_filtered,color_variable = "patient.type",colors=c(), plot_ellipse = FALSE)
PCA_patient_colored = plot_PCA_epidome(eo_filtered, color_variable = "patient.type", colors=c("red", "blue"), plot_ellipse = FALSE)
PCA_patient_colored + ggtitle("PCA plot of all samples colored by patient type")
dev.off()
png("PCA_by_sampleTime.png", width=1200, height=800)
PCA_patient_colored = plot_PCA_epidome(eo_filtered,color_variable = "sample.time",colors=c("red", "blue", "green"), plot_ellipse = FALSE)
PCA_patient_colored + ggtitle("PCA plot of all samples colored by patient ID")
dev.off()


png("PCA_aneurysm_by_patientID.png", width=1200, height=800)
PCA_patient_colored = plot_PCA_epidome(epidome_object_aneurysm_norm, color_variable = "patient.ID",colors=c(), plot_ellipse = FALSE)
PCA_patient_colored + ggtitle("PCA plot of aneurysm samples colored by patient ID")
dev.off()

# Assuming epidome_object_hypophysis_norm$p1_table and p2_table are the data matrices
clean_p1_table <- na.omit(epidome_object_hypophysis_norm$p1_table)
clean_p2_table <- na.omit(epidome_object_hypophysis_norm$p2_table)
# Update your epidome object with the cleaned tables
epidome_object_hypophysis_norm$p1_table <- clean_p1_table
epidome_object_hypophysis_norm$p2_table <- clean_p2_table

png("PCA_hypophysis_by_patientID.png", width=1200, height=800)
PCA_sample_site_colored = plot_PCA_epidome(epidome_object_hypophysis_norm,color_variable = "patient.ID",colors=c(), plot_ellipse = FALSE)
PCA_sample_site_colored + ggtitle("PCA plot of hypophysis samples colored by patient ID")
dev.off()

png("PCA_aneurysm_by_sampleTime.png", width=1200, height=800)
#unique_sample_times <- length(unique(epidome_object_aneurysm_norm$metadata[,"sample.time"]))
#PCA_sample_time_colored = plot_PCA_epidome(epidome_object_aneurysm_norm, color_variable = "sample.time", colors = c("red", "blue", "green", "purple"), plot_ellipse = FALSE)
PCA_sample_time_colored = plot_PCA_epidome(epidome_object_aneurysm_norm, color_variable = "sample.time", colors = c("red", "blue", "green"), plot_ellipse = FALSE)
PCA_sample_time_colored + ggtitle("PCA plot of aneurysm samples colored by sample time")
dev.off()
png("PCA_hypophysis_by_sampleTime.png", width=1200, height=800)
PCA_sample_time_colored = plot_PCA_epidome(epidome_object_hypophysis_norm, color_variable = "sample.time", colors = c("red", "blue", "green"), plot_ellipse = FALSE)
PCA_sample_time_colored + ggtitle("PCA plot of hypophysis samples colored by sample time")
dev.off()

#Image4
eo_filter_lowcount = filter_lowcount_samples_epidome(epidome_object,p1_threshold = 500,p2_threshold = 500)
eo_filter_ASVs = epidome_filtered_ASVs = filter_lowcount_ASVs_epidome(epidome_object,percent_threshold = 1)
epidome_object_normalized = normalize_epidome_object(epidome_object)
epidome_object_ASV_combined = combine_ASVs_epidome(epidome_object)
epidome_object_aneurysm = prune_by_variable_epidome(epidome_object,variable_name = "patient.type",variable_values = c("aneurysm"))
epidome_object_hypophysis= prune_by_variable_epidome(epidome_object,variable_name = "patient.type",variable_values = c("hypophysis"))

eo_ASV_combined = combine_ASVs_epidome(epidome_object_aneurysm)
count_table_reordered = classify_epidome(eo_ASV_combined,ST_amplicon_table)
p = make_barplot_epidome(count_table_reordered,reorder=FALSE,normalize=TRUE)
png("Barplot_aneurysm.png", width=1200, height=800)
p
dev.off()

eo_ASV_combined = combine_ASVs_epidome(epidome_object_hypophysis)
count_table_reordered = classify_epidome(eo_ASV_combined,ST_amplicon_table)
#colnames(count_table_reordered) <- c("LM.Nose","LM.Foot","LZ.Nose","LZ.Foot",...)
p = make_barplot_epidome(count_table_reordered,reorder=FALSE,normalize=TRUE)
png("Barplot_hypophysis.png", width=1200, height=800)
p
dev.off()


# -------------------------------------------------------------------------------------
# -------------------------------- The methods for 16S --------------------------------
#16S_plots/taxa_summary/taxa_summary_plots/bar_charts.html

## 4, stitch
```sh
mkdir pandaseq.out
#-p CCTACGGGNGGCWGCAG -q GACTACHVGGGTATCTAATCC
for file in cutadapted_16S/filtered_R1/*_R1.fastq.gz; do echo "pandaseq -f ${file} -r ${file/_R1/_R2} -l 300   -w pandaseq.out/$(echo $file | cut -d'/' -f3 | cut -d'_' -f1)_merged.fasta >> LOG_pandaseq"; done
```
pandaseq -f cutadapted_16S/filtered_R1/A10-1_R1.fastq.gz -r cutadapted_16S/filtered_R2/A10-1_R2.fastq.gz -l 300   -w pandaseq.out/A10-1_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/A10-2_R1.fastq.gz -r cutadapted_16S/filtered_R2/A10-2_R2.fastq.gz -l 300   -w pandaseq.out/A10-2_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/A10-3_R1.fastq.gz -r cutadapted_16S/filtered_R2/A10-3_R2.fastq.gz -l 300   -w pandaseq.out/A10-3_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/A1-1_R1.fastq.gz -r cutadapted_16S/filtered_R2/A1-1_R2.fastq.gz -l 300   -w pandaseq.out/A1-1_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/A12-1_R1.fastq.gz -r cutadapted_16S/filtered_R2/A12-1_R2.fastq.gz -l 300   -w pandaseq.out/A12-1_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/A12-2_R1.fastq.gz -r cutadapted_16S/filtered_R2/A12-2_R2.fastq.gz -l 300   -w pandaseq.out/A12-2_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/A12-3_R1.fastq.gz -r cutadapted_16S/filtered_R2/A12-3_R2.fastq.gz -l 300   -w pandaseq.out/A12-3_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/A1-2_R1.fastq.gz -r cutadapted_16S/filtered_R2/A1-2_R2.fastq.gz -l 300   -w pandaseq.out/A1-2_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/A1-3_R1.fastq.gz -r cutadapted_16S/filtered_R2/A1-3_R2.fastq.gz -l 300   -w pandaseq.out/A1-3_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/A17-1_R1.fastq.gz -r cutadapted_16S/filtered_R2/A17-1_R2.fastq.gz -l 300   -w pandaseq.out/A17-1_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/A17-2_R1.fastq.gz -r cutadapted_16S/filtered_R2/A17-2_R2.fastq.gz -l 300   -w pandaseq.out/A17-2_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/A17-3_R1.fastq.gz -r cutadapted_16S/filtered_R2/A17-3_R2.fastq.gz -l 300   -w pandaseq.out/A17-3_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/A20-1_R1.fastq.gz -r cutadapted_16S/filtered_R2/A20-1_R2.fastq.gz -l 300   -w pandaseq.out/A20-1_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/A20-2_R1.fastq.gz -r cutadapted_16S/filtered_R2/A20-2_R2.fastq.gz -l 300   -w pandaseq.out/A20-2_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/A20-3_R1.fastq.gz -r cutadapted_16S/filtered_R2/A20-3_R2.fastq.gz -l 300   -w pandaseq.out/A20-3_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/A21-1_R1.fastq.gz -r cutadapted_16S/filtered_R2/A21-1_R2.fastq.gz -l 300   -w pandaseq.out/A21-1_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/A21-2_R1.fastq.gz -r cutadapted_16S/filtered_R2/A21-2_R2.fastq.gz -l 300   -w pandaseq.out/A21-2_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/A21-3_R1.fastq.gz -r cutadapted_16S/filtered_R2/A21-3_R2.fastq.gz -l 300   -w pandaseq.out/A21-3_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/A2-1_R1.fastq.gz -r cutadapted_16S/filtered_R2/A2-1_R2.fastq.gz -l 300   -w pandaseq.out/A2-1_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/A22-1_R1.fastq.gz -r cutadapted_16S/filtered_R2/A22-1_R2.fastq.gz -l 300   -w pandaseq.out/A22-1_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/A22-2_R1.fastq.gz -r cutadapted_16S/filtered_R2/A22-2_R2.fastq.gz -l 300   -w pandaseq.out/A22-2_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/A22-3_R1.fastq.gz -r cutadapted_16S/filtered_R2/A22-3_R2.fastq.gz -l 300   -w pandaseq.out/A22-3_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/A2-2_R1.fastq.gz -r cutadapted_16S/filtered_R2/A2-2_R2.fastq.gz -l 300   -w pandaseq.out/A2-2_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/A2-3_R1.fastq.gz -r cutadapted_16S/filtered_R2/A2-3_R2.fastq.gz -l 300   -w pandaseq.out/A2-3_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/A24-1_R1.fastq.gz -r cutadapted_16S/filtered_R2/A24-1_R2.fastq.gz -l 300   -w pandaseq.out/A24-1_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/A24-2_R1.fastq.gz -r cutadapted_16S/filtered_R2/A24-2_R2.fastq.gz -l 300   -w pandaseq.out/A24-2_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/A24-3_R1.fastq.gz -r cutadapted_16S/filtered_R2/A24-3_R2.fastq.gz -l 300   -w pandaseq.out/A24-3_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/A25-1_R1.fastq.gz -r cutadapted_16S/filtered_R2/A25-1_R2.fastq.gz -l 300   -w pandaseq.out/A25-1_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/A25-2_R1.fastq.gz -r cutadapted_16S/filtered_R2/A25-2_R2.fastq.gz -l 300   -w pandaseq.out/A25-2_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/A25-3_R1.fastq.gz -r cutadapted_16S/filtered_R2/A25-3_R2.fastq.gz -l 300   -w pandaseq.out/A25-3_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/A27-1_R1.fastq.gz -r cutadapted_16S/filtered_R2/A27-1_R2.fastq.gz -l 300   -w pandaseq.out/A27-1_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/A27-2_R1.fastq.gz -r cutadapted_16S/filtered_R2/A27-2_R2.fastq.gz -l 300   -w pandaseq.out/A27-2_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/A27-3_R1.fastq.gz -r cutadapted_16S/filtered_R2/A27-3_R2.fastq.gz -l 300   -w pandaseq.out/A27-3_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/A28-1_R1.fastq.gz -r cutadapted_16S/filtered_R2/A28-1_R2.fastq.gz -l 300   -w pandaseq.out/A28-1_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/A28-2_R1.fastq.gz -r cutadapted_16S/filtered_R2/A28-2_R2.fastq.gz -l 300   -w pandaseq.out/A28-2_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/A28-3_R1.fastq.gz -r cutadapted_16S/filtered_R2/A28-3_R2.fastq.gz -l 300   -w pandaseq.out/A28-3_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/A3-1_R1.fastq.gz -r cutadapted_16S/filtered_R2/A3-1_R2.fastq.gz -l 300   -w pandaseq.out/A3-1_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/A3-2_R1.fastq.gz -r cutadapted_16S/filtered_R2/A3-2_R2.fastq.gz -l 300   -w pandaseq.out/A3-2_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/A3-3_R1.fastq.gz -r cutadapted_16S/filtered_R2/A3-3_R2.fastq.gz -l 300   -w pandaseq.out/A3-3_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/A4-1_R1.fastq.gz -r cutadapted_16S/filtered_R2/A4-1_R2.fastq.gz -l 300   -w pandaseq.out/A4-1_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/A4-2_R1.fastq.gz -r cutadapted_16S/filtered_R2/A4-2_R2.fastq.gz -l 300   -w pandaseq.out/A4-2_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/A4-3_R1.fastq.gz -r cutadapted_16S/filtered_R2/A4-3_R2.fastq.gz -l 300   -w pandaseq.out/A4-3_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/A5-1_R1.fastq.gz -r cutadapted_16S/filtered_R2/A5-1_R2.fastq.gz -l 300   -w pandaseq.out/A5-1_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/A5-2_R1.fastq.gz -r cutadapted_16S/filtered_R2/A5-2_R2.fastq.gz -l 300   -w pandaseq.out/A5-2_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/A5-3_R1.fastq.gz -r cutadapted_16S/filtered_R2/A5-3_R2.fastq.gz -l 300   -w pandaseq.out/A5-3_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H11-1_R1.fastq.gz -r cutadapted_16S/filtered_R2/H11-1_R2.fastq.gz -l 300   -w pandaseq.out/H11-1_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H11-2_R1.fastq.gz -r cutadapted_16S/filtered_R2/H11-2_R2.fastq.gz -l 300   -w pandaseq.out/H11-2_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H11-3_R1.fastq.gz -r cutadapted_16S/filtered_R2/H11-3_R2.fastq.gz -l 300   -w pandaseq.out/H11-3_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H13-1_R1.fastq.gz -r cutadapted_16S/filtered_R2/H13-1_R2.fastq.gz -l 300   -w pandaseq.out/H13-1_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H13-2_R1.fastq.gz -r cutadapted_16S/filtered_R2/H13-2_R2.fastq.gz -l 300   -w pandaseq.out/H13-2_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H13-3_R1.fastq.gz -r cutadapted_16S/filtered_R2/H13-3_R2.fastq.gz -l 300   -w pandaseq.out/H13-3_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H14-1_R1.fastq.gz -r cutadapted_16S/filtered_R2/H14-1_R2.fastq.gz -l 300   -w pandaseq.out/H14-1_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H14-2_R1.fastq.gz -r cutadapted_16S/filtered_R2/H14-2_R2.fastq.gz -l 300   -w pandaseq.out/H14-2_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H14-3_R1.fastq.gz -r cutadapted_16S/filtered_R2/H14-3_R2.fastq.gz -l 300   -w pandaseq.out/H14-3_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H16-1_R1.fastq.gz -r cutadapted_16S/filtered_R2/H16-1_R2.fastq.gz -l 300   -w pandaseq.out/H16-1_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H16-2_R1.fastq.gz -r cutadapted_16S/filtered_R2/H16-2_R2.fastq.gz -l 300   -w pandaseq.out/H16-2_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H16-3_R1.fastq.gz -r cutadapted_16S/filtered_R2/H16-3_R2.fastq.gz -l 300   -w pandaseq.out/H16-3_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H19-1_R1.fastq.gz -r cutadapted_16S/filtered_R2/H19-1_R2.fastq.gz -l 300   -w pandaseq.out/H19-1_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H19-2_R1.fastq.gz -r cutadapted_16S/filtered_R2/H19-2_R2.fastq.gz -l 300   -w pandaseq.out/H19-2_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H19-3_R1.fastq.gz -r cutadapted_16S/filtered_R2/H19-3_R2.fastq.gz -l 300   -w pandaseq.out/H19-3_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H24-1_R1.fastq.gz -r cutadapted_16S/filtered_R2/H24-1_R2.fastq.gz -l 300   -w pandaseq.out/H24-1_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H24-2_R1.fastq.gz -r cutadapted_16S/filtered_R2/H24-2_R2.fastq.gz -l 300   -w pandaseq.out/H24-2_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H24-3_R1.fastq.gz -r cutadapted_16S/filtered_R2/H24-3_R2.fastq.gz -l 300   -w pandaseq.out/H24-3_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H53-1_R1.fastq.gz -r cutadapted_16S/filtered_R2/H53-1_R2.fastq.gz -l 300   -w pandaseq.out/H53-1_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H53-2_R1.fastq.gz -r cutadapted_16S/filtered_R2/H53-2_R2.fastq.gz -l 300   -w pandaseq.out/H53-2_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H53-3_R1.fastq.gz -r cutadapted_16S/filtered_R2/H53-3_R2.fastq.gz -l 300   -w pandaseq.out/H53-3_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H56-1_R1.fastq.gz -r cutadapted_16S/filtered_R2/H56-1_R2.fastq.gz -l 300   -w pandaseq.out/H56-1_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H56-2_R1.fastq.gz -r cutadapted_16S/filtered_R2/H56-2_R2.fastq.gz -l 300   -w pandaseq.out/H56-2_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H56-3_R1.fastq.gz -r cutadapted_16S/filtered_R2/H56-3_R2.fastq.gz -l 300   -w pandaseq.out/H56-3_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H57-1_R1.fastq.gz -r cutadapted_16S/filtered_R2/H57-1_R2.fastq.gz -l 300   -w pandaseq.out/H57-1_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H57-2_R1.fastq.gz -r cutadapted_16S/filtered_R2/H57-2_R2.fastq.gz -l 300   -w pandaseq.out/H57-2_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H57-3_R1.fastq.gz -r cutadapted_16S/filtered_R2/H57-3_R2.fastq.gz -l 300   -w pandaseq.out/H57-3_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H59-1_R1.fastq.gz -r cutadapted_16S/filtered_R2/H59-1_R2.fastq.gz -l 300   -w pandaseq.out/H59-1_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H59-2_R1.fastq.gz -r cutadapted_16S/filtered_R2/H59-2_R2.fastq.gz -l 300   -w pandaseq.out/H59-2_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H59-3_R1.fastq.gz -r cutadapted_16S/filtered_R2/H59-3_R2.fastq.gz -l 300   -w pandaseq.out/H59-3_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H60-1_R1.fastq.gz -r cutadapted_16S/filtered_R2/H60-1_R2.fastq.gz -l 300   -w pandaseq.out/H60-1_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H60-2_R1.fastq.gz -r cutadapted_16S/filtered_R2/H60-2_R2.fastq.gz -l 300   -w pandaseq.out/H60-2_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H60-3_R1.fastq.gz -r cutadapted_16S/filtered_R2/H60-3_R2.fastq.gz -l 300   -w pandaseq.out/H60-3_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H64-1_R1.fastq.gz -r cutadapted_16S/filtered_R2/H64-1_R2.fastq.gz -l 300   -w pandaseq.out/H64-1_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H64-2_R1.fastq.gz -r cutadapted_16S/filtered_R2/H64-2_R2.fastq.gz -l 300   -w pandaseq.out/H64-2_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H64-3_R1.fastq.gz -r cutadapted_16S/filtered_R2/H64-3_R2.fastq.gz -l 300   -w pandaseq.out/H64-3_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H65-1_R1.fastq.gz -r cutadapted_16S/filtered_R2/H65-1_R2.fastq.gz -l 300   -w pandaseq.out/H65-1_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H65-2_R1.fastq.gz -r cutadapted_16S/filtered_R2/H65-2_R2.fastq.gz -l 300   -w pandaseq.out/H65-2_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H65-3_R1.fastq.gz -r cutadapted_16S/filtered_R2/H65-3_R2.fastq.gz -l 300   -w pandaseq.out/H65-3_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H67-1_R1.fastq.gz -r cutadapted_16S/filtered_R2/H67-1_R2.fastq.gz -l 300   -w pandaseq.out/H67-1_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H67-2_R1.fastq.gz -r cutadapted_16S/filtered_R2/H67-2_R2.fastq.gz -l 300   -w pandaseq.out/H67-2_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H67-3_R1.fastq.gz -r cutadapted_16S/filtered_R2/H67-3_R2.fastq.gz -l 300   -w pandaseq.out/H67-3_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H72-1_R1.fastq.gz -r cutadapted_16S/filtered_R2/H72-1_R2.fastq.gz -l 300   -w pandaseq.out/H72-1_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H72-2_R1.fastq.gz -r cutadapted_16S/filtered_R2/H72-2_R2.fastq.gz -l 300   -w pandaseq.out/H72-2_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H72-3_R1.fastq.gz -r cutadapted_16S/filtered_R2/H72-3_R2.fastq.gz -l 300   -w pandaseq.out/H72-3_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H73-1_R1.fastq.gz -r cutadapted_16S/filtered_R2/H73-1_R2.fastq.gz -l 300   -w pandaseq.out/H73-1_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H73-2_R1.fastq.gz -r cutadapted_16S/filtered_R2/H73-2_R2.fastq.gz -l 300   -w pandaseq.out/H73-2_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H73-3_R1.fastq.gz -r cutadapted_16S/filtered_R2/H73-3_R2.fastq.gz -l 300   -w pandaseq.out/H73-3_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H76-1_R1.fastq.gz -r cutadapted_16S/filtered_R2/H76-1_R2.fastq.gz -l 300   -w pandaseq.out/H76-1_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H76-2_R1.fastq.gz -r cutadapted_16S/filtered_R2/H76-2_R2.fastq.gz -l 300   -w pandaseq.out/H76-2_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H76-3_R1.fastq.gz -r cutadapted_16S/filtered_R2/H76-3_R2.fastq.gz -l 300   -w pandaseq.out/H76-3_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H78-1_R1.fastq.gz -r cutadapted_16S/filtered_R2/H78-1_R2.fastq.gz -l 300   -w pandaseq.out/H78-1_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H78-2_R1.fastq.gz -r cutadapted_16S/filtered_R2/H78-2_R2.fastq.gz -l 300   -w pandaseq.out/H78-2_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H78-3_R1.fastq.gz -r cutadapted_16S/filtered_R2/H78-3_R2.fastq.gz -l 300   -w pandaseq.out/H78-3_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H91-1_R1.fastq.gz -r cutadapted_16S/filtered_R2/H91-1_R2.fastq.gz -l 300   -w pandaseq.out/H91-1_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H91-2_R1.fastq.gz -r cutadapted_16S/filtered_R2/H91-2_R2.fastq.gz -l 300   -w pandaseq.out/H91-2_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H91-3_R1.fastq.gz -r cutadapted_16S/filtered_R2/H91-3_R2.fastq.gz -l 300   -w pandaseq.out/H91-3_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H94-1_R1.fastq.gz -r cutadapted_16S/filtered_R2/H94-1_R2.fastq.gz -l 300   -w pandaseq.out/H94-1_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H94-2_R1.fastq.gz -r cutadapted_16S/filtered_R2/H94-2_R2.fastq.gz -l 300   -w pandaseq.out/H94-2_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H94-3_R1.fastq.gz -r cutadapted_16S/filtered_R2/H94-3_R2.fastq.gz -l 300   -w pandaseq.out/H94-3_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H99-1_R1.fastq.gz -r cutadapted_16S/filtered_R2/H99-1_R2.fastq.gz -l 300   -w pandaseq.out/H99-1_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H99-2_R1.fastq.gz -r cutadapted_16S/filtered_R2/H99-2_R2.fastq.gz -l 300   -w pandaseq.out/H99-2_merged.fasta >> LOG_pandaseq
pandaseq -f cutadapted_16S/filtered_R1/H99-3_R1.fastq.gz -r cutadapted_16S/filtered_R2/H99-3_R2.fastq.gz -l 300   -w pandaseq.out/H99-3_merged.fasta >> LOG_pandaseq
#END


#(qiime1) jhuang@hamburg:~/DATA/Data_Holger_Epidome_myData2/pandaseq.out$ grep ">" A2-1_merged.fasta | wc -l
#146133


## 5, create two QIIME mapping files
```sh
validate_mapping_file.py -m map3.txt
```

## 6, combine files into a labeled file
```sh
add_qiime_labels.py -i pandaseq.out -m map3_corrected.txt -c FileInput -o combined_fasta
```

## 7, remove chimeric sequences using usearch
```sh
cd combined_fasta
pyfasta split -n 100 combined_seqs.fna
for i in {000..099}; do echo "identify_chimeric_seqs.py -i combined_fasta/combined_seqs.fna.${i} -m usearch61 -o usearch_checked_combined.${i}/ -r ~/REFs/gg_97_otus_4feb2011_fw_rc.fasta --threads=14;" >> uchime_commands.sh; done
mv uchime_commands.sh ..
./uchime_commands.sh

cat usearch_checked_combined.000/chimeras.txt usearch_checked_combined.001/chimeras.txt usearch_checked_combined.002/chimeras.txt usearch_checked_combined.003/chimeras.txt usearch_checked_combined.004/chimeras.txt usearch_checked_combined.005/chimeras.txt usearch_checked_combined.006/chimeras.txt usearch_checked_combined.007/chimeras.txt usearch_checked_combined.008/chimeras.txt usearch_checked_combined.009/chimeras.txt usearch_checked_combined.010/chimeras.txt usearch_checked_combined.011/chimeras.txt usearch_checked_combined.012/chimeras.txt usearch_checked_combined.013/chimeras.txt usearch_checked_combined.014/chimeras.txt usearch_checked_combined.015/chimeras.txt usearch_checked_combined.016/chimeras.txt usearch_checked_combined.017/chimeras.txt usearch_checked_combined.018/chimeras.txt usearch_checked_combined.019/chimeras.txt usearch_checked_combined.020/chimeras.txt usearch_checked_combined.021/chimeras.txt usearch_checked_combined.022/chimeras.txt usearch_checked_combined.023/chimeras.txt usearch_checked_combined.024/chimeras.txt usearch_checked_combined.025/chimeras.txt usearch_checked_combined.026/chimeras.txt usearch_checked_combined.027/chimeras.txt usearch_checked_combined.028/chimeras.txt usearch_checked_combined.029/chimeras.txt usearch_checked_combined.030/chimeras.txt usearch_checked_combined.031/chimeras.txt usearch_checked_combined.032/chimeras.txt usearch_checked_combined.033/chimeras.txt usearch_checked_combined.034/chimeras.txt usearch_checked_combined.035/chimeras.txt usearch_checked_combined.036/chimeras.txt usearch_checked_combined.037/chimeras.txt usearch_checked_combined.038/chimeras.txt usearch_checked_combined.039/chimeras.txt usearch_checked_combined.040/chimeras.txt usearch_checked_combined.041/chimeras.txt usearch_checked_combined.042/chimeras.txt usearch_checked_combined.043/chimeras.txt usearch_checked_combined.044/chimeras.txt usearch_checked_combined.045/chimeras.txt usearch_checked_combined.046/chimeras.txt usearch_checked_combined.047/chimeras.txt usearch_checked_combined.048/chimeras.txt usearch_checked_combined.049/chimeras.txt usearch_checked_combined.050/chimeras.txt usearch_checked_combined.051/chimeras.txt usearch_checked_combined.052/chimeras.txt usearch_checked_combined.053/chimeras.txt usearch_checked_combined.054/chimeras.txt usearch_checked_combined.055/chimeras.txt usearch_checked_combined.056/chimeras.txt usearch_checked_combined.057/chimeras.txt usearch_checked_combined.058/chimeras.txt usearch_checked_combined.059/chimeras.txt usearch_checked_combined.060/chimeras.txt usearch_checked_combined.061/chimeras.txt usearch_checked_combined.062/chimeras.txt usearch_checked_combined.063/chimeras.txt usearch_checked_combined.064/chimeras.txt usearch_checked_combined.065/chimeras.txt usearch_checked_combined.066/chimeras.txt usearch_checked_combined.067/chimeras.txt usearch_checked_combined.068/chimeras.txt usearch_checked_combined.069/chimeras.txt usearch_checked_combined.070/chimeras.txt usearch_checked_combined.071/chimeras.txt usearch_checked_combined.072/chimeras.txt usearch_checked_combined.073/chimeras.txt usearch_checked_combined.074/chimeras.txt usearch_checked_combined.075/chimeras.txt usearch_checked_combined.076/chimeras.txt usearch_checked_combined.077/chimeras.txt usearch_checked_combined.078/chimeras.txt usearch_checked_combined.079/chimeras.txt usearch_checked_combined.080/chimeras.txt usearch_checked_combined.081/chimeras.txt usearch_checked_combined.082/chimeras.txt usearch_checked_combined.083/chimeras.txt usearch_checked_combined.084/chimeras.txt usearch_checked_combined.085/chimeras.txt usearch_checked_combined.086/chimeras.txt usearch_checked_combined.087/chimeras.txt usearch_checked_combined.088/chimeras.txt usearch_checked_combined.089/chimeras.txt usearch_checked_combined.090/chimeras.txt usearch_checked_combined.091/chimeras.txt usearch_checked_combined.092/chimeras.txt usearch_checked_combined.093/chimeras.txt usearch_checked_combined.094/chimeras.txt usearch_checked_combined.095/chimeras.txt usearch_checked_combined.096/chimeras.txt usearch_checked_combined.097/chimeras.txt usearch_checked_combined.098/chimeras.txt usearch_checked_combined.099/chimeras.txt > chimeras.txt
filter_fasta.py -f combined_fasta/combined_seqs.fna -o combined_fasta/combined_nonchimera_seqs.fna -s chimeras.txt -n;
rm -rf usearch_checked_combined.0*


#grep ">A2.1_" combined_nonchimera_seqs.fna | wc -l
#...


## 8, create OTU picking parameter file, and run the QIIME open reference picking pipeline
```sh
echo "pick_otus:similarity 0.97" > clustering_params.txt
echo "assign_taxonomy:similarity 0.97" >> clustering_params.txt
echo "parallel_align_seqs_pynast:template_fp /home/jhuang/REFs/SILVA_132_QIIME_release/core_alignment/80_core_alignment.fna" >> clustering_params.txt
echo "assign_taxonomy:reference_seqs_fp /home/jhuang/REFs/SILVA_132_QIIME_release/rep_set/rep_set_16S_only/99/silva_132_99_16S.fna" >> clustering_params.txt
echo "assign_taxonomy:id_to_taxonomy_fp /home/jhuang/REFs/SILVA_132_QIIME_release/taxonomy/16S_only/99/consensus_taxonomy_7_levels.txt" >> clustering_params.txt
echo "alpha_diversity:metrics chao1,observed_otus,shannon,PD_whole_tree" >> clustering_params.txt
#with usearch61 for reference picking and usearch61_ref for de novo OTU picking
pick_open_reference_otus.py -r/home/jhuang/REFs/SILVA_132_QIIME_release/rep_set/rep_set_16S_only/99/silva_132_99_16S.fna -i combined_fasta/combined_nonchimera_seqs.fna -o clustering/ -p clustering_params.txt --parallel
```


## 9.1(optional), for control data
```sh
summarize_taxa_through_plots.py -i clustering/otu_table_mc2_w_tax_no_pynast_failures.biom -o plots/taxa_summary -s
mv usearch_checked_combined usearch_checked_combined_ctrl
mv combined_fasta combined_fasta_ctrl
mv clustering clustering_ctrl
mv plots plots_ctrl
```


## 9.2, for other data: core diversity analyses
```sh
core_diversity_analyses.py -o./core_diversity_e9893 -i./clustering/otu_table_mc2_w_tax_no_pynast_failures.biom -m./map3_corrected.txt -t./clustering/rep_set.tre -e9893 -p./clustering_params.txt
core_diversity_analyses.py -o./core_diversity_e18505 -i./clustering/otu_table_mc2_w_tax_no_pynast_failures.biom -m./map3_corrected.txt -t./clustering/rep_set.tre -e18505 -p./clustering_params.txt
core_diversity_analyses.py -o./core_diversity_e24132 -i./clustering/otu_table_mc2_w_tax_no_pynast_failures.biom -m./map3_corrected.txt -t./clustering/rep_set.tre -e24132 -p./clustering_params.txt
core_diversity_analyses.py -o./core_diversity_e31062 -i./clustering/otu_table_mc2_w_tax_no_pynast_failures.biom -m./map3_corrected.txt -t./clustering/rep_set.tre -e31062 -p./clustering_params.txt
```

#run, prepare the files in the header of Phyloseq.Rmd!
./Data_PaulBongarts_S.epidermidis_HDRNA/Data_Paul_and_Marc_Epidome_batch3/core_diversity_e33778/Phyloseq.Rmd

#geoBURST
https://pubmlst.org/bigsdb?db=pubmlst_sepidermidis_seqdef
	clonal complex
ST	arcC	aroE	gtr	mutS	pyrR	tpiA	yqiL

100,2,1,1,2,20,1,1
130,1,1,1,2,1,1,1
14,1,1,2,1,1,1,1
170,25,19,17,4,23,10,2
184,16,1,2,1,2,1,1
19,8,7,12,4,12,2,2
2,7,1,2,2,4,1,1
20,1,1,2,2,1,1,3
200,1,1,2,2,3,1,3
210,1,1,1,2,2,1,25
215,1,6,2,1,1,16,1
218,1,1,2,6,2,16,1
225,1,13,7,2,2,1,29
23,7,1,2,1,3,3,1
278,1,1,1,1,1,1,14
293,1,1,2,6,2,1,10
297,1,2,2,2,2,1,3
329,12,29,9,8,6,5,8
331,12,35,5,5,10,16,21
355,2,1,6,6,2,1,7
384,1,1,22,2,2,1,25
5,1,1,1,2,2,1,1
59,2,1,1,1,2,1,1
60,1,1,2,6,2,1,16
626,2,1,1,1,19,1,7
673,8,24,12,23,49,46,58
691,28,41,5,5,38,34,4
73,1,5,2,6,2,1,6
8,2,1,7,1,1,1,1
83,1,2,1,2,1,1,10
87,7,1,1,2,2,1,1
88,1,1,2,1,2,1,7

#TODO: based on the 16S+g216+yycH?
#Phylogenie von Sequenztypen aus A-&H-Proben


# --------------------------------------------- TRASH ---------------------------------------------------

TODO: clean the following text and write a post to xgenes!

aneurysm [ˊænjurizm]
hypophyis

Vorgabe:

Epidome Auswertung: A&H-Proben

- Bar plots (einzelner Proben) A&H getrennt!
- alpha-diversity
      * nach Zeitpunkt der Proben - Gemischung vergleichen (alle, .1, .2, .3 Collektive vergleichen, A&H getrennt)
      * Zeitpunkt .2 & .3 zwischen A&H-Probensammlung vergleichen
- ggf. dann Fokus auf zeinzelne MLSTs (2,5) signifikante Zu-/Abnahme während KH-Aufenthalt?
- für v3v4 (16S) alpha-diversity fua Klassenebene

*- Phylogenie von Sequenztypen aus A-&H-Proben

- geoBURST DLV-Analysis (klonale Cluster)


TODO_IMPORTANT: say using own statistical methods to performing the group comparisons, not Anna's methods!


Dear Anna and Holger,

I’ve finally wrapped up the updated DYNAMIP analysis and attached the new HTML report.

Here’s a quick summary of how I addressed Anna’s suggestions in the new script:

    * Alpha diversity (Section 8): Updated the longitudinal timepoint analysis (8.1 & 8.2) to include both paired t-tests and Paired Wilcoxon signed-rank tests. I also added a dedicated comparison for Group A vs. H only at the surgery timepoint (8.4).
    * Beta diversity (Section 9 - New!):* Refined this section and found some interesting shifts. Both cohorts showed significant community changes between surgery and discharge (A2 vs. A3: P~adj~ = 0.007; H2 vs. H3: P~adj~ = 0.002). We also see distinct baseline profiles between the A and H cohorts across all corresponding timepoints.
    * Differential abundance (Section 10): Ran the analysis to investigate which bacterial classes change across the admission/surgery/discharge timepoints.
    * ST Binary Prevalence (Section 13): Great suggestion on this one! I converted the data for ST2, ST5, and ST23 into a binary presence/absence matrix and ran Cochran’s Q + McNemar tests.

Best,
Jiabin

write a reply-mail based on the new attached scripts, say the point "-Points 5 and 6 in html doc: would it be possible to do a differential abundance or composititional data analysis to investigate which classes change in admission vs. surgery, surgery vs. discharge and admission vs. discharge?" were adressed in the point "10 Differential abundance analysis (16S)". The "-7.1: Could you compare group A and group H only at timepoint surgery? I think a possible difference gets diluted since admission and discharge show no difference." and "-7.2: When comparing alpha-diversity at the different timepoints, I think a paired t-test could be more appropriate (and also be more powerful)." is adressed with the updated "8 Alpha diversity (16S)" in the attache html-files.  The point "11: As we see no significant changes in S. epidermidis STs, I was wondering if it could be worth exploring converting the abundance data into binary presence/absence matrix for certain STs that have previously been associated with healthcare (ST2, ST5 and ST23) and analysing prevalence proportions over the three timepoints, possibly with an analysis based on Mc Nemar's?" was adressed by the point "13 Binary Prevalence Analysis of Target S. epidermidis STs (g216 & yycH)". Expect for the above-mentionedn "8 Alpha diversity (16S)", "10 Differential abundance analysis (16S)", the section "9 Beta diversity (16S)" is also new added. Notably, significant temporal shifts in community structure were observed between the surgery and discharge timepoints in both cohorts: **aneurysm patients (A2 vs. A3: R² = 0.068, *P*~adj~ = 0.007)** and **hypophysis patients (H2 vs. H3: R² = 0.055, *P*~adj~ = 0.002)**, indicating measurable changes in the nasal microbiome during the post-surgical recovery period. Additionally, significant differences were detected between the two patient cohorts at corresponding timepoints (**A1 vs. H1: *P*~adj~ = 0.007; A2 vs. H2: *P*~adj~ = 0.001; A3 vs. H3: *P*~adj~ = 0.049**), suggesting distinct baseline microbiome profiles between aneurysm and hypophysis surgery patients.

Hi Holger and Anna,

Sorry for the delay. I will put more focus on the DYNAMIP project this week. I hope to get the updated analysis done and send over the results by next week.

Best,
Jiabin


On 6/9/26 11:32, Holger Rohde wrote:
>
> Hi Jiabin,
>
>
>
> I was wondering if you had time to analyze the data according to Anna´s email? Many thanks!
>
>
>
> Holger
>
>
>
>
>
>
>
> Prof. Dr. Holger Rohde
>
> Oberarzt
>
> ESCMID Fellow
>
> ESCMID AMR Action Subcommittee (Director)
>
>
>
> Institut für Medizinische Mikrobiologie, Virologie und Hygiene
>
> Universitätsklinikum Hamburg-Eppendorf
>
> Martinistrasse 52
>
> D-20246 Hamburg
>
>
>
> Wie Sie uns finden:
>
> Gebäude N25
>
>
>
>
>
> Von: Both, Anna <a.both@uke.de>
> Gesendet: Donnerstag, 9. April 2026 18:12
> An: Huang, Jiabin <j.huang@uke.de>
> Cc: Rohde, Holger <rohde@uke.de>
> Betreff: DYNAMIP
>
>
>
> Dear Jiabin,
>
>
>
> Holger has sent me the analysis and script from the DYNAMIP study to see if I had any additional ideas. It looks really nice, I also really like that we can see the code to learn, as well.
>
> I have a couple of questionsor suggestions, I am curious what you think about them.
>
>
>
> Regarding the 16S-analysis:
>
> -Points 5 and 6 in html doc: would it be possible to do a differential abundance or composititional data analysis to investigate which classes change in admission vs. surgery, surgery vs. discharge and admission vs. discharge?
>
>
>
> -7.1: Could you compare group A and group H only at timepoint surgery? I think a possible difference gets diluted since admission and discharge show no difference.
>
>
>
> -7.2: When comparing alpha-diversity at the different timepoints, I think a paired t-test could be more appropriate (and also be more powerful).
>
>
>
> -11: As we see no significant changes in S. epidermidis STs, I was wondering if it could be worth exploring converting the abundance data into binary presence/absence matrix for certain STs that have previously been associated with healthcare (ST2, ST5 and ST23) and analysing prevalence proportions over the three timepoints, possibly with an analysis based on Mc Nemar's?
>
>
>
> I'm curious to know what you think about this.
>
>
>
> Talk to you soon,
>
> Anna


Based on this information, here's an improved description for your Methods section that incorporates the sample cohort details:

---

## Sample Collection and Study Design

Nasal swab samples were collected from two patient cohorts: patients undergoing aneurysm surgery (cohort A, n = 15) and patients undergoing hypophysis surgery (cohort H, n = 20). For each patient, samples were collected at three timepoints: admission (A1/H1), surgery (A2/H2), and discharge (A3/H3), resulting in a longitudinal study design to assess temporal changes in the nasal microbiome across surgical interventions.

---

## Updated Beta Diversity Text

Group differences in microbial community composition (beta-diversity) were assessed using Bray-Curtis dissimilarity matrices. Statistical significance was evaluated using permutational multivariate analysis of variance (PERMANOVA; `adonis2` function in the vegan R package v2.7.2) with 999 permutations. Pairwise comparisons were conducted between all group combinations, and *P*-values were adjusted for multiple testing using the Benjamini-Hochberg false discovery rate (FDR) procedure. Principal coordinates analysis (PCoA) was performed on the Bray-Curtis distance matrix to visualize sample clustering patterns. The first two principal coordinates explained 10.84% (PCo1) and 9.41% (PCo2) of the total variation in community composition. Notably, significant temporal shifts in community structure were observed between the surgery and discharge timepoints in both cohorts: **hypophysis patients (H2 vs. H3: R² = 0.055, *P*~adj~ = 0.00175)** and **aneurysm patients (A2 vs. A3: R² = 0.068, *P*~adj~ = 0.0072)**, indicating measurable changes in the nasal microbiome during the post-surgical recovery period. Additionally, significant differences were detected between the two patient cohorts at corresponding timepoints (e.g., A1 vs. H1: *P*~adj~ = 0.0066; A2 vs. H2: *P*~adj~ = 0.0009), suggesting distinct baseline microbiome profiles between aneurysm and hypophysis surgery patients.

---

## Updated Differential Abundance Text

#DELETE: , as well as cross-cohort comparisons between corresponding timepoints

Differential abundance analysis was performed to identify operational taxonomic units (OTUs) with significant abundance alterations across three clinical stages (admission, surgery, and discharge). Six independent pairwise comparisons were conducted within each patient cohort (aneurysm and hypophysis). Statistical testing was performed using the DESeq2 R package (v1.46.0) based on a negative binomial generalized linear model (Wald test). *P*-values were adjusted for multiple testing using the Benjamini-Hochberg false discovery rate (FDR) procedure, and OTUs with an adjusted *P*-value < 0.05 were considered statistically significant.


# Project: Epidome Luise Brakert: current status – additional data – analysis strategy (see http://xgenes.com/course/ro/)
#   - Dynamics in colonizing S. epidermidis populations - a prospective cohort study in neurosurgical patients.
Figure 2: Prospective analysis of nasal S. epidermidis populations during hospitalisation.
(A) Schematic overview of the study protocol. Patients (n=12) admitted to the UKE for elective neurosurgery for vascular aneurysm were included.
Patients had no other underlying medical conditions and had not received antibiotics for at least six months prior to admission.
Nasal swabs were taken on admission (swab #1), immediately after surgery (swab #2), and immediately before discharge (swab #3).
DNA was extracted from the swab material and subjected to epidome analysis (Rendboe et al., 2020, https://doi.org/10.1186/s12866-020-02041-w).
(B) Relative distribution of STs in nasal swabs according to epidome analysis.
Reads assigned to specific STs were used to assess the relative abundance of STs in relation to absolute read counts.

Sequence types vs ST2,ST5...

(B) Relative distribution of STs in nasal swabs according to epidome analysis. Reads assigned to specific STs were used to assess the relative abundance of STs in relation to absolute read counts.

#H-Gruppe direkt uses the Antibiotics in the Nose-position. A-Gruppe uses the antibiotics in the Venen --> H-Gruppe weniger Variable than A-Gruppe.
#TODO: 1, für A&H-Gruppe separat?
#      2, Signifikante Zu- oder Abnahme einzelner STs oder ST-Gruppen von admission --> discharge?
# Darf ich ein solche Statistik machen? Nein, use ChatG to confer a reply!


~/DATA_B/Data_Luise_Epidome_longitudinal_nose/core_diversity_e9893


Hi Jiabin,

as promised, here's a summary of the next steps for the epidome project!

# 20 --> 21 patients!

- The A and H sample collections are nose swabs from two patient cohorts, A from aneurysm patients (15 patients) and H from hypophysis surgery patients (20 patients).
- Yes that's correct, in cases with more than three time points you can ignore all the data from the 4th time point onwards!
- The fist number is the patient number, and the second number the time of swab acquistion: .1 at admission, .2 at surgery, .3 at discharge.
Within both group A and group H, alpha diversity of MLSTs (?=ST) and 16s should be compared between time .1, .2 and .3.
- Also, each point in time should be compared between the A and H group (A*.1 vs. H*.1, A*.2 vs. H*.2 and A*.3 vs H*.3).
- Lastly, the three time points should be compared in both groups combined (A+H*.1 vs. A+H*.2 vs. A+H*.3).

- In addition, a goeBURST DLV-analysis (of all MLSTs in A and H samples) should be done to identify clonal clusters of S. epidermidis strains.

- Most of the A samples had already been sequenced in 2022, the remaining A samples were sequenced together with all the H samples in October, before my vacation.
- Here's the link to the H samples and new A samples: https://public.leibniz-liv.de/sharing/3cyImAWZ7
- The sample list with some coded information on the patients is attached.


#The samples Aachen 1-22 don't belong to the A and H sample collection, so you can ignore them as well :)
#I'm not sure which sample collection was called "noses from patients HDRNA 1-20"... as far as I know, they don't correspond with the Aachen 1-22 samples.

# The beta-diversity please see
Diversity analyses, ordination, and statistics

Alpha diversity (Observed features/OTU richness, Shannon diversity, and Faith's phylogenetic diversity) was calculated and compared between groups using nonparametric rank-based tests (Wilcoxon rank-sum for two-group comparisons; Kruskal–Wallis for ≥3 groups with post-hoc pairwise Wilcoxon tests).

Group differences in community composition (beta-diversity) were assessed using Bray-Curtis dissimilarities (computed after Hellinger transformation of abundance profiles) and tested using PERMANOVA (adonis2 in vegan [v2.7.2]) with permutation-based significance. Ordination was performed by principal coordinates analysis (PCoA) on the Bray-Curtis distance matrix to visualize sample-to-sample relationships.

Differential abundance testing was performed using DESeq2 [v1.46.0] (negative binomial GLM). Across analyses involving multiple hypothesis tests, p-values were adjusted using the Benjamini-Hochberg false discovery rate procedure.

# Differential abundance analysis

Group differences in microbial community composition (beta-diversity) were assessed using Bray-Curtis dissimilarity matrices. Statistical significance was evaluated using permutational multivariate analysis of variance (PERMANOVA; adonis2 function in the vegan R package v2.7.2) with 999 permutations. Pairwise PERMANOVA comparisons were conducted between all group combinations, and P-values were adjusted for multiple testing using the Benjamini-Hochberg false discovery rate (FDR) procedure. Principal coordinates analysis (PCoA) was performed on the Bray-Curtis distance matrix to visualize sample clustering patterns in ordination space. Ordination space represents a reduced-dimensional projection of the complex, multi-dimensional relationships between samples, where distances between points reflect their dissimilarity in microbial community composition. The first two principal coordinates explained 10.84% (PCo1) and 9.41% (PCo2) of the total variation in community composition, totaling approximately 20%. While this relatively low percentage indicates that microbial community variation is distributed across many dimensions and the two-dimensional plot captures only a portion of the total variation, statistically significant differences were still detected. Notably, significant temporal shifts in community composition were observed between surgery and discharge timepoints in both anatomical sites: *hypophysis samples (H2_Surgery vs. H3_Discharge: R² = 0.055, P~adj~ = 0.00175)* and **aneurysm samples (A2_Surgery vs. A3_Discharge: R² = 0.068, P~adj~ = 0.0072)**, suggesting that surgical intervention is followed by measurable changes in microbial community structure during the recovery period.

Delete: " in ordination space. Ordination space represents a reduced-dimensional projection of the complex, multi-dimensional relationships between samples, where distances between points reflect their dissimilarity in microbial community composition"
"surgical intervention is followed by"

Key Improvements Made:

    1. Fixed the Grammatical Error at the End:
    Your original draft ended with "suggesting that measurable changes in microbial community structure during the recovery period," which is missing a verb. I changed it to "indicating measurable changes in the microbiome during the post-surgical recovery period."
    2. Removed the "Defensive" Sentence:
    I deleted the sentence: "While this relatively low percentage indicates that microbial community variation is distributed across many dimensions..."
    Why? In microbiome research, the first two PCoA axes explaining ~20% of the variance is actually completely normal and standard because microbiome data is highly sparse and high-dimensional. By calling it "relatively low," you might inadvertently draw a reviewer's attention to it as a weakness. It is much stronger to simply state the variance explained and confidently move on to your significant biological findings.
    3. Standardized Statistical Formatting:
    Changed P-values to P-values and P~adj~ to P~adj~, which is the standard typographical requirement for most scientific journals.
    4. Improved Flow and Conciseness:
    Changed "Pairwise PERMANOVA comparisons" to "Pairwise comparisons" (since PERMANOVA was already established in the previous sentence) to avoid repetition. Changed "microbial community structure" to "microbiome" in the final clause to avoid repeating the exact same phrase twice in one sentence.


Microbiome analysis

Sample collection and storage
Fecal samples were collected from mice between 9–10 am at the indicated timepoints and immediately frozen at −80 °C until use.

DNA extraction
Genomic DNA was extracted from fecal pellets using the QIAamp Fast DNA Stool Mini Kit (Qiagen, #51604) following a protocol optimized for murine stool. Briefly, pellets were mechanically homogenized in a Precellys® 24 homogenizer (Bertin Technologies) using Soil grinding tubes (Bertin Technologies, #SK38), and DNA was extracted according to the manufacturer's instructions. DNA yield was measured with NanoDrop, and the concentration was adjusted to 10 ng/µl.
16S rRNA gene amplicon library preparation and sequencing
16S rRNA amplicons (V3–V4 region) were generated using degenerate primers that contain Illumina adapter overhang sequences: forward (5′-TCGTCGGCAGCGTCAGATGTGTATAAGAGACAGCCTACGGGNGGCWGCAG-3′) and reverse (5′-GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAGGACTACHVGGGTATCTAATCC-3′), following established Illumina 16S library preparation guidance. Samples were multiplexed using the Nextera XT Index Kit (Illumina, #FC-131-1001) and sequenced by 500-cycle paired-end sequencing (2×250 bp) on an Illumina MiSeq (Illumina, #MS-102-2003).

Bioinformatics processing and QC
Raw FASTQ files were inspected with FastQC [v0.11.9]. Sequence processing was performed in QIIME 2 [v2023.9.1]. Demultiplexed paired-end reads were imported using a Phred33 manifest, and per-base quality profiles were reviewed to guide trimming/truncation. Amplicon Sequence Variants (ASVs) were inferred using the QIIME 2 DADA2 workflow [v1.26.0]. Primer/adapter-proximal bases were removed using fixed 5′ trimming (17 bp forward; 21 bp reverse), reads were filtered using expected-error thresholds, and multiple forward/reverse truncation length combinations were evaluated to optimize read retention and paired-end merging for the ~460 bp V3–V4 amplicon. Final denoising parameters were selected based on denoising statistics (successful merging and proportion of non-chimeric reads across samples), and the resulting feature table and representative ASV sequences were used for downstream analyses.
Taxonomy was assigned by VSEARCH-based [v2.22.1] consensus classification against SILVA (release 132) at 97% identity. Phylogenetic trees were constructed using MAFFT [v7.520] for multiple sequence alignment and FastTree [v2.1.11] for tree inference within the QIIME 2 pipeline. Downstream visualization and statistics were performed in R [v4.4.3] using phyloseq [v1.50.0].


#Quantification of short chain fatty acids by means of ultra-high-performance-liquid chromatography-mass spectrometry (UHPLC-MS/MS)
#SCFAs were quantified as previously described. Serum samples (25 µL) were homogenized in methanol (125 µL) for 20 min at 10 °C using a Thermomix (Eppendorf, Hamburg, Germany). After centrifugation, the clear supernatant was subjected to derivatization.


Alpha diversity (Shannon):
I ran Wilcoxon rank-sum tests for all pairwise comparisons with BH correction (see Shannon_Group1_2_5_6_pairwise_wilcox_BH.xlsx). None of the pairwise contrasts reach p-adj < 0.05, but three comparisons show a clear trend (p-adj = 0.056):

    Group1 vs Group2 (n=4 vs n=6): raw p=0.025, p-adj=0.056

    Group2 vs Group5 (n=6 vs n=3): raw p=0.028, p-adj=0.056

    Group5 vs Group6 (n=3 vs n=8): raw p=0.019, p-adj=0.056

Although these results do not pass the adjusted 0.05 threshold, the differences may still be biologically meaningful. Increasing the sample size would improve statistical power and help confirm whether these groups truly differ.

Beta diversity (Bray–Curtis / PERMANOVA):
See Bray_pairwise_PERMANOVA.xlsx.

PCoA plots:

    PCoA3_Group1_2_5_6.png / .svg

    PCoA3_Group1_vs_Group5.png / .svg


你好，Jiabin，
按照约定，这里是 Epidome 项目后续步骤的总结！
20 --> 21 名患者！

    A 和 H 样本集是来自两个患者队列的鼻拭子，A 组来自动脉瘤患者（15名患者），H 组来自垂体手术患者（20名患者）。
    没错，如果有超过三个时间点的情况，你可以忽略从第4个时间点开始的所有数据！

    第一个数字是患者编号，第二个数字是拭子采集的时间点：.1 代表入院时，.2 代表手术时，.3 代表出院时。
    在 A 组和 H 组内部，应比较时间点 .1、.2 和 .3 之间 MLSTs（?=ST）和 16S 的 α 多样性（alpha diversity）。
    此外，每个时间点都应在 A 组和 H 组之间进行比较（A*.1 对比 H*.1，A*.2 对比 H*.2，以及 A*.3 对比 H*.3）。

    最后，应将两组数据合并后比较这三个时间点（A+H*.1 对比 A+H*.2 对比 A+H*.3）。

    此外，还需要对 A 和 H 样本中的所有 MLST 进行 goeBURST DLV 分析，以鉴定表皮葡萄球菌（S. epidermidis）菌株的克隆簇（clonal clusters）。
    大部分 A 样本在 2022 年就已经完成了测序，剩余的 A 样本与所有 H 样本一起在我休假前的 10 月份进行了测序。
    这是 H 样本和新 A 样本的链接：https://public.leibniz-liv.de/sharing/3cyImAWZ7
    附件是样本列表，其中包含一些关于患者的编码信息。

亚琛（Aachen）1-22 号样本不属于 A 和 H 样本集，所以你也可以忽略它们 :)
我不确定哪个样本集被称为“HDRNA 1-20 号患者的鼻子”……据我所知，它们与亚琛 1-22 号样本并不对应。

#15
"A1.1","A1.2","A1.3",
"A2.1","A2.2","A2.3",
"A3.1","A3.2","A3.3",
"A4.1","A4.2","A4.3",
"A5.1","A5.2","A5.3",
"A10.1","A10.2","A10.3",
"A12.1","A12.2","A12.3",
"A17.1","A17.2","A17.3",
"A20.1","A20.2","A20.3",
"A21.1","A21.2","A21.3",

"A22.1","A22.2","A22.3",
"A24.1","A24.2","A24.3",
"A25.1","A25.2","A25.3",
"A27.1","A27.2","A27.3",
"A28.1","A28.2","A28.3",

#21
"H11.1","H11.2","H11.3",
"H13.1","H13.2","H13.3",
"H14.1","H14.2","H14.3",
"H16.1","H16.2","H16.3",
"H19.1","H19.2","H19.3",

"H24.1","H24.2","H24.3",
"H53.1","H53.2","H53.3",
"H56.1","H56.2","H56.3",
"H57.1","H57.2","H57.3",
"H59.1","H59.2","H59.3",

"H60.1","H60.2","H60.3",
"H64.1","H64.2","H64.3",
"H65.1","H65.2","H65.3",
"H67.1","H67.2","H67.3",
"H72.1","H72.2","H72.3",

"H73.1","H73.2","H73.3",
"H76.1","H76.2","H76.3",
"H78.1","H78.2","H78.3",
"H91.1","H91.2","H91.3",
"H94.1","H94.2","H94.3",
"H99.1","H99.2","H99.3"


sigtab = cbind(as(sigtab, "data.frame"), as(phyloseq::tax_table(ps_deseq_sel2)[rownames(sigtab), ], "matrix"))
# file base name
fname <- "DEGs_admission_vs_surgery_in_aneurysm"
write.xlsx(sigtab, file = paste0(fname, ".xlsx"), rowNames = TRUE)

kable(sigtab) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

library("ggplot2")
theme_set(theme_bw())
scale_fill_discrete <- function(palname = "Set1", ...) {
    scale_fill_brewer(palette = palname, ...)
}
x = tapply(sigtab$log2FoldChange, sigtab$Order, function(x) max(x))
x = sort(x)
sigtab$Order = factor(as.character(sigtab$Order), levels=names(x))
x = tapply(sigtab$log2FoldChange, sigtab$Family, function(x) max(x))
x = sort(x)
sigtab$Family = factor(as.character(sigtab$Family), levels=names(x))

#ggplot(sigtab, aes(x=log2FoldChange, y=Family, color=Order)) + geom_point(aes(size=padj)) + scale_size_continuous(name="padj",range=c(8,4))+
  theme(axis.text.x = element_text(angle = -25, hjust = 0, vjust=0.5))
# build the plot
p <- ggplot(sigtab, aes(x = log2FoldChange, y = Family, color = Class)) +
  geom_point(aes(size = padj)) +
  scale_size_continuous(name = "padj", range = c(8, 4)) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = -25, hjust = 0, vjust = 0.5))
# SVG (svglite gives crisp text)
if (!requireNamespace("svglite", quietly = TRUE)) install.packages("svglite")
ggplot2::ggsave(paste0(fname, ".svg"), plot = p, device = svglite::svglite,
                width = 8, height = 6, units = "in", dpi = 300)
# PNG
ggplot2::ggsave(paste0(fname, ".png"), plot = p, device = "png",
                width = 8, height = 6, units = "in", dpi = 300)


# --

Hi Holger and Anna,

Sorry for the delay. I will put more focus on the DYNAMIP project this week. I hope to get the updated analysis done and send over the results by next week.

Best,
Jiabin


write a mail to Anna, say the
你好，Jiabin，
Holger 把 DYNAMIP 研究的分析和脚本发给了我，想看看我有没有什么额外的想法。看起来非常棒，我也很高兴能看到代码，这对学习也很有帮助。
我有一些问题和建议，很好奇你的想法。
关于 16S 分析：

    1. html 文档中的第 5 和第 6 点：是否可以进行差异丰度分析或成分数据分析（compositional data analysis），以探究在入院与手术、手术与出院、以及入院与出院的对比中，哪些纲（classes）的丰度发生了变化？

    2.：能否只比较 A 组和 H 组在手术时间点的差异？我觉得潜在的差异可能被稀释了，因为入院和出院时并没有显示出差异。

    3：在比较不同时间点的 α 多样性时，我认为配对 t 检验可能更合适（而且统计功效也更高）。

    4：由于我们没有观察到表皮葡萄球菌 ST（序列型）有显著变化，我在想是否值得尝试将丰度数据转换为二元（存在/缺失）矩阵，专门针对那些此前被认为与医疗相关的特定 ST（如 ST2、ST5 和 ST23），并分析这三个时间点的检出率比例，或许可以基于 McNemar 检验进行分析？

我很想知道你对这些想法的看法。
期待很快和你交流，
Anna


```{r, echo=FALSE, warning=FALSE}
# --- DESCRIPTIVE TEXT FOR THE HTML REPORT ---
cat("### Statistical Analysis of Healthcare-Associated ST Prevalence\n\n")
cat("To evaluate temporal changes in the colonization of specific healthcare-associated Sequence Types (STs), we analyzed the binary presence/absence data across three clinical timepoints (admission, surgery, discharge). \n\n")
cat("- **Overall Assessment**: Cochran's Q test was used to determine if there was a significant overall change in prevalence across the three timepoints.\n")
cat("- **Pairwise Comparisons**: For specific timepoint transitions, paired McNemar tests were conducted. P-values were adjusted for multiple comparisons using the Benjamini-Hochberg (BH) method to control the False Discovery Rate (FDR).\n")
cat("- **Note on Results**: An adjusted p-value of `1.0` or `ns` (not significant) indicates that the ST showed no variation (e.g., it was either always absent or always present) between the two compared timepoints, meaning no statistical change occurred.\n\n")
```


for (st in available_sts) {
  cat(sprintf("**%s**: ", st))
  res <- results_list[[st]]$Cochran_Q
  if (!is.na(res$p.value)) {
    cat(sprintf("Q = %.3f, df = %d, p = %.4f\n", res$statistic, res$parameter, res$p.value))
  } else {
    cat("Could not be computed (insufficient variation or missing data).\n")
  }
}

# ==============================================================================
# 3. Statistical Analysis: Cochran's Q test + Paired McNemar test (with BH adjustment)
# ==============================================================================
results_list <- list()

for (st in available_sts) {
  st_data <- df_long %>% filter(ST == st)

  # 3.1 Overall test: Cochran's Q test (assesses differences across all 3 timepoints)
  cochran_res <- tryCatch({
    RVAideMemoire::cochran.qtest(Presence ~ Timepoint | PatientID, data = st_data)
  }, error = function(e) {
    return(data.frame(p.value = NA, note = "Incomplete data or zero variance, cannot compute"))
  })

  # 3.2 Pairwise comparisons: Paired McNemar test + BH adjustment
  pairwise_res <- tryCatch({
    st_data %>%
      rstatix::pairwise_mcnemar_test(Presence ~ Timepoint, subject = PatientID, p.adjust.method = "BH") %>%
      mutate(ST = st) # Add ST column for merging results
  }, error = function(e) {
    return(data.frame(ST = st, group1 = NA, group2 = NA, p = NA, p.adj = NA, p.signif = NA))
  })

  results_list[[st]] <- list(
    Cochran_Q = cochran_res,
    Pairwise_McNemar = pairwise_res
  )
}

# ==============================================================================
# 4. Results Presentation
# ==============================================================================
# Combine pairwise comparison results for all STs
final_pairwise_results <- bind_rows(lapply(results_list, function(x) x$Pairwise_McNemar))

# Print a formatted statistical results table
knitr::kable(final_pairwise_results, digits = 4,
             col.names = c("ST", "Timepoint 1", "Timepoint 2", "Raw p-value", "Adjusted p-value (BH)", "Significance")) %>%
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

# Print the overall Cochran's Q test results for each ST
for (st in available_sts) {
  cat(sprintf("\n--- Overall Test: %s (Cochran's Q Test) ---\n", st))
  print(results_list[[st]]$Cochran_Q)
}

#----


library(rstatix)
library(dplyr)
library(knitr)
library(kableExtra)

# 2. Post-hoc pairwise comparisons using paired t-tests
pw_t_paired <- div.df2 %>%
  pairwise_t_test(Shannon ~ Group, paired = TRUE, p.adjust.method = "BH") %>%
  # Overwrite the p.adj.signif column with your custom rules
  mutate(
    p.adj.signif = case_when(
      p.adj < 0.001 ~ "***",
      p.adj < 0.01  ~ "**",
      p.adj < 0.05  ~ "*",
      TRUE          ~ "ns"
    )
  )

knitr::kable(pw_t_paired, digits = 2) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))


# 3. Robustness check: Paired Wilcoxon signed-rank test (non-parametric alternative)
pw_wilcox_paired <- div.df2 %>%
  pairwise_wilcox_test(Shannon ~ Group, paired = TRUE, p.adjust.method = "BH") %>%
  # Overwrite the p.adj.signif column with your custom rules
  mutate(
    p.adj.signif = case_when(
      p.adj < 0.001 ~ "***",
      p.adj < 0.01  ~ "**",
      p.adj < 0.05  ~ "*",
      TRUE          ~ "ns"
    )
  )

knitr::kable(pw_wilcox_paired, digits = 2) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))


1. For the "Materials and Methods" Section
This text explains the rationale and execution of your statistical tests.

    Statistical Analysis of Longitudinal Alpha Diversity

    To evaluate temporal changes in microbial alpha diversity (Shannon index) within the aneurysm cohort across three clinical timepoints (admission, surgery, and discharge), a Friedman test was employed. The Friedman test is a non-parametric alternative to one-way repeated-measures ANOVA, specifically designed for comparing three or more paired or matched groups, making it ideal for longitudinal microbiome data that often deviates from normality.

    To identify specific temporal shifts between individual timepoints, post-hoc pairwise comparisons were conducted using paired t-tests.

    To ensure the robustness of these findings and account for any potential violations of normality in the pairwise differences, paired Wilcoxon signed-rank tests were simultaneously performed as a non-parametric confirmatory analysis. All post-hoc P-values were adjusted for multiple hypothesis testing using the Benjamini-Hochberg (BH) false discovery rate (FDR) procedure. Statistical significance was defined as an adjusted P-value (P~adj~) < 0.05. Data were visualized using boxplots, and statistical significance brackets were overlaid to denote the pairwise comparison P-values derived from the paired t-tests.

2. For the "Results" Section
This text interprets the specific data outputs from your tables.

    Longitudinal Shifts in Alpha Diversity Within the Aneurysm Cohort
    We assessed the longitudinal dynamics of the nasal microbiome in 15 aneurysm patients across three clinical stages. The overall variation in Shannon diversity across the three timepoints (admission, surgery, and discharge) did not reach strict statistical significance, though it exhibited a strong trend (Friedman test, P = 0.08).
    To dissect this trend, post-hoc pairwise comparisons were performed. Both parametric (paired t-test) and non-parametric (paired Wilcoxon) frameworks consistently identified a notable reduction in microbial diversity from admission to the time of surgery (raw P = 0.02 for both tests). Following Benjamini-Hochberg correction for multiple comparisons, this difference remained borderline significant (BH-adjusted P = 0.05). Furthermore, a similar directional trend was observed when comparing the surgery and discharge timepoints, indicating a potential shift or recovery phase post-surgery (paired t-test: raw P = 0.06, P~adj~ = 0.09; paired Wilcoxon: P~adj~ = 0.10). No significant difference in diversity was observed between the baseline (admission) and the final recovery stage (discharge) for either test (both P~adj~ > 0.89), suggesting that the microbiome diversity ultimately returned to near-baseline levels by the time of discharge.

3. Key Statistical Takeaways (For your own understanding / presentation)
If you need to present this in a meeting or defend your statistical choices, here is the breakdown of why your results look the way they do:

    Why is the Friedman test P = 0.08 (not significant), but the pairwise P = 0.02 (significant)?
        The Friedman test evaluates the global variance across all three groups simultaneously. It is a conservative omnibus test.
        The paired t-test isolates two specific groups (Admission vs. Surgery). Because it focuses only on that specific contrast, it has higher statistical power to detect a difference between those two exact timepoints, even if the overall 3-group variance isn't massive.
    Why use both Paired t-test and Paired Wilcoxon?
        Reviewers often question parametric tests (t-tests) on microbiome data because diversity indices aren't always perfectly normally distributed.
        By showing that both tests yield the exact same raw P-value (0.02 for Admission vs. Surgery; 0.06 for Surgery vs. Discharge), you prove that your finding is highly robust and not an artifact of assuming a normal distribution.
    *The Multiple Testing Correction (P~adj~):*
        Your raw P-value of 0.02 is very strong. However, because you ran 3 separate pairwise tests, the BH correction penalizes the P-value slightly, pushing it to exactly 0.05. Describing this as a "notable reduction... borderline significant after correction" is the most scientifically accurate and honest way to report it without overstating the findings.


### 1. Important Statistical Clarification: Independent vs. Paired t-test

Before updating the code, there is a crucial statistical distinction to make regarding your request for a **"paired t-test"**.

*   **Paired t-test** is used when you are comparing **the same subjects** under two different conditions (e.g., comparing Admission vs. Surgery *within* the Aneurysm cohort, which we did in the previous steps).
*   **Independent two-sample t-test (Student's t-test)** is used when you are comparing **two completely different groups of subjects** (e.g., comparing the Aneurysm cohort vs. the Hypophysis cohort).

Since the Aneurysm patients and Hypophysis patients are different individuals, **you must use an independent t-test, NOT a paired t-test.** Using a paired t-test here would result in a statistical error because the software wouldn't know how to pair a patient from group A with a patient from group H.

I have updated the code below to use the **independent t-test** (`method = "t.test"`) alongside your existing Wilcoxon rank-sum test.

---

### 2. Answering your question about the "Surgery ONLY" code

You asked: *"In the second block, do we really only used the timpoint surgery? I didn't find corresponding code."*

**Yes, the code is absolutely there!** It is this specific line in the second block:
```r
div.df_surgery <- div.df %>% filter(SampleType == "surgery")
```
*   `div.df` contains **all** timepoints (admission, surgery, discharge).
*   The `filter(SampleType == "surgery")` function strictly deletes all rows except those where the `SampleType` is exactly `"surgery"`.
*   Therefore, the new dataframe `div.df_surgery` (which is then assigned to `div.df2` for the rest of the block) **only contains surgery timepoint data**. The statistical test and the plot are then run exclusively on this filtered data.

---

### 3. Updated Code for Both Blocks

Here is the updated code. I have added the independent t-test calculations and tables to both blocks, and updated the plots to display the t-test p-values (though you can easily change it back to `wilcox.test` in the plot if you prefer).

### Summary:
1.  **Corrected Statistical Test:** Added `compare_means(..., method = "t.test")` to perform the **independent two-sample t-test** (which is the correct parametric test for two separate cohorts).
2.  **Dual Reporting:** The code prints both the Wilcoxon (non-parametric) and t-test (parametric) tables.
3.  **Plot Update:** The `stat_compare_means` in the plots was updated to `method = "t.test"` so the visual asterisks match the new parametric test. (If you prefer the plot to show the Wilcoxon p-value, simply change `method = "t.test"` back to `method = "wilcox.test"` in the plot code).


Here is the fully updated explanation and R code in English, incorporating your co-author's suggestions and ensuring the correct statistical tests are applied.

1. Statistical Rationale

Two-group comparisons (Wilcoxon vs. Kruskal-Wallis):
Yes, using the Wilcoxon rank-sum test (also known as the Mann-Whitney U test) for the two-group comparison (Aneurysm vs. Hypophysis) is perfectly fine and is the standard practice. When comparing only two independent groups, the Kruskal-Wallis test is mathematically equivalent to the Wilcoxon test, but the Wilcoxon test is the conventional and preferred choice in microbiome research.

2. Addressing your co-author's suggestions:

    * Suggestion 7.1 (Comparing A vs. H at the surgery timepoint only): This is a highly valid and insightful point. Comparing all timepoints together might dilute the acute differences caused by the specific surgical sites. By isolating the "surgery" timepoint, you can capture the peak divergence between the aneurysm and hypophysis cohorts without the "noise" from the admission and discharge timepoints where they might be more similar.
    * Suggestion 7.2 (Using paired tests for longitudinal data): This is also statistically sound. Since the same patients are sampled longitudinally (admission, surgery, discharge), the data has a repeated-measures design. Using paired tests accounts for individual baseline differences (e.g., some patients naturally have higher diversity than others), thereby significantly increasing statistical power.
        For the overall 3-timepoint comparison, the Friedman test (the non-parametric equivalent of repeated-measures ANOVA) is the correct choice.
        For the post-hoc pairwise comparisons, we will use the paired t-test (as requested by your co-author) and the paired Wilcoxon signed-rank test (as a robust non-parametric alternative).


---

### 3. Suggested Text for the Manuscript Methods Section

Since you have adopted your co-author's suggestions, you should update the statistical methods description in your manuscript to accurately reflect these rigorous choices:

> "Alpha diversity (Shannon index) was compared between the aneurysm and hypophysis cohorts using the **Wilcoxon rank-sum test**. To evaluate temporal changes within each cohort across the three clinical stages (admission, surgery, and discharge), **Friedman tests** (the non-parametric equivalent of repeated-measures ANOVA) were performed to account for the longitudinal, paired study design. Post-hoc pairwise comparisons were conducted using **paired t-tests** (and verified with **paired Wilcoxon signed-rank tests**), with *P*-values adjusted for multiple testing using the Benjamini-Hochberg false discovery rate (FDR) procedure. Additionally, to isolate the acute impact of the surgical site and prevent dilution of the signal by baseline or recovery timepoints, Shannon diversity was specifically compared between the two cohorts **at the surgery timepoint only** using the Wilcoxon rank-sum test."


#DELETED_CODE:

## Alpha diversity between aneurysm and hypophysis (All timepoints)

```{r, echo=FALSE, warning=FALSE}
hmp.div_qiime <- read.csv("adiv_even.txt", sep="\t")
colnames(hmp.div_qiime) <- c("sam_name", "chao1", "observed_otus", "shannon", "PD_whole_tree")
div.df <- merge(hmp.div_qiime, hmp.meta, by = "sam_name")

# Extract only Shannon values and grouping variable
div.df2 <- div.df[, c("sam_name", "PatientType", "shannon")]
colnames(div.df2) <- c("Sample", "Group", "Shannon")
```

Two-group comparison: Wilcoxon rank-sum test (Mann-Whitney U test)

```{r, echo=FALSE, warning=FALSE}
stat.test.Shannon <- compare_means(Shannon ~ Group, data = div.df2, method = "wilcox.test")
knitr::kable(stat.test.Shannon) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

# 2. Create boxplot
p <- ggboxplot(div.df2, x = "Group", y = "Shannon", fill = "Group",
               palette = c("aneurysm" = "#E69F00", "hypophysis" = "#56B4E9"),
               width = 0.5, legend = "none") +
  theme(axis.text.x = element_text(angle = 30, hjust = 1)) +
  labs(title = "Shannon Diversity: Aneurysm vs Hypophysis (All timepoints)") +
  stat_compare_means(comparisons = list(c("aneurysm", "hypophysis")),
                     method = "wilcox.test", label = "p.signif", size = 5)

print(p)
ggsave("./figures/alpha_diversity_Shannon_PatientType.png", device="png", height = 6, width = 6)
ggsave("./figures/alpha_diversity_Shannon_PatientType.svg", device="svg", height = 6, width = 6)
```


## NEW: Aneurysm vs. Hypophysis AT SURGERY ONLY (Addressing Suggestion 7.1)
*New block: Specifically filters for the surgery timepoint to compare the two cohorts without dilution.*

```{r, echo=TRUE, warning=FALSE}
hmp.div_qiime <- read.csv("adiv_even.txt", sep="\t")
colnames(hmp.div_qiime) <- c("sam_name", "chao1", "observed_otus", "shannon", "PD_whole_tree")
div.df <- merge(hmp.div_qiime, hmp.meta, by = "sam_name")

# Core modification: Filter strictly for the surgery timepoint samples
div.df_surgery <- div.df %>% filter(SampleType == "surgery")

div.df2 <- div.df_surgery[, c("sam_name", "PatientType", "shannon")]
colnames(div.df2) <- c("Sample", "Group", "Shannon")

# 1. Two-group comparison: Wilcoxon rank-sum test
stat.test.Shannon <- compare_means(Shannon ~ Group, data = div.df2, method = "wilcox.test")
knitr::kable(stat.test.Shannon) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

# 2. Plotting
p <- ggboxplot(div.df2, x = "Group", y = "Shannon", fill = "Group",
               palette = c("aneurysm" = "#E69F00", "hypophysis" = "#56B4E9"),
               width = 0.5, legend = "none") +
  theme(axis.text.x = element_text(angle = 30, hjust = 1)) +
  labs(title = "Shannon Diversity: Aneurysm vs Hypophysis (Surgery timepoint ONLY)") +
  stat_compare_means(comparisons = list(c("aneurysm", "hypophysis")),
                     method = "wilcox.test", label = "p.signif", size = 5)

print(p)
ggsave("./figures/alpha_diversity_Shannon_Surgery_ONLY.png", device="png", height = 6, width = 6)
ggsave("./figures/alpha_diversity_Shannon_Surgery_ONLY.svg", device="svg", height = 6, width = 6)
```


#TODO_AFTERNOON:
1. 今天先看一下工作量，如果可以就回信，明天 说希望下个星期前把结果给发回去。

#DEBUG:
R --no-restore
rmarkdown::render('Phyloseq.Rmd', output_file='Phyloseq.html')

install.packages("BSDA")


#-->92-301
#yycH: 476 + 44      -->minLen=200 (76)
#g216: 448 + 44      -->minLen=185 (78)
#16S: 410-427 + 44   -->minLen=170 (70)
#360/2=180 *
#200 and 200 *
>seq1;
TGGGTATGGCAATCACTTTACA
AGAATTCTATATTAAAGATGTTCTAATTGTGGAAAAGGGATCCATCGGTCATTCATTTAAACATTGGCCTCTATCAACAAAGACCATCACACCATCATTTACAACTAATGGTTTTGGCATGCCAGATATGAATGCAATAGCTAAAGATACATCACCTGCCTTCACTTTCAATGAAGAACATTTGTCTGGAAATAATTACGCTCAATACATTTCATTAGTAGCTGAGCATTACAATCTAAATGTCAAAACAAATACCAATGTTTCACGTGTAACATACATAGATGGTATATATCATGTATCAACGGACTATGGTGTTTATACCGCAGATTATATATTTATAGCAACTGGAGACTATTCATTCCCATATCATCCTTTTTCATATGGACGTCATTACAGTGAGATTCGAGCGTTCACTCAATTAAACGGTGACGCCTTTACAATTATTGGA GGTAATGAGAGTGCTTTTGATGC

>M03701:292:000000000-K9M88:1:1101:10277:1358:AAGAGGCA+TATCCTCT
TGGGTATGRCAATCACTTTACA
AGAATTCAATATTAAAGATGTTCTAATTGTTGAAAAGGGAACCATCGGTCATTCATTTAAACATTGGCCTCTATCAACAAAGACCATCACACCATCATTTACAACTAATGGTTTTGGCATGCCAGATATGAATGCAATAGCTAAAGATACATCACCTGCCTTCACTTTCAATGAAGAACATTTATCTGGAAAACGTTATGCTGAATACCTCTCACTAGTAGCTACGCATTACAATCTAAATGGCAAAACAAACACCAATGTTTCACGTGTAACATACATAGATGGTGTATATCATGTATCAACGGACTATGGTGTTTATACCGCAGATTATATATTTATAGCAACTGGAGACTATTCATTCCCATATCATCCTTTATCATATGGACGTCATTACAGTGAAATTCAAACATTCACTCAATTAAAAGGTGATGCTTTTACAATCATTGGT GGTAATGAGAGTGCTTTTGATGC