Workflow using QIIME2 for Data_Karoline_16S_2025

Install and test qiime2-docker

 #Cannot run under QIIME1, switch to QIIME2: pick_open_reference_otus.py -r/home/jhuang/REFs/SILVA_132_QIIME_release/rep_set/rep_set_16S_only/99/silva_132_99_16S.fna -i test.fna -o clustering_test/ -p clustering_params.txt --parallel --verbose

 docker pull quay.io/qiime2/core:2023.9

 docker run -it --rm \
 -v /mnt/md1/DATA/Data_Marius_16S_2025:/data \
 -v /home/jhuang/REFs:/home/jhuang/REFs \
 quay.io/qiime2/core:2023.9 bash
 cd /data

Import the fastq-files to paired-end-demux.qza

 #https://docs.qiime2.org/2018.8/tutorials/importing/

 qiime tools import --type 'SampleData[PairedEndSequencesWithQuality]' --input-path pe-33-manifest --output-path paired-end-demux.qza --input-format PairedEndFastqManifestPhred33
 #--> 1095204304 Mai 27 15:11 paired-end-demux.qza

 qiime demux summarize \
 --i-data paired-end-demux.qza \
 --o-visualization demux_pe.qzv

 #https://view.qiime2.org
 #qiime tools view demux_pe.qzv

Optimizing the parameters trunc-len-f and trunc-len-r and denoising with DADA2: optimized parameters is f240_r240

 #Your amplicon (V3–V4 region) is ~464 bp, so you need ≥20–30 bp overlap
 #464-38=426; 440 is the longst +12 nt for overlapping=we need 452 nt!

 #Optimize the parameters --p-trunc-len-f and --p-trunc-len-r
 ./dada2_batch_test.sh

         #!/bin/bash

         # Set your base inputs
         INPUT=paired-end-demux.qza
         TRIM_LEFT_F=17
         TRIM_LEFT_R=21

         # Output base
         OUTPUT_DIR=dada2_tests
         mkdir -p $OUTPUT_DIR

         # Loop over trunc-len-f and trunc-len-r combinations
         # Forward: from 220 to 240
         # Reverse: from 210 to 230
         i=1
         for f in 240 235 230 225; do
         for r in 225 220 215; do
             OUT=test_${i}_f${f}_r${r}
             echo "Running: $OUT"
             mkdir -p $OUTPUT_DIR/$OUT

             qiime dada2 denoise-paired \
             --i-demultiplexed-seqs $INPUT \
             --p-trim-left-f $TRIM_LEFT_F \
             --p-trim-left-r $TRIM_LEFT_R \
             --p-max-ee-f 3 --p-max-ee-r 5 \
             --p-trunc-len-f $f \
             --p-trunc-len-r $r \
             --p-n-threads 32 \
             --o-table $OUTPUT_DIR/$OUT/table.qza \
             --o-representative-sequences $OUTPUT_DIR/$OUT/rep-seqs.qza \
             --o-denoising-stats $OUTPUT_DIR/$OUT/denoising-stats.qza \
             --verbose > $OUTPUT_DIR/$OUT/log.txt 2>&1

             ((i++))
         done
         done

 for f in dada2_tests2/test_*/denoising-stats.qza; do
 qiime metadata tabulate \
     --m-input-file $f \
     --o-visualization ${f%.qza}.qzv
 done

 #pandaseq.out: grep ">" A1_R1.fastq.gz_merged.fasta | wc -l #8229;  grep ">" A10_R1.fastq.gz_merged.fasta | wc -l #9165

 # The best choice is f251_r251, since the first 17 and 21 bases with bad quality are anyway removed!
 #f251_r251: sample-A1   18299   10989   60.05   10609   7129    38.96   6535    35.71;  sample-A10  18736   12249   65.38   11778   7444    39.73   6978    37.24
 #f251_r250: sample-A1   18299   11855   64.78   11435   7431    40.61   6823    37.29;  sample-A10  18736   13092   69.88   12590   7714    41.17   7206    38.46
 #f251_r245: sample-A1   18299   12642   69.09   12180   7860    42.95   7193    39.31;  sample-A10  18736   13981   74.62   13457   8218    43.86   7649    40.83
 #f251_r240: sample-A1   18299   12678   69.28   12214   8060    44.05   7387    40.37;  sample-A10  18736   14018   74.82   13498   8412    44.9    7758    41.41

 #f250_r240: sample-A1   18299   13705   74.89   13191   8796    48.07   7984    43.63
 #f250_r235: sample-A1   18299   13716   74.95   13198   8793    48.05   7969    43.55
 #f250_r230: sample-A1   18299   13739   75.08   13217   9023    49.31   8113    44.34;  sample-A10  18736   14838   79.2    14159   8895    47.48   8101    43.24
 #f245_r240: sample-A1   18299   14513   79.31   14019   9472    51.76   8739    47.76;  sample-A10  18736   15609   83.31   15102   9605    51.26   8880    47.4
 #f245_r235: sample-A1   18299   14524   79.37   14026   9485    51.83   8746    47.79;  sample-A10  18736   15634   83.44   15127   9685    51.69   8869    47.34
 #f245_r230: sample-A1   18299   14547   79.5    14045   8845    48.34   8058    44.04;  sample-A10  18736   15664   83.6    15156   8812    47.03   7999    42.69
 #f240_r240: sample-A1   18299   14647   80.04   14164   9869    53.93   8932    48.81;  sample-A10  18736   15728   83.95   15213   10488   55.98   9081    48.47 *
 #f240_r235: sample-A1   18299   14661   80.12   14172   9125    49.87   8194    44.78;  sample-A10  18736   15755   84.09   15242   9579    51.13   8105    43.26
 #f240_r230: sample-A1   18299   14686   80.26   14191   4952    27.06   4666    25.5;   sample-A10  18736   15785   84.25   15267   3489    18.62   3341    17.83

 #f240_r225: sample-A1   18299   14701   80.34   14206   4575    25  4297    23.48
 #f240_r220: sample-A1   18299   14720   80.44   14223   4588    25.07   4310    23.55
 #f240_r225: sample-A1   18299   14747   80.59   14250   3976    21.73   3758    20.54
 #f230_r220: sample-A1   18299   14972   81.82   14514   3   0.02    3   0.02

 #The output of the optimized denoising: (*) dada2_tests2/test_7_f240_r240/table.qza and dada2_tests2/test_7_f240_r240/rep-seqs.qza.

Visualize outputs

 qiime feature-table summarize \
 --i-table dada2_tests2/test_7_f240_r240/table.qza \
 --o-visualization table.qzv \
 --m-sample-metadata-file qiime2_metadata.tsv

 #Table summary
 #Metric Sample
 #Number of samples  137
 #Number of features 3,039
 #Total frequency    1,641,484
 #
 #Frequency per sample
 #Minimum frequency  413.0
 #1st quartile   10,319.0
 #Median frequency   11,530.0
 #3rd quartile   13,146.0
 #Maximum frequency  40,022.0
 #Mean frequency 11,981.635036496351
 #
 #Frequency per feature
 #Minimum frequency  1.0
 #1st quartile   3.0
 #Median frequency   8.0
 #3rd quartile   95.5
 #Maximum frequency  56,472.0
 #Mean frequency 540.1395195788089

 #qiime tools peek table.qza
 #qiime tools peek qiime2_metadata.tsv

 qiime feature-table tabulate-seqs \
 --i-data dada2_tests2/test_7_f240_r240/rep-seqs.qza \
 --o-visualization rep-seqs.qzv

 qiime metadata tabulate \
 --m-input-file dada2_tests2/test_7_f240_r240/denoising-stats.qza \
 --o-visualization denoising-stats.qzv

Import reference sequences and taxonomy (SILVA 132)

 qiime tools import \
 --type 'FeatureData[Sequence]' \
 --input-path /home/jhuang/REFs/SILVA_132_QIIME_release/rep_set/rep_set_16S_only/99/silva_132_99_16S.fna \
 --output-path silva_132_99_otus.qza \
 --input-format DNAFASTAFormat

 qiime tools import \
 --type 'FeatureData[Taxonomy]' \
 --input-format HeaderlessTSVTaxonomyFormat \
 --input-path /home/jhuang/REFs/SILVA_132_QIIME_release/taxonomy/16S_only/99/consensus_taxonomy_7_levels.txt \
 --output-path silva_132_99_taxonomy.qza

Assign taxonomy

 qiime feature-classifier classify-consensus-vsearch \
 --i-query dada2_tests2/test_7_f240_r240/rep-seqs.qza \
 --i-reference-reads silva_132_99_otus.qza \
 --i-reference-taxonomy silva_132_99_taxonomy.qza \
 --p-perc-identity 0.97 \
 --p-threads 64 \
 --o-classification taxonomy.qza \
 --o-search-results search-results.qza

Visualize taxonomy

 qiime taxa barplot \
 --i-table dada2_tests2/test_7_f240_r240/table.qza \
 --i-taxonomy taxonomy.qza \
 --m-metadata-file qiime2_metadata.tsv \
 --o-visualization taxa-bar-plots.qzv

Build phylogenetic tree

 qiime alignment mafft \
 --i-sequences dada2_tests2/test_7_f240_r240/rep-seqs.qza \
 --o-alignment aligned-rep-seqs.qza

 qiime alignment mask \
 --i-alignment aligned-rep-seqs.qza \
 --o-masked-alignment masked-aligned-rep-seqs.qza

 qiime phylogeny fasttree \
 --i-alignment masked-aligned-rep-seqs.qza \
 --o-tree unrooted-tree.qza

 (*) qiime phylogeny midpoint-root \
 --i-tree unrooted-tree.qza \
 --o-rooted-tree rooted-tree.qza

Core diversity analysis

 #The -e 6389 flag sets the even sampling depth (rarefaction depth) to 6,389 reads for diversity analyses.
 #All samples will be rarefied to 4,753 reads.
 #Samples with fewer reads are excluded.
 qiime diversity core-metrics-phylogenetic \
 --i-phylogeny rooted-tree.qza \
 --i-table dada2_tests2/test_7_f240_r240/table.qza \
 --p-sampling-depth 6389 \
 --m-metadata-file qiime2_metadata.tsv \
 --output-dir core_metrics_results

 qiime diversity alpha \
 --i-table table.qza \
 --p-metric chao1 \
 --o-alpha-diversity core_metrics_results/chao1_vector.qza

 qiime tools export --input-path core_metrics_results/shannon_vector.qza --output-path exported_alpha/shannon
 qiime tools export --input-path core_metrics_results/faith_pd_vector.qza --output-path exported_alpha/faith_pd
 qiime tools export --input-path core_metrics_results/observed_features_vector.qza --output-path exported_alpha/observed_features
 qiime tools export --input-path core_metrics_results/chao1_vector.qza --output-path exported_alpha/chao1

 qiime tools export \
 --input-path core_metrics_results/unweighted_unifrac_distance_matrix.qza \
 --output-path exported_unweighted_unifrac
 qiime tools export \
 --input-path core_metrics_results/weighted_unifrac_distance_matrix.qza \
 --output-path exported_weighted_unifrac

 qiime diversity beta-group-significance \
 --i-distance-matrix core_metrics_results/weighted_unifrac_distance_matrix.qza \
 --m-metadata-file qiime2_metadata.tsv \
 --m-metadata-column Group \
 --p-pairwise \
 --p-method permanova \
 --o-visualization beta_group_significance.qzv

 qiime tools export \
 --input-path beta_group_significance.qzv \
 --output-path exported_beta_group

 #✅ Group 1 / Group 2 — The pairwise comparisons.
 #✅ Sample size — Number of samples used in the test.
 #✅ Permutations — Number of permutations in the PERMANOVA test.
 #✅ pseudo-F — The test statistic from PERMANOVA.
 #✅ p-value — The unadjusted p-value.
 #✅ q-value — The adjusted p-value (Bonferroni in QIIME2).
 #The q-value column (also sometimes called p-adj) is the multiple-testing corrected p-value.
 #👉 q < 0.05 means the difference is statistically significant between those two groups, even after correction.
 #“There is a significant difference in community composition between Group 1 and Group 2 (p=0.002).”

 #The --p-sampling-depth 6389 parameter is directly equivalent to QIIME 1’s -e 6389!
 #The QIIME 2 command will compute:
 #✅ Alpha diversity metrics (Observed OTUs, Shannon, Faith PD, Evenness)
 #✅ Beta diversity distance matrices (UniFrac, Bray-Curtis)
 #✅ PCoA plots
 #✅ Excludes samples with fewer than 6,389 reads.

 #📦 Output Folders and Files
 #Output Description
 #table.qzv  Visual of feature table (sample counts)
 #rep-seqs.qzv   Sequences per ASV
 #denoising-stats.qzv    DADA2 read tracking
 #taxonomy.qza/.qzv  Taxonomic classification of ASVs
 #taxa-bar-plots.qzv Interactive bar plots
 #core_metrics_results/  Alpha/beta diversity metrics + PCoA plots

Prepare three files feeding to Phyloseq.Rmd: table.qza (see above with ), rooted-tree.qza (see above with ), qiime2_metadata_for_qza_to_phyloseq.tsv edited from qiime2_metadata.tsv.

 # Rarefying can be performed here, or in Phyloseq.Rmd (default), therefore, we don't need this step any more.
 qiime feature-table summarize \
 --i-table core_metrics_results/rarefied_table.qza \
 --o-visualization rarefied_table.qzv \
 --m-sample-metadata-file qiime2_metadata.tsv

 #Table summary
 #Metric Sample
 #Number of samples  136
 #Number of features 2,781
 #Total frequency    868,904

 # In QIIME2, we need table.qza, not biom-file, therefore, we don't need this step any more.
 qiime tools export \
 --input-path core_metrics_results/rarefied_table.qza \
 --output-path exported_rarefied_table
 #-->
 exported_rarefied_table/feature-table.biom

 biom convert \
 -i exported_rarefied_table/feature-table.biom \
 -o exported_rarefied_table/feature-table.tsv \
 --to-tsv

 #✅ Old QIIME 1 table with GenBank IDs (like EF603722.1.1487) as feature labels.
 #✅ QIIME 2 table where feature IDs are hashes (like 0b438323a296b5f2ce2c8bbe3949ee8d).

 # Visulaize the taxonomy.qza
 qiime tools export \
 --input-path taxonomy.qza \
 --output-path exported-taxonomy

 #Feature ID    Taxon                               Confidence
 #0b4383...     k__Bacteria; p__Proteobacteria...   0.98
 #dfa833...     k__Bacteria; p__Firmicutes...       0.87
 #...

 # ---- I used the following to generate two file for feeding in the Phyloseq.Rmd ----

 #-1- exported_table/feature-table.biom corresesponds to table_even6389.biom in QIIME1, but in QIIME2, we don't need biom-file, instead of table.qza.

 qiime tools export \
 --input-path dada2_tests2/test_7_f240_r240/table.qza \
 --output-path exported_table
 #--> exported_table/feature-table.biom

 #-2- exported-tree/tree.nwk corresesponds to rep_set.tre in QIIME1

 qiime tools export \
 --input-path rooted-tree.qza \
 --output-path exported-tree
 #--> exported-tree/tree.nwk

 # ---- The code in Phyloseq.Rmd ----

 #install.packages("remotes")
 #remotes::install_github("jbisanz/qiime2R")
 #"core_metrics_results/rarefied_table.qza", rarefying performed in the code, therefore import the raw table.
 library(qiime2R)
 ps.ng.tax <- qza_to_phyloseq(
     features =  "dada2_tests2/test_7_f240_r240/table.qza",
     tree = "rooted-tree.qza",
     metadata = "qiime2_metadata_for_qza_to_phyloseq.tsv"
 )
 # or
 #biom convert \
 #      -i ./exported_table/feature-table.biom \
 #      -o ./exported_table/feature-table-v1.biom \
 #      --to-json
 #ps.ng.tax <- import_biom("./exported_table/feature-table-v1.biom", treefilename="./exported-tree/tree.nwk")

 #Note that the alpha- and beta-diversity-files needed in Phyloseq.Rmd has been prepared in the step 9.

Figures generated by Phyloseq.Rmd and MicrobiotaProcess_*.R

The following files can be found under server.

 ./Phyloseq.Rmd (Result Phyloseq.html)
 ./MicrobiotaProcess_cluster1_Group9-11_vs_cluster2_Group12-14_orig.R
 ./MicrobiotaProcess_Group1_vs_Group2.R
 ./MicrobiotaProcess_Group3_vs_Group4.R
 ./MicrobiotaProcess_PCA_Group1-4.R
 ./MicrobiotaProcess_PCA_Group9-14.R

Microbial bioinformatics

Microbial bioinformatics uses computational tools to analyze genomes, track evolution, and study functions in microorganisms, including bacteria and viruses.

Workflow using QIIME2 for Data_Karoline_16S_2025

Leave a Reply Cancel reply