Workflow using QIIME2 for Data_Karoline_16S_2025

  1. Install and test qiime2-docker

     #Cannot run under QIIME1, switch to QIIME2: pick_open_reference_otus.py -r/home/jhuang/REFs/SILVA_132_QIIME_release/rep_set/rep_set_16S_only/99/silva_132_99_16S.fna -i test.fna -o clustering_test/ -p clustering_params.txt --parallel --verbose
    
     docker pull quay.io/qiime2/core:2023.9
    
     docker run -it --rm \
     -v /mnt/md1/DATA/Data_Marius_16S_2025:/data \
     -v /home/jhuang/REFs:/home/jhuang/REFs \
     quay.io/qiime2/core:2023.9 bash
     cd /data
  2. Import the fastq-files to paired-end-demux.qza

     #https://docs.qiime2.org/2018.8/tutorials/importing/
    
     qiime tools import --type 'SampleData[PairedEndSequencesWithQuality]' --input-path pe-33-manifest --output-path paired-end-demux.qza --input-format PairedEndFastqManifestPhred33
     #--> 1095204304 Mai 27 15:11 paired-end-demux.qza
    
     qiime demux summarize \
     --i-data paired-end-demux.qza \
     --o-visualization demux_pe.qzv
    
     #https://view.qiime2.org
     #qiime tools view demux_pe.qzv
  3. Optimizing the parameters trunc-len-f and trunc-len-r and denoising with DADA2: optimized parameters is f240_r240

     #Your amplicon (V3–V4 region) is ~464 bp, so you need ≥20–30 bp overlap
     #464-38=426; 440 is the longst +12 nt for overlapping=we need 452 nt!
    
     #Optimize the parameters --p-trunc-len-f and --p-trunc-len-r
     ./dada2_batch_test.sh
    
             #!/bin/bash
    
             # Set your base inputs
             INPUT=paired-end-demux.qza
             TRIM_LEFT_F=17
             TRIM_LEFT_R=21
    
             # Output base
             OUTPUT_DIR=dada2_tests
             mkdir -p $OUTPUT_DIR
    
             # Loop over trunc-len-f and trunc-len-r combinations
             # Forward: from 220 to 240
             # Reverse: from 210 to 230
             i=1
             for f in 240 235 230 225; do
             for r in 225 220 215; do
                 OUT=test_${i}_f${f}_r${r}
                 echo "Running: $OUT"
                 mkdir -p $OUTPUT_DIR/$OUT
    
                 qiime dada2 denoise-paired \
                 --i-demultiplexed-seqs $INPUT \
                 --p-trim-left-f $TRIM_LEFT_F \
                 --p-trim-left-r $TRIM_LEFT_R \
                 --p-max-ee-f 3 --p-max-ee-r 5 \
                 --p-trunc-len-f $f \
                 --p-trunc-len-r $r \
                 --p-n-threads 32 \
                 --o-table $OUTPUT_DIR/$OUT/table.qza \
                 --o-representative-sequences $OUTPUT_DIR/$OUT/rep-seqs.qza \
                 --o-denoising-stats $OUTPUT_DIR/$OUT/denoising-stats.qza \
                 --verbose > $OUTPUT_DIR/$OUT/log.txt 2>&1
    
                 ((i++))
             done
             done
    
     for f in dada2_tests2/test_*/denoising-stats.qza; do
     qiime metadata tabulate \
         --m-input-file $f \
         --o-visualization ${f%.qza}.qzv
     done
    
     #pandaseq.out: grep ">" A1_R1.fastq.gz_merged.fasta | wc -l #8229;  grep ">" A10_R1.fastq.gz_merged.fasta | wc -l #9165
    
     # The best choice is f251_r251, since the first 17 and 21 bases with bad quality are anyway removed!
     #f251_r251: sample-A1   18299   10989   60.05   10609   7129    38.96   6535    35.71;  sample-A10  18736   12249   65.38   11778   7444    39.73   6978    37.24
     #f251_r250: sample-A1   18299   11855   64.78   11435   7431    40.61   6823    37.29;  sample-A10  18736   13092   69.88   12590   7714    41.17   7206    38.46
     #f251_r245: sample-A1   18299   12642   69.09   12180   7860    42.95   7193    39.31;  sample-A10  18736   13981   74.62   13457   8218    43.86   7649    40.83
     #f251_r240: sample-A1   18299   12678   69.28   12214   8060    44.05   7387    40.37;  sample-A10  18736   14018   74.82   13498   8412    44.9    7758    41.41
    
     #f250_r240: sample-A1   18299   13705   74.89   13191   8796    48.07   7984    43.63
     #f250_r235: sample-A1   18299   13716   74.95   13198   8793    48.05   7969    43.55
     #f250_r230: sample-A1   18299   13739   75.08   13217   9023    49.31   8113    44.34;  sample-A10  18736   14838   79.2    14159   8895    47.48   8101    43.24
     #f245_r240: sample-A1   18299   14513   79.31   14019   9472    51.76   8739    47.76;  sample-A10  18736   15609   83.31   15102   9605    51.26   8880    47.4
     #f245_r235: sample-A1   18299   14524   79.37   14026   9485    51.83   8746    47.79;  sample-A10  18736   15634   83.44   15127   9685    51.69   8869    47.34
     #f245_r230: sample-A1   18299   14547   79.5    14045   8845    48.34   8058    44.04;  sample-A10  18736   15664   83.6    15156   8812    47.03   7999    42.69
     #f240_r240: sample-A1   18299   14647   80.04   14164   9869    53.93   8932    48.81;  sample-A10  18736   15728   83.95   15213   10488   55.98   9081    48.47 *
     #f240_r235: sample-A1   18299   14661   80.12   14172   9125    49.87   8194    44.78;  sample-A10  18736   15755   84.09   15242   9579    51.13   8105    43.26
     #f240_r230: sample-A1   18299   14686   80.26   14191   4952    27.06   4666    25.5;   sample-A10  18736   15785   84.25   15267   3489    18.62   3341    17.83
    
     #f240_r225: sample-A1   18299   14701   80.34   14206   4575    25  4297    23.48
     #f240_r220: sample-A1   18299   14720   80.44   14223   4588    25.07   4310    23.55
     #f240_r225: sample-A1   18299   14747   80.59   14250   3976    21.73   3758    20.54
     #f230_r220: sample-A1   18299   14972   81.82   14514   3   0.02    3   0.02
    
     #The output of the optimized denoising: (*) dada2_tests2/test_7_f240_r240/table.qza and dada2_tests2/test_7_f240_r240/rep-seqs.qza.
  4. Visualize outputs

     qiime feature-table summarize \
     --i-table dada2_tests2/test_7_f240_r240/table.qza \
     --o-visualization table.qzv \
     --m-sample-metadata-file qiime2_metadata.tsv
    
     #Table summary
     #Metric Sample
     #Number of samples  137
     #Number of features 3,039
     #Total frequency    1,641,484
     #
     #Frequency per sample
     #Minimum frequency  413.0
     #1st quartile   10,319.0
     #Median frequency   11,530.0
     #3rd quartile   13,146.0
     #Maximum frequency  40,022.0
     #Mean frequency 11,981.635036496351
     #
     #Frequency per feature
     #Minimum frequency  1.0
     #1st quartile   3.0
     #Median frequency   8.0
     #3rd quartile   95.5
     #Maximum frequency  56,472.0
     #Mean frequency 540.1395195788089
    
     #qiime tools peek table.qza
     #qiime tools peek qiime2_metadata.tsv
    
     qiime feature-table tabulate-seqs \
     --i-data dada2_tests2/test_7_f240_r240/rep-seqs.qza \
     --o-visualization rep-seqs.qzv
    
     qiime metadata tabulate \
     --m-input-file dada2_tests2/test_7_f240_r240/denoising-stats.qza \
     --o-visualization denoising-stats.qzv
  5. Import reference sequences and taxonomy (SILVA 132)

     qiime tools import \
     --type 'FeatureData[Sequence]' \
     --input-path /home/jhuang/REFs/SILVA_132_QIIME_release/rep_set/rep_set_16S_only/99/silva_132_99_16S.fna \
     --output-path silva_132_99_otus.qza \
     --input-format DNAFASTAFormat
    
     qiime tools import \
     --type 'FeatureData[Taxonomy]' \
     --input-format HeaderlessTSVTaxonomyFormat \
     --input-path /home/jhuang/REFs/SILVA_132_QIIME_release/taxonomy/16S_only/99/consensus_taxonomy_7_levels.txt \
     --output-path silva_132_99_taxonomy.qza
  6. Assign taxonomy

     qiime feature-classifier classify-consensus-vsearch \
     --i-query dada2_tests2/test_7_f240_r240/rep-seqs.qza \
     --i-reference-reads silva_132_99_otus.qza \
     --i-reference-taxonomy silva_132_99_taxonomy.qza \
     --p-perc-identity 0.97 \
     --p-threads 64 \
     --o-classification taxonomy.qza \
     --o-search-results search-results.qza
  7. Visualize taxonomy

     qiime taxa barplot \
     --i-table dada2_tests2/test_7_f240_r240/table.qza \
     --i-taxonomy taxonomy.qza \
     --m-metadata-file qiime2_metadata.tsv \
     --o-visualization taxa-bar-plots.qzv
  8. Build phylogenetic tree

     qiime alignment mafft \
     --i-sequences dada2_tests2/test_7_f240_r240/rep-seqs.qza \
     --o-alignment aligned-rep-seqs.qza
    
     qiime alignment mask \
     --i-alignment aligned-rep-seqs.qza \
     --o-masked-alignment masked-aligned-rep-seqs.qza
    
     qiime phylogeny fasttree \
     --i-alignment masked-aligned-rep-seqs.qza \
     --o-tree unrooted-tree.qza
    
     (*) qiime phylogeny midpoint-root \
     --i-tree unrooted-tree.qza \
     --o-rooted-tree rooted-tree.qza
  9. Core diversity analysis

     #The -e 6389 flag sets the even sampling depth (rarefaction depth) to 6,389 reads for diversity analyses.
     #All samples will be rarefied to 4,753 reads.
     #Samples with fewer reads are excluded.
     qiime diversity core-metrics-phylogenetic \
     --i-phylogeny rooted-tree.qza \
     --i-table dada2_tests2/test_7_f240_r240/table.qza \
     --p-sampling-depth 6389 \
     --m-metadata-file qiime2_metadata.tsv \
     --output-dir core_metrics_results
    
     qiime diversity alpha \
     --i-table table.qza \
     --p-metric chao1 \
     --o-alpha-diversity core_metrics_results/chao1_vector.qza
    
     qiime tools export --input-path core_metrics_results/shannon_vector.qza --output-path exported_alpha/shannon
     qiime tools export --input-path core_metrics_results/faith_pd_vector.qza --output-path exported_alpha/faith_pd
     qiime tools export --input-path core_metrics_results/observed_features_vector.qza --output-path exported_alpha/observed_features
     qiime tools export --input-path core_metrics_results/chao1_vector.qza --output-path exported_alpha/chao1
    
     qiime tools export \
     --input-path core_metrics_results/unweighted_unifrac_distance_matrix.qza \
     --output-path exported_unweighted_unifrac
     qiime tools export \
     --input-path core_metrics_results/weighted_unifrac_distance_matrix.qza \
     --output-path exported_weighted_unifrac
    
     qiime diversity beta-group-significance \
     --i-distance-matrix core_metrics_results/weighted_unifrac_distance_matrix.qza \
     --m-metadata-file qiime2_metadata.tsv \
     --m-metadata-column Group \
     --p-pairwise \
     --p-method permanova \
     --o-visualization beta_group_significance.qzv
    
     qiime tools export \
     --input-path beta_group_significance.qzv \
     --output-path exported_beta_group
    
     #✅ Group 1 / Group 2 — The pairwise comparisons.
     #✅ Sample size — Number of samples used in the test.
     #✅ Permutations — Number of permutations in the PERMANOVA test.
     #✅ pseudo-F — The test statistic from PERMANOVA.
     #✅ p-value — The unadjusted p-value.
     #✅ q-value — The adjusted p-value (Bonferroni in QIIME2).
     #The q-value column (also sometimes called p-adj) is the multiple-testing corrected p-value.
     #👉 q < 0.05 means the difference is statistically significant between those two groups, even after correction.
     #“There is a significant difference in community composition between Group 1 and Group 2 (p=0.002).”
    
     #The --p-sampling-depth 6389 parameter is directly equivalent to QIIME 1’s -e 6389!
     #The QIIME 2 command will compute:
     #✅ Alpha diversity metrics (Observed OTUs, Shannon, Faith PD, Evenness)
     #✅ Beta diversity distance matrices (UniFrac, Bray-Curtis)
     #✅ PCoA plots
     #✅ Excludes samples with fewer than 6,389 reads.
    
     #📦 Output Folders and Files
     #Output Description
     #table.qzv  Visual of feature table (sample counts)
     #rep-seqs.qzv   Sequences per ASV
     #denoising-stats.qzv    DADA2 read tracking
     #taxonomy.qza/.qzv  Taxonomic classification of ASVs
     #taxa-bar-plots.qzv Interactive bar plots
     #core_metrics_results/  Alpha/beta diversity metrics + PCoA plots
  10. Prepare three files feeding to Phyloseq.Rmd: table.qza (see above with ), rooted-tree.qza (see above with ), qiime2_metadata_for_qza_to_phyloseq.tsv edited from qiime2_metadata.tsv.

     # Rarefying can be performed here, or in Phyloseq.Rmd (default), therefore, we don't need this step any more.
     qiime feature-table summarize \
     --i-table core_metrics_results/rarefied_table.qza \
     --o-visualization rarefied_table.qzv \
     --m-sample-metadata-file qiime2_metadata.tsv
    
     #Table summary
     #Metric Sample
     #Number of samples  136
     #Number of features 2,781
     #Total frequency    868,904
    
     # In QIIME2, we need table.qza, not biom-file, therefore, we don't need this step any more.
     qiime tools export \
     --input-path core_metrics_results/rarefied_table.qza \
     --output-path exported_rarefied_table
     #-->
     exported_rarefied_table/feature-table.biom
    
     biom convert \
     -i exported_rarefied_table/feature-table.biom \
     -o exported_rarefied_table/feature-table.tsv \
     --to-tsv
    
     #✅ Old QIIME 1 table with GenBank IDs (like EF603722.1.1487) as feature labels.
     #✅ QIIME 2 table where feature IDs are hashes (like 0b438323a296b5f2ce2c8bbe3949ee8d).
    
     # Visulaize the taxonomy.qza
     qiime tools export \
     --input-path taxonomy.qza \
     --output-path exported-taxonomy
    
     #Feature ID    Taxon                               Confidence
     #0b4383...     k__Bacteria; p__Proteobacteria...   0.98
     #dfa833...     k__Bacteria; p__Firmicutes...       0.87
     #...
    
     # ---- I used the following to generate two file for feeding in the Phyloseq.Rmd ----
    
     #-1- exported_table/feature-table.biom corresesponds to table_even6389.biom in QIIME1, but in QIIME2, we don't need biom-file, instead of table.qza.
    
     qiime tools export \
     --input-path dada2_tests2/test_7_f240_r240/table.qza \
     --output-path exported_table
     #--> exported_table/feature-table.biom
    
     #-2- exported-tree/tree.nwk corresesponds to rep_set.tre in QIIME1
    
     qiime tools export \
     --input-path rooted-tree.qza \
     --output-path exported-tree
     #--> exported-tree/tree.nwk
    
     # ---- The code in Phyloseq.Rmd ----
    
     #install.packages("remotes")
     #remotes::install_github("jbisanz/qiime2R")
     #"core_metrics_results/rarefied_table.qza", rarefying performed in the code, therefore import the raw table.
     library(qiime2R)
     ps.ng.tax <- qza_to_phyloseq(
         features =  "dada2_tests2/test_7_f240_r240/table.qza",
         tree = "rooted-tree.qza",
         metadata = "qiime2_metadata_for_qza_to_phyloseq.tsv"
     )
     # or
     #biom convert \
     #      -i ./exported_table/feature-table.biom \
     #      -o ./exported_table/feature-table-v1.biom \
     #      --to-json
     #ps.ng.tax <- import_biom("./exported_table/feature-table-v1.biom", treefilename="./exported-tree/tree.nwk")
    
     #Note that the alpha- and beta-diversity-files needed in Phyloseq.Rmd has been prepared in the step 9.
  11. Figures generated by Phyloseq.Rmd and MicrobiotaProcess_*.R

    The following files can be found under server.

     ./Phyloseq.Rmd (Result Phyloseq.html)
     ./MicrobiotaProcess_cluster1_Group9-11_vs_cluster2_Group12-14_orig.R
     ./MicrobiotaProcess_Group1_vs_Group2.R
     ./MicrobiotaProcess_Group3_vs_Group4.R
     ./MicrobiotaProcess_PCA_Group1-4.R
     ./MicrobiotaProcess_PCA_Group9-14.R

Leave a Reply

Your email address will not be published. Required fields are marked *