Main workflow using QIIME2 for Data_Childrensclinic_16S_2025

  1. Install and test qiime2-docker

    #Cannot run under QIIME1, switch to QIIME2: pick_open_reference_otus.py -r/home/jhuang/REFs/SILVA_132_QIIME_release/rep_set/rep_set_16S_only/99/silva_132_99_16S.fna -i test.fna -o clustering_test/ -p clustering_params.txt --parallel --verbose
    
    docker pull quay.io/qiime2/core:2023.9
    
    docker run -it --rm \
    -v /mnt/md1/DATA/Data_Childrensclinic_16S_2025:/data \
    -v /home/jhuang/REFs:/home/jhuang/REFs \
    quay.io/qiime2/core:2023.9 bash
    cd /data
  2. Import the fastq-files to paired-end-demux.qza

    #https://docs.qiime2.org/2018.8/tutorials/importing/
    
    #Moving the following fastq.gz to raw_data_NOT_USED:
    643805249,Kinderklinik Lauf 2 Pl.B,B04,N720-B,CGGAGCCT,S503-B,TATCCTCT,nan,nan
    7909160377,Kinderkllinik Lauf 2 Pl.D,D11,N728-D,TGCAGCTA,S517-D,GCGTAAGA,nan,nan
    7909188256,Kinderklinik Lauf 2 Pl.C,C10,N712-C,GTAGAGGA,S516-C,CCTAGAGT,nan,nan
    O231000101,Kinderkllinik Lauf 2 Pl.D,F11,N728-D,TGCAGCTA,S520-D,AAGGCTAT,nan,nan
    O231000102,Kinderkllinik Lauf 2 Pl.D,C09,N726-D,CCTAAGAC,S516-D,CCTAGAGT,nan,nan
    O231000103,Kinderkllinik Lauf 2 Pl.D,C11,N728-D,TGCAGCTA,S516-D,CCTAGAGT,nan,nan
    O231000902,Kinderkllinik Lauf 2 Pl.D,C12,N729-D,TCGACGTC,S516-D,CCTAGAGT,nan,nan
    O23100201,Kinderklinik Lauf 2 Pl.B,D10,N727-B,CGATCAGT,S506-B,ACTGCATA,nan,nan
    O23100805,Kinderklinik Lauf 2 Pl.B,F10,N727-B,CGATCAGT,S508-B,CTAAGCCT,nan,nan
    O23100901,Kinderkllinik Lauf 2 Pl.D,F08,N724-D,ACTGAGCG,S520-D,AAGGCTAT,nan,nan
    O23100903,Kinderkllinik Lauf 2 Pl.D,B08,N724-D,ACTGAGCG,S515-D,TTCTAGCT,nan,nan
    O23100904,Kinderkllinik Lauf 2 Pl.D,C08,N724-D,ACTGAGCG,S516-D,CCTAGAGT,nan,nan
    O23100905,Kinderkllinik Lauf 2 Pl.D,D10,N727-D,CGATCAGT,S517-D,GCGTAAGA,nan,nan
    O23111503,Kinderkllinik Lauf 2 Pl.D,A12,N729-D,TCGACGTC,S513-D,TCGACTAG,nan,nan
    O23120902,Kinderklinik Lauf 2 Pl.C,C03,N703-C,AGGCAGAA,S516-C,CCTAGAGT,nan,nan
    O24022002,Kinderklinik Lauf 2 Pl.C,H12,N715-C,ATCTCAGG,S522-C,TTATGCGA,nan,nan
    U23071801,Kinderklinik Lauf 2 Pl.B,B02,N718-B,GGAGCTAC,S503-B,TATCCTCT,nan,nan
    
    rename -n 's/^([A-Z]\d+)_S\d+_L\d+_R([12])_001\.fastq\.gz$/$1_R$2.fastq.gz/' *.fastq.gz
    rename 's/^([A-Z]\d+)_S\d+_L\d+_R([12])_001\.fastq\.gz$/$1_R$2.fastq.gz/' *.fastq.gz
    
    #for file in *.fastq.gz; do echo "mv $file $(echo $file | cut -d'_' -f1 | cut -d'-' -f1-1)_$(echo $file | cut -d'_' -f4).fastq.gz"; done
    for file in *.fastq.gz; do echo "mv $file $(echo $file | cut -d'_' -f1)_$(echo $file | cut -d'_' -f4).fastq.gz"; done
    #MANUALLY correct several filename errors and performing in the generated commands above
    #MANUALLY correct several filename errors and performing in the generated commands below
    
    NTC_2
    NTC_3
    NTC_4
    NTC_5
    NTC_6
    NTC_7
    NTC_8
    NTC_9
    NTC_10
    NTC_11
    NTC_12
    NTC_13
    NTC_14
    NTC_15
    NTC_16
    
    mv NTC-1_R1.fastq.gz NTC_1_R1.fastq.gz
    mv NTC-1_R2.fastq.gz NTC_1_R2.fastq.gz
    mv NTC-2_R1.fastq.gz NTC_2_R1.fastq.gz
    mv NTC-2_R2.fastq.gz NTC_2_R2.fastq.gz
    mv NTC-3_R1.fastq.gz NTC_3_R1.fastq.gz
    mv NTC-3_R2.fastq.gz NTC_3_R2.fastq.gz
    mv NTC-4_R1.fastq.gz NTC_4_R1.fastq.gz
    mv NTC-4_R2.fastq.gz NTC_4_R2.fastq.gz
    mv NTC-5_R1.fastq.gz NTC_5_R1.fastq.gz
    mv NTC-5_R2.fastq.gz NTC_5_R2.fastq.gz
    mv NTC-6_R1.fastq.gz NTC_6_R1.fastq.gz
    mv NTC-6_R2.fastq.gz NTC_6_R2.fastq.gz
    mv NTC-7_R1.fastq.gz NTC_7_R1.fastq.gz
    mv NTC-7_R2.fastq.gz NTC_7_R2.fastq.gz
    mv NTC-8_R1.fastq.gz NTC_8_R1.fastq.gz
    mv NTC-8_R2.fastq.gz NTC_8_R2.fastq.gz
    mv NTC-9_R1.fastq.gz NTC_9_R1.fastq.gz
    mv NTC-9_R2.fastq.gz NTC_9_R2.fastq.gz
    mv NTC-10_R1.fastq.gz NTC_10_R1.fastq.gz
    mv NTC-10_R2.fastq.gz NTC_10_R2.fastq.gz
    mv NTC-11_R1.fastq.gz NTC_11_R1.fastq.gz
    mv NTC-11_R2.fastq.gz NTC_11_R2.fastq.gz
    mv NTC-12_R1.fastq.gz NTC_12_R1.fastq.gz
    mv NTC-12_R2.fastq.gz NTC_12_R2.fastq.gz
    mv NTC-13_R1.fastq.gz NTC_13_R1.fastq.gz
    mv NTC-13_R2.fastq.gz NTC_13_R2.fastq.gz
    mv NTC-14_R1.fastq.gz NTC_14_R1.fastq.gz
    mv NTC-14_R2.fastq.gz NTC_14_R2.fastq.gz
    mv NTC-15_R1.fastq.gz NTC_15_R1.fastq.gz
    mv NTC-15_R2.fastq.gz NTC_15_R2.fastq.gz
    mv NTC-16_R1.fastq.gz NTC_16_R1.fastq.gz
    mv NTC-16_R2.fastq.gz NTC_16_R2.fastq.gz
    
    PC_1
    PC_2
    PC_3
    PC_4
    PC_5
    PC_6
    PC_7
    PC_8
    
    mv PC-1_R1.fastq.gz PC_1_R1.fastq.gz
    mv PC-1_R2.fastq.gz PC_1_R2.fastq.gz
    mv PC-2_R1.fastq.gz PC_2_R1.fastq.gz
    mv PC-2_R2.fastq.gz PC_2_R2.fastq.gz
    mv PC-3_R1.fastq.gz PC_3_R1.fastq.gz
    mv PC-3_R2.fastq.gz PC_3_R2.fastq.gz
    mv PC-4_R1.fastq.gz PC_4_R1.fastq.gz
    mv PC-4_R2.fastq.gz PC_4_R2.fastq.gz
    mv PC-5_R1.fastq.gz PC_5_R1.fastq.gz
    mv PC-5_R2.fastq.gz PC_5_R2.fastq.gz
    mv PC-6_R1.fastq.gz PC_6_R1.fastq.gz
    mv PC-6_R2.fastq.gz PC_6_R2.fastq.gz
    mv PC-7_R1.fastq.gz PC_7_R1.fastq.gz
    mv PC-7_R2.fastq.gz PC_7_R2.fastq.gz
    mv PC-8_R1.fastq.gz PC_8_R1.fastq.gz
    mv PC-8_R2.fastq.gz PC_8_R2.fastq.gz
    
    #MOVE the used fastq.gz to the directory raw_data
    #PREPARE the conf-file --> "-rw-rw-r-- 1 jhuang jhuang 9,3K Nov 20 12:22  pe-33-manifest"
    qiime tools import --type 'SampleData[PairedEndSequencesWithQuality]' --input-path pe-33-manifest --output-path paired-end-demux.qza --input-format PairedEndFastqManifestPhred33
    #--> "-rw-r--r-- 1 root   root   3,5G Nov 20 12:24  paired-end-demux.qza"
    
    qiime demux summarize \
    --i-data paired-end-demux.qza \
    --o-visualization demux_pe.qzv
    #--> "-rw-r--r-- 1 root   root   320K Nov 20 12:34  demux_pe.qzv"
    
    #qiime tools view demux_pe.qzv --> Error: Visualization viewing is currently not supported in headless environments. You can view Visualizations (and Artifacts) at https://view.qiime2.org, or move the Visualization to an environment with a display and view it with `qiime tools view`.
    #Open demux_pe.gzv on https://view.qiime2.org
  3. Optimizing the parameters trunc-len-f and trunc-len-r and denoising with DADA2: optimized parameters is f240_r240

    #Your amplicon (V3–V4 region) is ~464 bp, so you need ≥20–30 bp overlap
    #464-38=426; 440 is the longst +12 nt for overlapping=we need 452 nt!
    
    #Optimize the parameters --p-trunc-len-f and --p-trunc-len-r
    (qiime2-amplicon-2023.9) root@4379fea45cf7:/data# ./dada2_batch_test.sh
    
            #!/bin/bash
    
            # Set your base inputs
            INPUT=paired-end-demux.qza
            TRIM_LEFT_F=17
            TRIM_LEFT_R=21
    
            # Output base
            OUTPUT_DIR=dada2_tests
            mkdir -p $OUTPUT_DIR
    
            # Loop over trunc-len-f and trunc-len-r combinations
            # Forward: from 220 to 260
            # Reverse: from 210 to 260
            #225 220 215
            i=1
            for f in 260 255 250 245 240 235 230 225 220; do
                    for r in 260 255 250 245 240 235 230 225 220 215 210; do
                    OUT=test_${i}_f${f}_r${r}
                    echo "Running: $OUT"
                    mkdir -p $OUTPUT_DIR/$OUT
    
                    qiime dada2 denoise-paired \
                    --i-demultiplexed-seqs $INPUT \
                    --p-trim-left-f $TRIM_LEFT_F \
                    --p-trim-left-r $TRIM_LEFT_R \
                    --p-max-ee-f 3 --p-max-ee-r 5 \
                    --p-trunc-len-f $f \
                    --p-trunc-len-r $r \
                    --p-n-threads 32 \
                    --o-table $OUTPUT_DIR/$OUT/table.qza \
                    --o-representative-sequences $OUTPUT_DIR/$OUT/rep-seqs.qza \
                    --o-denoising-stats $OUTPUT_DIR/$OUT/denoising-stats.qza \
                    --verbose > $OUTPUT_DIR/$OUT/log.txt 2>&1
    
                    ((i++))
                    done
            done
    
    for f in dada2_tests2/test_*/denoising-stats.qza; do
    qiime metadata tabulate \
        --m-input-file $f \
        --o-visualization ${f%.qza}.qzv
    done
    
    #Manually convert denoising-stats.qza to denoising-stats.tsv using https://view.qiime2.org
    #Downloads to jhuang@WS-2290C:~/DATA/Data_Childrensclinic_16S_2025/dada2_tests/test_59_f235_r245$ mv ~/Downloads/data_stats.tsv .
    #NTC01  33463   1732    5.18    1723    1650    4.93    1589    4.75
    #O23091304  54831   7477    13.64   7409    7133    13.01   7101    12.95
    #NTC01  33463   1719    5.14    1711    1648    4.92    1588    4.75
    #O23091304  54831   7419    13.53   7345    7068    12.89   7046    12.85
    
    #pandaseq.out: grep ">" A1_R1.fastq.gz_merged.fasta | wc -l #8229;  grep ">" A10_R1.fastq.gz_merged.fasta | wc -l #9165
    
    sudo chown -R jhuang:jhuang dada2_tests
    cd dada2_tests
    python3 rank_dada2_params.py
    
    # Top parameter sets by % input non-chimeric (overall):
    # test_69_f230_r250: f=230, r=250, %non-chimeric=81.75, median%=84.57, merged%=83.86, samples_nonchim=253/253
    # * (CHOOSEN) test_59_f235_r245: f=235, r=245, %non-chimeric=81.57, median%=84.46, merged%=83.79, samples_nonchim=253/253
    # test_58_f235_r250: f=235, r=250, %non-chimeric=81.49, median%=84.35, merged%=83.61, samples_nonchim=253/253
    # test_49_f240_r240: f=240, r=240, %non-chimeric=81.30, median%=84.02, merged%=83.46, samples_nonchim=253/253
    
    # test_48_f240_r245: f=240, r=245, %non-chimeric=81.21, median%=83.95, merged%=83.33, samples_nonchim=253/253
    # test_47_f240_r250: f=240, r=250, %non-chimeric=81.12, median%=83.94, merged%=83.19, samples_nonchim=253/253
    # test_39_f245_r235: f=245, r=235, %non-chimeric=80.84, median%=83.59, merged%=82.98, samples_nonchim=253/253
    # test_38_f245_r240: f=245, r=240, %non-chimeric=80.78, median%=83.52, merged%=82.91, samples_nonchim=253/253
    # test_37_f245_r245: f=245, r=245, %non-chimeric=80.72, median%=83.36, merged%=82.81, samples_nonchim=253/253
    # test_36_f245_r250: f=245, r=250, %non-chimeric=80.62, median%=83.41, merged%=82.68, samples_nonchim=253/253
  4. Visualize outputs (Using pandaseq.out, since qiime2_metadata.tsv contains the pathway of pandaseq.out)

    4.1. mkdir fastqc_out fastqc -t 4 raw_data/* -o fastqc_out/ 4.2. mkdir trim_data trimmed_unpaired cd raw_data for file in O24021602_R1.fastq.gz O24021601_R1.fastq.gz O23100705_R1.fastq.gz PC_5_R1.fastq.gz O24010402_R1.fastq.gz U23090801_R1.fastq.gz O24012402_R1.fastq.gz NTC_7_R1.fastq.gz O23121301_R1.fastq.gz O24013104_R1.fastq.gz U25020701_R1.fastq.gz O23100401_R1.fastq.gz O24020901_R1.fastq.gz NTC_1_R1.fastq.gz O24020903_R1.fastq.gz O23092002_R1.fastq.gz UR009768_R1.fastq.gz O23100402_R1.fastq.gz O23091304_R1.fastq.gz O23100703_R1.fastq.gz O23110904_R1.fastq.gz O23110903_R1.fastq.gz O23121304_R1.fastq.gz O23083101_R1.fastq.gz O23100502_R1.fastq.gz O23110301_R1.fastq.gz O24020805_R1.fastq.gz A23051102_R1.fastq.gz O23102604_R1.fastq.gz A23051103_R1.fastq.gz O23102504_R1.fastq.gz O24011205_R1.fastq.gz O23111602_R1.fastq.gz O24020905_R1.fastq.gz O23090701_R1.fastq.gz O24020702_R1.fastq.gz NTC_4_R1.fastq.gz A24072901_R1.fastq.gz NTC_10_R1.fastq.gz O23100704_R1.fastq.gz NTC_8_R1.fastq.gz O24011901_R1.fastq.gz O23111501_R1.fastq.gz O23112303_R1.fastq.gz O23112902_R1.fastq.gz O24011203_R1.fastq.gz O23092006_R1.fastq.gz O24010404_R1.fastq.gz O23090803_R1.fastq.gz O23091501_R1.fastq.gz U25022101_R1.fastq.gz O23121305_R1.fastq.gz U24080201_R1.fastq.gz U24111801_R1.fastq.gz A24040201_R1.fastq.gz O23120702_R1.fastq.gz NTC_5_R1.fastq.gz O24013105_R1.fastq.gz O23100706_R1.fastq.gz O23083102_R1.fastq.gz O23091305_R1.fastq.gz PC_7_R1.fastq.gz O24011005_R1.fastq.gz PC01_R1.fastq.gz O24011004_R1.fastq.gz O23110902_R1.fastq.gz O24020904_R1.fastq.gz A24071701_R1.fastq.gz O24021501_R1.fastq.gz O23120101_R1.fastq.gz U25011702_R1.fastq.gz O23092103_R1.fastq.gz UR009909_R1.fastq.gz O23091403_R1.fastq.gz NTC_2_R1.fastq.gz PC_1_R1.fastq.gz O24013102_R1.fastq.gz O23102704_R1.fastq.gz O24010401_R1.fastq.gz O24012501_R1.fastq.gz U24121801_R1.fastq.gz O24031304_R1.fastq.gz O23090802_R1.fastq.gz NTC_12_R1.fastq.gz O23100702_R1.fastq.gz O24011202_R1.fastq.gz O23100601_R1.fastq.gz O23100802_R1.fastq.gz O24021404_R1.fastq.gz O23112301_R1.fastq.gz O24011103_R1.fastq.gz O24030601_R1.fastq.gz U23052201_R1.fastq.gz O23100803_R1.fastq.gz O23102502_R1.fastq.gz O24011206_R1.fastq.gz O23091401_R1.fastq.gz O23092001_R1.fastq.gz O24020801_R1.fastq.gz O23082302_R1.fastq.gz O23112204_R1.fastq.gz O23100701_R1.fastq.gz O23112304_R1.fastq.gz A24031201_R1.fastq.gz O24022801_R1.fastq.gz PC_3_R1.fastq.gz O24022901_R1.fastq.gz O23092005_R1.fastq.gz O23121502_R1.fastq.gz O24021401_R1.fastq.gz U25011701_R1.fastq.gz NTC01_R1.fastq.gz O23082401_R1.fastq.gz O24022202_R1.fastq.gz NTC_6_R1.fastq.gz O24031302_R1.fastq.gz NTC_15_R1.fastq.gz NTC_13_R1.fastq.gz O24022302_R1.fastq.gz O23121303_R1.fastq.gz O24021502_R1.fastq.gz O24020704_R1.fastq.gz O23092202_R1.fastq.gz A23112002_R1.fastq.gz O23092803_R1.fastq.gz O23112203_R1.fastq.gz NTC_11_R1.fastq.gz O23091302_R1.fastq.gz A24062801_R1.fastq.gz O23121302_R1.fastq.gz O23092104_R1.fastq.gz O24020802_R1.fastq.gz O24021403_R1.fastq.gz O23112201_R1.fastq.gz O23082402_R1.fastq.gz U24070401_R1.fastq.gz O24020902_R1.fastq.gz O24030602_R1.fastq.gz NTC_3_R1.fastq.gz O24011201_R1.fastq.gz O23091301_R1.fastq.gz O23112202_R1.fastq.gz O24021503_R1.fastq.gz O23102601_R1.fastq.gz O24011002_R1.fastq.gz O23121503_R1.fastq.gz O23092004_R1.fastq.gz O23091402_R1.fastq.gz O23092701_R1.fastq.gz O24031301_R1.fastq.gz O24020703_R1.fastq.gz O24013103_R1.fastq.gz A24080201_R1.fastq.gz U23071201_R1.fastq.gz U23052401_R1.fastq.gz U23071901_R1.fastq.gz O24011101_R1.fastq.gz U23091101_R1.fastq.gz O23110204_R1.fastq.gz O23102702_R1.fastq.gz O24020101_R1.fastq.gz O24012401_R1.fastq.gz O23100501_R1.fastq.gz O24020705_R1.fastq.gz A23060602_R1.fastq.gz O23102701_R1.fastq.gz O24011001_R1.fastq.gz O24022102_R1.fastq.gz O23110302_R1.fastq.gz O23100503_R1.fastq.gz O24020803_R1.fastq.gz O23110203_R1.fastq.gz A24030402_R1.fastq.gz O23092102_R1.fastq.gz O24011003_R1.fastq.gz O23121306_R1.fastq.gz O23110101_R1.fastq.gz O24020804_R1.fastq.gz O23112901_R1.fastq.gz A23072501_R1.fastq.gz O24030604_R1.fastq.gz O23112302_R1.fastq.gz O24010301_R1.fastq.gz O24021402_R1.fastq.gz O24011104_R1.fastq.gz O23102602_R1.fastq.gz O23102703_R1.fastq.gz O23091303_R1.fastq.gz O24011204_R1.fastq.gz A24071901_R1.fastq.gz O23102603_R1.fastq.gz O23082303_R1.fastq.gz O24011801_R1.fastq.gz O24030603_R1.fastq.gz O23092802_R1.fastq.gz O23090801_R1.fastq.gz A24030401_R1.fastq.gz O23083001_R1.fastq.gz A23060601_R1.fastq.gz O23092203_R1.fastq.gz NTC_14_R1.fastq.gz O23090602_R1.fastq.gz A23111301_R1.fastq.gz O23120103_R1.fastq.gz O24011105_R1.fastq.gz U23071701_R1.fastq.gz O23111601_R1.fastq.gz O24010501_R1.fastq.gz O23110202_R1.fastq.gz O23121501_R1.fastq.gz O24010302_R1.fastq.gz O24011207_R1.fastq.gz O23092702_R1.fastq.gz O23112206_R1.fastq.gz PC_2_R1.fastq.gz O23102503_R1.fastq.gz O23092201_R1.fastq.gz NTC_9_R1.fastq.gz A23080101_R1.fastq.gz PC_6_R1.fastq.gz O23120701_R1.fastq.gz U24101801_R1.fastq.gz A23112001_R1.fastq.gz O23120104_R1.fastq.gz O23100804_R1.fastq.gz O24031305_R1.fastq.gz O23111502_R1.fastq.gz O24020103_R1.fastq.gz O23082301_R1.fastq.gz NTC_16_R1.fastq.gz PC_8_R1.fastq.gz O24022101_R1.fastq.gz O24020102_R1.fastq.gz O23081801_R1.fastq.gz O24022902_R1.fastq.gz O23112205_R1.fastq.gz O24021603_R1.fastq.gz O23102501_R1.fastq.gz O24010403_R1.fastq.gz O24012403_R1.fastq.gz O23092003_R1.fastq.gz O24013101_R1.fastq.gz O24011802_R1.fastq.gz O24011102_R1.fastq.gz PC_4_R1.fastq.gz O23100801_R1.fastq.gz O23092101_R1.fastq.gz O24020701_R1.fastq.gz O23092801_R1.fastq.gz O24022201_R1.fastq.gz O23110901_R1.fastq.gz O23090601_R1.fastq.gz O24022301_R1.fastq.gz; do java -jar /home/jhuang/Tools/Trimmomatic-0.36/trimmomatic-0.36.jar PE -threads 16 $file ${file/_R1/_R2} ../trim_data/$file ../trimmed_unpaired/$file ../trim_data/${file/_R1/_R2} ../trimmed_unpaired/${file/_R1/_R2} ILLUMINACLIP:/home/jhuang/Tools/Trimmomatic-0.36/adapters/TruSeq3-PE-2.fa:2:30:10:8:TRUE LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36 AVGQUAL:20; done 2> trimmomatic_pe.log

    4.3. mkdir pandaseq.out conda activate /home/jhuang/miniconda3/envs/qiime1 for file in trim_data/*_R1.fastq.gz; do pandaseq -f ${file} -r ${file/_R1.fastq.gz/R2.fastq.gz} -l 300 -p CCTACGGGNGGCWGCAG -q GACTACHVGGGTATCTAATCC -w pandaseq.out/$(echo $file | cut -d’/’ -f2 | cut -d’‘ -f1-3)_merged.fasta >> LOG_pandaseq; done conda deactivate

    4.4. prepare qiime2_metadata.tsv

    4.5. run qiime feature-table summarize \ –i-table dada2_tests/test_59_f235_r245/table.qza \ –o-visualization table.qzv \ –m-sample-metadata-file qiime2_metadata.tsv

    #Table summary
    #Metric Sample
    #Number of samples  137-->96
    #Number of features 3,039-->21,893
    #Total frequency    1,641,484-->9,246,546
    #
    #Frequency per sample
    #Minimum frequency  413.0-->41,764.0
    #1st quartile   10,319.0-->100,017.5
    #Median frequency   11,530.0-->100,017.5
    #3rd quartile   13,146.0-->110,183.5
    #Maximum frequency  40,022.0-->143,563.0
    #Mean frequency 11,981.635036496351-->96,318.1875
    #
    #Frequency per feature
    #Minimum frequency  1.0
    #1st quartile   3.0
    #Median frequency   8.0-->4.0
    #3rd quartile   95.5-->14.0
    #Maximum frequency  56,472.0-->983,499.0
    #Mean frequency 540.1395195788089-->422.35
    
    #qiime tools peek dada2_tests/test_59_f235_r245/table.qza
    #qiime tools peek qiime2_metadata.tsv
    
    qiime feature-table tabulate-seqs \
    --i-data dada2_tests/test_59_f235_r245/rep-seqs.qza \
    --o-visualization rep-seqs.qzv
    
    qiime metadata tabulate \
    --m-input-file dada2_tests/test_59_f235_r245/denoising-stats.qza \
    --o-visualization denoising-stats.qzv
  5. Import reference sequences and taxonomy (SILVA 132)

    qiime tools import \
    --type 'FeatureData[Sequence]' \
    --input-path /home/jhuang/REFs/SILVA_132_QIIME_release/rep_set/rep_set_16S_only/99/silva_132_99_16S.fna \
    --output-path silva_132_99_otus.qza \
    --input-format DNAFASTAFormat
    
    qiime tools import \
    --type 'FeatureData[Taxonomy]' \
    --input-format HeaderlessTSVTaxonomyFormat \
    --input-path /home/jhuang/REFs/SILVA_132_QIIME_release/taxonomy/16S_only/99/consensus_taxonomy_7_levels.txt \
    --output-path silva_132_99_taxonomy.qza
  6. Assign taxonomy

    qiime feature-classifier classify-consensus-vsearch \
    --i-query  dada2_tests/test_59_f235_r245/rep-seqs.qza \
    --i-reference-reads silva_132_99_otus.qza \
    --i-reference-taxonomy silva_132_99_taxonomy.qza \
    --p-perc-identity 0.97 \
    --p-threads 64 \
    --o-classification taxonomy.qza \
    --o-search-results search-results.qza
  7. Visualize taxonomy

    qiime taxa barplot \
    --i-table  dada2_tests/test_59_f235_r245/table.qza \
    --i-taxonomy taxonomy.qza \
    --m-metadata-file qiime2_metadata.tsv \
    --o-visualization taxa-bar-plots.qzv
  8. Build phylogenetic tree

    qiime alignment mafft \
    --i-sequences  dada2_tests/test_59_f235_r245/rep-seqs.qza \
    --o-alignment aligned-rep-seqs.qza
    
    qiime alignment mask \
    --i-alignment aligned-rep-seqs.qza \
    --o-masked-alignment masked-aligned-rep-seqs.qza
    
    qiime phylogeny fasttree \
    --i-alignment masked-aligned-rep-seqs.qza \
    --o-tree unrooted-tree.qza
    
    # (*) The rooted-tree is generated from unrooted-tree, and will be used in the next step!
    qiime phylogeny midpoint-root \
    --i-tree unrooted-tree.qza \
    --o-rooted-tree rooted-tree.qza
  9. Core diversity analysis

    #The -e 6389 flag sets the even sampling depth (rarefaction depth) to 6,389 reads for diversity analyses.
    #All samples will be rarefied to 4,753 reads.
    #Samples with fewer reads are excluded.
    qiime diversity core-metrics-phylogenetic \
    --i-phylogeny rooted-tree.qza \
    --i-table  dada2_tests/test_59_f235_r245/table.qza \
    --p-sampling-depth 6389 \
    --m-metadata-file qiime2_metadata.tsv \
    --output-dir core_metrics_results
    
    qiime diversity alpha \
    --i-table dada2_tests/test_59_f235_r245/table.qza \
    --p-metric chao1 \
    --o-alpha-diversity core_metrics_results/chao1_vector.qza
    
    qiime tools export --input-path core_metrics_results/shannon_vector.qza --output-path exported_alpha/shannon
    qiime tools export --input-path core_metrics_results/faith_pd_vector.qza --output-path exported_alpha/faith_pd
    qiime tools export --input-path core_metrics_results/observed_features_vector.qza --output-path exported_alpha/observed_features
    qiime tools export --input-path core_metrics_results/chao1_vector.qza --output-path exported_alpha/chao1
    
    qiime tools export \
    --input-path core_metrics_results/unweighted_unifrac_distance_matrix.qza \
    --output-path exported_unweighted_unifrac
    qiime tools export \
    --input-path core_metrics_results/weighted_unifrac_distance_matrix.qza \
    --output-path exported_weighted_unifrac
    
    qiime diversity beta-group-significance \
    --i-distance-matrix core_metrics_results/weighted_unifrac_distance_matrix.qza \
    --m-metadata-file qiime2_metadata.tsv \
    --m-metadata-column Group \
    --p-pairwise \
    --p-method permanova \
    --o-visualization beta_group_significance.qzv
    
    qiime tools export \
    --input-path beta_group_significance.qzv \
    --output-path exported_beta_group
  10. Prepare three files feeding to Phyloseq.Rmd: table.qza (see above with ), rooted-tree.qza (see above with ), qiime2_metadata_for_qza_to_phyloseq.tsv edited from qiime2_metadata.tsv.

    # Rarefying can be performed here, or in Phyloseq.Rmd (default), therefore, we don't need this step any more.
    qiime feature-table summarize \
    --i-table core_metrics_results/rarefied_table.qza \
    --o-visualization rarefied_table.qzv \
    --m-sample-metadata-file qiime2_metadata.tsv
    
    #Table summary
    #Metric Sample
    #Number of samples  136
    #Number of features 2,781
    #Total frequency    868,904
    
    # In QIIME2, we need table.qza, not biom-file, therefore, we don't need this step any more.
    qiime tools export \
    --input-path core_metrics_results/rarefied_table.qza \
    --output-path exported_rarefied_table
    #--> exported_rarefied_table/feature-table.biom
    
    biom convert \
    -i exported_rarefied_table/feature-table.biom \
    -o exported_rarefied_table/feature-table.tsv \
    --to-tsv
    
    #✅ Old QIIME 1 table with GenBank IDs (like EF603722.1.1487) as feature labels.
    #✅ QIIME 2 table where feature IDs are hashes (like 0b438323a296b5f2ce2c8bbe3949ee8d).
    
    # Visulaize the taxonomy.qza
    qiime tools export \
    --input-path taxonomy.qza \
    --output-path exported-taxonomy
    
    #Feature ID    Taxon                               Confidence
    #0b4383...     k__Bacteria; p__Proteobacteria...   0.98
    #dfa833...     k__Bacteria; p__Firmicutes...       0.87
    #...
    
    # ---- I used the following to generate two file for feeding in the Phyloseq.Rmd ----
    
    #-1- exported_table/feature-table.biom corresesponds to table_even6389.biom in QIIME1, but in QIIME2, we don't need biom-file, instead of table.qza.
    
    qiime tools export \
    --input-path  dada2_tests/test_59_f235_r245/table.qza \
    --output-path exported_table
    #--> exported_table/feature-table.biom
    
    #-2- exported-tree/tree.nwk corresesponds to rep_set.tre in QIIME1
    
    qiime tools export \
    --input-path rooted-tree.qza \
    --output-path exported-tree
    #--> exported-tree/tree.nwk
    
    # END
    
    # ---- The code in Phyloseq.Rmd ----
    
    #install.packages("remotes")
    #remotes::install_github("jbisanz/qiime2R")
    #"core_metrics_results/rarefied_table.qza", rarefying performed in the code, therefore import the raw table.
    library(qiime2R)
    ps.ng.tax <- qza_to_phyloseq(
        features =  "dada2_tests/test_59_f235_r245/table.qza",
        tree = "rooted-tree.qza",
        metadata = "qiime2_metadata_for_qza_to_phyloseq.tsv"
    )
    # or
    #biom convert \
    #      -i ./exported_table/feature-table.biom \
    #      -o ./exported_table/feature-table-v1.biom \
    #      --to-json
    #ps.ng.tax <- import_biom("./exported_table/feature-table-v1.biom", treefilename="./exported-tree/tree.nwk")
    
    #Note that the alpha- and beta-diversity-files needed in Phyloseq.Rmd has been prepared in the step 9.
  11. Figures generated by Phyloseq.Rmd and MicrobiotaProcess_*.R

    The following files can be found under server.

    ./Phyloseq.Rmd (Result Phyloseq.html)
    ./MicrobiotaProcess_cluster1_Group9-11_vs_cluster2_Group12-14_orig.R
    ./MicrobiotaProcess_Group1_vs_Group2.R
    ./MicrobiotaProcess_Group3_vs_Group4.R
    ./MicrobiotaProcess_PCA_Group1-4.R
    ./MicrobiotaProcess_PCA_Group9-14.R
  12. Generating relative_abundance_phylum_order_family.xls Convert the Excel file for Phylum, Order, and Family relative abundances for all samples to csv file after correcting the family due to the two redundance species “=SUM(EG10+EG18)” Edit the csv file, merge then to a file.

    ~/Tools/csv2xls-0.4/csv_to_xls.py relative_abundance_phylum.csv relative_abundance_order.csv relative_abundance_family.csv -d$'\t' -o relative_abundance_phylum_order_family.xls;

Leave a Reply

Your email address will not be published. Required fields are marked *