Main workflow using QIIME2 for Data_Childrensclinic_16S_2025

  1. Install and test qiime2-docker

    #Cannot run under QIIME1, switch to QIIME2: pick_open_reference_otus.py -r/home/jhuang/REFs/SILVA_132_QIIME_release/rep_set/rep_set_16S_only/99/silva_132_99_16S.fna -i test.fna -o clustering_test/ -p clustering_params.txt --parallel --verbose
    
    docker pull quay.io/qiime2/core:2023.9
    
    docker run -it --rm \
    -v /mnt/md1/DATA/Data_Childrensclinic_16S_2025:/data \
    -v /home/jhuang/REFs:/home/jhuang/REFs \
    quay.io/qiime2/core:2023.9 bash
    cd /data
  2. Import the fastq-files to paired-end-demux.qza

    #https://docs.qiime2.org/2018.8/tutorials/importing/
    
    #Moving the following fastq.gz to raw_data_NOT_USED:
    643805249,Kinderklinik Lauf 2 Pl.B,B04,N720-B,CGGAGCCT,S503-B,TATCCTCT,nan,nan
    7909160377,Kinderkllinik Lauf 2 Pl.D,D11,N728-D,TGCAGCTA,S517-D,GCGTAAGA,nan,nan
    7909188256,Kinderklinik Lauf 2 Pl.C,C10,N712-C,GTAGAGGA,S516-C,CCTAGAGT,nan,nan
    O231000101,Kinderkllinik Lauf 2 Pl.D,F11,N728-D,TGCAGCTA,S520-D,AAGGCTAT,nan,nan
    O231000102,Kinderkllinik Lauf 2 Pl.D,C09,N726-D,CCTAAGAC,S516-D,CCTAGAGT,nan,nan
    O231000103,Kinderkllinik Lauf 2 Pl.D,C11,N728-D,TGCAGCTA,S516-D,CCTAGAGT,nan,nan
    O231000902,Kinderkllinik Lauf 2 Pl.D,C12,N729-D,TCGACGTC,S516-D,CCTAGAGT,nan,nan
    O23100201,Kinderklinik Lauf 2 Pl.B,D10,N727-B,CGATCAGT,S506-B,ACTGCATA,nan,nan
    O23100805,Kinderklinik Lauf 2 Pl.B,F10,N727-B,CGATCAGT,S508-B,CTAAGCCT,nan,nan
    O23100901,Kinderkllinik Lauf 2 Pl.D,F08,N724-D,ACTGAGCG,S520-D,AAGGCTAT,nan,nan
    O23100903,Kinderkllinik Lauf 2 Pl.D,B08,N724-D,ACTGAGCG,S515-D,TTCTAGCT,nan,nan
    O23100904,Kinderkllinik Lauf 2 Pl.D,C08,N724-D,ACTGAGCG,S516-D,CCTAGAGT,nan,nan
    O23100905,Kinderkllinik Lauf 2 Pl.D,D10,N727-D,CGATCAGT,S517-D,GCGTAAGA,nan,nan
    O23111503,Kinderkllinik Lauf 2 Pl.D,A12,N729-D,TCGACGTC,S513-D,TCGACTAG,nan,nan
    O23120902,Kinderklinik Lauf 2 Pl.C,C03,N703-C,AGGCAGAA,S516-C,CCTAGAGT,nan,nan
    O24022002,Kinderklinik Lauf 2 Pl.C,H12,N715-C,ATCTCAGG,S522-C,TTATGCGA,nan,nan
    U23071801,Kinderklinik Lauf 2 Pl.B,B02,N718-B,GGAGCTAC,S503-B,TATCCTCT,nan,nan
    
    rename -n 's/^([A-Z]\d+)_S\d+_L\d+_R([12])_001\.fastq\.gz$/$1_R$2.fastq.gz/' *.fastq.gz
    rename 's/^([A-Z]\d+)_S\d+_L\d+_R([12])_001\.fastq\.gz$/$1_R$2.fastq.gz/' *.fastq.gz
    
    #for file in *.fastq.gz; do echo "mv $file $(echo $file | cut -d'_' -f1 | cut -d'-' -f1-1)_$(echo $file | cut -d'_' -f4).fastq.gz"; done
    for file in *.fastq.gz; do echo "mv $file $(echo $file | cut -d'_' -f1)_$(echo $file | cut -d'_' -f4).fastq.gz"; done
    #MANUALLY correct several filename errors and performing in the generated commands above
    #MANUALLY correct several filename errors and performing in the generated commands below
    
    NTC_2
    NTC_3
    NTC_4
    NTC_5
    NTC_6
    NTC_7
    NTC_8
    NTC_9
    NTC_10
    NTC_11
    NTC_12
    NTC_13
    NTC_14
    NTC_15
    NTC_16
    
    mv NTC-1_R1.fastq.gz NTC_1_R1.fastq.gz
    mv NTC-1_R2.fastq.gz NTC_1_R2.fastq.gz
    mv NTC-2_R1.fastq.gz NTC_2_R1.fastq.gz
    mv NTC-2_R2.fastq.gz NTC_2_R2.fastq.gz
    mv NTC-3_R1.fastq.gz NTC_3_R1.fastq.gz
    mv NTC-3_R2.fastq.gz NTC_3_R2.fastq.gz
    mv NTC-4_R1.fastq.gz NTC_4_R1.fastq.gz
    mv NTC-4_R2.fastq.gz NTC_4_R2.fastq.gz
    mv NTC-5_R1.fastq.gz NTC_5_R1.fastq.gz
    mv NTC-5_R2.fastq.gz NTC_5_R2.fastq.gz
    mv NTC-6_R1.fastq.gz NTC_6_R1.fastq.gz
    mv NTC-6_R2.fastq.gz NTC_6_R2.fastq.gz
    mv NTC-7_R1.fastq.gz NTC_7_R1.fastq.gz
    mv NTC-7_R2.fastq.gz NTC_7_R2.fastq.gz
    mv NTC-8_R1.fastq.gz NTC_8_R1.fastq.gz
    mv NTC-8_R2.fastq.gz NTC_8_R2.fastq.gz
    mv NTC-9_R1.fastq.gz NTC_9_R1.fastq.gz
    mv NTC-9_R2.fastq.gz NTC_9_R2.fastq.gz
    mv NTC-10_R1.fastq.gz NTC_10_R1.fastq.gz
    mv NTC-10_R2.fastq.gz NTC_10_R2.fastq.gz
    mv NTC-11_R1.fastq.gz NTC_11_R1.fastq.gz
    mv NTC-11_R2.fastq.gz NTC_11_R2.fastq.gz
    mv NTC-12_R1.fastq.gz NTC_12_R1.fastq.gz
    mv NTC-12_R2.fastq.gz NTC_12_R2.fastq.gz
    mv NTC-13_R1.fastq.gz NTC_13_R1.fastq.gz
    mv NTC-13_R2.fastq.gz NTC_13_R2.fastq.gz
    mv NTC-14_R1.fastq.gz NTC_14_R1.fastq.gz
    mv NTC-14_R2.fastq.gz NTC_14_R2.fastq.gz
    mv NTC-15_R1.fastq.gz NTC_15_R1.fastq.gz
    mv NTC-15_R2.fastq.gz NTC_15_R2.fastq.gz
    mv NTC-16_R1.fastq.gz NTC_16_R1.fastq.gz
    mv NTC-16_R2.fastq.gz NTC_16_R2.fastq.gz
    
    PC_1
    PC_2
    PC_3
    PC_4
    PC_5
    PC_6
    PC_7
    PC_8
    
    mv PC-1_R1.fastq.gz PC_1_R1.fastq.gz
    mv PC-1_R2.fastq.gz PC_1_R2.fastq.gz
    mv PC-2_R1.fastq.gz PC_2_R1.fastq.gz
    mv PC-2_R2.fastq.gz PC_2_R2.fastq.gz
    mv PC-3_R1.fastq.gz PC_3_R1.fastq.gz
    mv PC-3_R2.fastq.gz PC_3_R2.fastq.gz
    mv PC-4_R1.fastq.gz PC_4_R1.fastq.gz
    mv PC-4_R2.fastq.gz PC_4_R2.fastq.gz
    mv PC-5_R1.fastq.gz PC_5_R1.fastq.gz
    mv PC-5_R2.fastq.gz PC_5_R2.fastq.gz
    mv PC-6_R1.fastq.gz PC_6_R1.fastq.gz
    mv PC-6_R2.fastq.gz PC_6_R2.fastq.gz
    mv PC-7_R1.fastq.gz PC_7_R1.fastq.gz
    mv PC-7_R2.fastq.gz PC_7_R2.fastq.gz
    mv PC-8_R1.fastq.gz PC_8_R1.fastq.gz
    mv PC-8_R2.fastq.gz PC_8_R2.fastq.gz
    
    #MOVE the used fastq.gz to the directory raw_data
    #PREPARE the conf-file --> "-rw-rw-r-- 1 jhuang jhuang 9,3K Nov 20 12:22  pe-33-manifest"
    qiime tools import --type 'SampleData[PairedEndSequencesWithQuality]' --input-path pe-33-manifest --output-path paired-end-demux.qza --input-format PairedEndFastqManifestPhred33
    #--> "-rw-r--r-- 1 root   root   3,5G Nov 20 12:24  paired-end-demux.qza"
    
    qiime demux summarize \
    --i-data paired-end-demux.qza \
    --o-visualization demux_pe.qzv
    #--> "-rw-r--r-- 1 root   root   320K Nov 20 12:34  demux_pe.qzv"
    
    #qiime tools view demux_pe.qzv --> Error: Visualization viewing is currently not supported in headless environments. You can view Visualizations (and Artifacts) at https://view.qiime2.org, or move the Visualization to an environment with a display and view it with `qiime tools view`.
    #Open demux_pe.gzv on https://view.qiime2.org
  3. Optimizing the parameters trunc-len-f and trunc-len-r and denoising with DADA2: optimized parameters is f240_r240

    #Your amplicon (V3–V4 region) is ~464 bp, so you need ≥20–30 bp overlap
    #464-38=426; 440 is the longst +12 nt for overlapping=we need 452 nt!
    
    #Optimize the parameters --p-trunc-len-f and --p-trunc-len-r
    (qiime2-amplicon-2023.9) root@4379fea45cf7:/data# ./dada2_batch_test.sh
    
            #!/bin/bash
    
            # Set your base inputs
            INPUT=paired-end-demux.qza
            TRIM_LEFT_F=17
            TRIM_LEFT_R=21
    
            # Output base
            OUTPUT_DIR=dada2_tests
            mkdir -p $OUTPUT_DIR
    
            # Loop over trunc-len-f and trunc-len-r combinations
            # Forward: from 220 to 260
            # Reverse: from 210 to 260
            #225 220 215
            i=1
            for f in 260 255 250 245 240 235 230 225 220; do
                    for r in 260 255 250 245 240 235 230 225 220 215 210; do
                    OUT=test_${i}_f${f}_r${r}
                    echo "Running: $OUT"
                    mkdir -p $OUTPUT_DIR/$OUT
    
                    qiime dada2 denoise-paired \
                    --i-demultiplexed-seqs $INPUT \
                    --p-trim-left-f $TRIM_LEFT_F \
                    --p-trim-left-r $TRIM_LEFT_R \
                    --p-max-ee-f 3 --p-max-ee-r 5 \
                    --p-trunc-len-f $f \
                    --p-trunc-len-r $r \
                    --p-n-threads 32 \
                    --o-table $OUTPUT_DIR/$OUT/table.qza \
                    --o-representative-sequences $OUTPUT_DIR/$OUT/rep-seqs.qza \
                    --o-denoising-stats $OUTPUT_DIR/$OUT/denoising-stats.qza \
                    --verbose > $OUTPUT_DIR/$OUT/log.txt 2>&1
    
                    ((i++))
                    done
            done
    
    for f in dada2_tests2/test_*/denoising-stats.qza; do
    qiime metadata tabulate \
        --m-input-file $f \
        --o-visualization ${f%.qza}.qzv
    done
    
    #Manually convert denoising-stats.qza to denoising-stats.tsv using https://view.qiime2.org
    #Downloads to jhuang@WS-2290C:~/DATA/Data_Childrensclinic_16S_2025/dada2_tests/test_59_f235_r245$ mv ~/Downloads/data_stats.tsv .
    #NTC01  33463   1732    5.18    1723    1650    4.93    1589    4.75
    #O23091304  54831   7477    13.64   7409    7133    13.01   7101    12.95
    #NTC01  33463   1719    5.14    1711    1648    4.92    1588    4.75
    #O23091304  54831   7419    13.53   7345    7068    12.89   7046    12.85
    
    #pandaseq.out: grep ">" A1_R1.fastq.gz_merged.fasta | wc -l #8229;  grep ">" A10_R1.fastq.gz_merged.fasta | wc -l #9165
    
    sudo chown -R jhuang:jhuang dada2_tests
    cd dada2_tests
    python3 rank_dada2_params.py
    
    # Top parameter sets by % input non-chimeric (overall):
    # test_69_f230_r250: f=230, r=250, %non-chimeric=81.75, median%=84.57, merged%=83.86, samples_nonchim=253/253
    # * (CHOOSEN) test_59_f235_r245: f=235, r=245, %non-chimeric=81.57, median%=84.46, merged%=83.79, samples_nonchim=253/253
    # test_58_f235_r250: f=235, r=250, %non-chimeric=81.49, median%=84.35, merged%=83.61, samples_nonchim=253/253
    # test_49_f240_r240: f=240, r=240, %non-chimeric=81.30, median%=84.02, merged%=83.46, samples_nonchim=253/253
    
    # test_48_f240_r245: f=240, r=245, %non-chimeric=81.21, median%=83.95, merged%=83.33, samples_nonchim=253/253
    # test_47_f240_r250: f=240, r=250, %non-chimeric=81.12, median%=83.94, merged%=83.19, samples_nonchim=253/253
    # test_39_f245_r235: f=245, r=235, %non-chimeric=80.84, median%=83.59, merged%=82.98, samples_nonchim=253/253
    # test_38_f245_r240: f=245, r=240, %non-chimeric=80.78, median%=83.52, merged%=82.91, samples_nonchim=253/253
    # test_37_f245_r245: f=245, r=245, %non-chimeric=80.72, median%=83.36, merged%=82.81, samples_nonchim=253/253
    # test_36_f245_r250: f=245, r=250, %non-chimeric=80.62, median%=83.41, merged%=82.68, samples_nonchim=253/253
  4. Visualize outputs (Using pandaseq.out, since qiime2_metadata.tsv contains the pathway of pandaseq.out)

    4.1. mkdir fastqc_out fastqc -t 4 raw_data/* -o fastqc_out/ 4.2. mkdir trim_data trimmed_unpaired cd raw_data for file in O24021602_R1.fastq.gz O24021601_R1.fastq.gz O23100705_R1.fastq.gz PC_5_R1.fastq.gz O24010402_R1.fastq.gz U23090801_R1.fastq.gz O24012402_R1.fastq.gz NTC_7_R1.fastq.gz O23121301_R1.fastq.gz O24013104_R1.fastq.gz U25020701_R1.fastq.gz O23100401_R1.fastq.gz O24020901_R1.fastq.gz NTC_1_R1.fastq.gz O24020903_R1.fastq.gz O23092002_R1.fastq.gz UR009768_R1.fastq.gz O23100402_R1.fastq.gz O23091304_R1.fastq.gz O23100703_R1.fastq.gz O23110904_R1.fastq.gz O23110903_R1.fastq.gz O23121304_R1.fastq.gz O23083101_R1.fastq.gz O23100502_R1.fastq.gz O23110301_R1.fastq.gz O24020805_R1.fastq.gz A23051102_R1.fastq.gz O23102604_R1.fastq.gz A23051103_R1.fastq.gz O23102504_R1.fastq.gz O24011205_R1.fastq.gz O23111602_R1.fastq.gz O24020905_R1.fastq.gz O23090701_R1.fastq.gz O24020702_R1.fastq.gz NTC_4_R1.fastq.gz A24072901_R1.fastq.gz NTC_10_R1.fastq.gz O23100704_R1.fastq.gz NTC_8_R1.fastq.gz O24011901_R1.fastq.gz O23111501_R1.fastq.gz O23112303_R1.fastq.gz O23112902_R1.fastq.gz O24011203_R1.fastq.gz O23092006_R1.fastq.gz O24010404_R1.fastq.gz O23090803_R1.fastq.gz O23091501_R1.fastq.gz U25022101_R1.fastq.gz O23121305_R1.fastq.gz U24080201_R1.fastq.gz U24111801_R1.fastq.gz A24040201_R1.fastq.gz O23120702_R1.fastq.gz NTC_5_R1.fastq.gz O24013105_R1.fastq.gz O23100706_R1.fastq.gz O23083102_R1.fastq.gz O23091305_R1.fastq.gz PC_7_R1.fastq.gz O24011005_R1.fastq.gz PC01_R1.fastq.gz O24011004_R1.fastq.gz O23110902_R1.fastq.gz O24020904_R1.fastq.gz A24071701_R1.fastq.gz O24021501_R1.fastq.gz O23120101_R1.fastq.gz U25011702_R1.fastq.gz O23092103_R1.fastq.gz UR009909_R1.fastq.gz O23091403_R1.fastq.gz NTC_2_R1.fastq.gz PC_1_R1.fastq.gz O24013102_R1.fastq.gz O23102704_R1.fastq.gz O24010401_R1.fastq.gz O24012501_R1.fastq.gz U24121801_R1.fastq.gz O24031304_R1.fastq.gz O23090802_R1.fastq.gz NTC_12_R1.fastq.gz O23100702_R1.fastq.gz O24011202_R1.fastq.gz O23100601_R1.fastq.gz O23100802_R1.fastq.gz O24021404_R1.fastq.gz O23112301_R1.fastq.gz O24011103_R1.fastq.gz O24030601_R1.fastq.gz U23052201_R1.fastq.gz O23100803_R1.fastq.gz O23102502_R1.fastq.gz O24011206_R1.fastq.gz O23091401_R1.fastq.gz O23092001_R1.fastq.gz O24020801_R1.fastq.gz O23082302_R1.fastq.gz O23112204_R1.fastq.gz O23100701_R1.fastq.gz O23112304_R1.fastq.gz A24031201_R1.fastq.gz O24022801_R1.fastq.gz PC_3_R1.fastq.gz O24022901_R1.fastq.gz O23092005_R1.fastq.gz O23121502_R1.fastq.gz O24021401_R1.fastq.gz U25011701_R1.fastq.gz NTC01_R1.fastq.gz O23082401_R1.fastq.gz O24022202_R1.fastq.gz NTC_6_R1.fastq.gz O24031302_R1.fastq.gz NTC_15_R1.fastq.gz NTC_13_R1.fastq.gz O24022302_R1.fastq.gz O23121303_R1.fastq.gz O24021502_R1.fastq.gz O24020704_R1.fastq.gz O23092202_R1.fastq.gz A23112002_R1.fastq.gz O23092803_R1.fastq.gz O23112203_R1.fastq.gz NTC_11_R1.fastq.gz O23091302_R1.fastq.gz A24062801_R1.fastq.gz O23121302_R1.fastq.gz O23092104_R1.fastq.gz O24020802_R1.fastq.gz O24021403_R1.fastq.gz O23112201_R1.fastq.gz O23082402_R1.fastq.gz U24070401_R1.fastq.gz O24020902_R1.fastq.gz O24030602_R1.fastq.gz NTC_3_R1.fastq.gz O24011201_R1.fastq.gz O23091301_R1.fastq.gz O23112202_R1.fastq.gz O24021503_R1.fastq.gz O23102601_R1.fastq.gz O24011002_R1.fastq.gz O23121503_R1.fastq.gz O23092004_R1.fastq.gz O23091402_R1.fastq.gz O23092701_R1.fastq.gz O24031301_R1.fastq.gz O24020703_R1.fastq.gz O24013103_R1.fastq.gz A24080201_R1.fastq.gz U23071201_R1.fastq.gz U23052401_R1.fastq.gz U23071901_R1.fastq.gz O24011101_R1.fastq.gz U23091101_R1.fastq.gz O23110204_R1.fastq.gz O23102702_R1.fastq.gz O24020101_R1.fastq.gz O24012401_R1.fastq.gz O23100501_R1.fastq.gz O24020705_R1.fastq.gz A23060602_R1.fastq.gz O23102701_R1.fastq.gz O24011001_R1.fastq.gz O24022102_R1.fastq.gz O23110302_R1.fastq.gz O23100503_R1.fastq.gz O24020803_R1.fastq.gz O23110203_R1.fastq.gz A24030402_R1.fastq.gz O23092102_R1.fastq.gz O24011003_R1.fastq.gz O23121306_R1.fastq.gz O23110101_R1.fastq.gz O24020804_R1.fastq.gz O23112901_R1.fastq.gz A23072501_R1.fastq.gz O24030604_R1.fastq.gz O23112302_R1.fastq.gz O24010301_R1.fastq.gz O24021402_R1.fastq.gz O24011104_R1.fastq.gz O23102602_R1.fastq.gz O23102703_R1.fastq.gz O23091303_R1.fastq.gz O24011204_R1.fastq.gz A24071901_R1.fastq.gz O23102603_R1.fastq.gz O23082303_R1.fastq.gz O24011801_R1.fastq.gz O24030603_R1.fastq.gz O23092802_R1.fastq.gz O23090801_R1.fastq.gz A24030401_R1.fastq.gz O23083001_R1.fastq.gz A23060601_R1.fastq.gz O23092203_R1.fastq.gz NTC_14_R1.fastq.gz O23090602_R1.fastq.gz A23111301_R1.fastq.gz O23120103_R1.fastq.gz O24011105_R1.fastq.gz U23071701_R1.fastq.gz O23111601_R1.fastq.gz O24010501_R1.fastq.gz O23110202_R1.fastq.gz O23121501_R1.fastq.gz O24010302_R1.fastq.gz O24011207_R1.fastq.gz O23092702_R1.fastq.gz O23112206_R1.fastq.gz PC_2_R1.fastq.gz O23102503_R1.fastq.gz O23092201_R1.fastq.gz NTC_9_R1.fastq.gz A23080101_R1.fastq.gz PC_6_R1.fastq.gz O23120701_R1.fastq.gz U24101801_R1.fastq.gz A23112001_R1.fastq.gz O23120104_R1.fastq.gz O23100804_R1.fastq.gz O24031305_R1.fastq.gz O23111502_R1.fastq.gz O24020103_R1.fastq.gz O23082301_R1.fastq.gz NTC_16_R1.fastq.gz PC_8_R1.fastq.gz O24022101_R1.fastq.gz O24020102_R1.fastq.gz O23081801_R1.fastq.gz O24022902_R1.fastq.gz O23112205_R1.fastq.gz O24021603_R1.fastq.gz O23102501_R1.fastq.gz O24010403_R1.fastq.gz O24012403_R1.fastq.gz O23092003_R1.fastq.gz O24013101_R1.fastq.gz O24011802_R1.fastq.gz O24011102_R1.fastq.gz PC_4_R1.fastq.gz O23100801_R1.fastq.gz O23092101_R1.fastq.gz O24020701_R1.fastq.gz O23092801_R1.fastq.gz O24022201_R1.fastq.gz O23110901_R1.fastq.gz O23090601_R1.fastq.gz O24022301_R1.fastq.gz; do java -jar /home/jhuang/Tools/Trimmomatic-0.36/trimmomatic-0.36.jar PE -threads 16 $file ${file/_R1/_R2} ../trim_data/$file ../trimmed_unpaired/$file ../trim_data/${file/_R1/_R2} ../trimmed_unpaired/${file/_R1/_R2} ILLUMINACLIP:/home/jhuang/Tools/Trimmomatic-0.36/adapters/TruSeq3-PE-2.fa:2:30:10:8:TRUE LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36 AVGQUAL:20; done 2> trimmomatic_pe.log

    4.3. mkdir pandaseq.out conda activate /home/jhuang/miniconda3/envs/qiime1 for file in trim_data/*_R1.fastq.gz; do pandaseq -f ${file} -r ${file/_R1.fastq.gz/R2.fastq.gz} -l 300 -p CCTACGGGNGGCWGCAG -q GACTACHVGGGTATCTAATCC -w pandaseq.out/$(echo $file | cut -d’/’ -f2 | cut -d’‘ -f1-3)_merged.fasta >> LOG_pandaseq; done conda deactivate

    4.4. prepare qiime2_metadata.tsv

    4.5. run qiime feature-table summarize \ –i-table dada2_tests/test_59_f235_r245/table.qza \ –o-visualization table.qzv \ –m-sample-metadata-file qiime2_metadata.tsv

    #Table summary
    #Metric Sample
    #Number of samples  137-->96
    #Number of features 3,039-->21,893
    #Total frequency    1,641,484-->9,246,546
    #
    #Frequency per sample
    #Minimum frequency  413.0-->41,764.0
    #1st quartile   10,319.0-->100,017.5
    #Median frequency   11,530.0-->100,017.5
    #3rd quartile   13,146.0-->110,183.5
    #Maximum frequency  40,022.0-->143,563.0
    #Mean frequency 11,981.635036496351-->96,318.1875
    #
    #Frequency per feature
    #Minimum frequency  1.0
    #1st quartile   3.0
    #Median frequency   8.0-->4.0
    #3rd quartile   95.5-->14.0
    #Maximum frequency  56,472.0-->983,499.0
    #Mean frequency 540.1395195788089-->422.35
    
    #qiime tools peek dada2_tests/test_59_f235_r245/table.qza
    #qiime tools peek qiime2_metadata.tsv
    
    qiime feature-table tabulate-seqs \
    --i-data dada2_tests/test_59_f235_r245/rep-seqs.qza \
    --o-visualization rep-seqs.qzv
    
    qiime metadata tabulate \
    --m-input-file dada2_tests/test_59_f235_r245/denoising-stats.qza \
    --o-visualization denoising-stats.qzv
  5. Import reference sequences and taxonomy (SILVA 132)

    qiime tools import \
    --type 'FeatureData[Sequence]' \
    --input-path /home/jhuang/REFs/SILVA_132_QIIME_release/rep_set/rep_set_16S_only/99/silva_132_99_16S.fna \
    --output-path silva_132_99_otus.qza \
    --input-format DNAFASTAFormat
    
    qiime tools import \
    --type 'FeatureData[Taxonomy]' \
    --input-format HeaderlessTSVTaxonomyFormat \
    --input-path /home/jhuang/REFs/SILVA_132_QIIME_release/taxonomy/16S_only/99/consensus_taxonomy_7_levels.txt \
    --output-path silva_132_99_taxonomy.qza
  6. Assign taxonomy

    qiime feature-classifier classify-consensus-vsearch \
    --i-query  dada2_tests/test_59_f235_r245/rep-seqs.qza \
    --i-reference-reads silva_132_99_otus.qza \
    --i-reference-taxonomy silva_132_99_taxonomy.qza \
    --p-perc-identity 0.97 \
    --p-threads 64 \
    --o-classification taxonomy.qza \
    --o-search-results search-results.qza
  7. Visualize taxonomy

    qiime taxa barplot \
    --i-table  dada2_tests/test_59_f235_r245/table.qza \
    --i-taxonomy taxonomy.qza \
    --m-metadata-file qiime2_metadata.tsv \
    --o-visualization taxa-bar-plots.qzv
  8. Build phylogenetic tree

    qiime alignment mafft \
    --i-sequences  dada2_tests/test_59_f235_r245/rep-seqs.qza \
    --o-alignment aligned-rep-seqs.qza
    
    qiime alignment mask \
    --i-alignment aligned-rep-seqs.qza \
    --o-masked-alignment masked-aligned-rep-seqs.qza
    
    qiime phylogeny fasttree \
    --i-alignment masked-aligned-rep-seqs.qza \
    --o-tree unrooted-tree.qza
    
    # (*) The rooted-tree is generated from unrooted-tree, and will be used in the next step!
    qiime phylogeny midpoint-root \
    --i-tree unrooted-tree.qza \
    --o-rooted-tree rooted-tree.qza
  9. Core diversity analysis

    #The -e 6389 flag sets the even sampling depth (rarefaction depth) to 6,389 reads for diversity analyses.
    #All samples will be rarefied to 4,753 reads.
    #Samples with fewer reads are excluded.
    qiime diversity core-metrics-phylogenetic \
    --i-phylogeny rooted-tree.qza \
    --i-table  dada2_tests/test_59_f235_r245/table.qza \
    --p-sampling-depth 6389 \
    --m-metadata-file qiime2_metadata.tsv \
    --output-dir core_metrics_results
    
    qiime diversity alpha \
    --i-table dada2_tests/test_59_f235_r245/table.qza \
    --p-metric chao1 \
    --o-alpha-diversity core_metrics_results/chao1_vector.qza
    
    qiime tools export --input-path core_metrics_results/shannon_vector.qza --output-path exported_alpha/shannon
    qiime tools export --input-path core_metrics_results/faith_pd_vector.qza --output-path exported_alpha/faith_pd
    qiime tools export --input-path core_metrics_results/observed_features_vector.qza --output-path exported_alpha/observed_features
    qiime tools export --input-path core_metrics_results/chao1_vector.qza --output-path exported_alpha/chao1
    
    qiime tools export \
    --input-path core_metrics_results/unweighted_unifrac_distance_matrix.qza \
    --output-path exported_unweighted_unifrac
    qiime tools export \
    --input-path core_metrics_results/weighted_unifrac_distance_matrix.qza \
    --output-path exported_weighted_unifrac
    
    qiime diversity beta-group-significance \
    --i-distance-matrix core_metrics_results/weighted_unifrac_distance_matrix.qza \
    --m-metadata-file qiime2_metadata.tsv \
    --m-metadata-column Group \
    --p-pairwise \
    --p-method permanova \
    --o-visualization beta_group_significance.qzv
    
    qiime tools export \
    --input-path beta_group_significance.qzv \
    --output-path exported_beta_group
  10. Prepare three files feeding to Phyloseq.Rmd: table.qza (see above with ), rooted-tree.qza (see above with ), qiime2_metadata_for_qza_to_phyloseq.tsv edited from qiime2_metadata.tsv.

    # Rarefying can be performed here, or in Phyloseq.Rmd (default), therefore, we don't need this step any more.
    qiime feature-table summarize \
    --i-table core_metrics_results/rarefied_table.qza \
    --o-visualization rarefied_table.qzv \
    --m-sample-metadata-file qiime2_metadata.tsv
    
    #Table summary
    #Metric Sample
    #Number of samples  136
    #Number of features 2,781
    #Total frequency    868,904
    
    # In QIIME2, we need table.qza, not biom-file, therefore, we don't need this step any more.
    qiime tools export \
    --input-path core_metrics_results/rarefied_table.qza \
    --output-path exported_rarefied_table
    #--> exported_rarefied_table/feature-table.biom
    
    biom convert \
    -i exported_rarefied_table/feature-table.biom \
    -o exported_rarefied_table/feature-table.tsv \
    --to-tsv
    
    #✅ Old QIIME 1 table with GenBank IDs (like EF603722.1.1487) as feature labels.
    #✅ QIIME 2 table where feature IDs are hashes (like 0b438323a296b5f2ce2c8bbe3949ee8d).
    
    # Visulaize the taxonomy.qza
    qiime tools export \
    --input-path taxonomy.qza \
    --output-path exported-taxonomy
    
    #Feature ID    Taxon                               Confidence
    #0b4383...     k__Bacteria; p__Proteobacteria...   0.98
    #dfa833...     k__Bacteria; p__Firmicutes...       0.87
    #...
    
    # ---- I used the following to generate two file for feeding in the Phyloseq.Rmd ----
    
    #-1- exported_table/feature-table.biom corresesponds to table_even6389.biom in QIIME1, but in QIIME2, we don't need biom-file, instead of table.qza.
    
    qiime tools export \
    --input-path  dada2_tests/test_59_f235_r245/table.qza \
    --output-path exported_table
    #--> exported_table/feature-table.biom
    
    #-2- exported-tree/tree.nwk corresesponds to rep_set.tre in QIIME1
    
    qiime tools export \
    --input-path rooted-tree.qza \
    --output-path exported-tree
    #--> exported-tree/tree.nwk
    
    # END
    
    # ---- The code in Phyloseq.Rmd ----
    
    #install.packages("remotes")
    #remotes::install_github("jbisanz/qiime2R")
    #"core_metrics_results/rarefied_table.qza", rarefying performed in the code, therefore import the raw table.
    library(qiime2R)
    ps.ng.tax <- qza_to_phyloseq(
        features =  "dada2_tests/test_59_f235_r245/table.qza",
        tree = "rooted-tree.qza",
        metadata = "qiime2_metadata_for_qza_to_phyloseq.tsv"
    )
    # or
    #biom convert \
    #      -i ./exported_table/feature-table.biom \
    #      -o ./exported_table/feature-table-v1.biom \
    #      --to-json
    #ps.ng.tax <- import_biom("./exported_table/feature-table-v1.biom", treefilename="./exported-tree/tree.nwk")
    
    #Note that the alpha- and beta-diversity-files needed in Phyloseq.Rmd has been prepared in the step 9.
  11. Figures generated by Phyloseq.Rmd and MicrobiotaProcess_*.R

    The following files can be found under server.

    ./Phyloseq.Rmd (Result Phyloseq.html)
    ./MicrobiotaProcess_cluster1_Group9-11_vs_cluster2_Group12-14_orig.R
    ./MicrobiotaProcess_Group1_vs_Group2.R
    ./MicrobiotaProcess_Group3_vs_Group4.R
    ./MicrobiotaProcess_PCA_Group1-4.R
    ./MicrobiotaProcess_PCA_Group9-14.R
  12. Generating relative_abundance_phylum_order_family.xls Convert the Excel file for Phylum, Order, and Family relative abundances for all samples to csv file after correcting the family due to the two redundance species “=SUM(EG10+EG18)” Edit the csv file, merge then to a file.

    ~/Tools/csv2xls-0.4/csv_to_xls.py relative_abundance_phylum.csv relative_abundance_order.csv relative_abundance_family.csv -d$'\t' -o relative_abundance_phylum_order_family.xls;

Comparing MicrobiotaProcess.R to MicrobiotaProcess_Group9_10_11_small_PreFMT.R

The two scripts largely use the same MicrobiotaProcess functions; the main difference is *which `ps_` object you feed in for each task** (alpha, beta, composition plots).

What the “small” script currently does

  • Creates one MPSE (often called mpse_abund) from a taxa-filtered plotting object (e.g. ps.ng.tax_abund / your ps_abund) and then runs alpha diversity + beta diversity + composition plotting all on that same MPSE.
  • It also uses rarefied abundance (RareAbundance) as the input for some taxonomy abundance plots/heatmaps.

Implication: alpha/beta diversity are being computed on a taxa-filtered dataset, which can:

  • artificially reduce richness (Observed/Chao1),
  • shift Shannon/Simpson,
  • and change distance structure (especially for presence/absence metrics, but also sometimes Bray).

What the updated MicrobiotaProcess workflow (recommended) does

It splits the workflow into two MPSE objects, each built from the correct upstream ps_*:

1) Diversity MPSE (mpse_div)

Input: ps_filt (QC-filtered samples, full taxa set; not filtered “for plotting”)

  • Alpha diversity: do rarefaction inside MPSE (mp_rrarefy()) and compute alpha on RareAbundance.
  • Beta diversity (your current Bray+Hellinger): compute Hellinger from non-rarefied Abundance and then Bray/PCoA/PERMANOVA.

2) Plotting MPSE (mpse_plot)

Input: ps_abund_rel (taxa filtered for readability; relative abundance)

  • Use this MPSE only for composition plots (stacked bars, heatmaps).

Implication: you keep diversity analyses biologically faithful (no “plotting filter”) while still producing clean, readable composition plots.

Other (non-critical) differences

  • The updated script uses prune_samples() + prune_taxa() instead of overwriting otu_table() manually (safer / less error-prone).
  • Outputs are consistently written into a figures/ directory.

Bottom line

Same MicrobiotaProcess functions; the updated script mainly fixes the *recommended `ps_` input choice** for each analysis type (diversity vs. plotting).

Phyloseq objects used in the workflow (ps_*)

!!!!! Good candidate for the workshop for the clinicians !!!!!

“First, I generate ps_rarefied from ps_filt. Then, for cleaner composition plots, I create ps_abund / ps_abund_rel by filtering taxa for plotting (e.g., keeping taxa with mean relative abundance > 0.1%; resulting in ~95 taxa across 239 samples). Is it correct to compute alpha diversity and beta diversity using ps_abund (the taxa-filtered object)?”

Not really. ps_abund / ps_abund_rel (taxa filtered “for plotting”) is generally not the right object for alpha- or beta-diversity, unless you explicitly want diversity calculated only within that filtered subset. Luckily, I directly using alpha- and beta diversity from qiime2-result and MicrobiotaProcess –> which means we should feed MicrobiotaProcess with ps_filt (not rarefied) and for differential analysis also using ps_filt (not rarefied)

For that sentence, I mean these specific objects:

  • For Jaccard / unweighted UniFrac (presence–absence / detection-sensitive):
    ✅ compute beta diversity on ps_rarefied
    = the rarefied-count phyloseq object created from ps_filt (with the full taxon set, i.e., no plotting-only taxon filter).

  • For Bray–Curtis (abundance-based):
    ✅ compute beta diversity on a non-rarefied transformed object, typically:

    • ps_rel = relative abundances computed from ps_filt (not rarefied), or
    • a Hellinger-transformed version derived from ps_filt (if that is your chosen workflow).
  • For CLR / Aitchison distance:
    ✅ compute beta diversity on a CLR-transformed version of the non-rarefied counts from ps_filt
    (often named ps_clr / otu_clr, created from ps_filt + pseudocount), not on ps_abund.

The “plotting-filtered object” I’m warning against is:

  • ps_abund / ps_abund_rel = taxa filtered for visualization (e.g., mean relative abundance > 0.1%).

Why it’s not “correct” for diversity

Filtering taxa by mean relative abundance (e.g., >0.1%) removes many low-abundance / low-prevalence taxa and changes the community profile.

Alpha diversity

  • Observed richness will be artificially lower (because taxa were removed).
  • Shannon/Simpson can also change because you altered the abundance distribution. ✅ Therefore, compute alpha diversity on ps_rarefied (unfiltered taxa), not on ps_abund.

Beta diversity

This depends on the distance metric, but heavy taxon filtering can distort results:

  • Presence/absence distances (e.g., Jaccard, unweighted UniFrac) are very sensitive to removing taxa → filtering can strongly change distances and clustering.
  • Abundance-weighted distances (e.g., Bray–Curtis, weighted UniFrac) can also shift after filtering; mild filters (e.g., remove singletons) may be acceptable, but mean rel. abundance >0.1% can be quite aggressive (95 taxa suggests strong pruning). ✅ Best practice: compute beta diversity on ps_rarefied (for Jaccard/unweighted UniFrac) or on non-rarefied transformed data (for Bray/CLR workflows), not on a plotting-filtered object.

Recommended workflow (clean separation of purposes)

For analysis (diversity)

  • Alpha diversity: ps_rarefied (derived from ps_filt)
  • Beta diversity:
    • Jaccard / unweighted UniFrac: ps_rarefied
    • Bray–Curtis: usually from ps_rel (or Hellinger), often without rarefaction
    • Aitchison/CLR: CLR-transformed (non-rarefied) data (no rarefaction)

For visualization (composition plots only)

  • Optional cleaner composition plots: ps_abund / ps_abund_rel derived from ps_rel (taxa filtered to make stacked bars/heatmaps readable)

When would ps_abund be acceptable for beta diversity?

Only if your question is explicitly:

“How do samples differ considering only dominant taxa (≥0.1% mean abundance)?”

Then it’s a valid different analysis, but you should label it clearly as “dominant-taxa-only beta diversity.”

If you tell me which beta distance(s) you use (Bray, Jaccard, weighted/unweighted UniFrac, CLR/Aitchison), I can recommend the best object + transformation for your exact setup.


The correct answer is: beta diversity does not universally require rarefaction.

Why this confusion happens

Historically, many microbiome pipelines used rarefied counts for beta diversity to “standardize depth.” That’s why you often see “beta diversity → rarefy” in older workflows. But in modern practice, rarefaction for beta diversity is optional and sometimes not recommended, because it discards data and adds randomness.

When rarefaction is a good idea for beta diversity

Use ps_rarefied when your beta metric is very sensitive to library size, especially:

  • Presence/absence distances: Jaccard, unweighted UniFrac These are strongly influenced by detection of rare taxa, which depends on depth. Rarefaction is a common way to make them comparable.
  • If your field/reviewer expectations specifically require rarefied beta diversity for certain metrics.

When rarefaction is not necessary (often better avoided)

Prefer non-rarefied approaches when you use:

  • Bray–Curtis on relative abundance (or Hellinger-transformed data) Rarefaction can reduce power and add noise; relative abundance already removes the “sum” effect (though compositionality remains).
  • Aitchison distance / PCA via CLR transform (Euclidean on CLR) This is a standard compositional approach and typically does not use rarefaction.
  • DESeq2 VST / rlog for ordination-like PCA (count-model-based normalization) Again: no rarefaction.

Important note about your original code

Doing rarefaction → compositional (“relative abundance”) is usually redundant for beta diversity/ordination, because after rarefaction every sample has the same total, and compositional transformation just divides by that constant.

Corrected “rule of thumb”

  • Alpha diversity: ps_rarefied ✅ (common)
  • Beta diversity:
    • Unweighted UniFrac / Jaccard: ps_rarefied ✅ (often recommended)
    • Bray–Curtis / ordination on abundances: ps_rel or Hellinger ✅ (rarefaction optional)
    • Aitchison (CLR): CLR-transformed (non-rarefied) ✅ (no rarefaction)

If you tell me exactly which beta-diversity distance(s) you compute (Bray–Curtis? weighted/unweighted UniFrac? Jaccard? Aitchison/CLR?), I can recommend the cleanest “use this object + this transformation” setup.


Doing rarefaction and then compositional normalization (relative abundance) is often redundant.

Short answer: Don’t need both — rarefaction + compositional normalization is redundant in most cases. Doing rarefaction and then compositional normalization (relative abundance) is often redundant.

# RAREFACTION
set.seed(9242)  # This will help in reproducing the filtering and nomalisation.
ps.ng.tax <- rarefy_even_depth(ps.ng.tax, sample.size = 6389)
total <- 6389

# NORMALIZE number of reads in each sample using median sequencing depth.
total = median(sample_sums(ps.ng.tax))
#> total
#[1] 42369
standf = function(x, t=total) round(t * (x / sum(x)))
ps.ng.tax = transform_sample_counts(ps.ng.tax, standf)
ps.ng.tax_rel <- microbiome::transform(ps.ng.tax, "compositional")

saveRDS(ps.ng.tax, "./ps.ng.tax.rds")

What happens mathematically

1) Rarefaction forces every sample to the same library size (e.g., 6,389 reads). 2) Compositional normalization divides each sample by its total so it sums to 1.

After rarefaction, every sample has the same total, so compositional normalization is essentially: relative_abundance = rarefied_counts / constant_depth. That step is valid, but typically unnecessary.

Practical implication

  • If your goal is relative abundance–based visualization or many beta-diversity analyses, you can usually skip rarefaction and compute relative abundances (or other transformations) on the full data.
  • If your goal is alpha diversity, rarefaction is commonly used, but then you typically keep that rarefied object for alpha diversity only (not as a general-purpose normalized dataset).

So: rarefaction alone can be fine for specific tasks (especially alpha diversity), but rarefaction + compositional normalization together is usually not needed.


Mapping new ps_* objects to the old ps.ng.tax* objects (and overwrite notes)

ps_* object What it contains Old R-script equivalent (ps.ng.tax*) Overwrite notes for old object(s) Taxonomic composition Alpha diversity Beta diversity DESeq2 differential abundance
ps_raw Raw imported phyloseq object (integer counts; as imported) (no direct equivalent) ⚠️ only if you also apply sample filtering first
ps_base ps_raw + taxonomy + sample metadata aligned (still raw counts) (closest to) ps.ng.tax before overwrite Old ps.ng.tax BEFORE overwrite: integer count table (absolute abundance). ⚠️ only if you also apply sample filtering first
ps_pruned Optional subset of ps_base (e.g., remove unwanted samples by ID/pattern); still raw counts (subset of) ps.ng.tax before overwrite Old ps.ng.tax BEFORE overwrite: same as above, but without any sample subsetting unless you did it. ⚠️ only if you also apply low-depth filtering
ps_filt Filtered samples (low-depth samples removed) + taxa with nonzero totals; absolute counts ps.ng.tax before overwrite (but with explicit low-depth filtering) Old ps.ng.tax BEFORE overwrite: raw integer counts; new ps_filt is the “cleaned” version after sample-depth QC. ✅ as a starting point (but plot on ps_rel) ✅ as input to rarefaction ✅ as input to rarefaction ✅ as input to ps_deseq
ps_rel Relative abundance (compositional) computed from ps_filt ps.ng.tax_rel (conceptually) In the old script, ps.ng.tax_rel is “relative abundance of ps.ng.tax”, but its meaning depends on whether it was computed before or after ps.ng.tax was overwritten. ✅ primary
ps_abund Absolute counts after “plotting taxa filter” (e.g., mean rel. abundance > 0.1%), derived from ps_filt via ps_rel ps.ng.tax_abund Old ps.ng.tax_abund was created by filtering taxa using mean relative abundance (> 0.001) and then pruning counts. ✅ (if you want cleaner plots) (not recommended)
ps_abund_rel Relative abundance computed from ps_abund (filtered taxa set) ps.ng.tax_abund_rel Old ps.ng.tax_abund_rel was relative abundance of the filtered-taxa object. ✅ (clean composition plots)
ps_rarefied Rarefied counts from ps_filt (even depth) ps.ng.tax after overwrite Old ps.ng.tax was overwritten 1× by rarefy_even_depth(ps.ng.tax, sample.size = 41764) → after this, ps.ng.tax no longer meant raw counts; it meant rarefied counts. ✅ primary ✅ primary
ps_deseq Non-rarefied integer counts from ps_filt + optional count-based taxon prefilter (e.g., total ≥ 10) (no direct equivalent) Old ps.ng.tax_abund is not a good DESeq2 analogue because it used a mean-relative-abundance filter; ps_deseq uses count-based prefiltering (optional) and keeps integer counts. ✅ primary

Overwrite summary (old script):

  • ps.ng.tax was overwritten 1 time:
    • Before overwrite: ps.ng.tax = absolute abundance (raw integer counts).
    • After overwrite: ps.ng.tax = rarefied counts, produced by rarefy_even_depth(ps.ng.tax, sample.size = 41764).
ps_* object What it contains Taxonomic composition Alpha diversity Beta diversity DESeq2 differential abundance
ps_raw Raw imported phyloseq object (integer counts; as imported) ❌ (not recommended directly) ⚠️ only if you also apply sample filtering first
ps_base ps_raw + taxonomy + sample metadata aligned (still raw counts) ⚠️ only if you also apply sample filtering first
ps_pruned Optional subset of ps_base (e.g., remove unwanted samples by ID/pattern); still raw counts ⚠️ only if you also apply low-depth filtering
ps_filt Filtered samples (low-depth samples removed) + taxa with nonzero totals; absolute counts ✅ as a starting point (but plot on ps_rel) ✅ as the input to rarefaction ✅ as the input to rarefaction ✅ as the input to ps_deseq
ps_rel Relative abundance (compositional) computed from ps_filt ✅ primary
ps_abund Absolute counts after “plotting taxa filter” (e.g., mean rel. abundance > 0.1%), derived from ps_filt via ps_rel ✅ (if you want cleaner plots) ❌ (not recommended)
ps_abund_rel Relative abundance computed from ps_abund (filtered taxa set) ✅ (clean composition plots)
ps_rarefied Rarefied counts from ps_filt (even depth) ✅ primary ✅ primary
ps_deseq Non-rarefied integer counts from ps_filt + optional count-based taxon prefilter (e.g., total ≥ 10) ✅ primary

Why the ΔadeIJ Evidence Chain Is Not Fully “Closed”

efflux_logica_table

这里说“证据链没有闭环”,不是否定结果,而是从审稿人最严格的因果标准来看,ΔadeIJ 这条线目前更像“相关性很强 + 合理推断”,但还缺少几步能把“推断”锁死成“因果”的关键证据。

1) 目前已有的证据(强相关)

  • 表型:ΔadeIJ 的 ROS 更高,对 Cu²⁺ / SNP 更敏感
  • 转录组:ΔadeIJ 上调金属外排/解毒/应激相关基因

因此很自然会推断:

AdeIJ 参与金属/氧化应激稳态(redox & metal homeostasis)

这属于 “一致性证据(consistent with)”,说服力已经很不错。

2) 为什么说“还没闭环”:少了把“推断 → 因果”钉死的步骤

审稿人可能会问:
“你怎么证明是 缺失 AdeIJ 导致金属/ROS 失衡,而不是其他原因?”

通常闭环至少需要补上一类证据(不一定要做,但逻辑上缺):

A. 互补实验(complementation / rescue)

理想闭环:

  • ΔadeIJ 表型变差
  • 把 adeIJ(或 adeIJK)补回去 → ROS/Cu²⁺/SNP 表型恢复到 WT(rescue)

没有这一步,审稿人可能会担心:

  • 极性效应(polar effect)
  • 二次突变(secondary mutation)
  • 背景差异(background effects)

B. 直接测量“因果中间量”(mechanistic intermediate)

你们推断的是“金属/毒性底物积累 → ROS 上升 → Cu²⁺/SNP 敏感”。
但目前缺少对“中间量”的直接测量,例如:

  • 细胞内 铜含量 是否在 ΔadeIJ 增加(ICP-MS/比色法等)?
  • 是否有更明确的蛋白氧化损伤/Fe–S 破坏指标?
  • 是否存在呼吸链/膜电位异常导致 ROS 增加?

目前是“结果(ROS 高)+ 反应(相关基因上调)”,但还没直接证明“金属积累/底物积累”这一环节。

C. 排除替代解释(alternative explanations)

ΔadeIJ 的高 ROS 也可能来自:

  • 代谢状态改变导致呼吸链泄漏增加
  • 生长状态差异影响 ROS 测定
  • Cu²⁺ 敏感来自包膜改变而非金属外排不足

如果未排除,结论更适合写成 consistent with,而非 crucial determinant

3) “闭环”长什么样(最理想的因果链)

AdeIJ 缺失
细胞内金属/毒性底物积累(直接测到)
ROS 上升(你们已测到)
Cu²⁺/SNP 敏感(你们已测到)
补回 adeIJ 后表型恢复(rescue)

这就是“证据链闭环”。

4) 对写作的建议(不一定要加实验)

投稿时完全可以不补实验,但建议在文字上更稳健:

  • 避免:“AdeIJ is crucial/essential for maintaining …”
  • 推荐:
    • “These data support / are consistent with a role for AdeIJ in …”
    • “We suggest AdeIJ contributes to …”
    • “Further work (e.g., complementation or intracellular metal quantification) would be needed to establish causality.”

Arc 在海马与前额叶发育中的作用对比:空间学习关键期 vs 精神分裂症相关网络异常

方面 PNAS 2018:海马、空间学习与“关键期” J Neurosci 2019:前额叶、精神分裂症相关表型
研究脑区 海马 (Hippocampus) 前额叶皮层 (PFC)
核心问题 海马空间学习是否存在依赖 Arc 的发育关键期? Arc 缺失是否会导致精神分裂症样行为和 PFC 功能异常?
主要方法 – 不同时间窗 Arc KO 小鼠 (常规/早期 P7+/晚期 P21+)

– Morris 水迷宫、情境恐惧记忆
– 海马 LFP (theta, gamma, ripple)
– 同样 KO 小鼠
– PFC LFP (theta, gamma) + 切片电生理 (E/I 平衡、网络增益)
– 社交、工作记忆 (Y 迷宫)、PPI、开放场、安非他命反应、癫痫易感性等
主要结论 – Arc 在出生后首月海马高表达,形成发育关键期

– 早期敲除永久损害网络振荡及成年空间学习
– 晚期敲除不影响学习但仍需 Arc 存储长期记忆
一句话:Arc 决定海马空间学习网络的“发育窗口”
– 早期/全身 KO 削弱 PFC 振荡及突触功能 (网络“长歪”)
– 但无典型精神分裂症行为缺陷 (社交、工作记忆、PPI、多巴胺正常)
– Arc 删除扰乱 PFC 网络但不足以产生精神分裂症样行为
一句话:Arc 缺失单独不足以造成精神分裂症
结论导向 Arc 的必要性 + 关键期:特定发育阶段对海马网络成熟关键 Arc 删除为不充分条件:需 Arc 失调 + 其他因素组合
方法侧重 行为 + 海马 LFP (theta/gamma/ripple) PFC LFP + 切片电生理 + 全面精神分裂症行为/多巴胺/癫痫测试

Follow-up questions regarding Data_Karoline_16S_2025_cagefilter and Data_Marius_16S (Dec 2025)

  1. 针对第 9、10、11 组以及混合的 pre-FMT 组的 PCA 和多样性分析
    筛选条件:Group 9(J1–4, J10, J11)、Group 10(K1–K6)、Group 11(L2–L6)以及 pre-FMT(G1–G6, H1–H6, I1–I6)
    1.1 这些组之间在 α 多样性或 β 多样性方面是否存在显著差异?你能否为此生成一个 PCoA 图?
    1.2 在仅比较第 9、10 和 11 组时,α 多样性或 β 多样性之间是否存在显著差异?你能否也为此生成一个 PCoA 图?
    1.3 第 9 组与第 10 组之间有哪些差异表达的基因(DEGs)?
    1.4 第 9 组与第 10 组之间是否存在差异富集/差异表达的通路?
    1.5 你是否可以使用 R 绘制一棵系统聚类树(树状图),包含数据集中检测到的所有细菌科(bacterial families)?我附了一篇论文作为示例(Figure 1C)。

  2. Group 3 与 Group 4 之间的通路分析
    筛选条件:Group 3:C1–C7;Group 4:E1–E10。
    是否可以请你对 Group 3 vs Group 4 做一次通路分析?

  3. 2022 年数据集(第一组筛选条件)
    筛选条件:Group 1:1, 2, 7, 6;Group 5:29, 30, 31。
    这些组之间在 α 多样性或 β 多样性方面是否存在显著差异?你能否为此生成一个 PCoA 图?
    有哪些 DEGs?
    是否存在差异富集/差异表达的通路?

  4. 2022 年数据集(第二组筛选条件)
    筛选条件:Group 1:1, 2, 7, 6;Group 5:29, 30, 31;Group 2(不加筛选);Group 6(不加筛选)。
    这些组之间在 α 多样性或 β 多样性方面是否存在显著差异?你能否为此生成一个 PCoA 图?

  5. 微生物组分析的方法部分
    我们已经写好了材料和方法部分,其中包含微生物组分析的内容,我已将这一部分附在邮件中。你有没有什么意见?特别是你认为是否还需要补充描述 QC(质控)分析的内容?

    Microbiome analysis(微生物组分析)
    在所示时间点,于上午 9–10 点之间采集小鼠粪便样本,并立即在 -80°C 冻存直至使用。DNA 提取使用 QIAamp Fast DNA Stool Mini Kit(Qiagen, #51604),并根据已发表的小鼠粪便专用方案进行操作⁸。简而言之,粪便颗粒在 Precellys® 24 匀浆器(Bertin Technologies)中、使用 Soil grinding tubes(Bertin Technologies, #SK38)进行均质处理,随后按照试剂盒说明书提取 DNA。通过 Nanodrop 测定 DNA 产量,并将浓度调至 10 ng/µl。16S rRNA 扩增子(V3–V4 区域)采用带有 Illumina 接头通用序列的简并引物进行扩增:前向引物 F(5′- TCGTCGGCAGCGTCAGATGTGTATAAGAGACAGCCTACGGGNGGCWGCAG-3′),反向引物 R(5′-GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAGGACTACHVGGG-TATCTAATCC-3′),具体方法参考已发表的方案⁹,¹⁰。随后使用 Illumina Nextera XT Index Kit(Illumina, #FC-131-1001)对样本进行多重上样,构建带条形码的文库。文库在 MiSeq 平台(Illumina, #MS-102-2003)上进行 500PE 测序。

    QC
    不同组之间微生物群落组成的差异使用 PERMANOVA(置换多元方差分析)进行评估,基于 R 包 vegan 中的 adonis 函数,设置 9999 次置换。该分析基于 Bray–Curtis 距离矩阵,该矩阵综合考虑了物种的有无以及各物种在每个样本中的相对丰度。该方法用于评估不同组之间整体群落组成是否存在显著差异。
    群体间的差异丰度分析使用 R 包 DESeq2 完成,该方法通过负二项式广义线性模型对计数数据进行建模¹¹。在气泡图中,x 轴表示相对丰度的 log₂ 倍数变化,每个气泡代表一个 OTU,气泡颜色表示其所属的细菌目(bacterial order),气泡大小表示经过 Benjamini–Hochberg 校正后的调整 p 值。

  6. 关于 PCoA 和 PCA 的问题
    我们目前还不是完全确定你所做的分析是主坐标分析(PCoA)还是主成分分析(PCA)。你能否就此再给一些说明或评论?


  1. 第 9、10、11 组及混合 pre-FMT 组的 PCA 和多样性分析
    你之前已经向我们展示过,这些组在 α 多样性和 β 多样性方面存在显著差异。我们能否据此得出结论:每一组在 α 和 β 多样性上都与所有其他组显著不同?如果不能,是否可以做一个事后检验(例如 Bonferroni 校正的 post-hoc 分析),来查看哪些组在 α 或 β 多样性上存在显著差异?在这里,我们主要感兴趣的比较是:9 vs 10,9 vs 11,10 vs 11。

  2. 通路分析:第 9 组 vs 第 10 组
    是否可以预先筛选我们感兴趣的通路?在这里,我们特别想关注参与短链脂肪酸(SCFAs)生成的相关通路。这个在分析上可行吗?你对这样的做法有什么看法或建议?

  3. 2022 年数据集:第 1、2、5 和 6 组
    在第 1 组和第 5 组之间,是否能检测到任何差异表达基因(DEGs)?
    你之前已经向我们展示过,第 1、2、5 和 6 组之间在 β 多样性上存在显著差异。结合我在第一个问题中的疑问:这些组在 β 多样性上是否彼此之间都显著不同,还是说我们需要进行事后分析(post-hoc)来判断具体是哪些组之间存在显著差异?在这里,我们主要关心的比较是 1 vs 5 和 2 vs 6。你能帮我们一起解读这些结果吗?

Hamburg State Schools Abitur 2025: Average Grades Ranked

The table lists Hamburg state schools’ 2025 Abitur average grades and graduate counts, now with an index column as first column.1

# School (English [German]) Avg. grade Number of graduates
1 Oberalster Grammar School [Gymnasium Oberalster] 1.83 86
2 Johanneum Academic School [Gelehrtenschule des Johanneums] 1.91 104
3 Christianeum [Christianeum] 1.93 108
4 Second Educational Path Campus (former Hansa-Kolleg) [Campus Zweiter Bildungsweg (eh. Hansa-Kolleg)] 1.94 14
5 Buckhorn Grammar School [Gymnasium Buckhorn] 1.95 122
6 Helene-Lange Grammar School [Helene-Lange-Gymnasium] 1.96 123
7 Altona Grammar School [Gymnasium Altona] 2.00 109
8 Albert Schweitzer Grammar School [Albert-Schweitzer-Gymnasium] 2.05 109
9 Othmarschen Grammar School [Gymnasium Othmarschen] 2.05 88
10 Walddörfer Grammar School [Walddörfer-Gymnasium] 2.07 155
11 Ohlstedt Grammar School [Gymnasium Ohlstedt] 2.10 46
12 Rissen Grammar School [Gymnasium Rissen] 2.10 82
13 Blankenese Grammar School [Gymnasium Blankenese] 2.11 115
14 Hochrad Grammar School [Gymnasium Hochrad] 2.12 147
15 Winterhude Community School [Stadtteilschule Winterhude] 2.14 69
16 Alstertal Grammar School [Gymnasium Alstertal] 2.15 60
17 Emilie-Wüstenfeld Grammar School [Emilie-Wüstenfeld-Gymnasium] 2.16 121
18 Friedrich-Ebert Grammar School [Friedrich-Ebert-Gymnasium] 2.17 63
19 Grootmoor Grammar School [Gymnasium Grootmoor] 2.19 118
20 Lerchenfeld Grammar School [Gymnasium Lerchenfeld] 2.19 118
21 Alter Teichweg Community School [Grund- und Stadtteilschule Alter Teichweg] 2.20 125
22 Hummelsbüttel Grammar School [Gymnasium Hummelsbüttel] 2.20 74
23 Eppendorf Grammar School [Gymnasium Eppendorf] 2.21 91
24 Kaiser-Friedrich-Ufer Grammar School [Gymnasium Kaiser-Friedrich-Ufer] 2.21 95
25 Marion Dönhoff Grammar School [Marion Dönhoff Gymnasium] 2.21 75
26 Matthias-Claudius Grammar School [Matthias-Claudius-Gymnasium] 2.22 107
27 Bergedorf Community School [Stadtteilschule Bergedorf] 2.23 80
28 Hansa Grammar School Bergedorf [Hansa-Gymnasium Bergedorf] 2.24 104
29 Bornbrook Grammar School [Gymnasium Bornbrook] 2.25 72
30 Ohmoor Grammar School [Gymnasium Ohmoor] 2.25 156
31 Wilhelm Grammar School [Wilhelm-Gymnasium] 2.25 64
32 Klosterschule Grammar School [Gymnasium Klosterschule] 2.27 111
33 Lohbrügge Grammar School [Gymnasium Lohbrügge] 2.28 105
34 Meiendorf Grammar School [Gymnasium Meiendorf] 2.28 84
35 Carl-von-Ossietzky Grammar School [Carl-von-Ossietzky-Gymnasium] 2.29 92
36 Max Brauer School [Max-Brauer-Schule] 2.29 76
37 Vocational School for Plant and Construction Engineering at Inselpark [Berufliche Schule Anlagen- und Konstruktionstechnik am Inselpark] 2.32 13
38 Rahlstedt Grammar School [Gymnasium Rahlstedt] 2.32 110
39 Allee Grammar School [Gymnasium Allee] 2.33 107
40 Bondenwald Grammar School [Gymnasium Bondenwald] 2.33 93
41 Osterbek Grammar School [Gymnasium Osterbek] 2.33 45
42 Alexander-von-Humboldt Grammar School [Alexander-von-Humboldt-Gymnasium] 2.34 67
43 Vocational School Am Lämmermarkt [Berufliche Schule Am Lämmermarkt] 2.34 40
44 Heidberg Grammar School [Gymnasium Heidberg] 2.35 77
45 Albrecht-Thaer Grammar School [Albrecht-Thaer-Gymnasium] 2.37 92
46 Vocational School for Banking, Insurance and Law with Vocational Grammar School St. Pauli [Berufliche Schule für Banken, Versicherungen und Recht mit Beruflichem Gymnasium St. Pauli] 2.38 33
47 Corveystraße Grammar School [Gymnasium Corveystraße] 2.39 118
48 Finkenwerder Grammar School [Gymnasium Finkenwerder] 2.39 33
49 Blankenese Community School [Stadtteilschule Blankenese] 2.40 109
50 Erich Kästner School [Erich Kästner Schule] 2.41 57
51 Farmsen Grammar School [Gymnasium Farmsen] 2.41 62
52 Walddörfer Community School [Stadtteilschule Walddörfer] 2.41 61
53 Allermöhe Grammar School [Gymnasium Allermöhe] 2.42 61
54 Oldenfelde Grammar School [Gymnasium Oldenfelde] 2.42 73
55 Julius-Leber School [Julius-Leber-Schule] 2.42 112
56 Dörpsweg Grammar School [Gymnasium Dörpsweg] 2.43 80
57 Margaretha-Rothe Grammar School [Margaretha-Rothe-Gymnasium] 2.44 64
58 Meiendorf Community School [Stadtteilschule Meiendorf] 2.45 44
59 Gyula Trebitsch School Tonndorf (G8) [Gyula Trebitsch Schule Tonndorf (G8)] 2.45 24
60 Struensee Grammar School [Struensee Gymnasium] 2.47 74
61 Goethe School Harburg [Goethe-Schule-Harburg] 2.48 123
62 Heinrich-Hertz School (G8) [Heinrich-Hertz-Schule (G8)] 2.48 38
63 Louise Weiss Grammar School [Louise Weiss Gymnasium] 2.48 59
64 Nelson Mandela School in Kirchdorf [Nelson-Mandela-Schule im Stadtteil Kirchdorf] 2.49 59
65 Kurt-Körber Grammar School [Kurt-Körber-Gymnasium] 2.50 43
66 Poppenbüttel Community School [Stadtteilschule Poppenbüttel] 2.50 52
67 Lise-Meitner Grammar School [Lise-Meitner-Gymnasium] 2.53 83
68 Altona Community School [Stadtteilschule Altona] 2.54 38
69 Bergedorf Community School (dual-qualifying track) [Stadtteilschule Bergedorf (doppelt qualifizierender Bildungsgang)] 2.55 11
70 City Nord Vocational School [Berufliche Schule City Nord] 2.56 26
71 Hamburg-Harburg Vocational School [Berufliche Schule Hamburg-Harburg] 2.56 11
72 Ida Ehre School [Ida Ehre Schule] 2.57 86
73 Goethe Grammar School [Goethe-Gymnasium] 2.58 76
74 Charlotte-Paulsen Grammar School [Charlotte-Paulsen-Gymnasium] 2.59 62
75 Fritz-Schumacher School [Fritz-Schumacher-Schule] 2.59 46
76 Am Heidberg Community School [Stadtteilschule Am Heidberg] 2.59 60
77 Vocational School for Social Pedagogy – Anna Warburg School [Berufliche Schule für Sozialpädagogik – Anna-Warburg-Schule] 2.60 25
78 Eppendorf Community School [Grund- und Stadtteilschule Eppendorf] 2.60 55
79 Helmut Schmidt Grammar School [Helmut-Schmidt-Gymnasium] 2.60 83
80 Am Hafen Community School [Stadtteilschule Am Hafen] 2.60 66
81 Fischbek-Falkenberg Community School [Stadtteilschule Fischbek-Falkenberg] 2.60 42
82 Second Educational Path Campus [Campus Zweiter Bildungsweg] 2.61 58
83 Marienthal Grammar School [Gymnasium Marienthal] 2.63 67
84 Lessing Community School [Lessing-Stadtteilschule] 2.64 89
85 Horn Community School [Stadtteilschule Horn] 2.64 64
86 Lohbrügge Community School [Stadtteilschule Lohbrügge] 2.64 80
87 Niendorf Community School [Stadtteilschule Niendorf] 2.64 53
88 Mümmelmannsberg Community School [Stadtteilschule Mümmelmannsberg] 2.66 48
89 Oldenfelde Community School [Stadtteilschule Oldenfelde] 2.67 35
90 Irena Sendler School [Irena-Sendler-Schule] 2.68 89
91 Hamburg-Mitte Community School [Stadtteilschule Hamburg-Mitte] 2.68 45
92 Max-Schmeling Community School [Max-Schmeling-Stadtteilschule] 2.70 37
93 Esther Bejarano School [Esther Bejarano Schule] 2.70 52
94 Flottbek Community School [Stadtteilschule Flottbek] 2.70 13
95 Wilhelmsburg Community School [Stadtteilschule Wilhelmsburg (OSW)] 2.71 18
96 Elisabeth-Lange School [Elisabeth-Lange-Schule] 2.73 33
97 Helmuth Hübener Community School [Stadtteilschule Helmuth Hübener] 2.73 59
98 Richard-Linde-Weg Community School [Stadtteilschule Richard-Linde-Weg] 2.73 39
99 Otto Hahn School [Otto-Hahn-Schule] 2.77 62
100 Stübenhofer Weg Community School (OSW) [Stadtteilschule Stübenhofer Weg (OSW)] 2.77 11
101 Heinrich-Hertz School (G9) [Heinrich-Hertz-Schule (G9)] 2.76 74
102 Emil Krause School [Emil Krause Schule] 2.80 61
103 Gyula Trebitsch School Tonndorf (G9) [Gyula Trebitsch Schule Tonndorf (G9)] 2.80 67
104 Geschwister-Scholl Community School [Geschwister-Scholl-Stadtteilschule] 2.85 33
105 Lurup Community School [Stadtteilschule Lurup] 2.85 55
106 Gretel Bergmann School [Gretel-Bergmann-Schule] 2.90 35
107 Bramfeld Community School [Stadtteilschule Bramfeld] 2.92 25
108 Finkenwerder Community School [Stadtteilschule Finkenwerder] 2.97 16
109 Öjendorf Community School [Stadtteilschule Öjendorf] 2.96 13
110 Altrahlstedt Community School [Grund- und Stadtteilschule Altrahlstedt] 3.00 13

Filling SRA Metadata for Bulk RNA-Seq of Human Skin Organoids (SkOs) in the NCBI SRA depository

Screenshot from 2025-12-09 14-31-53

This guide explains how to complete the SRA metadata table for ten bulk RNA-seq samples derived from human skin organoids (SkOs). It includes one fully filled example row and shows the pattern to follow for the remaining samples.

Columns That Are the Same for All 10 Samples

According to the Methods section, all samples correspond to bulk RNA-seq using Lexogen RiboCop rRNA depletion and the Lexogen RNA-Seq V2 library kit. Sequencing was performed as single-end 1×83 bp reads on the Illumina NextSeq 500. Therefore, the following columns remain identical for all samples (untreated_rep1HSV_d8_rep2):

  • Library strategy: RNA-Seq
  • Library source: TRANSCRIPTOMIC
  • Library selection: RANDOM (rRNA-depleted total RNA; “RANDOM” is the standard choice)
  • Library layout: SINGLE
  • Platform: ILLUMINA
  • Instrument model: Illumina NextSeq 500
  • Filetype: fastq (use this if uploading *.fastq.gz; otherwise, select the matching file type)

Columns That Differ Per Sample

  1. Sample name This information comes from BioSample (e.g., untreated_rep1, HSV_d4_rep2, etc.).
  2. Library ID Each row must have a unique internal ID. You can use a simple sequential pattern:
untreated_rep1 → LIB01  
untreated_rep2 → LIB02  
HSV_d2_rep1   → LIB03  
…  
HSV_d8_rep2   → LIB10

Alternatively, a more descriptive format such as SkO_bulk_untreated_rep1 is acceptable. The key requirement is that all IDs are unique.

  1. Title Use the following pattern: Bulk RNA-seq of human skin organoid (SkO), CONDITION, replicate X

Examples:

untreated_rep1 → Bulk RNA-seq of human skin organoid (SkO), untreated, replicate 1  
untreated_rep2 → Bulk RNA-seq of human skin organoid (SkO), untreated, replicate 2  
HSV_d2_rep1   → Bulk RNA-seq of human skin organoid (SkO) infected with Human alphaherpesvirus 1 (HSV-1), 2 dpi, replicate 1  
HSV_d4_rep2   → Bulk RNA-seq of human skin organoid (SkO) infected with Human alphaherpesvirus 1 (HSV-1), 4 dpi, replicate 2  

Repeat this pattern for 6 dpi and 8 dpi samples.

  1. Design description Use this base text and modify it per condition:

    Bulk RNA-seq of human hair-bearing skin organoids (SkOs) derived from hiPSC line UKEi001-A. Total RNA from two organoids per condition was pooled, depleted of rRNA (Lexogen RiboCop rRNA Depletion Kit for Human/Mouse/Rat V2), and libraries were prepared with the Lexogen RNA-Seq V2 Library Prep Kit with UDIs (short insert variant). Libraries were sequenced as 1×83 bp single-end reads on an Illumina NextSeq 500.

Then append the condition information as follows:

  • For untreated rows: Condition: untreated control.
  • For HSV_d2 rows: Condition: skin organoids infected with Human alphaherpesvirus 1 (HSV-1) for 2 days (2 dpi).
  • For HSV_d4 rows: Condition: skin organoids infected with Human alphaherpesvirus 1 (HSV-1) for 4 days (4 dpi).
  • Continue the same pattern for 6 dpi and 8 dpi.
    1. Filename / filename2 Since this dataset is single-end, only the Filename column is required.
  • Filetype: fastq
  • Filename: must match the exact uploaded file name, e.g.:
untreated_rep1 → untreated_rep1.fastq.gz  
untreated_rep2 → untreated_rep2.fastq.gz  
HSV_d2_rep1   → HSV_d2_rep1.fastq.gz  
…
- **filename2:** leave blank (used only for paired-end libraries).

Example: Completed Row (untreated_rep1)

Column Example Entry
Sample name untreated_rep1
Library ID LIB01
Title Bulk RNA-seq of human skin organoid (SkO), untreated, replicate 1
Library strategy RNA-Seq
Library source TRANSCRIPTOMIC
Library selection RANDOM
Library layout SINGLE
Platform ILLUMINA
Instrument model Illumina NextSeq 500
Design description Bulk RNA-seq of human hair-bearing skin organoids (SkOs) derived from hiPSC line UKEi001-A. Total RNA from two organoids per condition was pooled, depleted of rRNA (Lexogen RiboCop rRNA Depletion Kit for Human/Mouse/Rat V2) and libraries were prepared with the Lexogen RNA-Seq V2 Library Prep Kit with UDIs (short insert variant). Libraries were sequenced as 1×83 bp single-end reads on an Illumina NextSeq 500. Condition: untreated control.
Filetype fastq
Filename untreated_rep1.fastq.gz
filename2 (blank)

After completing this first row, copy and adjust the fields (Library ID, Title, Design description condition, and Filename) for the other nine samples accordingly.