Processing Data_Patricia_Transposon_2025 v2 (Workflow for Structural Variant Calling in Nanopore Sequencing)

  1. Generate the HD46_Ctrol annotation

     mamba activate trycycler
     cd trycycler_HD46_Ctrl;
     trycycler cluster --threads 55 --assemblies assemblies/*.fasta --reads reads.fastq --out_dir trycycler;
    
     trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_001
     mv trycycler/cluster_001/1_contigs/J_ctg000010.fasta .
     mv trycycler/cluster_001/1_contigs/L_tig00000016.fasta .
     mv trycycler/cluster_001/1_contigs/R_tig00000001.fasta .
     mv trycycler/cluster_001/1_contigs/H_utg000001c.fasta .
    
     trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_002
     mv trycycler/cluster_002/1_contigs/*00000*.fasta .
     Error: unable to find a suitable common sequence
    
     trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_003
     mv trycycler/cluster_003/1_contigs/F_tig00000004.fasta .
     mv trycycler/cluster_003/1_contigs/L_tig00000003.fasta .
    
     trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_004
     mv trycycler/cluster_004/1_contigs/J_ctg000000.fasta .
     mv trycycler/cluster_004/1_contigs/P_ctg000000.fasta .
     mv trycycler/cluster_004/1_contigs/S_contig_2.fasta .
    
     trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_005
     trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_006
     trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_007
     trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_008
    
     trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_009
     trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_010
     trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_011
     trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_012
     trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_013
     trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_014
     trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_015
     trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_016
     trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_017
     trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_018
     trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_019
     trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_020
     trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_021
     trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_022
     trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_023
     trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_024
     trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_025
     trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_026
     trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_027
     trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_028
     trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_029
    
     trycycler msa --threads 55 --cluster_dir trycycler/cluster_001
     trycycler msa --threads 55 --cluster_dir trycycler/cluster_004
    
     trycycler partition --threads 55 --reads reads.fastq --cluster_dirs trycycler/cluster_001
     trycycler partition --threads 55 --reads reads.fastq --cluster_dirs trycycler/cluster_004
    
     trycycler consensus --threads 55 --cluster_dir trycycler/cluster_001
     trycycler consensus --threads 55 --cluster_dir trycycler/cluster_004
    
     #Polish --> TODO: Need to be Debugged!
     for c in trycycler/cluster_001 trycycler/cluster_004; do
         medaka_consensus -i "$c"/4_reads.fastq -d "$c"/7_final_consensus.fasta -o "$c"/medaka  -m r941_min_sup_g507 -t 12
         mv "$c"/medaka/consensus.fasta "$c"/8_medaka.fasta
         rm -r "$c"/medaka "$c"/*.fai "$c"/*.mmi  # clean up
     done
     # cat trycycler/cluster_*/8_medaka.fasta > trycycler/consensus.fasta
    
     cp trycycler/cluster_001/7_final_consensus.fasta HD46_Ctrl_chr.fasta
     cp trycycler/cluster_004/7_final_consensus.fasta HD46_Ctrl_plasmid.fasta
  2. install mambaforge https://conda-forge.org/miniforge/ (recommended)

     #download Mambaforge-24.9.2-0-Linux-x86_64.sh from website
     chmod +x Mambaforge-24.9.2-0-Linux-x86_64.sh
     ./Mambaforge-24.9.2-0-Linux-x86_64.sh
    
     To activate this environment, use:
         micromamba activate /home/jhuang/mambaforge
     Or to execute a single command in this environment, use:
         micromamba run -p /home/jhuang/mambaforge mycommand
     installation finished.
    
     Do you wish to update your shell profile to automatically initialize conda?
     This will activate conda on startup and change the command prompt when activated.
     If you'd prefer that conda's base environment not be activated on startup,
       run the following command when conda is activated:
    
     conda config --set auto_activate_base false
    
     You can undo this by running `conda init --reverse $SHELL`? [yes|no]
     [no] >>> yes
     no change     /home/jhuang/mambaforge/condabin/conda
     no change     /home/jhuang/mambaforge/bin/conda
     no change     /home/jhuang/mambaforge/bin/conda-env
     no change     /home/jhuang/mambaforge/bin/activate
     no change     /home/jhuang/mambaforge/bin/deactivate
     no change     /home/jhuang/mambaforge/etc/profile.d/conda.sh
     no change     /home/jhuang/mambaforge/etc/fish/conf.d/conda.fish
     no change     /home/jhuang/mambaforge/shell/condabin/Conda.psm1
     no change     /home/jhuang/mambaforge/shell/condabin/conda-hook.ps1
     no change     /home/jhuang/mambaforge/lib/python3.12/site-packages/xontrib/conda.xsh
     no change     /home/jhuang/mambaforge/etc/profile.d/conda.csh
     modified      /home/jhuang/.bashrc
     ==> For changes to take effect, close and re-open your current shell. <==
     no change     /home/jhuang/mambaforge/condabin/conda
     no change     /home/jhuang/mambaforge/bin/conda
     no change     /home/jhuang/mambaforge/bin/conda-env
     no change     /home/jhuang/mambaforge/bin/activate
     no change     /home/jhuang/mambaforge/bin/deactivate
     no change     /home/jhuang/mambaforge/etc/profile.d/conda.sh
     no change     /home/jhuang/mambaforge/etc/fish/conf.d/conda.fish
     no change     /home/jhuang/mambaforge/shell/condabin/Conda.psm1
     no change     /home/jhuang/mambaforge/shell/condabin/conda-hook.ps1
     no change     /home/jhuang/mambaforge/lib/python3.12/site-packages/xontrib/conda.xsh
     no change     /home/jhuang/mambaforge/etc/profile.d/conda.csh
     no change     /home/jhuang/.bashrc
     No action taken.
     WARNING conda.common.path.windows:_path_to(100): cygpath is not available, fallback to manual path conversion
     WARNING conda.common.path.windows:_path_to(100): cygpath is not available, fallback to manual path conversion
     Added mamba to /home/jhuang/.bashrc
     ==> For changes to take effect, close and re-open your current shell. <==
     Thank you for installing Mambaforge!
    
     Close your terminal window and open a new one, or run:
     #source ~/mambaforge/bin/activate
     conda --version
     mamba --version
    
     https://github.com/conda-forge/miniforge/releases
     Note
    
         * After installation, please make sure that you do not have the Anaconda default channels configured.
             conda config --show channels
             conda config --remove channels defaults
             conda config --add channels conda-forge
             conda config --show channels
             conda config --set channel_priority strict
             #conda clean --all
             conda config --remove channels biobakery
    
         * !!!!Do not install anything into the base environment as this might break your installation. See here for details.!!!!
    
     # --Deprecated method: mamba installing on conda--
     #conda install -n base --override-channels -c conda-forge mamba 'python_abi=*=*cp*'
     #    * Note that installing mamba into any other environment than base is not supported.
     #
     #conda activate base
     #conda install conda
     #conda uninstall mamba
     #conda install mamba

2: install required Tools on the mamba env

    * Sniffles2: Detect structural variants, including transposons, from long-read alignments.
    * RepeatModeler2: Identify and classify transposons de novo.
    * RepeatMasker: Annotate known transposable elements using transposon libraries.
    * SVIM: An alternative structural variant caller optimized for long-read sequencing, if needed.
    * SURVIVOR: Consolidate structural variants across samples for comparative analysis.

    mamba deactivate
    # Create a new conda environment
    mamba create -n transposon_long python=3.6 -y

    # Activate the environment
    mamba activate transposon_long

    mamba install -c bioconda sniffles
    mamba install -c bioconda repeatmodeler repeatmasker

    # configure repeatmasker database
    mamba info --envs
    cd /home/jhuang/mambaforge/envs/transposon_long/share/RepeatMasker

    #mamba install python=3.6
    mamba install -c bioconda svim
    mamba install -c bioconda survivor
  1. Test the installed tools

     # Check versions
     sniffles --version
     RepeatModeler -h
     RepeatMasker -h
     svim --help
     SURVIVOR --help
     mamba install -c conda-forge perl r
  2. Data Preparation

     Raw Signal Data: Nanopore devices generate electrical signal data as DNA passes through the nanopore.
     Basecalling: Tools like Guppy or Dorado are used to convert raw signals into nucleotide sequences (FASTQ files).
  3. Preprocessing

     Quality Filtering: Remove low-quality reads using tools like Filtlong or NanoFilt.
     Adapter Trimming: Identify and remove sequencing adapters with tools like Porechop.
  4. (Optional) Variant Calling for SNP and Indel Detection:

     Tools like Medaka, Longshot, or Nanopolish analyze the aligned reads to identify SNPs and small indels.
  5. (OFFICIAL STARTING POINT) Alignment and Structural Variant Calling: Tools such as Sniffles or SVIM detect large insertions, deletions, and other structural variants. 使用长读长测序工具如 SVIM 或 Sniffles 检测结构变异(e.g. 散在性重复序列)。

       #NOTE that the ./batch1_depth25/trycycler_WT/reads.fastq and F24A430001437_BACctmoD/BGI_result/Separate/${sample}/1.Cleandata/${sample}.filtered_reads.fq.gz are the same!
    
       # -- PREPARING the input fastq-data, merge the fastqz and move the top-directory
    
       # Under raw_data/no_sample_id/20250731_0943_MN45170_FBD12615_97f118c2/fastq_pass
       zcat ./barcode01/FBD12615_pass_barcode01_97f118c2_aa46ecf7_0.fastq.gz ./barcode01/FBD12615_pass_barcode01_97f118c2_aa46ecf7_1.fastq.gz ./barcode01/FBD12615_pass_barcode01_97f118c2_aa46ecf7_2.fastq.gz ./barcode01/FBD12615_pass_barcode01_97f118c2_aa46ecf7_3.fastq.gz ... | gzip > HD46_1.fastq.gz
       mv ./raw_data/no_sample_id/20250731_0943_MN45170_FBD12615_97f118c2/fastq_pass/HD46_1.fastq.gz ~/DATA/Data_Patricia_Transposon_2025
    
         #this are the corresponding sample names:
         #barcode 1: HD46-1
         #barcode 2: HD46-2
         #barcode 3: HD46-3
         #barcode 4: HD46-4
         mv barcode01.fastq.gz HD46_1.fastq.gz
         mv barcode02.fastq.gz HD46_2.fastq.gz
         mv barcode03.fastq.gz HD46_3.fastq.gz
         mv barcode04.fastq.gz HD46_4.fastq.gz
    
       # -- CALCULATE the coverages
         #!/bin/bash
    
         for bam in barcode*_minimap2.sorted.bam; do
             echo "Processing $bam ..."
             avg_cov=$(samtools depth -a "$bam" | awk '{sum+=$3; cnt++} END {if (cnt>0) print sum/cnt; else print 0}')
             echo -e "${bam}\t${avg_cov}" >> coverage_summary.txt
         done
    
       # ---- !!!! LOGIN the suitable environment !!!! ----
       # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! #
       mamba activate transposon_long
    
       # -- TODO: AFTERNOON_DEBUG_THIS: FAILED and not_USED: Alignment and Detect structural variants in each sample using SVIM which used aligner ngmlr or mimimap2
       #mamba install -c bioconda ngmlr
       mamba install -c bioconda svim
    
       #SEARCH FOR "HD46_Ctrl_chr_plasmid.fasta" for finding the insertion-calling-commands
       # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! for all 4 options #
       # ---- Option_1: minimap2 (aligner) + SVIM (structural variant caller) --> SUCCESSFUL ----
    
       for sample in HD46_1 HD46_2 HD46_3 HD46_4 HD46_5 HD46_6 HD46_7 HD46_8 HD46_13; do
           #INS,INV,DUP:TANDEM,DUP:INT,BND
           svim reads --aligner minimap2 --nanopore minimap2+svim_${sample}    ${sample}.fastq.gz HD46_Ctrl_chr_plasmid.fasta  --cores 20 --types INS --min_sv_size 100 --sequence_allele --insertion_sequences --read_names;
       done
    
       #svim alignment svim_alignment_minmap2_1_re 1.sorted.bam CP020463_.fasta --types INS --sequence_alleles --insertion_sequences --read_names
    
       # ---- Option_2: minamap2 (aligner) + Sniffles2 (structural variant caller) --> SUCCESSFUL ----
       #Minimap2: A commonly used aligner for nanopore sequencing data.
       #    Align Long Reads to the WT Reference using Minimap2
       #sniffles -m WT.sorted.bam -v WT.vcf -s 10 -l 50 -t 60
       #  -s 20: Requires at least 20 reads to support an SV for reporting. --> 10
       #  -l 50: Reports SVs that are at least 50 base pairs long.
       #  -t 60: Uses 60 threads for faster processing.
       for sample in HD46_1 HD46_2 HD46_3 HD46_4 HD46_5 HD46_6 HD46_7 HD46_8 HD46_13; do
           #minimap2 --MD -t 60 -ax map-ont HD46_Ctrl_chr_plasmid.fasta ./batch1_depth25/trycycler_${sample}/reads.fastq | samtools sort -o ${sample}.sorted.bam
           minimap2 --MD -t 60 -ax map-ont HD46_Ctrl_chr_plasmid.fasta ${sample}.fastq.gz | samtools sort -o ${sample}_minimap2.sorted.bam
           samtools index ${sample}_minimap2.sorted.bam
           sniffles -m ${sample}_minimap2.sorted.bam -v ${sample}_minimap2+sniffles.vcf -s 10 -l 50 -t 60
           #QUAL < 20 ||
           bcftools filter -e "INFO/SVTYPE != 'INS'" ${sample}_minimap2+sniffles.vcf > ${sample}_minimap2+sniffles_filtered.vcf
       done
    
         #Estimating parameter...
         #        Max dist between aln events: 44
         #        Max diff in window: 76
         #        Min score ratio: 2
         #        Avg DEL ratio: 0.0112045
         #        Avg INS ratio: 0.0364027
         #Start parsing... CP020463
         #                # Processed reads: 10000
         #                # Processed reads: 20000
         #        Finalizing  ..
         #Start genotype calling:
         #        Reopening Bam file for parsing coverage
         #        Finalizing  ..
         #Estimating parameter...
         #        Max dist between aln events: 28
         #        Max diff in window: 89
         #        Min score ratio: 2
         #        Avg DEL ratio: 0.013754
         #        Avg INS ratio: 0.17393
         #Start parsing... CP020463
         #                # Processed reads: 10000
         #                # Processed reads: 20000
         #                # Processed reads: 30000
         #                # Processed reads: 40000
    
         # Results:
         # * barcode01_minimap2+sniffles.vcf
         # * barcode01_minimap2+sniffles_filtered.vcf
         # * barcode02_minimap2+sniffles.vcf
         # * barcode02_minimap2+sniffles_filtered.vcf
         # * barcode03_minimap2+sniffles.vcf
         # * barcode03_minimap2+sniffles_filtered.vcf
         # * barcode04_minimap2+sniffles.vcf
         # * barcode04_minimap2+sniffles_filtered.vcf
    
       #ERROR: No MD string detected! Check bam file! Otherwise generate using e.g. samtools. --> No results!
       #for sample in barcode01 barcode02 barcode03 barcode04; do
       #    sniffles -m svim_reads_minimap2_${sample}/${sample}.fastq.minimap2.coordsorted.bam -v sniffles_minimap2_${sample}.vcf -s 10 -l 50 -t 60
       #    bcftools filter -e "INFO/SVTYPE != 'INS'" sniffles_minimap2_${sample}.vcf > sniffles_minimap2_${sample}_filtered.vcf
       #done
    
       # ---- Option_3: NGMLR (aligner) + SVIM (structural variant caller) --> SUCCESSFUL ----
       for sample in HD46_1 HD46_2 HD46_3 HD46_4 HD46_5 HD46_6 HD46_7 HD46_8 HD46_13; do
           svim reads --aligner ngmlr --nanopore    ngmlr+svim_${sample}       ${sample}.fastq.gz HD46_Ctrl_chr_plasmid.fasta  --cores 10;
       done
    
       # ---- Option_4: NGMLR (aligner) + sniffles (structural variant caller) --> SUCCESSFUL ----
       for sample in HD46_1 HD46_2 HD46_3 HD46_4 HD46_5 HD46_6 HD46_7 HD46_8 HD46_13; do
           sniffles -m ngmlr+svim_${sample}/${sample}.fastq.ngmlr.coordsorted.bam -v ${sample}_ngmlr+sniffles.vcf -s 10 -l 50 -t 60
           bcftools filter -e "INFO/SVTYPE != 'INS'" ${sample}_ngmlr+sniffles.vcf > ${sample}_ngmlr+sniffles_filtered.vcf
       done
    
       #END
  6. Compare and integrate all results produced by minimap2+sniffles and ngmlr+sniffles, and check them each position in IGV!

     # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! #
     mv HD46_1_minimap2+sniffles_filtered.vcf    HD46-1_minimap2+sniffles_filtered.vcf
     mv HD46_1_ngmlr+sniffles_filtered.vcf       HD46-1_ngmlr+sniffles_filtered.vcf
     mv HD46_2_minimap2+sniffles_filtered.vcf    HD46-2_minimap2+sniffles_filtered.vcf
     mv HD46_2_ngmlr+sniffles_filtered.vcf       HD46-2_ngmlr+sniffles_filtered.vcf
     mv HD46_3_minimap2+sniffles_filtered.vcf    HD46-3_minimap2+sniffles_filtered.vcf
     mv HD46_3_ngmlr+sniffles_filtered.vcf       HD46-3_ngmlr+sniffles_filtered.vcf
     mv HD46_4_minimap2+sniffles_filtered.vcf    HD46-4_minimap2+sniffles_filtered.vcf
     mv HD46_4_ngmlr+sniffles_filtered.vcf       HD46-4_ngmlr+sniffles_filtered.vcf
     mv HD46_5_minimap2+sniffles_filtered.vcf    HD46-5_minimap2+sniffles_filtered.vcf
     mv HD46_5_ngmlr+sniffles_filtered.vcf       HD46-5_ngmlr+sniffles_filtered.vcf
     mv HD46_6_minimap2+sniffles_filtered.vcf    HD46-6_minimap2+sniffles_filtered.vcf
     mv HD46_6_ngmlr+sniffles_filtered.vcf       HD46-6_ngmlr+sniffles_filtered.vcf
     mv HD46_7_minimap2+sniffles_filtered.vcf    HD46-7_minimap2+sniffles_filtered.vcf
     mv HD46_7_ngmlr+sniffles_filtered.vcf       HD46-7_ngmlr+sniffles_filtered.vcf
     mv HD46_8_minimap2+sniffles_filtered.vcf    HD46-8_minimap2+sniffles_filtered.vcf
     mv HD46_8_ngmlr+sniffles_filtered.vcf       HD46-8_ngmlr+sniffles_filtered.vcf
     mv HD46_13_minimap2+sniffles_filtered.vcf   HD46-13_minimap2+sniffles_filtered.vcf
     mv HD46_13_ngmlr+sniffles_filtered.vcf      HD46-13_ngmlr+sniffles_filtered.vcf
  7. (NOT_USED) Filtering low-complexity insertions using RepeatMasker (TODO: how to use RepeatModeler to generate own lib?)

       python vcf_to_fasta.py variants.vcf variants.fasta
       #python filter_low_complexity.py variants.fasta filtered_variants.fasta retained_variants.fasta
       #Using RepeatMasker to filter the low-complexity fasta, the used h5 lib is
       /home/jhuang/mambaforge/envs/transposon_long/share/RepeatMasker/Libraries/Dfam.h5    #1.9G
       python /home/jhuang/mambaforge/envs/transposon_long/share/RepeatMasker/famdb.py -i /home/jhuang/mambaforge/envs/transposon_long/share/RepeatMasker/Libraries/Dfam.h5 names 'bacteria' | head
       Exact Matches
       =============
       2 bacteria (blast name), Bacteria 
    (scientific name), eubacteria (genbank common name), Monera (in-part), Procaryotae (in-part), Prokaryota (in-part), Prokaryotae (in-part), prokaryote (in-part), prokaryotes (in-part) Non-exact Matches ================= 1783272 Terrabacteria group (scientific name) 91061 Bacilli (scientific name), Bacilli Ludwig et al. 2010 (authority), Bacillus/Lactobacillus/Streptococcus group (synonym), Firmibacteria (synonym), Firmibacteria Murray 1988 (authority) 1239 Bacillaeota (synonym), Bacillaeota Oren et al. 2015 (authority), Bacillota (synonym), Bacillus/Clostridium group (synonym), clostridial firmicutes (synonym), Clostridium group firmicutes (synonym), Firmacutes (synonym), firmicutes (blast name), Firmicutes (scientific name), Firmicutes corrig. Gibbons and Murray 1978 (authority), Low G+C firmicutes (synonym), low G+C Gram-positive bacteria (common name), low GC Gram+ (common name) Summary of Classes within Firmicutes: * Bacilli (includes many common pathogenic and non-pathogenic Gram-positive bacteria, taxid=91061) * Bacillus (e.g., Bacillus subtilis, Bacillus anthracis) * Staphylococcus (e.g., Staphylococcus aureus, Staphylococcus epidermidis) * Streptococcus (e.g., Streptococcus pneumoniae, Streptococcus pyogenes) * Listeria (e.g., Listeria monocytogenes) * Clostridia (includes many anaerobic species like Clostridium and Clostridioides) * Erysipelotrichia (intestinal bacteria, some pathogenic) * Tissierellia (less-studied, veterinary relevance) * Mollicutes (cell wall-less, includes Mycoplasma species) * Negativicutes (includes some Gram-negative, anaerobic species) RepeatMasker -species Bacilli -pa 4 -xsmall variants.fasta python extract_unmasked_seq.py variants.fasta.masked unmasked_variants.fasta #bcftools filter -i ‘QUAL>30 && INFO/SVLEN>100’ variants.vcf -o filtered.vcf # #bcftools view -i ‘SVTYPE=”INS”‘ variants.vcf | bcftools query -f ‘%CHROM\t%POS\t%REF\t%ALT\t%INFO\n’ > insertions.txt #mamba install -c bioconda vcf2fasta #vcf2fasta variants.vcf -o insertions.fasta #grep “SEQS” variants.vcf | awk ‘{ print $1, $2, $4, $5, $8 }’ > insertions.txt #python3 filtering_low_complexity.py # #vcftools –vcf input.vcf –recode –out filtered_output –minSVLEN 100 #bcftools filter -e ‘INFO/SEQS ~ “^(G+|C+|T+|A+){4,}”‘ variants.vcf -o filtered.vcf # — calculate the percentage of reads To calculate the percentage of reads that contain the insertion from the VCF entry, use the INFO and FORMAT fields provided in the VCF record. Step 1: Extract Relevant Information In the provided VCF entry: RE (Reads Evidence): 733 – the total number of reads supporting the insertion. GT (Genotype): 1/1 – this indicates a homozygous insertion, meaning all reads covering this region are expected to have the insertion. AF (Allele Frequency): 1 – a 100% allele frequency, indicating that every read in this sample supports the insertion. DR (Depth Reference): 0 – the number of reads supporting the reference allele. DV (Depth Variant): 733 – the number of reads supporting the variant allele (insertion). Step 2: Calculate Percentage of Reads Supporting the Insertion Using the formula: Percentage of reads with insertion=(DVDR+DV)×100 Percentage of reads with insertion=(DR+DVDV​)×100 Substitute the values: Percentage=(7330+733)×100=100% Percentage=(0+733733​)×100=100% Conclusion Based on the VCF record, 100% of the reads support the insertion, indicating that the insertion is fully present in the sample (homozygous insertion). This is consistent with the AF=1 and GT=1/1 fields. * In your VCF file generated by Sniffles, the REF=N in the results has a specific meaning: * In a standard VCF, the REF field usually contains the reference base(s) at the variant position. * For structural variants (SVs), especially insertions, there is no reference sequence replaced; the insertion occurs between reference bases. * Therefore, Sniffles uses N as a placeholder in the REF field to indicate “no reference base replaced”. * The actual inserted sequence is then stored in the ALT field.
  8. Why some records have UNRESOLVED in the FILTER field in the Excel output.

     1. Understanding the format
    
         The data appears to be structural variant (SV) calls from Sniffles, probably in a VCF-like tabular format exported to Excel:
    
             * gi|1176884116|gb|CP020463.1| → reference sequence
             * Positions: 1855752 and 2422820
             * N → insertion event
             * SVLEN=999 → size of the insertion
             * AF → allele frequency
             * GT:DR:DV → genotype, depth reference, depth variant (1/1:0:678, example values for a PASS variant)
             * FILTER → whether the variant passed filters (UNRESOLVED means it didn’t pass)
    
     2. What UNRESOLVED usually means
    
         In Sniffles:
    
         * UNRESOLVED is assigned to SVs when the tool cannot confidently resolve the exact sequence or breakpoint.
         * Reasons include:
             - Low read support (RE, DV) relative to the expected coverage
             - Ambiguous alignment at repetitive regions
             - Conflicting strand or orientation signals
             - Allele frequency inconsistent with expectations
    
     3. Examine your two records
    
         First record
    
             POS: 1855752
             SVTYPE: INS
             SVLEN: 999
             RE: 68
             AF: 1
             GT: 1/1
             FILTER: UNRESOLVED
    
         Observations:
    
         * AF = 1 → allele frequency 100%, homozygous insertion
         * RE = 68 → 68 reads support the variant, decent coverage
         * Still UNRESOLVED → likely because Sniffles could not resolve the inserted sequence precisely; sometimes long insertions in repetitive regions are hard to reconstruct fully even with good read support.
    
         Second record
    
             POS: 2422820
             SVTYPE: INS
             SVLEN: 999
             RE: 22
             AF: 0.025522
             GT: 0/0
             FILTER: UNRESOLVED
    
         Observations:
    
         * AF = 0.0255 → very low allele frequency (~2.5%)
         * RE = 22, DR = 840 → very low variant reads vs reference
         * GT = 0/0 → homozygous reference
         * Sniffles marks it UNRESOLVED because the variant is essentially noise, not confidently detected.
    
     4. Key difference between the two
         Feature First record    Second record
         Allele frequency (AF)   1 (high)    0.0255 (very low)
         Variant reads (RE)  68  22
         Genotype (GT)   1/1 0/0
         Reason for UNRESOLVED   Unresolvable inserted sequence
    
     ✅ 5. Conclusion
    
         * Sniffles marks a variant as UNRESOLVED when the SV cannot be confidently characterized.
         * Even if there is good read support (first record), complex insertions can’t always be reconstructed fully.
         * Very low allele frequency (second record) also triggers UNRESOLVED because the signal is too weak compared to background noise.
         * Essentially: “UNRESOLVED” ≠ bad data, it’s just unresolved uncertainty.
  9. (NOT_SURE_HOW_TO_USE) Polishing of assembly: Use tools like Medaka to refine variant calls by leveraging consensus sequences derived from nanopore data.

       mamba install -c bioconda medaka
       medaka-consensus -i aligned_reads.bam -r reference.fasta -o polished_output -t 4
  10. Compare Insertions Across Samples

     Merge Variants Across Samples: Use SURVIVOR to merge and compare the detected insertions in all samples against the WT:
    
     SURVIVOR merge input_vcfs.txt 1000 1 1 1 0 30 merged.vcf
    
         Input: List of VCF files from Sniffles2.
         Output: A consolidated VCF file with shared and unique variants.
    
     Filter WT Insertions:
    
         Identify transposons present only in samples 1–9 by subtracting WT variants using bcftools:
    
             bcftools isec WT.vcf merged.vcf -p comparison_results
  11. Validate and Visualize

     Visualize with IGV: Use IGV to inspect insertion sites in the alignment and confirm quality.
    
     igv.sh
    
     Validate Findings:
         Perform PCR or additional sequencing for key transposon insertion sites to confirm results.
  12. Alternatives to TEPID for Long-Read Data

     If you’re looking for transposon-specific tools for long reads:
    
         REPET: A robust transposon annotation tool compatible with assembled genomes.
         EDTA (Extensive de novo TE Annotator):
             A pipeline to identify, classify, and annotate transposons.
             Works directly on your assembled genomes.
    
             perl EDTA.pl --genome WT.fasta --type all
  13. The WT.vcf file in the pipeline is generated by detecting structural variants (SVs) in the wild-type (WT) genome aligned against itself or using it as a baseline reference. Here’s how you can generate the WT.vcf:

     Steps to Generate WT.vcf
     1. Align WT Reads to the WT Reference Genome
    
     The goal here is to create an alignment of the WT sequencing data to the WT reference genome to detect any self-contained structural variations, such as native insertions, deletions, or duplications.
    
     Command using Minimap2:
    
     minimap2 -ax map-ont WT.fasta WT_reads.fastq | samtools sort -o WT.sorted.bam
    
     Index the BAM file:
    
     samtools index WT.sorted.bam
    
     2. Detect Structural Variants with Sniffles2
    
     Run Sniffles2 on the WT alignment to call structural variants:
    
     sniffles --input WT.sorted.bam --vcf WT.vcf
    
     This step identifies:
    
         Native transposons and insertions present in the WT genome.
         Other structural variants that are part of the reference genome or sequencing artifacts.
    
     Key parameters to consider:
    
         --min_support: Adjust based on your WT sequencing coverage.
         --max_distance: Define proximity for merging variants.
         --min_length: Set a minimum SV size (e.g., >50 bp for transposons).
  14. Clean and Filter the WT.vcf, Variant Filtering: Remove low-confidence variants based on read depth, quality scores, or allele frequency.

     To ensure the WT.vcf only includes relevant transposons or SVs:
    
         Use bcftools or similar tools to filter out low-confidence variants:
    
         bcftools filter -e "QUAL < 20 || INFO/SVTYPE != 'INS'" WT.vcf > WT_filtered.vcf
         bcftools filter -e "QUAL < 1 || INFO/SVTYPE != 'INS'" 1_.vcf > 1_filtered_.vcf
  15. NOTE that in this pipeline, the WT.fasta (reference genome) is typically a high-quality genome sequence from a database or a well-annotated version of your species’ genome. It is not assembled from the WT.fastq sequencing reads in this context. Here’s why:

     Why Use a Reference Genome (WT.fasta) from a Database?
    
         Higher Quality and Completeness:
             Database references (e.g., NCBI, Ensembl) are typically well-assembled, highly polished, and annotated. They serve as a reliable baseline for variant detection.
    
         Consistency:
             Using a standard reference ensures consistent comparisons across your WT and samples (1–9). Variants detected will be relative to this reference, not influenced by possible assembly errors.
    
         Saves Time:
             Assembling a reference genome from WT reads requires significant computational effort. Using an existing reference streamlines the analysis.
    
     Alternative: Assembling WT from FASTQ
    
     If you don’t have a high-quality reference genome (WT.fasta) and must rely on your WT FASTQ reads:
    
         Assemble the genome from your WT.fastq:
             Use long-read assemblers like Flye, Canu, or Shasta to create a draft genome.
    
         flye --nano-raw WT.fastq --out-dir WT_assembly --genome-size 
    Polish the assembly using tools like Racon (with the same reads) or Medaka for higher accuracy. Use the assembled and polished genome as your WT.fasta reference for further steps. Key Takeaways: If you have access to a reliable, high-quality reference genome, use it as the WT.fasta. Only assemble WT.fasta from raw reads (WT.fastq) if no database reference is available for your organism.
  16. Annotate Transposable Elements: Tools like ANNOVAR or SnpEff provide functional insights into the detected variants.

     # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! #
     #Using snpEff to annotate the insertion!
     conda activate /home/jhuang/miniconda3/envs/spandx
# --> BUG:
LOCUS       HD46_Ctrl 2707468 bp    DNA     circular BCT
    02-OCT-2025
DEFINITION  Staphylococcus epidermidis strain HD46-ctrl chromosome, whole
            genome shotgun sequence.
ACCESSION
VERSION

# --> DEBUG: adapt the genbank-file header as follows:
LOCUS       HD46_Ctrl 2707468 bp    DNA     circular BCT 02-OCT-2025
DEFINITION  Staphylococcus epidermidis strain HD46-ctrl chromosome, whole
            genome shotgun sequence.
ACCESSION   HD46_Ctrl
VERSION     HD46_Ctrl.1
DBLINK      BioProject: PRJNA1337321
            BioSample: SAMN52215988
KEYWORDS    .
SOURCE      Staphylococcus epidermidis
  ORGANISM  Staphylococcus epidermidis
            Bacteria; Firmicutes; Bacilli; Bacillales; Staphylococcaceae;
            Staphylococcus.
COMMENT     Annotated genome for HD46_Ctrl.
...
    # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! #
    mkdir ~/miniconda3/envs/spandx/share/snpeff-5.1-2/data/HD46_Ctrl
    cp HD46_Ctrl_chr.gb ~/miniconda3/envs/spandx/share/snpeff-5.1-2/data/HD46_Ctrl/genes.gbk

    vim ~/miniconda3/envs/spandx/share/snpeff-5.1-2/snpEff.config  #HD46_Ctrl.genome : HD46_Ctrl
    /home/jhuang/miniconda3/envs/spandx/bin/snpEff build -genbank HD46_Ctrl      -d

    sed -i 's/^cluster_001_consensus/HD46_Ctrl.1/' HD46-8_ngmlr+sniffles_filtered.vcf
    sed -i 's/^cluster_001_consensus/HD46_Ctrl.1/' HD46-13_ngmlr+sniffles_filtered.vcf
    #snpEff eff -nodownload -no-downstream -no-intergenic -ud 100 -v HD46_Ctrl HD46-8_ngmlr+sniffles_filtered.vcf > HD46-8_ngmlr+sniffles_filtered.annotated.vcf
    #snpEff eff -nodownload -no-downstream -no-intergenic -ud 100 -v HD46_Ctrl HD46-13_ngmlr+sniffles_filtered.vcf > HD46-13_ngmlr+sniffles_filtered.annotated.vcf

    # HD46-8
    snpEff ann -Xmx8g -v -hgvs -canon -ud 200 \
    -stats HD46-8_snpeff_stats.html \
    HD46_Ctrl \
    HD46-8_ngmlr+sniffles_filtered.vcf \
    > HD46-8_ngmlr+sniffles_filtered.annotated.vcf

    # HD46-13
    snpEff ann -Xmx8g -v -hgvs -canon -ud 200 \
    -stats HD46-13_snpeff_stats.html \
    HD46_Ctrl \
    HD46-13_ngmlr+sniffles_filtered.vcf \
    > HD46-13_ngmlr+sniffles_filtered.annotated.vcf
  1. Summarize the results as a Excel-file

    # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! #
    conda activate plot-numpy1
    #python generate_common_vcf.py
    #mv common_variants.xlsx putative_transposons.xlsx
    
    # * Reads each of your VCFs.
    # * Filters variants → only keep those with FILTER == PASS.
    # * Compares the two aligner methods (minimap2+sniffles2 vs ngmlr+sniffles2) per sample.
    # * Keeps only variants that appear in both methods for the same sample.
    # * Outputs: An Excel file with the common variants and a log text file listing which variants were filtered out, and why (not_PASS or not_COMMON_in_two_VCF).
    
    #python generate_fuzzy_common_vcf_v1.py
    #Sample PASS_minimap2   PASS_ngmlr  COMMON
    #  HD46-Ctrl_Ctrl   39  29  28
    #  HD46-1   39  32  29
    #  HD46-2   40  32  28
    #  HD46-3   38  30  27
    #  HD46-4   46  35  32
    #  HD46-5   40  35  31
    #  HD46-6   43  35  30
    #  HD46-7   40  33  28
    #  HD46-8   37  20  11
    #  HD46-13  39  38  27
    
    #Sample PASS_minimap2   PASS_ngmlr  COMMON_FINAL
    #HD46-Ctrl_Ctrl 39  29  6
    #HD46-1 39  32  8
    #HD46-2 40  32  8
    #HD46-3 38  30  6
    #HD46-4 46  35  8
    #HD46-5 40  35  9
    #HD46-6 43  35  10
    #HD46-7 40  33  8
    #HD46-8 37  20  4
    #HD46-13    39  38  5
    
    # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! #
    #!!!! Summarize the results of ngmlr+sniffles !!!!
    python merge_ngmlr+sniffles_filtered_results_and_summarize.py
    
    #!!!! Post-Processing !!!!
    #DELETE "2186168    N   

    . PASS” in Sheet HD46-13 and Summary #DELETE “2427785 N CGTCAGAATCGCTGTCTGCGTCCGAGTCACTGTCTGAGTCTGAATCACTATCTGCGTCTGAGTCACTGTCTG . PASS” due to “0/1:169:117” in HD46-13 and Summary #DELETE “2441640 N GCTCATTAAGAATCATTAAATTAC . PASS” due to 0/1:170:152 in HD46-13 and Summary

  2. Source code of merge_ngmlr+sniffles_filtered_results_and_summarize.py

    python add_ann_to_excel.py         --excel merged_ngmlr+sniffles_variants.xlsx         --sheet8 "HD46-8"         --sheet13 "HD46-13"         --vcf8 HD46-8_ngmlr+sniffles_filtered.annotated.vcf         --vcf13 HD46-13_ngmlr+sniffles_filtered.annotated.vcf         --out merged_ngmlr+sniffles_variants_with_ANN.xlsx
    
    #!/usr/bin/env python3
    # -*- coding: utf-8 -*-
    """
    Add SnpEff ANN columns (for SVTYPE=INS) from annotated VCFs into an Excel workbook,
    with detailed debug about why CHROM+POS may not match.
    
    Key improvements:
    - Stronger CHROM/POS normalization (strip 'chr', unify MT naming, coerce numbers).
    - Explicit detection and logging of sheet key columns used.
    - Debug block prints:
    * Unique key counts in sheet vs VCF (before/after normalization)
    * Example non-matching keys from the sheet and from the VCF (top N)
    * Chromosome naming diagnostics (e.g., 'chr' presence, 'MT'/'M' harmonization)
    * Off-by-N diagnostics via --pos_tolerance (counts for would-match @ ±N)
    * Optional preview of the sheet's SV type column, if present
    - Safer ANN parsing and aggregation.
    - Command-line options: --debug_examples, --pos_tolerance
    
    ANN filling is done only for **exact** (CHROM, POS) equality (as before).
    Tolerance is used only for *diagnostics*, not for filling, to avoid incorrect merges.
    """
    
    import argparse
    import gzip
    import io
    import re
    from pathlib import Path
    from typing import List, Tuple, Dict, Iterable, Set
    
    import pandas as pd
    
    FALLBACK_ANN_FIELDS: List[str] = [
        'Allele','Annotation','Annotation_Impact','Gene_Name','Gene_ID',
        'Feature_Type','Feature_ID','Transcript_BioType','Rank','HGVS.c',
        'HGVS.p','cDNA.pos/cDNA.length','CDS.pos/CDS.length','AA.pos/AA.length',
        'Distance','Errors_Warnings_Info'
    ]
    
    def open_text_maybe_gzip(path: Path):
        if str(path).endswith('.gz'):
            return io.TextIOWrapper(gzip.open(path, 'rb'), encoding='utf-8', errors='ignore')
        return open(path, 'r', encoding='utf-8', errors='ignore')
    
    def normalize_chrom(col: pd.Series) -> pd.Series:
        s = col.astype(str).str.strip()
        s = s.str.replace(r'^(chr|CHR)', '', regex=True)
        # Standardize mitochondrial names to "MT"
        s = s.str.replace(r'^(M|MtDNA|MTDNA|Mito|Mitochondrion)$', 'MT', regex=True, case=False)
        return s.str.upper()
    
    def normalize_pos(col: pd.Series) -> pd.Series:
        # Excel can make ints look like floats; coerce then Int64
        # (We keep Int64 nullable for robustness; we never compare NaNs.)
        s = pd.to_numeric(col, errors='coerce')
        # If people had 0-based starts in the sheet (rare for INS), this won't fix it,
        # but the tolerance debug will reveal a +1 shift if present.
        return s.astype('Int64')
    
    def parse_vcf_ann(vcf_path: Path) -> Tuple[pd.DataFrame, List[str]]:
        ann_fields = None
        header_cols = None
        records = []
    
        with open_text_maybe_gzip(vcf_path) as f:
            for line in f:
                if line.startswith('##INFO=<ID=ANN'):
                    m = re.search(r'Format:\s*([^">]+)', line)
                    if m:
                        ann_fields = [s.strip() for s in m.group(1).split('|')]
                if line.startswith('#CHROM'):
                    header_cols = line.strip().lstrip('#').split('\t')
                    break
    
            if not header_cols:
                raise RuntimeError(f"Could not find VCF header line (#CHROM ...) in {vcf_path}")
    
            if not ann_fields:
                ann_fields = FALLBACK_ANN_FIELDS
    
            ann_cols = [f'ANN_{x}' for x in ann_fields]
    
            for line in f:
                if not line or line[0] == '#':
                    continue
                parts = line.rstrip('\n').split('\t')
                if len(parts) < len(header_cols):
                    continue
                row = dict(zip(header_cols, parts))
                info = row.get('INFO', '')
    
                # Only INS
                if not re.search(r'(?:^|;)SVTYPE=INS(?:;|$)', info):
                    continue
    
                chrom = row.get('#CHROM') or row.get('CHROM')
                pos_str = row.get('POS')
                try:
                    pos = int(pos_str)
                except Exception:
                    continue
    
                # Extract ANN entries
                ann_match = re.search(r'(?:^|;)ANN=([^;]+)', info)
                ann_entries = ann_match.group(1).split(',') if ann_match else []
    
                field_values: Dict[str, List[str]] = {k: [] for k in ann_fields}
                for ann in ann_entries:
                    items = ann.split('|')
                    if len(items) < len(ann_fields):
                        items += [''] * (len(ann_fields) - len(items))
                    elif len(items) > len(ann_fields):
                        items = items[:len(ann_fields)]
                    for k, v in zip(ann_fields, items):
                        field_values[k].append(v)
    
                joined = {f'ANN_{k}': (';'.join(v) if v else '') for k, v in field_values.items()}
                records.append({'CHROM': chrom, 'POS': pos, **joined})
    
        df = pd.DataFrame.from_records(records)
        if not df.empty:
            df['POS'] = pd.to_numeric(df['POS'], errors='coerce').astype('Int64')
            df['CHROM'] = normalize_chrom(df['CHROM'])
        return df, ann_cols
    
    def detect_key_columns(df: pd.DataFrame) -> Dict[str, str]:
        chrom_candidates = ['CHROM', '#CHROM', 'Chrom', 'Chromosome', 'chrom', 'chr', 'Chr']
        pos_candidates   = ['POS', 'Position', 'position', 'pos', 'Start', 'start']
        mapping = {}
        for c in chrom_candidates:
            if c in df.columns:
                mapping['CHROM'] = c
                break
        for p in pos_candidates:
            if p in df.columns:
                mapping['POS'] = p
                break
        return mapping
    
    def normalize_chrom_pos_df(df: pd.DataFrame, keys: Dict[str, str]) -> pd.DataFrame:
        out = df.copy()
        out[keys['CHROM']] = normalize_chrom(out[keys['CHROM']])
        out[keys['POS']]   = normalize_pos(out[keys['POS']])
        return out
    
    def summarize_chr_formats(series: pd.Series, label: str):
        raw = series.astype(str)
        has_chr_prefix = raw.str.startswith(('chr','CHR')).sum()
        mt_like = raw.str.fullmatch(r'(M|MtDNA|MTDNA|Mito|Mitochondrion)', case=False).sum()
        print(f"[{label}] CHROM diagnostics:")
        print(f"  total rows: {len(raw)}")
        print(f"  with 'chr'/'CHR' prefix: {has_chr_prefix}")
        print(f"  mitochondrial names like M/MtDNA/etc: {mt_like}")
    
    def keys_set(df: pd.DataFrame, chrom_col: str, pos_col: str) -> Set[Tuple[str, int]]:
        # Drop NA POS, NA CHROM
        sub = df[[chrom_col, pos_col]].dropna()
        # Ensure ints (drop NA after coercion)
        sub = sub[(sub[pos_col].astype('Int64').notna())]
        return set(zip(sub[chrom_col].astype(str), sub[pos_col].astype('int64')))
    
    def tolerance_match_count(sheet_keys: Iterable[Tuple[str,int]],
                            vcf_keys: Set[Tuple[str,int]],
                            tol: int) -> int:
        if tol <= 0:
            return sum(1 for k in sheet_keys if k in vcf_keys)
        cnt = 0
        for chrom, pos in sheet_keys:
            if (chrom, pos) in vcf_keys:
                cnt += 1
            else:
                matched = False
                # check +/- 1..tol
                for d in range(1, tol+1):
                    if (chrom, pos - d) in vcf_keys or (chrom, pos + d) in vcf_keys:
                        matched = True
                        break
                if matched:
                    cnt += 1
        return cnt
    
    def debug_match_report(df_sheet: pd.DataFrame,
                        vcf_df: pd.DataFrame,
                        keys: Dict[str, str],
                        debug_examples: int = 15,
                        pos_tolerance: int = 1):
        print("\n=== DEBUG: Matching overview ===")
        # Raw diagnostics
        summarize_chr_formats(df_sheet[keys['CHROM']], label="SHEET (raw)")
        summarize_chr_formats(vcf_df['CHROM'], label="VCF (normalized)")
    
        # Normalize sheet
        df_norm = normalize_chrom_pos_df(df_sheet, keys)
        print(f"Detected key columns -> CHROM: '{keys['CHROM']}'  POS: '{keys['POS']}'")
        # Basic stats
        n_sheet_all = len(df_sheet)
        n_sheet_key_nonnull = df_norm[keys['CHROM']].notna().sum() - df_norm[keys['CHROM']].isna().sum()
        n_sheet_pos_nonnull = df_norm[keys['POS']].notna().sum()
        print(f"SHEET rows total: {n_sheet_all}")
        print(f"SHEET rows with non-null CHROM: {n_sheet_key_nonnull}, non-null POS: {n_sheet_pos_nonnull}")
    
        # Unique key counts
        sheet_norm_keys_df = df_norm.rename(columns={keys['CHROM']: 'CHROM', keys['POS']: 'POS'})
        sheet_norm_keys_df = sheet_norm_keys_df.dropna(subset=['CHROM','POS'])
        sheet_norm_keys_df['POS'] = sheet_norm_keys_df['POS'].astype('Int64')
        sheet_keys_unique = keys_set(sheet_norm_keys_df, 'CHROM', 'POS')
        vcf_keys_unique   = keys_set(vcf_df, 'CHROM', 'POS')
    
        print(f"Unique (CHROM,POS) keys -> SHEET: {len(sheet_keys_unique)}  VCF(INS): {len(vcf_keys_unique)}")
    
        # Exact match count
        exact_matches = len(sheet_keys_unique & vcf_keys_unique)
        print(f"Exact key matches (SHEET∩VCF): {exact_matches}")
    
        # Tolerance diagnostics (diagnose off-by-one etc.)
        if pos_tolerance > 0:
            approx_matches = tolerance_match_count(sheet_keys_unique, vcf_keys_unique, pos_tolerance)
            print(f"Keys that would match within ±{pos_tolerance}: {approx_matches}")
    
        # Show some examples of non-matching keys from SHEET
        if debug_examples > 0:
            not_in_vcf = sorted(k for k in sheet_keys_unique if k not in vcf_keys_unique)
            not_in_sheet = sorted(k for k in vcf_keys_unique if k not in sheet_keys_unique)
            print(f"\nExamples of SHEET keys not found in VCF (showing up to {debug_examples}):")
            for k in not_in_vcf[:debug_examples]:
                print("  SHEET-only:", k)
            print(f"\nExamples of VCF keys not found in SHEET (showing up to {debug_examples}):")
            for k in not_in_sheet[:debug_examples]:
                print("  VCF-only:", k)
    
        # Try to detect a type column and report counts
        type_cols = [c for c in df_sheet.columns if c.lower() in ('svtype','type','variant_type','sv_type')]
        if type_cols:
            tcol = type_cols[0]
            is_ins = df_sheet[tcol].astype(str).str.upper() == 'INS'
            print(f"\nType column detected: '{tcol}'. SHEET rows with INS: {int(is_ins.sum())} / {len(df_sheet)}")
            # Of the INS rows, how many have keys that match?
            ins_keys = keys_set(df_norm[is_ins], keys['CHROM'], keys['POS'])
            exact_ins_matches = len(ins_keys & vcf_keys_unique)
            print(f"  INS-only exact key matches: {exact_ins_matches} / {len(ins_keys)}")
            if pos_tolerance > 0:
                approx_ins_matches = tolerance_match_count(ins_keys, vcf_keys_unique, pos_tolerance)
                print(f"  INS-only matches within ±{pos_tolerance}: {approx_ins_matches} / {len(ins_keys)}")
        else:
            print("\nNo explicit type column found in SHEET.")
    
    def merge_ann_into_sheet(df_sheet: pd.DataFrame, vcf_df: pd.DataFrame, ann_cols: List[str],
                            pos_tolerance: int = 1, debug_examples: int = 15) -> pd.DataFrame:
        df = df_sheet.copy()
    
        keys = detect_key_columns(df)
        if 'CHROM' not in keys or 'POS' not in keys:
            print("WARNING: Could not detect CHROM/POS columns in sheet; ANN columns will be empty.")
            for c in ann_cols:
                if c not in df.columns:
                    df[c] = ''
            return df
    
        # DEBUG: run a comprehensive match report
        debug_match_report(df, vcf_df, keys, debug_examples=debug_examples, pos_tolerance=pos_tolerance)
    
        # Normalize sheet keys for merge
        df_norm = normalize_chrom_pos_df(df, keys)
    
        # Prepare VCF map (unique by CHROM,POS), aggregate ANN fields
        vcf_use = vcf_df.copy()
        if vcf_use.empty:
            print("NOTE: No INS records found in VCF; ANN columns will be created but empty.")
        else:
            agg = {c: lambda s: ';'.join([x for x in s.astype(str).tolist() if x]) for c in ann_cols}
            vcf_use = vcf_use.groupby(['CHROM', 'POS'], as_index=False).agg(agg)
    
        # Identify potential type column in sheet
        type_cols = [c for c in df.columns if c.lower() in ('svtype','type','variant_type','sv_type')]
        has_type = bool(type_cols)
        if has_type:
            tcol = type_cols[0]
            is_ins = df[tcol].astype(str).str.upper() == 'INS'
            print(f"\nMERGE: using type column '{tcol}' -> rows marked INS: {int(is_ins.sum())} / {len(df)}")
        else:
            is_ins = pd.Series([False]*len(df), index=df.index)
            print("\nMERGE: no type column -> will fill ANN wherever exact (CHROM,POS) matches VCF INS.")
    
        # Left merge on exact keys only (do not use tolerance for filling, just for diagnostics)
        left = df_norm.rename(columns={keys['CHROM']: 'CHROM', keys['POS']: 'POS'})
        merged = left.merge(vcf_use[['CHROM','POS'] + ann_cols], on=['CHROM','POS'], how='left', suffixes=('',''))
    
        # Initialize ANN columns on original df
        for c in ann_cols:
            if c not in df.columns:
                df[c] = ''
    
        # Fill values:
        for c in ann_cols:
            values = merged[c]
            if has_type:
                df.loc[is_ins, c] = values[is_ins].fillna('').astype(str).values
            else:
                df[c] = values.fillna('').astype(str).values
    
        # Report matching stats on the actual merge
        matched = merged[ann_cols].notna().any(axis=1).sum()
        print(f"\nMERGE RESULT: rows with any ANN filled (exact VCF match): {int(matched)} / {len(df)}")
    
        # Additional hint if tolerance suggests many near-misses
        if pos_tolerance > 0:
            sheet_keys = keys_set(left, 'CHROM', 'POS')
            vcf_keys_unique = keys_set(vcf_use, 'CHROM', 'POS')
            approx = tolerance_match_count(sheet_keys, vcf_keys_unique, pos_tolerance)
            if approx > matched:
                print(f"NOTE: There appear to be {approx - matched} additional rows that would match within ±{pos_tolerance}.")
                print("      This often indicates a 0-based vs 1-based position shift or use of END instead of POS in the sheet.")
    
        return df
    
    def main():
        ap = argparse.ArgumentParser()
        ap.add_argument('--excel', default='merged_ngmlr+sniffles_variants.xlsx', help='Input Excel workbook')
        ap.add_argument('--sheet8', default='HD46-8', help='Sheet name for HD46-8 sample')
        ap.add_argument('--sheet13', default='HD46-13', help='Sheet name for HD46-13 sample')
        ap.add_argument('--vcf8', default='HD46-8_ngmlr+sniffles_filtered.annotated.vcf', help='Annotated VCF for HD46-8')
        ap.add_argument('--vcf13', default='HD46-13_ngmlr+sniffles_filtered.annotated.vcf', help='Annotated VCF for HD46-13')
        ap.add_argument('--out', default='merged_ngmlr+sniffles_variants_with_ANN.xlsx', help='Output Excel path')
        ap.add_argument('--debug_examples', type=int, default=15, help='How many non-match examples to print from each side')
        ap.add_argument('--pos_tolerance', type=int, default=1, help='Diagnostic tolerance (±N bp) for off-by-N checks (used for debug only)')
        args = ap.parse_args()
    
        excel_path = Path(args.excel)
        vcf8_path = Path(args.vcf8)
        vcf13_path = Path(args.vcf13)
        out_path = Path(args.out)
    
        # Load sheets (resolve case-insensitive names)
        xls = pd.ExcelFile(excel_path)
        def resolve_sheet(name: str) -> str:
            if name in xls.sheet_names:
                return name
            lower_map = {s.lower(): s for s in xls.sheet_names}
            return lower_map.get(name.lower(), name)
    
        sheet8 = resolve_sheet(args.sheet8)
        sheet13 = resolve_sheet(args.sheet13)
    
        df8 = pd.read_excel(excel_path, sheet_name=sheet8)
        df13 = pd.read_excel(excel_path, sheet_name=sheet13)
    
        # Parse VCFs (INS only)
        vcf8_df, ann_cols = parse_vcf_ann(vcf8_path)
        vcf13_df, _ = parse_vcf_ann(vcf13_path)
    
        print(f"VCF8 INS variants: {len(vcf8_df)}; VCF13 INS variants: {len(vcf13_df)}")
        print(f"ANN subfields ({len(ann_cols)}): {', '.join(ann_cols)}")
    
        # Merge with diagnostics
        df8_out = merge_ann_into_sheet(df8, vcf8_df, ann_cols,
                                    pos_tolerance=args.pos_tolerance,
                                    debug_examples=args.debug_examples)
        df13_out = merge_ann_into_sheet(df13, vcf13_df, ann_cols,
                                        pos_tolerance=args.pos_tolerance,
                                        debug_examples=args.debug_examples)
    
        # Save
        with pd.ExcelWriter(out_path, engine='xlsxwriter') as writer:
            df8_out.to_excel(writer, sheet_name=sheet8, index=False)
            df13_out.to_excel(writer, sheet_name=sheet13, index=False)
    
        print(f"\nDone. Wrote: {out_path.resolve()}")
    
    if __name__ == '__main__':
        main()
  3. Manually merge all contents of ANN=? to a seperate column ‘ANN’ in the isolate-specific sheets in the Excel-file.

    #Add CHROM and HD46_Ctrl.1 to first column of the input Excel-file
    (plot-numpy1) jhuang@WS-2290C:~/DATA/Data_Patricia_Transposon_2025$ python add_ann_to_excel.py         --excel merged_ngmlr+sniffles_variants.xlsx         --sheet8 "HD46-8"         --sheet13 "HD46-13"         --vcf8 HD46-8_ngmlr+sniffles_filtered.annotated.vcf         --vcf13 HD46-13_ngmlr+sniffles_filtered.annotated.vcf         --out merged_ngmlr+sniffles_variants_with_ANN.xlsx
    #DEL some columns (INFO, NN_Allele, ANN_Rank, ANN_Errors_Warnings_Info, from the table, and COPY the summary-sheet to the final table.
  4. Run nextflow bacass

    # -- samplesheet_bacass.tsv --
    #ID R1  R2  LongFastQ   Fast5   GenomeSize
    #HD46_Ctrl          HD46_Ctrl.fastq.gz  NA  NA
    #HD46_1         HD46_1.fastq.gz NA  NA
    #An6    /mnt/md1/DATA/Data_Tam_DNAseq_2026_An6_BG5/X101SC26036392-Z01-J002/clean_data/An6/An6_L1_1.clean.rd.fq.gz   /mnt/md1/DATA/Data_Tam_DNAseq_2026_An6_BG5/X101SC26036392-Z01-J002/clean_data/An6/An6_L1_2.clean.rd.fq.gz   /mnt/md1/DATA/Data_Tam_DNAseq_2026_An6_BG5/X101SC26036392-Z01-J003/Release-X101SC26036392-Z01-J003-20260513_01/Data-X101SC26036392-Z01-J003/An6/2157_4C_PBK79106_7ec05c46/merged_An6_longreads.fastq.gz NA  2.7m
    #BG5    /mnt/md1/DATA/Data_Tam_DNAseq_2026_An6_BG5/X101SC26036392-Z01-J002/clean_data/BG5/BG5_L1_1.clean.rd.fq.gz   /mnt/md1/DATA/Data_Tam_DNAseq_2026_An6_BG5/X101SC26036392-Z01-J002/clean_data/BG5/BG5_L1_2.clean.rd.fq.gz   /mnt/md1/DATA/Data_Tam_DNAseq_2026_An6_BG5/X101SC26036392-Z01-J003/Release-X101SC26036392-Z01-J003-20260513_01/Data-X101SC26036392-Z01-J003/BG5/2157_4C_PBK79106_7ec05c46/merged_BG5_longreads.fastq.gz NA  6.5m
    
    conda deactivate
    # DEBUG: --kmerfinderdb /mnt/nvme1n1p1/REFs/kmerfinder/bacteria/ not working, maybe due to the version, since 2.5.0 was working (see below)!
    #nextflow run nf-core/bacass -r 2.5.0 -profile docker \
    #--input samplesheet.tsv \
    #--outdir bacass_out \
    #--assembly_type long \
    #--kraken2db /mnt/nvme1n1p1/REFs/k2_standard_08_GB_20251015.tar.gz \
    #--kmerfinderdb /mnt/nvme1n1p1/REFs/kmerfinder/bacteria/ \
    #-resume
    
    #For hybrid-assembly: --assembly_type hybrid --assembler unicycler,dragonflye [unicycler,autocycler,canu,dragonflye,flye,miniasm,raven,megahit] \
    nextflow run nf-core/bacass -r 2.6.0 -profile docker --help
    nextflow run nf-core/bacass -r 2.6.0 -profile docker \
      --input samplesheet_bacass.tsv \
      --outdir bacass_out \
      --assembly_type long \
      --assembler unicycler,dragonflye \
      --kraken2db /mnt/nvme1n1p1/REFs/k2_standard_08_GB_20251015.tar.gz \
      --skip_kmerfinder \
      -resume \
      -c unicycler.config \
      -work-dir bacass_out/work
    
    #SAVE bacass_out/Kmerfinder/kmerfinder_summary.csv to bacass_out/Kmerfinder/An6?/An6?_kmerfinder_results.xlsx
    
    #busco example results:
    Input_file      Dataset Complete        Single  Duplicated      Fragmented      Missing n_markers       Scaffold N50    Contigs N50     Percent gaps    Number of scaffolds
    wt_cef.scaffolds.fa     bacteria_odb10  98.4    98.4    0.0     1.6     0.0     124     285852  285852  0.000%  45
    wt_cipro.scaffolds.fa   bacteria_odb10  90.3    89.5    0.8     8.1     1.6     124     7434    7434    0.000%  1699
  5. Detecting the next closest genome

    mamba activate gtdbtk
    
    # 验证环境变量是否加载成功
    echo $GTDBTK_DATA_PATH
    # 应输出:/mnt/nvme4n1p1/gtdb_data/release232
    
    # 3. 运行分类(你提供的命令 + 实用参数)
    gtdbtk classify_wf \
      --genome_dir ./bacass_out/Medaka \
      --out_dir gtdb_out \
      --cpus 64 \
      --extension .fa \
      --prefix mygenome
    
    # 4. 查看结果
    cat gtdb_out/classify/mygenome.bac120.summary.tsv   # 细菌结果
  6. Structural variant calling

    conda activate sv_assembly
    
    # MLST calling
    for sample in HD46_Ctrl HD46_1 HD46_2 HD46_3 HD46_4 HD46_5 HD46_6 HD46_7 HD46_8 HD46_13 _WT _1 _2 _3 _4 _5 _7 _8 _9 _10; do
        mlst bacass_out/Medaka/${sample}-unicycler-medaka_polished_genome.fa >> mlst_res
    done
    
    # After running MLST and genome taxonomy checks, I found that CP020463 is only a suitable reference for the first dataset (_WT, _1–_10), because all of these samples share ST 86 — the same sequence type as CP020463. For the second dataset (HD46 series), CP020463 is not an appropriate reference. MLST and GTDB-Tk classification results (see mygenome.bac120.summary2.xlsx and mlst_res.xlsx) show that these genomes are genetically distinct.
    for sample in _WT _1 _2 _3 _4 _5 _7 _8 _9 _10; do
            nucmer --maxmatch -l 100 -c 500 CP020463.fasta bacass_out/Medaka/${sample}-dragonflye-medaka_polished_genome.fa -p ${sample};
            delta-filter -1 -q ${sample}.delta > ${sample}.filtered.delta;
            #Usage: Assemblytics delta output_prefix    unique_length_required    min_size    max_size
            Assemblytics ${sample}.filtered.delta ${sample}_assemblytics 1000 100 500000;
    done
    samtools faidx bacass_out/Medaka/HD46_Ctrl-dragonflye-medaka_polished_genome.fa contig00001 > bacass_out/Medaka/HD46_Ctrl_chrom.fa
    for sample in HD46_1 HD46_2 HD46_5 HD46_6 HD46_7; do
            nucmer --maxmatch -l 100 -c 500 bacass_out/Medaka/HD46_Ctrl_chrom.fa bacass_out/Medaka/${sample}-dragonflye-medaka_polished_genome.fa -p ${sample};
            delta-filter -1 -q ${sample}.delta > ${sample}.filtered.delta;
            #Usage: Assemblytics delta output_prefix    unique_length_required    min_size    max_size; Note that we use a large threshold 500,000 nt.
            Assemblytics ${sample}.filtered.delta ${sample}_assemblytics 1000 100 500000;
    done
    
    ./merge_variants.sh
            #!/bin/bash
            # Define the output file name
            OUTPUT="merged_assemblytics_variants.txt"
            # 1. Write the header with the new 'Sample' column
            # We read the header from the first file, strip the '#', and append 'Sample'
            head -n 1 *_assemblytics.variant_preview.txt | grep '^#' | sed 's/^#//' | awk '{print $0 "\tSample"}' > "$OUTPUT"
            # 2. Loop through all matching files
            for file in *_assemblytics.variant_preview.txt; do
                # Extract sample name (e.g., "HD46_Ctrl" from "HD46_Ctrl_assemblytics.variant_preview.txt")
                sample=$(echo "$file" | sed 's/_assemblytics\.variant_preview\.txt//')
    
                # Append data lines, skipping the header line (lines starting with '#')
                # and append the sample name as the last column
                grep -v '^#' "$file" | awk -v samp="$sample" '{print $0 "\t" samp}' >> "$OUTPUT"
            done
            echo "✅ Successfully merged $(ls *_assemblytics.variant_preview.txt | wc -l) files into $OUTPUT"
    
    #Manually sorted the generated file merged_assemblytics_variants.txt into two parts: one is HD46-series and one is _*series.
    
    unicycler -l HD46_Ctrl.fastq.gz --mode normal -t 40 -o HD46_Ctrl_unicycler_normal
    unicycler -l HD46_3.fastq.gz --mode normal -t 40 -o HD46_3_unicycler_normal
    unicycler -l HD46_4.fastq.gz --mode normal -t 40 -o HD46_4_unicycler_normal
    unicycler -l HD46_13.fastq.gz --mode normal -t 40 -o HD46_13_unicycler_normal

From bioBakery Processing to Paired Differential Analysis (Data_Tam_Metagenomics_2026_Wastewater)

Direct Answer

Yes and No.

  • Yes, you can use biobakery_workflows wmgx to process all your FASTQ files into taxonomic and functional profiles, and wmgx_vis to generate basic visualizations (like alpha/beta diversity).
  • No (Not Recommended), you should not use wmgx_vis for the actual differential analysis. Because your study is a paired/longitudinal design (Pre- vs. Post-treatment from the same subjects across multiple time points), you must account for the paired nature of the data (using random effects). The command-line wrapper for wmgx_vis does not support specifying random effects, which would lead to statistically flawed results.

Instead, the standard practice in the bioBakery ecosystem is to use wmgx for processing, and then run MaAsLin 2 directly for the differential analysis.

Here is the complete workflow on how to do this:


Step 1: Process the data with wmgx

Since your paired-end files follow the standard _1.fastq.gz and _2.fastq.gz naming convention, wmgx will automatically detect and pair them.

Assuming all your symlinks are in a folder called /data/fastq_files/:

biobakery_workflows wmgx \
  --input /data/fastq_files/ \
  --output /data/wmgx_results/ \
  --threads 20

(Note: If you don’t need strain profiling, you can speed this up by adding --bypass-strain-profiling)


Step 2: Basic Visualization with wmgx_vis (Optional)

You can use wmgx_vis to generate standard diversity plots and heatmaps. You will need a metadata file (see Step 3 for the format).

biobakery_workflows wmgx_vis \
  --input /data/wmgx_results/ \
  --project-name Metagenome_Study \
  --input-metadata /data/metadata.tsv \
  --metadata-categorical Group Timepoint \
  --output /data/visualization_results/

Step 3: Proper Differential Analysis (Using MaAsLin 2 directly)

To correctly compare Pre vs. Post while accounting for the fact that the samples are paired (and potentially accounting for the different time points), you should use MaAsLin 2 directly on the output tables generated by wmgx.

1. Prepare your Metadata File (metadata.tsv)

You need a tab-separated file where the SampleID exactly matches the prefix of your FASTQ files (e.g., 2025_Nov_Pre_A1).

SampleID Group Timepoint Subject_ID
2025_Nov_Pre_A1 Pre Nov Subject_1
2025_Nov_Post_B1 Post Nov Subject_1
2026_Jan_Pre_A Pre Jan Subject_2
2026_Jan_Post_B Post Jan Subject_2

(Note: You must correctly map which “Pre” sample belongs to which “Post” sample in the Subject_ID column. This is how the model knows they are paired).

2. Run MaAsLin 2 in R

You can run MaAsLin 2 (which is installed with bioBakery) using R. This allows you to define Group (Pre/Post) as a fixed effect and Subject_ID as a random effect, which is mathematically required for paired data.

# Install Maaslin2 if you haven't already
# BiocManager::install("Maaslin2")
library(Maaslin2)

# Run the differential abundance analysis
fit_data <- Maaslin2(
    input_data     = "/data/wmgx_results/wmgx/humann/genefamilies.tsv", # or pathabundance.tsv
    input_metadata = "/data/metadata.tsv",
    output         = "/data/maaslin2_results/",

    # Define your variables
    fixed_effects  = c("Group", "Timepoint"), # Comparing Pre vs Post, while controlling for Timepoint
    random_effects = c("Subject_ID"),         # CRUCIAL: Accounts for the paired Pre/Post design!

    # Data processing parameters
    normalization  = "NONE",                  # HUMAnN output is already normalized to relative abundance
    transform      = "LOG",
    min_abundance  = 0.01,
    min_prevalence = 0.10,
    correction     = "BH"                     # Benjamini-Hochberg for multiple testing (FDR)
)

Summary

  1. Use wmgx to process your raw FASTQs.
  2. Use wmgx_vis if you just want quick, automated PCA/PCoA plots and heatmaps.
  3. Do not use wmgx_vis for the differential statistics. Extract the genefamilies.tsv or metaphlan merged tables and run MaAsLin 2 directly so you can include Subject_ID as a random effect to properly handle your Pre/Post paired design.

世界杯竞猜游戏(Tippspiel)

由于2026年美加墨世界杯扩军到了48支球队,总共有104场比赛,预测所有小组赛的精确比分篇幅会非常庞大。如果你是在参加公司或朋友间的世界杯竞猜游戏(Tippspiel),我为你整理了一份最核心、最完整的赛事比分预测表

这份预测涵盖了小组赛焦点战完整的淘汰赛路径(从16强到决赛)以及附加竞猜问题,你可以直接参考填入你的竞猜表:

⚽ 一、 小组赛焦点战预测 (Gruppenphase)

在小组赛阶段,强队通常会稳扎稳打,比分不会过于夸张。

🇺🇸🇨🇦🇲🇽 东道主首战:

  • 🇲🇽 墨西哥 vs 南非 🇿🇦 ➔ 2:0
  • 🇨🇦 加拿大 vs 波黑 🇧🇦 ➔ 1:0
  • 🇺🇸 美国 vs 巴拉圭 🇵🇾 ➔ 2:1

🏆 夺冠热门及强队小组赛典型比分:

  • 🇩🇪 德国 vs 厄瓜多尔/澳大利亚 ➔ 3:1
  • 🇪🇸 西班牙 vs 克罗地亚/新西兰 ➔ 2:0
  • 🇫🇷 法国 vs 丹麦/加拿大 ➔ 2:1
  • 🇦🇷 阿根廷 vs 澳大利亚/秘鲁 ➔ 2:0
  • 🏴󠁧󠁢󠁥󠁮󠁧󠁿 英格兰 vs 塞尔维亚/斯洛文尼亚 ➔ 2:0
  • 🇧🇷 巴西 vs 苏格兰/摩洛哥 ➔ 2:1

🏆 二、 完整淘汰赛路径预测 (K.O.-Phase)

淘汰赛阶段通常防守更严密,1球或2球的优势是常态,部分比赛会拖入点球大战。

🥊 1/8 决赛 (Achtelfinale) – 核心场次

  • 🇪🇸 西班牙 2:0 日本 🇯🇵
  • 🇩🇪 德国 2:1 墨西哥 🇲🇽
  • 🇫🇷 法国 3:0 波兰 🇵🇱
  • 🇦🇷 阿根廷 2:1 澳大利亚 🇦🇺
  • 🏴󠁧󠁢󠁥󠁮󠁧󠁿 英格兰 1:0 哥伦比亚 🇨🇴
  • 🇧🇷 巴西 2:0 瑞士 🇨🇭
  • 🇵🇹 葡萄牙 1:0 乌拉圭 🇺🇾
  • 🇳🇱 荷兰 2:1 美国 🇺🇸 (东道主遗憾止步16强)

🔥 1/4 决赛 (Viertelfinale)

  • 🇪🇸 西班牙 2:1 德国 🇩🇪 (焦点大战)
  • 🇫🇷 法国 2:1 阿根廷 🇦🇷 (卫冕冠军出局)
  • 🏴󠁧󠁢󠁥󠁮󠁧󠁿 英格兰 1:1 (点球 4:3) 巴西 🇧🇷
  • 🇵🇹 葡萄牙 2:1 荷兰 🇳🇱

🚀 半决赛 (Halbfinale)

  • 🇪🇸 西班牙 2:1 法国 🇫🇷
  • 🏴󠁧󠁢󠁥󠁮󠁧󠁿 英格兰 2:1 葡萄牙 🇵🇹

🥉 季军战 (Spiel um Platz 3)

  • 🇫🇷 法国 2:0 葡萄牙 🇵🇹

🏆 决赛 (Finale) – 新泽西大都会人寿体育场

  • 🇪🇸 西班牙 2 : 1 英格兰 🏴󠁧󠁢󠁥󠁮󠁧󠁿 (预测:西班牙凭借更强大的中场控制力(如罗德里、佩德里)和边路爆点(亚马尔)在常规时间或加时赛中绝杀英格兰,队史第二次捧杯!)

🎯 三、 附加竞猜问题 (Rahmenwetten)

如果你的竞猜表有这些问题,可以直接填这些答案:

  1. 🏆 世界杯冠军 (Weltmeister): 西班牙 (Spanien)
  2. 🥇 最佳射手/金靴 (Torschützenkönig): 基利安·姆巴佩 (Kylian Mbappé)预测进球数:6球
  3. 🌟 最佳年轻球员 (Bester junger Spieler): 拉明·亚马尔 (Lamine Yamal / 西班牙)
  4. 🧤 最佳门将 (Bester Torwart): 埃米利亚诺·马丁内斯 (Emiliano Martínez / 阿根廷)乌奈·西蒙 (Unai Simón / 西班牙)
  5. ⚽ 决赛比分 (Ergebnis des Finales): 2:1
  6. 📊 赛事总进球数 (Anzahl der Turniertore): 268球 (48队104场比赛,场均约2.5-2.8球)

💡 提示 (Tipp): 在填写竞猜表时,淘汰赛阶段的比分尽量多写 1:0, 2:0, 2:1, 1:1 这种小比分,这在世界杯淘汰赛中出现的概率远高于大比分(如3:2或4:1)。祝你的竞猜表取得好成绩(Viel Glück beim Tippspiel)!

RNAseq processing (Data_Tam_RNAseq_2026_Dicl_Mero_Azith_Rifa_on_AYE)

complete_deg_pipeline.R

complete_deg_pipeline_custom_cutoff.R

1.R

  1. Preparing raw data

     mkdir raw_data; cd raw_data
    
     # control samples (8)
     ln -s ../X101SC26025981-Z01-J001/01.RawData/1/1_1.fq.gz AYE-WT_ctr_r1_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/1/1_2.fq.gz AYE-WT_ctr_r1_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/2/2_1.fq.gz AYE-WT_ctr_r2_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/2/2_2.fq.gz AYE-WT_ctr_r2_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/3/3_1.fq.gz AYE-WT_ctr_r3_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/3/3_2.fq.gz AYE-WT_ctr_r3_R2.fastq.gz
    
     ln -s ../X101SC26025981-Z01-J001/01.RawData/4/4_1.fq.gz AYE-T_ctr_r1_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/4/4_2.fq.gz AYE-T_ctr_r1_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/5/5_1.fq.gz AYE-T_ctr_r2_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/5/5_2.fq.gz AYE-T_ctr_r2_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/6/6_1.fq.gz AYE-T_ctr_r3_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/6/6_2.fq.gz AYE-T_ctr_r3_R2.fastq.gz
    
     ln -s ../X101SC26025981-Z01-J001/01.RawData/7/7_1.fq.gz AYE-O_ctr_r1_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/7/7_2.fq.gz AYE-O_ctr_r1_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/8/8_1.fq.gz AYE-O_ctr_r2_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/8/8_2.fq.gz AYE-O_ctr_r2_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/9/9_1.fq.gz AYE-O_ctr_r3_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/9/9_2.fq.gz AYE-O_ctr_r3_R2.fastq.gz
    
     ln -s ../X101SC26025981-Z01-J001/01.RawData/10/10_1.fq.gz O-Trans_ctr_r1_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/10/10_2.fq.gz O-Trans_ctr_r1_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/11/11_1.fq.gz O-Trans_ctr_r2_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/11/11_2.fq.gz O-Trans_ctr_r2_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/12/12_1.fq.gz O-Trans_ctr_r3_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/12/12_2.fq.gz O-Trans_ctr_r3_R2.fastq.gz
    
     ln -s ../X101SC26025981-Z01-J001/01.RawData/1new/1new_1.fq.gz WT-Trans_ctr_r1_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/1new/1new_2.fq.gz WT-Trans_ctr_r1_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/2new/2new_1.fq.gz WT-Trans_ctr_r2_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/2new/2new_2.fq.gz WT-Trans_ctr_r2_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/3new/3new_1.fq.gz WT-Trans_ctr_r3_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/3new/3new_2.fq.gz WT-Trans_ctr_r3_R2.fastq.gz
    
     ln -s ../X101SC26025981-Z01-J001/01.RawData/49/49_1.fq.gz AYE-WT_ctr_solid_r1_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/49/49_2.fq.gz AYE-WT_ctr_solid_r1_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/50/50_1.fq.gz AYE-WT_ctr_solid_r2_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/50/50_2.fq.gz AYE-WT_ctr_solid_r2_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/51/51_1.fq.gz AYE-WT_ctr_solid_r3_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/51/51_2.fq.gz AYE-WT_ctr_solid_r3_R2.fastq.gz
    
     ln -s ../X101SC26025981-Z01-J001/01.RawData/52/52_1.fq.gz AYE-O_ctr_solid_r1_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/52/52_2.fq.gz AYE-O_ctr_solid_r1_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/53/53_1.fq.gz AYE-O_ctr_solid_r2_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/53/53_2.fq.gz AYE-O_ctr_solid_r2_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/54/54_1.fq.gz AYE-O_ctr_solid_r3_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/54/54_2.fq.gz AYE-O_ctr_solid_r3_R2.fastq.gz
    
     ln -s ../X101SC26025981-Z01-J001/01.RawData/55/55_1.fq.gz AYE-T_ctr_solid_r1_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/55/55_2.fq.gz AYE-T_ctr_solid_r1_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/56/56_1.fq.gz AYE-T_ctr_solid_r2_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/56/56_2.fq.gz AYE-T_ctr_solid_r2_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/57/57_1.fq.gz AYE-T_ctr_solid_r3_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/57/57_2.fq.gz AYE-T_ctr_solid_r3_R2.fastq.gz
    
     # Diclofenac(双氯芬酸)treatment (6)
     ln -s ../X101SC26025981-Z01-J001/01.RawData/25/25_1.fq.gz AYE-WT_Diclo750_r1_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/25/25_2.fq.gz AYE-WT_Diclo750_r1_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/26/26_1.fq.gz AYE-WT_Diclo750_r2_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/26/26_2.fq.gz AYE-WT_Diclo750_r2_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/27/27_1.fq.gz AYE-WT_Diclo750_r3_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/27/27_2.fq.gz AYE-WT_Diclo750_r3_R2.fastq.gz
    
     ln -s ../X101SC26025981-Z01-J001/01.RawData/28/28_1.fq.gz AYE-T_Diclo375_r1_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/28/28_2.fq.gz AYE-T_Diclo375_r1_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/29/29_1.fq.gz AYE-T_Diclo375_r2_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/29/29_2.fq.gz AYE-T_Diclo375_r2_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/30/30_1.fq.gz AYE-T_Diclo375_r3_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/30/30_2.fq.gz AYE-T_Diclo375_r3_R2.fastq.gz
    
     ln -s ../X101SC26025981-Z01-J001/01.RawData/31/31_1.fq.gz AYE-O_Diclo375_r1_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/31/31_2.fq.gz AYE-O_Diclo375_r1_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/32/32_1.fq.gz AYE-O_Diclo375_r2_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/32/32_2.fq.gz AYE-O_Diclo375_r2_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/33/33_1.fq.gz AYE-O_Diclo375_r3_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/33/33_2.fq.gz AYE-O_Diclo375_r3_R2.fastq.gz
    
     ln -s ../X101SC26025981-Z01-J001/01.RawData/34/34_1.fq.gz O-Trans_Diclo375_r1_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/34/34_2.fq.gz O-Trans_Diclo375_r1_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/35/35_1.fq.gz O-Trans_Diclo375_r2_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/35/35_2.fq.gz O-Trans_Diclo375_r2_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/36/36_1.fq.gz O-Trans_Diclo375_r3_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/36/36_2.fq.gz O-Trans_Diclo375_r3_R2.fastq.gz
    
     ln -s ../X101SC26025981-Z01-J001/01.RawData/4new/4new_1.fq.gz WT-Trans_Diclo750_r1_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/4new/4new_2.fq.gz WT-Trans_Diclo750_r1_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/5new/5new_1.fq.gz WT-Trans_Diclo750_r2_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/5new/5new_2.fq.gz WT-Trans_Diclo750_r2_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/6new/6new_1.fq.gz WT-Trans_Diclo750_r3_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/6new/6new_2.fq.gz WT-Trans_Diclo750_r3_R2.fastq.gz
    
     ln -s ../X101SC26025981-Z01-J001/01.RawData/73/73_1.fq.gz AYE-WT_Diclo1250_solid_r1_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/73/73_2.fq.gz AYE-WT_Diclo1250_solid_r1_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/74/74_1.fq.gz AYE-WT_Diclo1250_solid_r2_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/74/74_2.fq.gz AYE-WT_Diclo1250_solid_r2_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/75/75_1.fq.gz AYE-WT_Diclo1250_solid_r3_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/75/75_2.fq.gz AYE-WT_Diclo1250_solid_r3_R2.fastq.gz
    
     # Rifampicin(利福平)treatment (4)
     ln -s ../X101SC26025981-Z01-J001/01.RawData/13/13_1.fq.gz AYE-WT_Rifampicin1.5_r1_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/13/13_2.fq.gz AYE-WT_Rifampicin1.5_r1_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/14/14_1.fq.gz AYE-WT_Rifampicin1.5_r2_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/14/14_2.fq.gz AYE-WT_Rifampicin1.5_r2_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/15/15_1.fq.gz AYE-WT_Rifampicin1.5_r3_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/15/15_2.fq.gz AYE-WT_Rifampicin1.5_r3_R2.fastq.gz
    
     ln -s ../X101SC26025981-Z01-J001/01.RawData/16/16_1.fq.gz AYE-T_Rifampicin2_r1_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/16/16_2.fq.gz AYE-T_Rifampicin2_r1_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/17/17_1.fq.gz AYE-T_Rifampicin2_r2_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/17/17_2.fq.gz AYE-T_Rifampicin2_r2_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/18/18_1.fq.gz AYE-T_Rifampicin2_r3_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/18/18_2.fq.gz AYE-T_Rifampicin2_r3_R2.fastq.gz
    
     ln -s ../X101SC26025981-Z01-J001/01.RawData/19/19_1.fq.gz AYE-O_Rifampicin2_r1_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/19/19_2.fq.gz AYE-O_Rifampicin2_r1_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/20/20_1.fq.gz AYE-O_Rifampicin2_r2_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/20/20_2.fq.gz AYE-O_Rifampicin2_r2_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/21/21_1.fq.gz AYE-O_Rifampicin2_r3_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/21/21_2.fq.gz AYE-O_Rifampicin2_r3_R2.fastq.gz
    
     ln -s ../X101SC26025981-Z01-J001/01.RawData/22/22_1.fq.gz O-Trans_Rifampicin2_r1_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/22/22_2.fq.gz O-Trans_Rifampicin2_r1_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/23/23_1.fq.gz O-Trans_Rifampicin2_r2_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/23/23_2.fq.gz O-Trans_Rifampicin2_r2_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/24/24_1.fq.gz O-Trans_Rifampicin2_r3_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/24/24_2.fq.gz O-Trans_Rifampicin2_r3_R2.fastq.gz
    
     # Meropenem(美罗培南)treatment (4)
     ln -s ../X101SC26025981-Z01-J001/01.RawData/37/37_1.fq.gz AYE-WT_Mero0.35-0.5_r1_R1.fastq.gz  #AYE-WT_Mero0.5_r1
     ln -s ../X101SC26025981-Z01-J001/01.RawData/37/37_2.fq.gz AYE-WT_Mero0.35-0.5_r1_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/38/38_1.fq.gz AYE-WT_Mero0.35-0.5_r2_R1.fastq.gz  #AYE-WT_YX_Mero0.35_r2
     ln -s ../X101SC26025981-Z01-J001/01.RawData/38/38_2.fq.gz AYE-WT_Mero0.35-0.5_r2_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/39/39_1.fq.gz AYE-WT_Mero0.35-0.5_r3_R1.fastq.gz  #AYE-WT_public_Mero0.35_r3
     ln -s ../X101SC26025981-Z01-J001/01.RawData/39/39_2.fq.gz AYE-WT_Mero0.35-0.5_r3_R2.fastq.gz
    
     ln -s ../X101SC26025981-Z01-J001/01.RawData/40/40_1.fq.gz AYE-T_Mero0.15_r1_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/40/40_2.fq.gz AYE-T_Mero0.15_r1_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/41/41_1.fq.gz AYE-T_Mero0.15_r2_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/41/41_2.fq.gz AYE-T_Mero0.15_r2_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/42/42_1.fq.gz AYE-T_Mero0.15_r3_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/42/42_2.fq.gz AYE-T_Mero0.15_r3_R2.fastq.gz
    
     ln -s ../X101SC26025981-Z01-J001/01.RawData/43/43_1.fq.gz AYE-O_Mero0.5_r1_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/43/43_2.fq.gz AYE-O_Mero0.5_r1_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/44/44_1.fq.gz AYE-O_Mero0.5_r2_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/44/44_2.fq.gz AYE-O_Mero0.5_r2_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/45/45_1.fq.gz AYE-O_Mero0.5_r3_R1.fastq.gz  #Mero0.45
     ln -s ../X101SC26025981-Z01-J001/01.RawData/45/45_2.fq.gz AYE-O_Mero0.5_r3_R2.fastq.gz
    
     ln -s ../X101SC26025981-Z01-J001/01.RawData/46/46_1.fq.gz O-Trans_Mero0.25_r1_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/46/46_2.fq.gz O-Trans_Mero0.25_r1_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/47/47_1.fq.gz O-Trans_Mero0.25_r2_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/47/47_2.fq.gz O-Trans_Mero0.25_r2_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/48/48_1.fq.gz O-Trans_Mero0.25_r3_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/48/48_2.fq.gz O-Trans_Mero0.25_r3_R2.fastq.gz
    
     # Azithromycin(阿奇霉素)treatment (5), among them, F_ctr_solid is clinical isolate.
     ln -s ../X101SC26025981-Z01-J001/01.RawData/58/58_1.fq.gz F_ctr_solid_r1_R1.fastq.gz  #clinical
     ln -s ../X101SC26025981-Z01-J001/01.RawData/58/58_2.fq.gz F_ctr_solid_r1_R2.fastq.gz  #clinical
     ln -s ../X101SC26025981-Z01-J001/01.RawData/59/59_1.fq.gz F_ctr_solid_r2_R1.fastq.gz  #clinical
     ln -s ../X101SC26025981-Z01-J001/01.RawData/59/59_2.fq.gz F_ctr_solid_r2_R2.fastq.gz  #clinical
     ln -s ../X101SC26025981-Z01-J001/01.RawData/60/60_1.fq.gz F_ctr_solid_r3_R1.fastq.gz  #clinical
     ln -s ../X101SC26025981-Z01-J001/01.RawData/60/60_2.fq.gz F_ctr_solid_r3_R2.fastq.gz  #clinical
    
     ln -s ../X101SC26025981-Z01-J001/01.RawData/61/61_1.fq.gz AYE-WT_Azi20_solid_r1_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/61/61_2.fq.gz AYE-WT_Azi20_solid_r1_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/62/62_1.fq.gz AYE-WT_Azi20_solid_r2_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/62/62_2.fq.gz AYE-WT_Azi20_solid_r2_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/63/63_1.fq.gz AYE-WT_Azi20_solid_r3_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/63/63_2.fq.gz AYE-WT_Azi20_solid_r3_R2.fastq.gz
    
     ln -s ../X101SC26025981-Z01-J001/01.RawData/67/67_1.fq.gz AYE-T_Azi20_solid_r1_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/67/67_2.fq.gz AYE-T_Azi20_solid_r1_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/68/68_1.fq.gz AYE-T_Azi20_solid_r2_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/68/68_2.fq.gz AYE-T_Azi20_solid_r2_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/69/69_1.fq.gz AYE-T_Azi20_solid_r3_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/69/69_2.fq.gz AYE-T_Azi20_solid_r3_R2.fastq.gz
    
     ln -s ../X101SC26025981-Z01-J001/01.RawData/64/64_1.fq.gz AYE-O_Azi20_solid_r1_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/64/64_2.fq.gz AYE-O_Azi20_solid_r1_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/65/65_1.fq.gz AYE-O_Azi20_solid_r2_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/65/65_2.fq.gz AYE-O_Azi20_solid_r2_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/66/66_1.fq.gz AYE-O_Azi20_solid_r3_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/66/66_2.fq.gz AYE-O_Azi20_solid_r3_R2.fastq.gz
    
     ln -s ../X101SC26025981-Z01-J001/01.RawData/70/70_1.fq.gz F_Azi20_solid_r1_R1.fastq.gz  #clinical
     ln -s ../X101SC26025981-Z01-J001/01.RawData/70/70_2.fq.gz F_Azi20_solid_r1_R2.fastq.gz  #clinical
     ln -s ../X101SC26025981-Z01-J001/01.RawData/71/71_1.fq.gz F_Azi20_solid_r2_R1.fastq.gz  #clinical
     ln -s ../X101SC26025981-Z01-J001/01.RawData/71/71_2.fq.gz F_Azi20_solid_r2_R2.fastq.gz  #clinical
     ln -s ../X101SC26025981-Z01-J001/01.RawData/72/72_1.fq.gz F_Azi20_solid_r3_R1.fastq.gz  #clinical
     ln -s ../X101SC26025981-Z01-J001/01.RawData/72/72_2.fq.gz F_Azi20_solid_r3_R2.fastq.gz  #clinical
  2. Preparing the directory trimmed

     mkdir trimmed trimmed_unpaired;
     for sample_id in AYE-O_Azi20_solid_r1 AYE-O_Azi20_solid_r2 AYE-O_Azi20_solid_r3 AYE-O_ctr_r1 AYE-O_ctr_r2 AYE-O_ctr_r3 AYE-O_ctr_solid_r1 AYE-O_ctr_solid_r2 AYE-O_ctr_solid_r3 AYE-O_Diclo375_r1 AYE-O_Diclo375_r2 AYE-O_Diclo375_r3 AYE-O_Mero0.5_r1 AYE-O_Mero0.5_r2 AYE-O_Mero0.5_r3 AYE-O_Rifampicin2_r1 AYE-O_Rifampicin2_r2 AYE-O_Rifampicin2_r3 AYE-T_Azi20_solid_r1 AYE-T_Azi20_solid_r2 AYE-T_Azi20_solid_r3 AYE-T_ctr_r1 AYE-T_ctr_r2 AYE-T_ctr_r3 AYE-T_ctr_solid_r1 AYE-T_ctr_solid_r2 AYE-T_ctr_solid_r3 AYE-T_Diclo375_r1 AYE-T_Diclo375_r2 AYE-T_Diclo375_r3 AYE-T_Mero0.15_r1 AYE-T_Mero0.15_r2 AYE-T_Mero0.15_r3 AYE-T_Rifampicin2_r1 AYE-T_Rifampicin2_r2 AYE-T_Rifampicin2_r3 AYE-WT_Azi20_solid_r1 AYE-WT_Azi20_solid_r2 AYE-WT_Azi20_solid_r3 AYE-WT_ctr_r1 AYE-WT_ctr_r2 AYE-WT_ctr_r3 AYE-WT_ctr_solid_r1 AYE-WT_ctr_solid_r2 AYE-WT_ctr_solid_r3 AYE-WT_Diclo1250_solid_r1 AYE-WT_Diclo1250_solid_r2 AYE-WT_Diclo1250_solid_r3 AYE-WT_Diclo750_r1 AYE-WT_Diclo750_r2 AYE-WT_Diclo750_r3 AYE-WT_Mero0.35-0.5_r1 AYE-WT_Mero0.35-0.5_r2 AYE-WT_Mero0.35-0.5_r3 AYE-WT_Rifampicin1.5_r1 AYE-WT_Rifampicin1.5_r2 AYE-WT_Rifampicin1.5_r3 F_Azi20_solid_r1 F_Azi20_solid_r2 F_Azi20_solid_r3 F_ctr_solid_r1 F_ctr_solid_r2 F_ctr_solid_r3 O-Trans_ctr_r1 O-Trans_ctr_r2 O-Trans_ctr_r3 O-Trans_Diclo375_r1 O-Trans_Diclo375_r2 O-Trans_Diclo375_r3 O-Trans_Mero0.25_r1 O-Trans_Mero0.25_r2 O-Trans_Mero0.25_r3 O-Trans_Rifampicin2_r1 O-Trans_Rifampicin2_r2 O-Trans_Rifampicin2_r3 WT-Trans_ctr_r1 WT-Trans_ctr_r2 WT-Trans_ctr_r3 WT-Trans_Diclo750_r1 WT-Trans_Diclo750_r2 WT-Trans_Diclo750_r3; do \
     for sample_id in AYE-T_Diclo375_r2; do \
             java -jar /home/jhuang/Tools/Trimmomatic-0.36/trimmomatic-0.36.jar PE -threads 100 raw_data/${sample_id}_R1.fastq.gz raw_data/${sample_id}_R2.fastq.gz trimmed/${sample_id}_R1.fastq.gz trimmed_unpaired/${sample_id}_R1.fastq.gz trimmed/${sample_id}_R2.fastq.gz trimmed_unpaired/${sample_id}_R2.fastq.gz ILLUMINACLIP:/home/jhuang/Tools/Trimmomatic-0.36/adapters/TruSeq3-PE-2.fa:2:30:10:8:TRUE LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36 AVGQUAL:20; done 2> trimmomatic_pe.log;
     done
  3. (Optional) using trinity to find the most closely reference

     #Trinity --seqType fq --max_memory 50G --left trimmed/wt_r1_R1.fastq.gz  --right trimmed/wt_r1_R2.fastq.gz --CPU 12
    
     #https://www.genome.jp/kegg/tables/br08606.html#prok
     acb     KGB     Acinetobacter baumannii ATCC 17978  2007    GenBank
     abm     KGB     Acinetobacter baumannii SDF     2008    GenBank
     aby     KGB     Acinetobacter baumannii AYE     2008    GenBank --> *
     abc     KGB     Acinetobacter baumannii ACICU   2008    GenBank
     abn     KGB     Acinetobacter baumannii AB0057  2008    GenBank
     abb     KGB     Acinetobacter baumannii AB307-0294  2008    GenBank
     abx     KGB     Acinetobacter baumannii 1656-2  2012    GenBank
     abz     KGB     Acinetobacter baumannii MDR-ZJ06    2012    GenBank
     abr     KGB     Acinetobacter baumannii MDR-TJ  2012    GenBank
     abd     KGB     Acinetobacter baumannii TCDC-AB0715     2012    GenBank
     abh     KGB     Acinetobacter baumannii TYTH-1  2012    GenBank
     abad    KGB     Acinetobacter baumannii D1279779    2013    GenBank
     abj     KGB     Acinetobacter baumannii BJAB07104   2013    GenBank
     abab    KGB     Acinetobacter baumannii BJAB0715    2013    GenBank
     abaj    KGB     Acinetobacter baumannii BJAB0868    2013    GenBank
     abaz    KGB     Acinetobacter baumannii ZW85-1  2013    GenBank
     abk     KGB     Acinetobacter baumannii AbH12O-A2   2014    GenBank
     abau    KGB     Acinetobacter baumannii AB030   2014    GenBank
     abaa    KGB     Acinetobacter baumannii AB031   2014    GenBank
     abw     KGB     Acinetobacter baumannii AC29    2014    GenBank
     abal    KGB     Acinetobacter baumannii LAC-4   2015    GenBank
     #Note that the Acinetobacter baumannii strain ATCC 19606 chromosome, complete genome (GenBank: CU459141.1) was choosen as reference!
  4. Preparing samplesheet.csv

     sample,fastq_1,fastq_2,strandedness
     Urine_r1,Urine_r1_R1.fq.gz,Urine_r1_R2.fq.gz,auto
     ...
  5. Downloading CU459141.fasta and CU459141.gff from GenBank and preparing CU459141_m.gff

     #Example1: http://xgenes.com/article/article-content/157/prepare-virus-gtf-for-nextflow-run/
     #Default NOT_WORKING: --gtf_group_features 'gene_id'  --gtf_extra_attributes 'gene_name' --featurecounts_group_type 'gene_biotype' --featurecounts_feature_type 'exon'
     #(host_env) !NOT_WORKING! jhuang@WS-2290C:~/DATA/Data_Tam_RNAseq_2024$ /usr/local/bin/nextflow run rnaseq/main.nf --input samplesheet.csv --outdir results    --fasta "/home/jhuang/DATA/Data_Tam_RNAseq_2024/CU459141.fasta" --gff "/home/jhuang/DATA/Data_Tam_RNAseq_2024/CU459141.gff"        -profile docker -resume  --max_cpus 55 --max_memory 512.GB --max_time 2400.h    --save_align_intermeds --save_unaligned --save_reference    --aligner 'star_salmon'    --gtf_group_features 'gene_id'  --gtf_extra_attributes 'gene_name' --featurecounts_group_type 'gene_biotype' --featurecounts_feature_type 'transcript'
    
     # -- DEBUG_1 (CDS --> exon in CP059040.gff) --
     #Checking the record (see below) in results/genome/CP059040.gtf
     #In ./results/genome/CP059040.gtf e.g. "CP059040.1      Genbank transcript      1       1398    .       +       .       transcript_id "gene-H0N29_00005"; gene_id "gene-H0N29_00005"; gene_name "dnaA"; Name "dnaA"; gbkey "Gene"; gene "dnaA"; gene_biotype "protein_coding"; locus_tag "H0N29_00005";"
     #--featurecounts_feature_type 'transcript' returns only the tRNA results
     #Since the tRNA records have "transcript and exon". In gene records, we have "transcript and CDS". replace the CDS with exon
    
     grep -P "\texon\t" CP059040.gff | sort | wc -l    #96
     grep -P "cmsearch\texon\t" CP059040.gff | wc -l    #=10  ignal recognition particle sRNA small typ, transfer-messenger RNA, 5S ribosomal RNA
     grep -P "Genbank\texon\t" CP059040.gff | wc -l    #=12  16S and 23S ribosomal RNA
     grep -P "tRNAscan-SE\texon\t" CP059040.gff | wc -l    #tRNA 74
     wc -l star_salmon/AUM_r3/quant.genes.sf  #--featurecounts_feature_type 'transcript' results in 96 records!
    
     grep -P "\tCDS\t" CU459141.gff3 | wc -l  #3659
     sed 's/\tCDS\t/\texon\t/g' CU459141.gff3 > CU459141_m.gff
     grep -P "\texon\t" CU459141_m.gff | sort | wc -l  #3760
    
     # -- DEBUG_2: combination of 'CU459141_m.gff' and 'exon' results in ERROR, using 'transcript' instead!
     --gff "/home/jhuang/DATA/Data_Tam_RNAseq_2026_on_AYE/CU459141_m.gff" --featurecounts_feature_type 'transcript'
    
     # -- DEBUG_3: make sure the header of fasta is the same to the *_m.gff file
  6. nextflow run

     # ---- SUCCESSFUL with directly downloaded gff3 and fasta from NCBI using docker after replacing 'CDS' with 'exon' ----
     (host_env) mv trimmed/*.fastq.gz .
     (host_env) nextflow run nf-core/rnaseq -r 3.14.0 -profile docker \

    –input samplesheet.csv –outdir results –fasta “/home/jhuang/DATA/Data_Tam_RNAseq_2026_on_AYE/CU459141.fasta” –gff “/home/jhuang/DATA/Data_Tam_RNAseq_2026_on_AYE/CU459141_m.gff” -resume –max_cpus 90 –max_memory 900.GB –max_time 2400.h –save_align_intermeds –save_unaligned –save_reference –aligner ‘star_salmon’ –gtf_group_features ‘gene_id’ –gtf_extra_attributes ‘gene_name’ –featurecounts_group_type ‘gene_biotype’ –featurecounts_feature_type ‘transcript’

  7. Import data and pca-plot

     #mamba activate r_env
    
     #install.packages("ggfun")
     # Import the required libraries
     library("AnnotationDbi")
     library("clusterProfiler")
     library("ReactomePA")
     library(gplots)
     library(tximport)
     library(DESeq2)
     #library("org.Hs.eg.db")
     library(dplyr)
     library(tidyverse)
     #install.packages("devtools")
     #devtools::install_version("gtable", version = "0.3.0")
     library(gplots)
     library("RColorBrewer")
     #install.packages("ggrepel")
     library("ggrepel")
     # install.packages("openxlsx")
     library(openxlsx)
     library(EnhancedVolcano)
     library(DESeq2)
     library(edgeR)
    
     setwd("~/DATA/Data_Tam_RNAseq_2026_on_AYE/results/star_salmon")
     # Define paths to your Salmon output quantification files
    
     # Store sample names in a character vector
     samples <- c(
         "AYE-O_Azi20_solid_r1", "AYE-O_Azi20_solid_r2", "AYE-O_Azi20_solid_r3", "AYE-O_ctr_r1", "AYE-O_ctr_r2",
         "AYE-O_ctr_r3", "AYE-O_ctr_solid_r1", "AYE-O_ctr_solid_r2", "AYE-O_ctr_solid_r3",
         "AYE-O_Diclo375_r1", "AYE-O_Diclo375_r2", "AYE-O_Diclo375_r3", "AYE-O_Mero0.5_r1",
         "AYE-O_Mero0.5_r2", "AYE-O_Mero0.5_r3", "AYE-O_Rifampicin2_r1", "AYE-O_Rifampicin2_r2",
         "AYE-O_Rifampicin2_r3", "AYE-T_Azi20_solid_r1", "AYE-T_Azi20_solid_r2", "AYE-T_Azi20_solid_r3",
         "AYE-T_ctr_r1", "AYE-T_ctr_r2", "AYE-T_ctr_r3", "AYE-T_ctr_solid_r1", "AYE-T_ctr_solid_r2",
         "AYE-T_ctr_solid_r3", "AYE-T_Diclo375_r1", "AYE-T_Diclo375_r2", "AYE-T_Diclo375_r3",
         "AYE-T_Mero0.15_r1", "AYE-T_Mero0.15_r2", "AYE-T_Mero0.15_r3", "AYE-T_Rifampicin2_r1",
         "AYE-T_Rifampicin2_r2", "AYE-T_Rifampicin2_r3", "AYE-WT_Azi20_solid_r1", "AYE-WT_Azi20_solid_r2",
         "AYE-WT_Azi20_solid_r3", "AYE-WT_ctr_r1", "AYE-WT_ctr_r2", "AYE-WT_ctr_r3", "AYE-WT_ctr_solid_r1",
         "AYE-WT_ctr_solid_r2", "AYE-WT_ctr_solid_r3", "AYE-WT_Diclo1250_solid_r1", "AYE-WT_Diclo1250_solid_r2",
         "AYE-WT_Diclo1250_solid_r3", "AYE-WT_Diclo750_r1", "AYE-WT_Diclo750_r2", "AYE-WT_Diclo750_r3",
         "AYE-WT_Mero0.35-0.5_r1", "AYE-WT_Mero0.35-0.5_r2", "AYE-WT_Mero0.35-0.5_r3", "AYE-WT_Rifampicin1.5_r1",
         "AYE-WT_Rifampicin1.5_r2", "AYE-WT_Rifampicin1.5_r3", "F_Azi20_solid_r1", "F_Azi20_solid_r2",
         "F_Azi20_solid_r3", "F_ctr_solid_r1", "F_ctr_solid_r2", "F_ctr_solid_r3", "O-Trans_ctr_r1",
         "O-Trans_ctr_r2", "O-Trans_ctr_r3", "O-Trans_Diclo375_r1", "O-Trans_Diclo375_r2", "O-Trans_Diclo375_r3",
         "O-Trans_Mero0.25_r1", "O-Trans_Mero0.25_r2", "O-Trans_Mero0.25_r3", "O-Trans_Rifampicin2_r1",
         "O-Trans_Rifampicin2_r2", "O-Trans_Rifampicin2_r3", "WT-Trans_ctr_r1", "WT-Trans_ctr_r2",
         "WT-Trans_ctr_r3", "WT-Trans_Diclo750_r1", "WT-Trans_Diclo750_r2", "WT-Trans_Diclo750_r3"
     )
    
     ## Automatically generate the named vector
     files <- setNames(paste0("./", samples, "/quant.sf"), samples)
    
     # -----------------------------------------------------------------
     # ---- Step 1: Create Detailed Metadata from Your Sample Names ----
    
     # Extract metadata from sample names
     samples <- names(files)
    
     # Parse the complex sample names
     metadata <- data.frame(
     sample = samples,
     stringsAsFactors = FALSE
     )
    
     # Extract strain (everything before first underscore or hyphen treatment)
     metadata$strain <- sapply(strsplit(samples, "[-_]"), function(x) {
     if(x[1] %in% c("AYE", "O", "WT", "F")) {
         if(x[1] == "AYE" && length(x) > 1 && x[2] %in% c("WT", "T", "O")) {
         paste(x[1:2], collapse = "-")
         } else if(x[1] %in% c("O", "WT") && x[2] == "Trans") {
         paste(x[1:2], collapse = "-")
         } else {
         x[1]
         }
     } else {
         x[1]
     }
     })
    
     # Extract treatment type
     metadata$treatment <- sapply(samples, function(x) {
     if(grepl("_ctr", x)) return("ctrl")
     if(grepl("Diclo", x)) return("Diclo")
     if(grepl("Mero", x)) return("Mero")
     if(grepl("Azi", x)) return("Azi")
     if(grepl("Rifampicin", x)) return("Rifampicin")
     return("ctrl")
     })
    
     # Extract concentration
     metadata$concentration <- sapply(samples, function(x) {
     if(grepl("Diclo1250", x)) return("1250")
     if(grepl("Diclo750", x)) return("750")
     if(grepl("Diclo375", x)) return("375")
     if(grepl("Mero0.5", x)) return("0.5")
     if(grepl("Mero0.35", x)) return("0.35")
     if(grepl("Mero0.25", x)) return("0.25")
     if(grepl("Mero0.15", x)) return("0.15")
     if(grepl("Azi20", x)) return("20")
     if(grepl("Rifampicin2", x)) return("2")
     if(grepl("Rifampicin1.5", x)) return("1.5")
     return("0")
     })
    
     # Extract condition (solid vs liquid)
     metadata$condition <- ifelse(grepl("_solid", samples), "solid", "liquid")
    
     # Extract replicate
     metadata$replicate <- sapply(strsplit(samples, "_"), function(x) {
     rep_part <- x[length(x)]
     gsub("r", "", rep_part)
     })
    
     # Create combined group for easy comparisons
     metadata$group <- paste(metadata$strain, metadata$treatment, metadata$concentration, sep = "_")
    
     # Set row names
     rownames(metadata) <- metadata$sample
    
     # Reorder to match txi columns
     metadata <- metadata[colnames(txi$counts), ]
    
     # ---------------------------------------------
     # ---- Step 2: Choose Your Design Strategy ----
    
     # Strategy A: Full Factorial Design (if balanced)
     dds <- DESeqDataSetFromTximport(txi, metadata,
                              design = ~ strain + treatment + condition)
    
     # --> Strategy B: Combined Group Factor ⭐ RECOMMENDED
     metadata$group <- factor(paste(metadata$strain,
                                     metadata$treatment,
                                     metadata$concentration,
                                     metadata$condition,
                                     sep = "_"))
    
     dds <- DESeqDataSetFromTximport(txi, metadata,
                                     design = ~ group)
     dds <- DESeq(dds)
    
     # See all available comparisons
     resultsNames(dds)
    
     # -------------------------------------------------------------
     # ---- Step 3: Set Up Specific Comparisons from Your Notes ----
     # ==========================================
     # 1. Define Exact Comparisons from Your Notes
     # ==========================================
     planned_comparisons <- list(
     # --- Baseline / Strain Controls ---
     AYE_T_ctr_vs_AYE_WT_ctr            = list(treat = "AYE-T_ctrl_0_liquid",   ctrl = "AYE-WT_ctrl_0_liquid"),
     AYE_O_ctr_vs_AYE_WT_ctr            = list(treat = "AYE-O_ctrl_0_liquid",   ctrl = "AYE-WT_ctrl_0_liquid"),
     O_Trans_ctr_vs_AYE_WT_ctr          = list(treat = "O-Trans_ctrl_0_liquid", ctrl = "AYE-WT_ctrl_0_liquid"),
     WT_Trans_ctr_vs_AYE_WT_ctr         = list(treat = "WT-Trans_ctrl_0_liquid",ctrl = "AYE-WT_ctrl_0_liquid"),
     AYE_O_ctr_vs_AYE_T                 = list(treat = "AYE-O_ctrl_0_liquid",   ctrl = "AYE-T_ctrl_0_liquid"),
     O_Trans_ctr_vs_AYE_T               = list(treat = "O-Trans_ctrl_0_liquid", ctrl = "AYE-T_ctrl_0_liquid"),
     WT_Trans_ctr_vs_AYE_T              = list(treat = "WT-Trans_ctrl_0_liquid",ctrl = "AYE-T_ctrl_0_liquid"),
    
     # --- Condition Effects (Solid vs Liquid) ---
     AYE_WT_ctr_solid_vs_AYE_WT_ctr     = list(treat = "AYE-WT_ctrl_0_solid",   ctrl = "AYE-WT_ctrl_0_liquid"),
     AYE_O_ctr_solid_vs_AYE_O_ctr       = list(treat = "AYE-O_ctrl_0_solid",    ctrl = "AYE-O_ctrl_0_liquid"),
     AYE_T_ctr_solid_vs_AYE_T_ctr       = list(treat = "AYE-T_ctrl_0_solid",    ctrl = "AYE-T_ctrl_0_liquid"),
     AYE_O_ctr_solid_vs_AYE_WT_ctr_solid= list(treat = "AYE-O_ctrl_0_solid",    ctrl = "AYE-WT_ctrl_0_solid"),
     AYE_T_ctr_solid_vs_AYE_WT_ctr_solid= list(treat = "AYE-T_ctrl_0_solid",    ctrl = "AYE-WT_ctrl_0_solid"),
    
     # --- Diclofenac ---
     AYE_WT_Diclo750_vs_AYE_WT_ctr      = list(treat = "AYE-WT_Diclo_750_liquid",   ctrl = "AYE-WT_ctrl_0_liquid"),
     AYE_T_Diclo375_vs_AYE_WT_ctr       = list(treat = "AYE-T_Diclo_375_liquid",    ctrl = "AYE-WT_ctrl_0_liquid"),
     AYE_O_Diclo375_vs_AYE_WT_ctr       = list(treat = "AYE-O_Diclo_375_liquid",    ctrl = "AYE-WT_ctrl_0_liquid"),
     O_Trans_Diclo375_vs_AYE_WT_ctr     = list(treat = "O-Trans_Diclo_375_liquid",  ctrl = "AYE-WT_ctrl_0_liquid"),
     WT_Trans_Diclo750_vs_AYE_WT_ctr    = list(treat = "WT-Trans_Diclo_750_liquid", ctrl = "AYE-WT_ctrl_0_liquid"),
     Diclo_AYE_WT_1250_solid_vs_solid_ctr = list(treat = "AYE-WT_Diclo_1250_solid", ctrl = "AYE-WT_ctrl_0_solid"),
    
     # --- Meropenem ---
     AYE_WT_Mero_vs_AYE_WT_ctr          = list(treat = "AYE-WT_Mero_0.35_liquid", ctrl = "AYE-WT_ctrl_0_liquid"),
     AYE_T_Mero_vs_AYE_WT_ctr           = list(treat = "AYE-T_Mero_0.15_liquid",      ctrl = "AYE-WT_ctrl_0_liquid"),
     AYE_O_Mero_vs_AYE_WT_ctr           = list(treat = "AYE-O_Mero_0.5_liquid",       ctrl = "AYE-WT_ctrl_0_liquid"),
     O_Trans_Mero_vs_AYE_WT_ctr         = list(treat = "O-Trans_Mero_0.25_liquid",    ctrl = "AYE-WT_ctrl_0_liquid"),
     AYE_T_Mero_vs_AYE_T_ctr            = list(treat = "AYE-T_Mero_0.15_liquid",      ctrl = "AYE-T_ctrl_0_liquid"),
    
     # --- Azithromycin (Solid) ---
     AYE_WT_Azi_vs_solid_ctr            = list(treat = "AYE-WT_Azi_20_solid", ctrl = "AYE-WT_ctrl_0_solid"),
     AYE_T_Azi_vs_solid_ctr             = list(treat = "AYE-T_Azi_20_solid",  ctrl = "AYE-T_ctrl_0_solid"),
     AYE_O_Azi_vs_solid_ctr             = list(treat = "AYE-O_Azi_20_solid",  ctrl = "AYE-O_ctrl_0_solid"),
     F_Azi_vs_F_solid_ctr               = list(treat = "F_Azi_20_solid",      ctrl = "F_ctrl_0_solid"),
    
     # --- Rifampicin ---
     AYE_WT_Rif_vs_AYE_WT_ctr           = list(treat = "AYE-WT_Rifampicin_1.5_liquid", ctrl = "AYE-WT_ctrl_0_liquid"),
     AYE_T_Rif_vs_AYE_T_ctr             = list(treat = "AYE-T_Rifampicin_2_liquid",    ctrl = "AYE-T_ctrl_0_liquid"),
     AYE_O_Rif_vs_AYE_O_ctr             = list(treat = "AYE-O_Rifampicin_2_liquid",    ctrl = "AYE-O_ctrl_0_liquid"),
     O_Trans_Rif_vs_O_Trans_ctr         = list(treat = "O-Trans_Rifampicin_2_liquid",  ctrl = "O-Trans_ctrl_0_liquid")
     )
    
     # ==========================================
     # 2. Verification & Validation Script
     # ==========================================
     # Identify which column in colData holds your group names
     group_col <- if("group" %in% colnames(colData(dds))) "group" else
                 if("treatment" %in% colnames(colData(dds))) "treatment" else
                 stop("❌ Please specify the correct colData column containing group names.")
    
     actual_groups <- unique(colData(dds)[[group_col]])
    
     cat("\n", paste(rep("=", 85), collapse=""), "\n")
     cat("📋 VERIFICATION OF NOTE-DERIVED COMPARISONS\n")
     cat(paste(rep("=", 85), collapse=""), "\n\n")
    
     validation_results <- data.frame(
     Comparison_Name = character(),
     Treatment_String = character(),
     Control_String = character(),
     Status = character(),
     Suggested_Contrast = character(),
     stringsAsFactors = FALSE
     )
    
     for(name in names(planned_comparisons)) {
     trt <- planned_comparisons[[name]]$treat
     ctl <- planned_comparisons[[name]]$ctrl
    
     # Find closest matches in actual data
     trt_match <- actual_groups[grepl(trt, actual_groups, fixed = TRUE)]
     ctl_match <- actual_groups[grepl(ctl, actual_groups, fixed = TRUE)]
    
     status <- if(length(trt_match) > 0 && length(ctl_match) > 0) "✅ VALID" else "⚠️  CHECK"
     contrast_str <- if(status == "✅ VALID")
         paste0('c("', group_col, '", "', trt_match[1], '", "', ctl_match[1], '")') else "N/A"
    
     validation_results <- rbind(validation_results, data.frame(
         Comparison_Name = name,
         Treatment_String = trt,
         Control_String = ctl,
         Status = status,
         Suggested_Contrast = contrast_str,
         stringsAsFactors = FALSE
     ))
    
     cat(sprintf("%-45s | T:%-25s C:%-20s | %s\n", name, trt, ctl, status))
     if(status == "⚠️  CHECK") {
         if(length(trt_match) == 0) cat("   🔍 Treat not found. Closest: ", paste(head(actual_groups[grepl(strsplit(trt, "_")[[1]][1], actual_groups)], 3), collapse=", "), "\n")
         if(length(ctl_match) == 0) cat("   🔍 Ctrl not found.  Closest: ", paste(head(actual_groups[grepl(strsplit(ctl, "_")[[1]][1], actual_groups)], 3), collapse=", "), "\n")
     }
     }
    
     # ==========================================
     # 3. Auto-Generate DESeq2 results() Calls (Optional)
     # ==========================================
     valid_comparisons <- validation_results[validation_results$Status == "✅ VALID", ]
     if(nrow(valid_comparisons) > 0) {
     cat("\n📜 READY-TO-RUN DESeq2 CONTRASTS:\n")
     cat(paste(rep("-", 60), collapse=""), "\n")
     for(i in seq_len(nrow(valid_comparisons))) {
         cat(sprintf('res_%s <- results(dds, contrast = %s)\n',
                     gsub("[^A-Za-z0-9]", "_", valid_comparisons$Comparison_Name[i]),
                     valid_comparisons$Suggested_Contrast[i]))
     }
     } else {
     cat("\n⚠️  No exact matches found. Check your colData group naming convention.\n")
     }
    
     # -----------------------------
     # ---- Step 4: PCA figures ----
    
     # 🔍 What each figure shows:
     #
     #    01_PCA_by_Strain.png → Tests if genetic background (AYE-WT, AYE-T, AYE-O, Trans, F) is the dominant source of variation.
     #    02_PCA_by_Treatment.png → Shows clustering by antibiotic/drug exposure (ctrl, Diclo, Mero, Azi, Rifampicin).
     #    03_PCA_by_Condition.png → Reveals batch/growth media effects (solid vs liquid).
     #    04_PCA_CombinedGroups.png → Full experimental grouping with labeled sample names for quick outlier detection.
     #    05_PCA_Ellipses.png → Adds 95% confidence boundaries per strain to visualize group spread and overlap.
     #
     # ⚠️ Quick Checklist Before Running:
     #
     #    Ensure metadata columns (strain, treatment, condition, group) are attached to colData(dds).
     #    If ggrepel is missing, run install.packages("ggrepel").
     #    All PNGs will save to your current working directory (getwd()).
    
     # Install if missing: install.packages(c("ggplot2", "ggrepel"))
     library(DESeq2)
     library(ggplot2)
     library(ggrepel)
    
     # 1. Variance Stabilizing Transformation & Extract PCA Data
     vsd <- vst(dds, blind = FALSE)
     pca_data <- plotPCA(vsd, intgroup = c("strain", "treatment", "condition", "group"), returnData = TRUE)
     percent_var <- round(100 * attr(pca_data, "percentVar"))
    
     # Consistent theme for all plots
     base_theme <- theme_bw(base_size = 12) +
     theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 13),
             legend.position = "right",
             legend.title = element_text(face = "bold"),
             panel.grid.major = element_line(color = "grey90"),
             panel.grid.minor = element_blank())
    
     # --- Plot 1: Colored by Strain ---
     p1 <- ggplot(pca_data, aes(x = PC1, y = PC2, color = strain, shape = condition)) +
     geom_point(size = 3, alpha = 0.8) +
     geom_text_repel(aes(label = name), size = 2.5, max.overlaps = 20, show.legend = FALSE) +
     labs(x = paste0("PC1: ", percent_var[1], "% variance"),
         y = paste0("PC2: ", percent_var[2], "% variance"),
         title = "PCA: Samples Colored by Strain",
         color = "Strain", shape = "Condition") +
     base_theme
     ggsave("01_PCA_by_Strain.png", p1, width = 8, height = 6, dpi = 300)
    
     # --- Plot 2: Colored by Treatment ---
     p2 <- ggplot(pca_data, aes(x = PC1, y = PC2, color = treatment, shape = condition)) +
     geom_point(size = 3, alpha = 0.8) +
     labs(x = paste0("PC1: ", percent_var[1], "% variance"),
         y = paste0("PC2: ", percent_var[2], "% variance"),
         title = "PCA: Samples Colored by Treatment",
         color = "Treatment", shape = "Condition") +
     base_theme
     ggsave("02_PCA_by_Treatment.png", p2, width = 8, height = 6, dpi = 300)
    
     # --- Plot 3: Colored by Condition (Solid vs Liquid) ---
     p3 <- ggplot(pca_data, aes(x = PC1, y = PC2, color = condition, shape = strain)) +
     geom_point(size = 3, alpha = 0.8) +
     labs(x = paste0("PC1: ", percent_var[1], "% variance"),
         y = paste0("PC2: ", percent_var[2], "% variance"),
         title = "PCA: Samples Colored by Growth Condition",
         color = "Condition", shape = "Strain") +
     base_theme
     ggsave("03_PCA_by_Condition.png", p3, width = 8, height = 6, dpi = 300)
    
     # --- Plot 4: Combined Groups with Sample Labels ---
     p4 <- ggplot(pca_data, aes(x = PC1, y = PC2, color = group)) +
     geom_point(size = 3, alpha = 0.8) +
     geom_text_repel(aes(label = name), size = 2, max.overlaps = 30, box.padding = 0.3) +
     labs(x = paste0("PC1: ", percent_var[1], "% variance"),
         y = paste0("PC2: ", percent_var[2], "% variance"),
         title = "PCA: Combined Experimental Groups",
         color = "Group") +
     base_theme + theme(legend.position = "none")
     ggsave("04_PCA_CombinedGroups.png", p4, width = 9, height = 7, dpi = 300)
    
     # --- Plot 5: 95% Confidence Ellipses (by Strain) ---
     p5 <- ggplot(pca_data, aes(x = PC1, y = PC2, color = strain, fill = strain)) +
     geom_point(size = 3, alpha = 0.7) +
     stat_ellipse(level = 0.95, alpha = 0.2, geom = "polygon", show.legend = FALSE) +
     labs(x = paste0("PC1: ", percent_var[1], "% variance"),
         y = paste0("PC2: ", percent_var[2], "% variance"),
         title = "PCA: 95% Confidence Ellipses by Strain",
         color = "Strain", fill = "Strain") +
     base_theme
     ggsave("05_PCA_Ellipses.png", p5, width = 8, height = 6, dpi = 300)
    
     message("✅ All 5 PCA plots saved to working directory!")
  8. Run Differential Expression & PCA Analysis Complete

     (r_env) cd ~/DATA/Data_Tam_RNAseq_2026_Dicl_Mero_Azith_Rifa_on_AYE/results/star_salmon/
     #(r_env) Rscript complete_deg_pipeline.R  #For standard cutoff in the project, we use complete_deg_pipeline_custom_cutoff.R
    
     # Adapted the script to the following requests:
     # (a) Rifampicin: use genes with a cutoff of log2 fold change > 1.2 and < -1.2 for the KEGG and GO analyses.
     # (b) Baseline / Strain Controls: use genes with a cutoff of log2 fold change > 1.4 and < -1.4 for the KEGG and GO analyses.
     # (c) All other comparisons: please retain the same selection criteria as in the previous analysis you sent to me.
    
     # How it works:
     #   * Rifampicin: The script looks for "Rif" in the comparison name (e.g., 28_AYE_WT_Rif_vs_Ctrl) and applies |log2FC| >= 1.2.
     #   * Baseline/Strain Controls: The script looks for "_ctr_vs_" in the comparison name (e.g., 01_AYE_T_ctr_vs_AYE_WT_ctr) and applies |log2FC| >= 1.4.
     #   * All Others: Falls back to the original 2.0 cutoff.
     #   * The console output will now explicitly print which cutoff is being used for each specific comparison.
    
     (r_env) Rscript complete_deg_pipeline_custom_cutoff.R
  9. KEGG and GO annotations in non-model organisms

    (a) Rifampicin: use genes with a cutoff of log2 fold change > 1.2 and 1.4 and < -1.4 for the KEGG and GO analyses. (c) All other comparisons: please retain the same selection criteria as in the previous analysis you sent to me.

https://www.biobam.com/functional-analysis/

10.1. Assign KEGG and GO Terms (see diagram above)

Since your organism is non-model, standard R databases (org.Hs.eg.db, etc.) won’t work. You’ll need to manually retrieve KEGG and GO annotations.

* Preparing file 1 eggnog_out.emapper.annotations.txt for the R-code below: (KEGG Terms): EggNog based on orthology and phylogenies

    EggNOG-mapper assigns both KEGG Orthology (KO) IDs and GO terms.

    Install EggNOG-mapper:

        mamba create -n eggnog_env python=3.8 eggnog-mapper -c conda-forge -c bioconda  #eggnog-mapper_2.1.12
        mamba activate eggnog_env

    Run annotation:

        #diamond makedb --in eggnog6.prots.faa -d eggnog_proteins.dmnd
        mkdir /home/jhuang/mambaforge/envs/eggnog_env/lib/python3.8/site-packages/data/
        download_eggnog_data.py --dbname eggnog.db -y --data_dir /home/jhuang/mambaforge/envs/eggnog_env/lib/python3.8/site-packages/data/
        #NOT_WORKING: emapper.py -i CP059040_gene.fasta -o eggnog_dmnd_out --cpu 60 -m diamond[hmmer,mmseqs] --dmnd_db /home/jhuang/REFs/eggnog_data/data/eggnog_proteins.dmnd
        #Download CU459141_protein_.fasta from NCBI
        python ~/Scripts/update_fasta_header.py CU459141_protein_.fasta CU459141_protein.fasta
        emapper.py -i CU459141_protein.fasta -o eggnog_out --cpu 60 --resume
        #----> result annotations.tsv: Contains KEGG, GO, and other functional annotations.
        #---->  470.IX87_14445:
            * 470 likely refers to the organism or strain (e.g., Acinetobacter baumannii ATCC 19606 or another related strain).
            * IX87_14445 would refer to a specific gene or protein within that genome.

    Extract KEGG KO IDs from annotations.emapper.annotations.

* Preparing file 2 blast2go_annot.annot2_ for the R-code below:

  - Basic (GO Terms from 'Blast2GO 5 Basic', saved in blast2go_annot.annot): Using Blast/Diamond + Blast2GO_GUI based on sequence alignment + GO mapping

    * 'Load protein sequences' (Tags: NONE, generated columns: Nr, SeqName) -->
    * Buttons 'blast' (Tags: BLASTED, generated columns: Description, Length, #Hits, e-Value, sim mean),
    * Button 'mapping' (Tags: MAPPED, generated columns: #GO, GO IDs, GO Names), "Mapping finished - Please proceed now to annotation."
    * Button 'annot' (Tags: ANNOTATED, generated columns: Enzyme Codes, Enzyme Names), "Annotation finished."
            * Used parameter 'Annotation CutOff': The Blast2GO Annotation Rule seeks to find the most specific GO annotations with a certain level of reliability. An annotation score is calculated for each candidate GO which is composed by the sequence similarity of the Blast Hit, the evidence code of the source GO and the position of the particular GO in the Gene Ontology hierarchy. This annotation score cutoff select the most specific GO term for a given GO branch which lies above this value.
            * Used parameter 'GO Weight' is a value which is added to Annotation Score of a more general/abstract Gene Ontology term for each of its more specific, original source GO terms. In this case, more general GO terms which summarise many original source terms (those ones directly associated to the Blast Hits) will have a higher Annotation Score.

  - Advanced (GO Terms from 'Blast2GO 5 Basic'): Interpro based protein families / domains --> Button interpro

    * Button 'interpro' (Tags: INTERPRO, generated columns: InterPro IDs, InterPro GO IDs, InterPro GO Names) --> "InterProScan Finished - You can now merge the obtained GO Annotations."

  - MERGE the results of InterPro GO IDs (advanced) to GO IDs (basic) and generate final GO IDs, saved in blast2go_annot.annot2

    * Button 'interpro'/'Merge InterProScan GOs to Annotation' --> "Merge (add and validate) all GO terms retrieved via InterProScan to the already existing GO annotation." --> "Finished merging GO terms from InterPro with annotations. Maybe you want to run ANNEX (Annotation Augmentation)."
    * (NOT_USED) Button 'annot'/'ANNEX' --> "ANNEX finished. Maybe you want to do the next step: Enzyme Code Mapping."

  - PREPARING go_terms and ec_terms: annot_* file (NOTE that blast2go_annot.annot2 is after merging InterPro_GO_IDs and GO_IDs):

    cut -f1-2 -d$'\t' blast2go_annot.annot2 > blast2go_annot.annot2_

10.2. Perform KEGG and GO Enrichment in R

      (r_env) cd /mnt/md1/DATA/Data_Tam_RNAseq_2026_Dicl_Mero_Azith_Rifa_on_AYE/results/star_salmon/DEG_Results_Complete

      #For |deg_cutoff_log_foldchange| >=1.4
      sed "s/01_AYE_T_ctr_vs_AYE_WT_ctr/02_AYE_O_ctr_vs_AYE_WT_ctr/g" 1.R > 2.R
      ...

      #For |deg_cutoff_log_foldchange| >=2.0
      sed "s/08_AYE_WT_ctr_solid_vs_liquid/09_AYE_O_ctr_solid_vs_liquid/g" 8.R > 9.R
      sed "s/08_AYE_WT_ctr_solid_vs_liquid/10_AYE_T_ctr_solid_vs_liquid/g" 8.R > 10.R
      sed "s/08_AYE_WT_ctr_solid_vs_liquid/11_AYE_O_ctr_solid_vs_AYE_WT_solid/g" 8.R > 11.R
      sed "s/08_AYE_WT_ctr_solid_vs_liquid/12_AYE_T_ctr_solid_vs_AYE_WT_solid/g" 8.R > 12.R
      sed "s/08_AYE_WT_ctr_solid_vs_liquid/13_AYE_WT_Diclo750_vs_Ctrl/g" 8.R > 13.R
      sed "s/08_AYE_WT_ctr_solid_vs_liquid/14_AYE_T_Diclo375_vs_Ctrl/g" 8.R > 14.R
      sed "s/08_AYE_WT_ctr_solid_vs_liquid/15_AYE_O_Diclo375_vs_Ctrl/g" 8.R > 15.R
      sed "s/08_AYE_WT_ctr_solid_vs_liquid/16_O_Trans_Diclo375_vs_Ctrl/g" 8.R > 16.R
      sed "s/08_AYE_WT_ctr_solid_vs_liquid/17_WT_Trans_Diclo750_vs_Ctrl/g" 8.R > 17.R
      sed "s/08_AYE_WT_ctr_solid_vs_liquid/18_AYE_WT_Diclo1250_solid_vs_Ctrl_solid/g" 8.R > 18.R
      sed "s/08_AYE_WT_ctr_solid_vs_liquid/19_AYE_WT_Mero_vs_Ctrl/g" 8.R > 19.R
      sed "s/08_AYE_WT_ctr_solid_vs_liquid/20_AYE_T_Mero_vs_Ctrl/g" 8.R > 20.R
      sed "s/08_AYE_WT_ctr_solid_vs_liquid/21_AYE_O_Mero_vs_Ctrl/g" 8.R > 21.R
      sed "s/08_AYE_WT_ctr_solid_vs_liquid/22_O_Trans_Mero_vs_Ctrl/g" 8.R > 22.R
      sed "s/08_AYE_WT_ctr_solid_vs_liquid/23_AYE_T_Mero_vs_AYE_T_Ctrl/g" 8.R > 23.R
      sed "s/08_AYE_WT_ctr_solid_vs_liquid/24_AYE_WT_Azi_solid_vs_Ctrl_solid/g" 8.R > 24.R
      sed "s/08_AYE_WT_ctr_solid_vs_liquid/25_AYE_T_Azi_solid_vs_Ctrl_solid/g" 8.R > 25.R
      sed "s/08_AYE_WT_ctr_solid_vs_liquid/26_AYE_O_Azi_solid_vs_Ctrl_solid/g" 8.R > 26.R
      sed "s/08_AYE_WT_ctr_solid_vs_liquid/27_F_Azi_solid_vs_Ctrl_solid/g" 8.R > 27.R

      #For |deg_cutoff_log_foldchange| >=1.2
      sed "s/28_AYE_WT_Rif_vs_Ctrl/29_AYE_T_Rif_vs_Ctrl/g" 28.R > 29.R
      sed "s/28_AYE_WT_Rif_vs_Ctrl/30_AYE_O_Rif_vs_Ctrl/g" 28.R > 30.R
      sed "s/28_AYE_WT_Rif_vs_Ctrl/31_O_Trans_Rif_vs_Ctrl/g" 28.R > 31.R

      (r_env) jhuang@WS-2290C:/mnt/md1/DATA/Data_Tam_RNAseq_2026_Dicl_Mero_Azith_Rifa_on_AYE/results/star_salmon/DEG_Results_Complete$ Rscript 1.R
      #=== SUMMARY ===
      #Up-regulated genes: 16
      #  Valid KEGG IDs: 4
      #  Enriched pathways: 0
      #Down-regulated genes: 151
      #  Valid KEGG IDs: 50
      #  Enriched pathways: 4
      #'select()' returned 1:1 mapping between keys and columns
      #'select()' returned 1:1 mapping between keys and columns
      #'select()' returned 1:1 mapping between keys and columns
      #=== SUMMARY ===
      #Up-regulated genes: 16
      #  Valid GO IDs: 16
      #  Enriched GO-terms: 0
      #Down-regulated genes: 151
      #  Valid KEGG IDs: 151
      #  Enriched GO-terms: 3
      #...

  10.3. Finalizing the KEGG and GO Enrichment table

        1. NOTE (Already realized in the code): geneIDs in KEGG_Enrichment have been already translated from ko to geneID in H0N29_*-format; If not, nachmachen using eggnog-res, 因为 eggnog里有1-1-mspping Info between ko-Name and GeneID.
        2. NEED_MANUAL_DELETION (Already setting the cutoff in the code): p.adjust values have been calculated, we have to filter all records in GO_Enrichment-results by |p.adjust|<=0.05. DON'T_NEED_ANY_MORE, since pvalueCutoff = 0.05 settings in enricher. Alternative using pvalueCutoff=1.0, marked the color as yellow if the p.adjusted <= 0.05 in GO_enrichment.
        3. NOTE (Not occuring in the new dataset): In rare case, the description is missing for some IDs, e.g. GO term: GO:0006807: replace GO:0006807  obsolete nitrogen compound metabolic process;  ko00975: Metabolism, Biosynthesis of other secondary metabolites

Explanation for the “near-1” frequencies and why these weren’t reported before

Explanation for the “near-1” frequencies and why these weren’t reported before

Based on the data, here’s how to explain this:

Key Points:

  1. These are NOT newly acquired mutations during passaging

    • All three positions (15458, 22636, 24781) show frequency = 1.0 in the starting virus (hCoV229E_Rluc)
    • They remain fixed at 1.0 throughout all passages (p10, p16, p26)
    • These represent pre-existing differences between your experimental virus strain and the reference genome PP810610
  2. Why they weren’t reported in earlier analyses:

    • VPhaser2 is designed to detect intra-host variants (positions with heterogeneity, frequency < 1.0)
    • When a mutation is fixed at 100% (frequency = 1.0), it becomes part of the consensus sequence
    • Many variant callers either:
      • Don’t report fixed variants as “intra-host variants”
      • Filter them out as they represent consensus differences from reference, not within-host diversity
    • Your earlier analysis likely only reported positions with true intra-host heterogeneity (frequency between 0 and 1)
  3. The “near-1” values (0.997, 0.996, 0.978) in X7523_p26:

    These slight deviations from 1.0 are NOT calculation errors but can be explained by:

    • Sequencing errors: Even at high coverage, there’s a small error rate (~0.1-1%)
    • Mapping artifacts: Some reads may map incorrectly at the position
    • Very low-level mixed population: Possibly <3% wild-type contamination
    • Statistical noise: At very high frequencies, small absolute numbers of alternative reads can cause slight deviations

Suggested Response:

You could add this to your email:


Zusätzliche Erklärung zu den Positionen 15458, 22636 und 24781:

Bei genauerer Betrachtung der Daten zeigt sich, dass diese drei Mutationen bereits im Ausgangsvirus (hCoV229E_Rluc) zu 100% vorhanden waren und sich während des gesamten Passagierens nicht verändert haben. Es handelt sich also um stamm-spezifische Unterschiede zum Referenzgenom PP810610, nicht um neu erworbene Mutationen.

Warum wurden sie in früheren Analysen nicht berichtet?

  • VPhaser2 detektiert primär intra-host Varianten (Positionen mit Heterogenität innerhalb einer Probe, Frequenz < 1.0)
  • Bei einer Frequenz von exakt 1.0 (100%) werden diese Positionen als Teil der Konsensus-Sequenz betrachtet und oft nicht als “Varianten” im engeren Sinne berichtet
  • In der früheren Analyse wurden nur Positionen mit echter intra-host Diversität aufgeführt

Die leicht abweichenden Werte in X7523_p26 (0.997, 0.996, 0.978): Diese minimalen Abweichungen von 1.0 sind technisch bedingt durch:

  • Sequenzierfehler (typischerweise 0.1-1% Fehlerrate)
  • Mapping-Artefakte
  • Statistisches Rauschen bei sehr hohen Frequenzen

Sie stellen keine biologisch relevante Heterogenität dar, sondern liegen im Bereich der technischen Variabilität.


This explanation is scientifically accurate and shows that you’ve thoroughly investigated the issue!

Analysis metagenomics using Docker (Data_Tam_Metagenomics_2026*)

https://huttenhower.sph.harvard.edu/biobakery_workflows/

Whole metagenome shotgun sequencing data can be processed through read-level quality control (KneadData), taxonomic profiling (MetaPhlAn), functional profiling (HUMAnN), and strain profiling (StrainPhlAn) to generate a report with publication-ready figures with two workflow commands.

  1. Prepare the toy datasets

     # Install if needed: conda install -c bioconda seqtk
     cd ~/DATA/Data_Tam_Metagenomics_2026_pre_vs_post_treatment/X101SC25123808-Z01-J002/01.RawData/A
     seqtk sample -s100 A_1.fq.gz 0.01 | gzip > ../J002_A_1.fastq.gz
     seqtk sample -s100 A_2.fq.gz 0.01 | gzip > ../J002_A_2.fastq.gz
     cd ../B
     seqtk sample -s100 B_1.fq.gz 0.01 | gzip > ../J002_B_1.fastq.gz
     seqtk sample -s100 B_2.fq.gz 0.01 | gzip > ../J002_B_2.fastq.gz
     mv J002*.fastq.gz /mnt/md1/DATA/Data_Tam_Metagenomics_2026_wastewater/X101SC25123808-Z01-J003/01.RawData/A_test/
    
     seqtk sample -s100 B_1.fastq.gz 0.01 | gzip > ../A_test/B_1.fastq.gz
     seqtk sample -s100 B_2.fastq.gz 0.01 | gzip > ../A_test/B_2.fastq.gz
  2. 拉取镜像(注意:latest 实际是 2019-2021 年构建的旧版)

     docker pull biobakery/workflows:latest
  3. 验证容器内版本

     docker run --rm biobakery/workflows:latest biobakery_workflows --version
  4. Install Databases Inside Container

     # Create persistent host directory for databases
     mkdir -p /mnt/nvme4n1p1/biobakery_db
    
     docker run -it \
     -v /mnt/nvme4n1p1/biobakery_db:/biobakery_databases \
     biobakery/workflows:latest \
     /bin/bash
    
     # Inside container:
     biobakery_workflows_databases --available
     #There are five available database sets each corresponding to a data processing workflow.
     #wmgx: The full databases for the whole metagenome workflow
     #wmgx_demo: The demo databases for the whole metagenome workflow
     #wmgx_wmtx: The full databases for the whole metagenome and metatranscriptome workflow
     #16s_usearch: The full databases for the 16s workflow
     #16s_dada2: The full databases for the dada2 workflow
     #16s_its: The unite database for the its workflow
     #isolate_assembly: The eggnog-mapper databases for the assembly workflow
    
     biobakery_workflows_databases --install wmgx_demo --location /biobakery_databases
     biobakery_workflows_databases --install wmgx_wmtx --location /biobakery_databases
     biobakery_workflows_databases --install 16s_usearch --location /biobakery_databases
     biobakery_workflows_databases --install 16s_dada2 --location /biobakery_databases
     biobakery_workflows_databases --install 16s_its --location /biobakery_databases
     biobakery_workflows_databases --install isolate_assembly --location /biobakery_databases
    
     biobakery_workflows_databases --install wmgx --location /biobakery_databases
     # ---- DOWNLOAD_LOG ----
     1. INSTALLING humann utility mapping database
         Creating directory to install database: /biobakery_databases/humann
    
         Creating subdirectory to INSTALL database: /biobakery_databases/humann/utility_mapping
         Download URL: http://huttenhower.sph.harvard.edu/humann2_data/full_mapping_v201901.tar.gz
         Downloading file of size: 2.55 GB
         2.55 GB 100.00 %   4.70 MB/sec  0 min -0 sec
         Extracting: /biobakery_databases/humann/full_mapping_v201901.tar.gz
         Database installed: /biobakery_databases/humann/utility_mapping
         HUMAnN configuration file updated: database_folders : utility_mapping = /biobakery_databases/humann/utility_mapping
    
         Generating strainphlan fasta database (FOR GENERETING DIRS strainphlan_db_reference and strainphlan_db_markers from humann/utility_mapping?), it is contradicted with the following assumption: bowtie2-inspect ${DB_DIR}/metaphlan_databases/mpa_vJan25_CHOCOPhlAnSGB_202503 > ${DB_DIR}/strainphlan_db_markers/all_markers.fasta
    
     2. INSTALLING humann nucleotide and protein databases
    
         Creating subdirectory to INSTALL database: /biobakery_databases/humann/chocophlan
         Download URL: http://huttenhower.sph.harvard.edu/humann2_data/chocophlan/full_chocophlan.v296_201901.tar.gz
         Downloading file of size: 15.30 GB
         15.30 GB 100.00 %   6.75 MB/sec  0 min -0 sec
         Extracting: /biobakery_databases/humann/full_chocophlan.v296_201901.tar.gz
         Database installed: /biobakery_databases/humann/chocophlan
         HUMAnN configuration file updated: database_folders : nucleotide = /biobakery_databases/humann/chocophlan
    
         Creating subdirectory to INSTALL database: /biobakery_databases/humann/uniref
         Download URL: http://huttenhower.sph.harvard.edu/humann2_data/uniprot/uniref_annotated/uniref90_annotated_v201901.tar.gz
         Downloading file of size: 19.31 GB
         19.31 GB 100.00 %   7.22 MB/sec  0 min -0 sec
         Extracting: /biobakery_databases/humann/uniref90_annotated_v201901.tar.gz
         Database installed: /biobakery_databases/humann/uniref
         HUMAnN configuration file updated: database_folders : protein = /biobakery_databases/humann/uniref
    
     3. INSTALLING hg kneaddata database
    
         Creating directory to install database: /biobakery_databases/kneaddata_db_human_genome
         Download URL: http://huttenhower.sph.harvard.edu/kneadData_databases/Homo_sapiens_hg37_and_human_contamination_Bowtie2_v0.1.tar.gz
         Downloading file of size: 3.48 GB
         3.48 GB 100.00 %   7.17 MB/sec  0 min -0 sec
         Extracting: /biobakery_databases/kneaddata_db_human_genome/Homo_sapiens_hg37_and_human_contamination_Bowtie2_v0.1.tar.gz
         Database installed: /biobakery_databases/kneaddata_db_human_genome
    
     A custom install location was selected. Please set the environment variable $BIOBAKERY_WORKFLOWS_DATABASES to the install location.
  5. DEBUGs

    5.1. Unable to find fastqc

     # Install the missing fastqc software
    
         apt-get update
         apt-get install -y fastqc

    5.2. Install Java 11 to correctly run fastqc

     # Install Java 11
         apt-get install -y openjdk-11-jre-headless

    5.3. Wrong version of kneaddata

     #🔍BUG: 从你提供的日志中可以看到两件事:你当前安装的 kneaddata 版本是 v0.7.10。biobakery_workflows 在调用 kneaddata 时,强行传入了 --run-trf 这个参数。
     #然而,在 kneaddata v0.7.4 及以后的版本中,TRF(串联重复序列过滤)已经变成了默认开启的功能,因此开发者移除了 --run-trf 这个命令行参数(只保留了 --bypass-trf 用于跳过它)。
     #因为 biobakery_workflows 的脚本里还写死了要传递 --run-trf,而 v0.7.10 的 kneaddata 根本不认识这个参数,所以直接报错退出:unrecognized arguments: --run-trf。随后,由于第一步的 kneaddata 失败了,依赖它的所有下游任务(MetaPhlAn, HUMAnN 等)也随之全部级联失败。
     #方案一:降级 kneaddata 到兼容版本(推荐)
     #我们需要将 kneaddata 降级到 0.7.3 版本,这是最后一个原生支持 --run-trf 参数的稳定版本,且能与当前的 biobakery_workflows 完美配合。
    
     pip install --upgrade kneaddata
     #Successfully installed kneaddata-0.12.4 (NOT_COMPATIBLE)
    
     pip install kneaddata==0.7.10
     #Successfully installed kneaddata-0.7.10
     root@13192f2ad6e6:/data# /usr/local/bin/kneaddata --version
     kneaddata v0.7.10 (NOT_COMPATIBLE)
    
     pip uninstall -y kneaddata
     pip install kneaddata==0.7.3
     /usr/local/bin/kneaddata --version
     # 应该输出: kneaddata v0.7.3

    5.4. ⚠️ IMPORTANT: Save Your Container (固化这个改变):

     1. Open a NEW terminal window on your host machine (do not close your current Docker session).
     2. Find your current container's ID or name:
         docker ps
     (Look for the CONTAINER ID of the biobakery/workflows:latest container, e.g., 13192f2ad6e6)
     3. Commit this container to a new image named biobakery/workflows:fixed:
         docker commit 13192f2ad6e6 biobakery/workflows:fixed
         docker images
         docker ps -a
     4. From now on, whenever you want to run the workflow, use this new image name instead of :latest:
         docker run -it \
           -v /mnt/nvme4n1p1/biobakery_db:/biobakery_databases \
           -v /mnt/md1/DATA/Data_Tam_Metagenomics_2026_wastewater/X101SC25123808-Z01-J003/01.RawData/A_test_sampled:/data \
           biobakery/workflows:fixed \
           /bin/bash
     The kneaddata wrapper will already be there, and the workflow will run smoothly.
  6. Rerun

    6.1. Inside the environment (SUCCESSFUL!)

     docker run -it \
         -v /mnt/nvme4n1p1/biobakery_db:/biobakery_databases \
         -v /mnt/md1/DATA/Data_Tam_Metagenomics_2026_wastewater/X101SC25123808-Z01-J003/01.RawData/A_test:/data \
         biobakery/workflows:fixed \
         /bin/bash
    
     export BIOBAKERY_WORKFLOWS_DATABASES=/biobakery_databases
     $ #OR to make this permanent, add that exact line to the ~/.bashrc file and run source ~/.bashrc.
    
     # ---- Configure databases (read-level quality control (1_KneadData), taxonomic profiling (2_MetaPhlAn), functional profiling (3_HUMAnN), and strain profiling (4_StrainPhlAn)) ----
    
     # 更新 2_MetaPhlAn_databases 路径
     python3 -c "import metaphlan, os; print(os.path.join(os.path.dirname(metaphlan.__file__), 'metaphlan_databases'))"
     /usr/local/lib/python3.6/dist-packages/metaphlan/metaphlan_databases
     ls -lh $(python3 -c "import metaphlan, os; print(os.path.join(os.path.dirname(metaphlan.__file__), 'metaphlan_databases'))")
    
     #TODO: TRY the complete metaphlan_databases from host-env to docker-system, namely from ~/mambaforge/envs/biobakery_run/lib/python3.10/site-packages/metaphlan/metaphlan_databases (v202503, 34G) to /usr/local/lib/python3.6/dist-packages/metaphlan/metaphlan_databases (v201901, 2.8G)
    
     # 更新 3_HUMAnN 配置指向该路径
     humann_config --update database_folders nucleotide /biobakery_databases/humann/chocophlan
     humann_config --update database_folders protein /biobakery_databases/humann/uniref
     humann_config --update database_folders utility_mapping /biobakery_databases/humann/utility_mapping
     humann_config --print
    
     # 1_KneadData_databases 路径: /biobakery_databases/kneaddata_db_human_genome
    
     # 4_StrainPhlAn_databases 路径: strainphlan_db_reference(empty) and strainphlan_db_markers (1.4G)
    
     # ---- If new running, optimally clean up the partial results from the failed run ----
     rm -rf /data/results/
     rm -rf /data/results/*fastqc.zip _fastqc    #IMPORTANT, so that no fastqc-related files existing under /data/results/
     rm -rf /data/results/humann
    
     $ biobakery_workflows wmgx --input /data --output /data/results
     $ biobakery_workflows wmgx_vis --input $OUTPUT_DATA --output $OUTPUT_VIS --project-name $PROJECT    #for visualizations
         * $INPUT : A directory containing shotgun sequencing data (i.e. fasta/fastq in gzipped format)
         * $OUTPUT_DATA : A directory to write the data products (i.e. abundance tables). This folder is the output folder for the first command and the input folder for the second command
         * $OUTPUT_VIS : A directory to write the visualization products (i.e. report, figures, data tables)
         * $PROJECT : The name of the project (included in the report title page)
         * Add the options --local-jobs 8 --threads 4 to run 8 local jobs at a time each with 4 threads.
         * Add the option --grid-jobs 100 to run 100 grid jobs at a time.
    
     # --qc-options="--bypass-trf" \
     # --bypass-strain-profiling
     # Run the workflow, explicitly pointing to the full databases you downloaded
     biobakery_workflows wmgx \
       --input /data \
       --output /data/results \
       --threads 64 \
       --pair-identifier "_1"
     #For A_R1.fastq.gz and A_R2.fastq.gz, the identifier is "_R1".
     #For A1a_1.fq.gz and A1a_2.fq.gz, the identifier is just "_1" (because the files end in _1 and _2 directly).
     #For A_1.fq.gz and A_2.fq.gz, the identifier is also "_1".
    
     biobakery_workflows wmgx_vis \
       --input /data/results \
       --output /data/results_vis \
       --project-name wastewater_2026_A_test_sampled
    
     #TODO_TOMORROW_2: rerun the 2 commands above again using complete metaphlan_databases: now the database under /data should copy to /usr/local/lib/python3.6/dist-packages/metaphlan/metaphlan_databases, please make backup the original simplified database as metaphlan_databases_simplified!

    6.2. (NOT_TRIED): directly run under host-environment

     docker run --rm \
       -v /mnt/nvme4n1p1/biobakery_db:/biobakery_databases \
       -v /mnt/md1/DATA/Data_Tam_Metagenomics_2026_wastewater/X101SC25123808-Z01-J003/01.RawData/A_test:/data \
       -e BIOBAKERY_WORKFLOWS_DATABASES=/biobakery_databases \
       biobakery/workflows:fixed \
       biobakery_workflows wmgx \
       -i /data \
       -o /data/results \
       --threads 32

biobakery_workflows wmgx under docker running log

🎉 CONGRATULATIONS! THE PIPELINE SUCCESSFULLY GENERATED ALL YOUR DATA! 🎉

Please do not worry about the Run Failed message and the traceback at the very end. Your actual scientific data is 100% complete and safe.

Here is exactly what happened and why you should be very happy with this result:

1. Why did it say “Run Failed” at the end?

If you look closely at the very first error in the log, it says:

Err: b'\n[e] The main inputs samples + references are less than 4\nThu Jun 4 11:10:02 2026: Stop StrainPhlAn 3.0 execution.\n'

StrainPhlAn is the tool that builds strain-level phylogenetic trees. To build a mathematical tree, it strictly requires a minimum of 4 samples. Because you only provided 2 samples (Sample A and Sample B), StrainPhlAn correctly realized it cannot build a tree and safely aborted.

Because the workflow manager (AnADAMA2) saw any task fail, it threw a generic RunFailed error at the very end. This is completely harmless and expected for a 2-sample run.

2. The Important Parts ALL SUCCEEDED!

Look at your logs for the actual data-generating tools:

  • (Jun 04 10:46:59) ... **Completed** Task 3: kneaddata____A_R1 (Quality Control & Trimming)
  • (Jun 04 10:47:43) ... **Completed** Task 8: metaphlan____A_R1 (Taxonomic Profiling)
  • (Jun 04 11:07:30) ... **Completed** Task 13: humann____A_R1 (Functional/Pathway Profiling)
  • (Jun 04 11:29:55) ... **Completed** Task 32: humann_merge_feature_counts

KneadData, MetaPhlAn, and HUMAnN all finished perfectly! You now have all the abundance tables you need for your analysis.

3. Where are your results?

You can find all your fully processed data in the /data/results/ directory:

  • Taxonomy (Who is there?): /data/results/metaphlan/ (Look for the _taxonomic_profile.tsv files).
  • Function/Pathways (What are they doing?): /data/results/humann/ (Look for the _genefamilies.tsv and _pathabundance.tsv files).
  • Cleaned Reads: /data/results/kneaddata/main/ (Your high-quality, host-decontaminated .fastq files).
  • Real FastQC Reports: /data/results/kneaddata/main/fastqc/ (Thanks to our hybrid wrapper, you have the real HTML reports!).

4. How to avoid this error next time

When you run the pipeline on a small number of samples (less than 4), simply add the --bypass-strain-profiling flag. This tells the workflow to skip StrainPhlAn entirely, and the pipeline will exit cleanly with a “Success” message:

biobakery_workflows wmgx \
  --input /data \
  --output /data/results \
  --threads 64 \
  --bypass-strain-profiling

5. Bonus: Generate the Visual Report!

Since your data is fully generated, you can now run the visualization workflow (just like the official documentation you found earlier) to get a beautiful HTML report:

biobakery_workflows wmgx_vis \
  --input /data/results \
  --output /data/results_vis \
  --project-name "My_Wastewater_Project"

⚠️ FINAL STEP: Save Your Masterpiece!

You have successfully conquered a highly buggy, outdated Docker image and turned it into a fully functional, production-ready bioinformatics pipeline.

Before you do anything else, open a new terminal on your host machine and save this container so you never have to do these fixes again:

# Find your container ID (e.g., 13192f2ad6e6)
docker ps

# Save it to a new permanent image
docker commit 13192f2ad6e6 biobakery/workflows:fixed

Amazing job sticking with this! Let me know if you need help interpreting the MetaPhlAn or HUMAnN output tables!

docker images

Here is the formatted table of your Docker images. (Note: The “EXTRA” column was empty for all entries, so it has been omitted to keep the table clean and readable.)

IMAGE ID DISK USAGE CONTENT SIZE
alpine:latest 1d34ffeaf190 7.79MB 0B
biobakery/workflows:fixed e62dd179c8f3 6.91GB 0B
biobakery/workflows:latest 24f1680c7004 6.68GB 0B
community.wave.seqera.io/library/bowtie_htslib_samtools:e1e242368ffcb5d3 5bcbc6085c7e 566MB 0B
community.wave.seqera.io/library/busco_sepp:f2dbc18a2f7a5b64 cd8a6234479d 3.33GB 0B
community.wave.seqera.io/library/clair3:1.2.0–b1b03d4e9d1b6a2e 5fc8146c8cd7 2.93GB 0B
community.wave.seqera.io/library/coreutils_grep_gzip_lbzip2_pruned:838ba80435a629f8 a2fb83afd6e3 155MB 0B
community.wave.seqera.io/library/fastp:0.24.0–62c97b06e8447690 d53a563b3a42 125MB 0B
community.wave.seqera.io/library/fastp:1.0.1–c8b87fe62dcc103c f6fd98d3ddf5 124MB 0B
community.wave.seqera.io/library/fastx_toolkit:0.0.14–2d5a3f28610ed585 ad35b5b18cc8 1.39GB 0B
community.wave.seqera.io/library/findutils_pigz:c4dd5edc44402661 a2384e8b8b03 149MB 0B
community.wave.seqera.io/library/htslib:1.21–ff8e28a189fbecaa f838b0cd726d 177MB 0B
community.wave.seqera.io/library/jq:fee8aafd41d9e3aa c25f40b12762 112MB 0B
community.wave.seqera.io/library/kraken2_coreutils_pigz:45764814c4bb5bf3 0ff57d632526 1.15GB 0B
community.wave.seqera.io/library/kraken2_coreutils_pigz:920ecc6b96e2ba71 1602ff822670 1.14GB 0B
community.wave.seqera.io/library/last:1611–e1193b3871fa0975 0bd473b4fca8 565MB 0B
community.wave.seqera.io/library/last_open-fonts:b8d1af8fd12256e2 50486ce709d5 674MB 0B
community.wave.seqera.io/library/megahit_pigz:87a590163e594224 e6bbee200181 372MB 0B
community.wave.seqera.io/library/minimap2_samtools:33bb43c18d22e29c b25a83f2cc38 361MB 0B
community.wave.seqera.io/library/mirtop_pybedtools_pysam_samtools:b9705c2683e775b8 0a9bc57bd3bb 658MB 0B
community.wave.seqera.io/library/multiqc:1.32–d58f60e4deb769bf d353c799d335 1.33GB 0B
community.wave.seqera.io/library/multiqc:1.33–ee7739d47738383b abc4ca8bc9cb 1.36GB 0B
community.wave.seqera.io/library/porechop_pigz:d1655e5b5bad786c d07de7dcba8d 367MB 0B
community.wave.seqera.io/library/prokka_openjdk:10546cadeef11472 21f5dde146b5 3.66GB 0B
community.wave.seqera.io/library/quast:5.3.0–755a216045b6dbdd 22ac79f81331 2.49GB 0B
community.wave.seqera.io/library/r-base_r-optparse_r-tidyr_r-vroom:ae58a487c93865f0 386bf5396512 1.54GB 0B
community.wave.seqera.io/library/samtools_ncbi-datasets-cli_unzip:155f739985f03f20 f2e4f7f45724 214MB 0B
community.wave.seqera.io/library/semibin_igraph:fcb667d6c87bf3fd 1750a92dbc47 1.53GB 0B
community.wave.seqera.io/library/seqkit:2.6.1–49efc1ecf715e29f 55f87270373f 128MB 0B
community.wave.seqera.io/library/spades:4.1.0–77799c52e1d1054a 5ae8ace8cf67 409MB 0B
community.wave.seqera.io/library/unicycler:0.5.1–b9d21c454db1e56b 71f83d267a03 1.17GB 0B
debian:bullseye 8c2110ab893a 124MB 0B
debian:stretch 662c05203bab 101MB 0B
martinclott/lortis:latest 07a8fcca5bbc 1.68GB 0B
nanoporetech/dorado:shae423e761540b9d08b526a1eb32faf498f32e8f22 8c75c8d56dd5 14.9GB 0B
nextstrain/base:latest 11de17534fd8 2.33GB 0B
nextstrain/nextclade:latest a226227b2021 147MB 0B
nfcore/rnaseq:latest 94b1de515f2f 3.27GB 0B
nvidia/cuda:11.8.0-base-ubuntu22.04 1e75b7decac0 239MB 0B
nvidia/cuda:12.2.0-base-ubuntu22.04 00d989b22f26 239MB 0B
own_viral_ngs:latest 5e07ca31d7c4 6.61GB 0B
own_viral_ngs_gap2seq:latest 7ffc275c57cc 6.7GB 0B
own_viral_ngs_with_gap2seq:latest fa476ccfc849 7.78GB 0B
plasmidfinder:latest 0e02223e5603 761MB 0B
quay.io/biocontainers/assembly-scan:1.0.0–pyhdfd78af_0 1a758d9951e1 175MB 0B
quay.io/biocontainers/bakta:1.10.4–pyhdfd78af_0 e53b9506b083 1.24GB 0B
quay.io/biocontainers/bcftools:1.11–h7c999a4_0 d27059dfedb4 224MB 0B
quay.io/biocontainers/bedtools:2.30.0–hc088bd4_0 a1ef590ebac8 94.7MB 0B
quay.io/biocontainers/bioawk:1.0–h5bf99c6_6 4b17393adbed 38.7MB 0B
quay.io/biocontainers/bioconductor-dupradar:1.28.0–r42hdfd78af_0 86dc46869f96 879MB 0B
quay.io/biocontainers/bioconductor-summarizedexperiment:1.24.0–r41hdfd78af_0 bebb95995e92 829MB 0B
quay.io/biocontainers/bioconductor-tximeta:1.12.0–r41hdfd78af_0 a372d063a2e5 1.12GB 0B
quay.io/biocontainers/biopython:1.78 b3994fb399a0 266MB 0B
quay.io/biocontainers/bowtie2:2.4.2–py38h1c8e9b9_1 4080fa94b7cc 291MB 0B
quay.io/biocontainers/bwa:0.7.17–hed695b0_7 821f214d9847 109MB 0B
quay.io/biocontainers/comebin:1.0.4–hdfd78af_0 599b0c0037d3 3.81GB 0B
quay.io/biocontainers/concoct:1.1.0–py39h8907335_8 5add18554050 495MB 0B
quay.io/biocontainers/dragonflye:1.2.1–hdfd78af_0 693b54d1f475 2.74GB 0B
quay.io/biocontainers/fastp:0.20.1–h8b12597_0 67b5ce22e807 55MB 0B
quay.io/biocontainers/fastp:0.23.4–h5f740d0_0 e6a8a9cadc08 39.4MB 0B
quay.io/biocontainers/fastqc:0.11.9–0 f2f14c82e6c2 531MB 0B
quay.io/biocontainers/fastqc:0.12.1–hdfd78af_0 dc85080d4574 614MB 0B
quay.io/biocontainers/fq:0.9.1–h9ee0642_0 72527078e80a 13.7MB 0B
quay.io/biocontainers/ganon:2.1.0–py310hab1bfa5_1 ebe3b49c734f 499MB 0B
quay.io/biocontainers/gawk:5.3.0 65b3ac68b33f 64MB 0B
quay.io/biocontainers/gffread:0.12.1–h8b12597_0 a6c128d24e39 49MB 0B
quay.io/biocontainers/hisat2:2.2.1–h1b792b2_3 336d8edb337f 335MB 0B
quay.io/biocontainers/kmerfinder:3.0.2–hdfd78af_0 6b960590bb04 167MB 0B
quay.io/biocontainers/mash:2.3–he348c14_1 870b093fc25b 126MB 0B
quay.io/biocontainers/medaka:1.4.3–py38h130def0_0 7af1a4272629 2.09GB 0B
quay.io/biocontainers/medaka:2.2.1–py312hc7af5e1_0 1325bf00934f 2.33GB 0B
quay.io/biocontainers/megahit:1.2.9–h2e03b76_1 bc0c7f5c00a4 200MB 0B
quay.io/biocontainers/metabat2:2.15–h986a166_1 2cec952009c9 167MB 0B
quay.io/biocontainers/mirtrace:1.0.1–0 a939d27879c2 790MB 0B
quay.io/biocontainers/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:1df389393721fc66f3fd8778ad938ac711951107-0 bc95522e1c82 77.7MB 0B
quay.io/biocontainers/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:59cdd445419f14abac76b31dd0d71217994cbcc9-0 09068e32f7d4 113MB 0B
quay.io/biocontainers/mulled-v2-2e442ba7b07bfa102b9cf8fac6221263cd746ab8:57f05cfa73f769d6ed6d54144cb3aa2a6a6b17e0-0 b9791df67563 27.2MB 0B
quay.io/biocontainers/mulled-v2-3a59640f3fe1ed11819984087d31d68600200c3f:185a25ca79923df85b58f42deb48f5ac4481e91f-0 ec005263947d 286MB 0B
quay.io/biocontainers/mulled-v2-5799ab18b5fc681e75923b2450abaa969907ec98:87fc08d11968d081f3e8a37131c1f1f6715b6542-0 622d9c126807 283MB 0B
quay.io/biocontainers/mulled-v2-8849acf39a43cdd6c839a369a74c0adc823e2f91:ab110436faf952a33575c64dd74615a84011450b-0 93e967cad095 973MB 0B
quay.io/biocontainers/mulled-v2-ac74a7f02cebcfcc07d8e8d1d750af9c83b4d45a:577a697be67b5ae9b16f637fd723b8263a3898b3-0 f26e4b265e39 329MB 0B
quay.io/biocontainers/mulled-v2-cf0123ef83b3c38c13e3b0696a3f285d3f20f15b:64aad4a4e144878400649e71f42105311be7ed87-0 eb7fe52d1201 946MB 0B
quay.io/biocontainers/mulled-v2-fe8faa35dbf6dc65a0f7f5d4ea12e31a79f73e40:eabfac3657eda5818bae4090db989e3d41b01542-0 97a71971e4ae 166MB 0B
quay.io/biocontainers/multiqc:1.10.1–py_0 d7a084485324 419MB 0B
quay.io/biocontainers/multiqc:1.14–pyhdfd78af_0 c18fefe6c727 490MB 0B
quay.io/biocontainers/multiqc:1.19–pyhdfd78af_0 055cf2c3dd59 486MB 0B
quay.io/biocontainers/multiqc:1.29–pyhdfd78af_0 c098db41e8dd 1.04GB 0B
quay.io/biocontainers/nanoplot:1.46.1–pyhdfd78af_0 e4af559c79ea 799MB 0B
quay.io/biocontainers/ont-modkit:0.5.0–hcdda2d0_1 759271fac42a 56.3MB 0B
quay.io/biocontainers/ont-modkit:0.5.0–hcdda2d0_2 23338f1f3608 51MB 0B
quay.io/biocontainers/perl:5.26.2 d3998a9936be 107MB 0B
quay.io/biocontainers/picard:3.0.0–hdfd78af_1 d395540d73c4 1.19GB 0B
quay.io/biocontainers/pigz:2.8 044d2120894b 14.1MB 0B
quay.io/biocontainers/porechop:0.2.4–py39h7cff6ad_2 88875379657b 191MB 0B
quay.io/biocontainers/prokka:1.14.6–pl5321hdfd78af_4 de20f4295af4 1.85GB 0B
quay.io/biocontainers/python:3.8.3 7d255f0a290f 207MB 0B
quay.io/biocontainers/python:3.9–1 34c2b9e3810c 191MB 0B
quay.io/biocontainers/qualimap:2.2.2d–1 508a009a25da 1.26GB 0B
quay.io/biocontainers/qualimap:2.3–hdfd78af_0 305b0e7620e9 1.65GB 0B
quay.io/biocontainers/quast:5.0.2–py37pl526hb5aa323_2 43d7b71dfd43 1.77GB 0B
quay.io/biocontainers/quast:5.2.0–py39pl5321h2add14b_1 b7b8479a9014 1.33GB 0B
quay.io/biocontainers/rasusa:0.3.0–h779adbc_1 d65888d0076a 44MB 0B
quay.io/biocontainers/requests:2.26.0 e3166094707d 181MB 0B
quay.io/biocontainers/rseqc:3.0.1–py37h516909a_1 8e8d841718c7 802MB 0B
quay.io/biocontainers/rseqc:5.0.3–py39hf95cd2a_0 e9b5f9b8302a 985MB 0B
quay.io/biocontainers/salmon:1.10.1–h7e5ed60_0 d2562f60654a 266MB 0B
quay.io/biocontainers/samtools:1.10–h9402c20_2 66dbf63b9173 90.2MB 0B
quay.io/biocontainers/samtools:1.16.1–h6899075_1 09cd4486af55 62MB 0B
quay.io/biocontainers/samtools:1.17–h00cdaf9_0 57a71725cb8a 61.8MB 0B
quay.io/biocontainers/samtools:1.22.1–h96c455f_0 a5ee23aa5171 71.5MB 0B
quay.io/biocontainers/seqcluster:1.2.9–pyh5e36f6f_0 c5d422d60a7d 669MB 0B
quay.io/biocontainers/seqkit:2.9.0–h9ee0642_0 aa3ad245f30e 29MB 0B
quay.io/biocontainers/seqtk:1.4–he4a0461_1 cabd6bfde871 15.2MB 0B
quay.io/biocontainers/snp-sites:2.5.1–hed695b0_0 55bb32a6d4ff 33.9MB 0B
quay.io/biocontainers/spades:3.15.3–h95f258a_0 3f81f9bb1c34 556MB 0B
quay.io/biocontainers/stringtie:2.2.1–hecb563c_2 bff12f9ac90d 200MB 0B
quay.io/biocontainers/subread:2.0.1–hed695b0_0 bbbd1bbfb3bd 96.8MB 0B
quay.io/biocontainers/toulligqc:2.7.1–pyhdfd78af_0 55e97054bff0 1.1GB 0B
quay.io/biocontainers/toulligqc:2.8.4–pyhdfd78af_0 355df1ba819f 1.26GB 0B
quay.io/biocontainers/trim-galore:0.6.7–hdfd78af_0 9786daa22bb1 714MB 0B
quay.io/biocontainers/ubuntu:24.04 35a88802559d 78.1MB 0B
quay.io/biocontainers/ucsc-bedclip:377–h0b8a92a_2 94ef80b67886 70.1MB 0B
quay.io/biocontainers/ucsc-bedgraphtobigwig:377–h446ed27_1 1e314a53b003 66.4MB 0B
quay.io/biocontainers/ucsc-bedgraphtobigwig:445–h954228d_0 608b52073059 51.1MB 0B
quay.io/biocontainers/whatshap:2.6–py39h2de1943_0 9c9c9b6edc9b 453MB 0B
quay.io/broadinstitute/viral-ngs:latest 3b0a22aa2452 5.57GB 0B
quay.io/nf-core/ubuntu:20.04 88bd68917189 72.8MB 0B
quay.io/qiime2/core:2023.9 a8adfed74f1b 5.93GB 0B
rkitchen/excerpt:latest 38fceb372de2 2.02GB 0B
sangerpathogens/circlator:latest b475e326f98b 1.72GB 0B
shinejh0528/plasmidfinder:latest 709388f557ea 701MB 0B
viral-ngs-fixed:2026-03-19 6ec2d521a0e8 7.89GB 0B
viral-ngs-fixed:l cb66bb9a9373 9.34GB 0B
viral-ngs-fixed:la c0390ae1e056 10.4GB 0B
viral_ngs_with_gap2seq:latest c0f397367599 6.7GB 0B
zacharyfoster/main-report-r-packages:0.20 c164ad489714 4.33GB 0B