- cat longreads
-
Run unicycler
conda activate /home/jhuang/miniconda3/envs/trycycler unicycler -1 /mnt/md1/DATA/Data_Tam_DNAseq_2026_An6_BG5/X101SC26036392-Z01-J002/clean_data/An6/An6_L1_1.clean.rd.fq.gz -2 /mnt/md1/DATA/Data_Tam_DNAseq_2026_An6_BG5/X101SC26036392-Z01-J002/clean_data/An6/An6_L1_2.clean.rd.fq.gz -l /mnt/md1/DATA/Data_Tam_DNAseq_2026_An6_BG5/X101SC26036392-Z01-J003/Release-X101SC26036392-Z01-J003-20260513_01/Data-X101SC26036392-Z01-J003/An6/2157_4C_PBK79106_7ec05c46/merged_An6_longreads.fastq.gz --mode normal -t 100 -o An6_unicycler_normal unicycler -1 /mnt/md1/DATA/Data_Tam_DNAseq_2026_An6_BG5/X101SC26036392-Z01-J002/clean_data/BG5/BG5_L1_1.clean.rd.fq.gz -2 /mnt/md1/DATA/Data_Tam_DNAseq_2026_An6_BG5/X101SC26036392-Z01-J002/clean_data/BG5/BG5_L1_2.clean.rd.fq.gz -l /mnt/md1/DATA/Data_Tam_DNAseq_2026_An6_BG5/X101SC26036392-Z01-J003/Release-X101SC26036392-Z01-J003-20260513_01/Data-X101SC26036392-Z01-J003/BG5/2157_4C_PBK79106_7ec05c46/merged_BG5_longreads.fastq.gz --mode normal -t 100 -o BG5_unicycler_normal unicycler -1 /mnt/md1/DATA/Data_Tam_DNAseq_2026_An6_BG5/X101SC26036392-Z01-J002/clean_data/An6/An6_L1_1.clean.rd.fq.gz -2 /mnt/md1/DATA/Data_Tam_DNAseq_2026_An6_BG5/X101SC26036392-Z01-J002/clean_data/An6/An6_L1_2.clean.rd.fq.gz -l /mnt/md1/DATA/Data_Tam_DNAseq_2026_An6_BG5/X101SC26036392-Z01-J003/Release-X101SC26036392-Z01-J003-20260513_01/Data-X101SC26036392-Z01-J003/An6/2157_4C_PBK79106_7ec05c46/merged_An6_longreads.fastq.gz --mode conservative -t 100 -o An6_unicycler_conservative unicycler -1 /mnt/md1/DATA/Data_Tam_DNAseq_2026_An6_BG5/X101SC26036392-Z01-J002/clean_data/BG5/BG5_L1_1.clean.rd.fq.gz -2 /mnt/md1/DATA/Data_Tam_DNAseq_2026_An6_BG5/X101SC26036392-Z01-J002/clean_data/BG5/BG5_L1_2.clean.rd.fq.gz -l /mnt/md1/DATA/Data_Tam_DNAseq_2026_An6_BG5/X101SC26036392-Z01-J003/Release-X101SC26036392-Z01-J003-20260513_01/Data-X101SC26036392-Z01-J003/BG5/2157_4C_PBK79106_7ec05c46/merged_BG5_longreads.fastq.gz --mode conservative -t 100 -o BG5_unicycler_conservative unicycler -1 /mnt/md1/DATA/Data_Tam_DNAseq_2026_An6_BG5/X101SC26036392-Z01-J002/clean_data/An6/An6_L1_1.clean.rd.fq.gz -2 /mnt/md1/DATA/Data_Tam_DNAseq_2026_An6_BG5/X101SC26036392-Z01-J002/clean_data/An6/An6_L1_2.clean.rd.fq.gz -l /mnt/md1/DATA/Data_Tam_DNAseq_2026_An6_BG5/X101SC26036392-Z01-J003/Release-X101SC26036392-Z01-J003-20260513_01/Data-X101SC26036392-Z01-J003/An6/2157_4C_PBK79106_7ec05c46/merged_An6_longreads.fastq.gz --mode bold -t 100 -o An6_unicycler_bold unicycler -1 /mnt/md1/DATA/Data_Tam_DNAseq_2026_An6_BG5/X101SC26036392-Z01-J002/clean_data/BG5/BG5_L1_1.clean.rd.fq.gz -2 /mnt/md1/DATA/Data_Tam_DNAseq_2026_An6_BG5/X101SC26036392-Z01-J002/clean_data/BG5/BG5_L1_2.clean.rd.fq.gz -l /mnt/md1/DATA/Data_Tam_DNAseq_2026_An6_BG5/X101SC26036392-Z01-J003/Release-X101SC26036392-Z01-J003-20260513_01/Data-X101SC26036392-Z01-J003/BG5/2157_4C_PBK79106_7ec05c46/merged_BG5_longreads.fastq.gz --mode bold -t 100 -o BG5_unicycler_bold -
Run nextflow bacass
conda deactivate # Downlod k2_standard_08_GB_20251015.tar.gz from https://benlangmead.github.io/aws-indexes/k2#kraken2--bracken # Download 20190108_kmerfinder_stable_dirs.tar.gz from https://zenodo.org/records/13447056; 'tar xzf 20190108_kmerfinder_stable_dirs.tar.gz' #The database does not work! # Download the kmerfinder database: https://www.genomicepidemiology.org/services/ --> https://cge.food.dtu.dk/services/KmerFinder/ --> https://cge.food.dtu.dk/services/KmerFinder/etc/kmerfinder_db.tar.gz #The database works! # DEBUG: --kmerfinderdb /mnt/nvme1n1p1/REFs/kmerfinder/bacteria/ not working! nextflow run nf-core/bacass -r 2.6.0 -profile docker --help nextflow run nf-core/bacass -r 2.6.0 -profile docker \ --input samplesheet_bacass.tsv \ --outdir bacass_out \ --assembly_type hybrid \ --assembler unicycler,dragonflye \ --kraken2db /mnt/nvme1n1p1/REFs/k2_standard_08_GB_20251015.tar.gz \ --skip_kmerfinder \ -resume \ -work-dir bacass_out/work #We can skip kmerfinder since kraken2 do similar function: 功能,KmerFinder,Kraken2,您的需求 物种鉴定,✅ 基于 k-mer 频率,✅ 基于 k-mer 分类,二者选一即可 污染筛查,✅ 可检测外源序列,✅ 可检测外源序列,✅ 有 Kraken2 足够 数据库大小,~32 GB,~8 GB,Kraken2 更轻量 运行速度,较慢,较快,Kraken2 更快 细菌特异性,✅ 针对细菌优化,✅ 支持细菌,二者均可 # Using Polished genomes for downstream analyses jhuang@WS-2290C:~/DATA/Data_Tam_DNAseq_2026_An6_BG5/bacass_out/Medaka$ grep ">" An6-unicycler-medaka_polished_genome.fa jhuang@WS-2290C:~/DATA/Data_Tam_DNAseq_2026_An6_BG5/bacass_out/Medaka$ grep ">" BG5-unicycler-medaka_polished_genome.fa -
Species Identification: 快速筛查用 Mash → 精确分类用 GTDB-Tk → 种级验证用 FastANI,三者结合可最大限度提高物种鉴定的准确性和可解释性。
🔬 主流物种鉴定工具对比
| 工具 | 适用对象 | 核心原理 | 优势 | 局限性 |
|---|---|---|---|---|
| GTDB-Tk(CHOSEN) [[12]] | 细菌/古菌 | 基于120+个单拷贝标记基因的系统发育分析 | 分类标准客观(基于GTDB数据库),分辨率高,适合新物种发现 | 仅支持原核生物,计算量较大 |
| Mash / Mash Screen [[35]] | 所有生物 | MinHash算法快速估算基因组距离(ANI近似) | 速度极快(秒级),可筛查污染,支持参考库自定义 | 分辨率依赖参考库完整性,对远缘物种区分有限 |
| Kraken2 [[25]] | 所有生物 | k-mer + LCA(最低共同祖先)分类 | 速度快,支持自定义数据库,可处理混合样本 | 内存需求高(标准库~30GB),假阳性需置信度过滤 |
| FastANI [[36]] | 细菌/古菌 | 全基因组平均核苷酸一致性(ANI)计算 | 金标准方法,95-96% ANI ≈ 同种,结果可解释性强 | 需两两比对,大规模筛查较慢 |
| NCBI BLAST+ 16S/全基因组 [[2]] | 所有生物 | 序列相似性比对 | 数据库最全,结果直观,适合初步筛查 | 16S分辨率有限(种内难区分),全基因组BLAST慢 |
# 1. 创建环境(推荐 mamba)
mamba create -n gtdbtk -c conda-forge -c bioconda gtdbtk
mamba activate gtdbtk
# 2. 下载数据库(仅需首次,约 60GB)
gtdbtk download --data_dir ./gtdb_data --release 220
wget https://data.gtdb.aau.ecogenomic.org/releases/release232/232.0/auxillary_files/gtdbtk_package/full_package/gtdbtk_r232_data.tar.g
mamba env config vars set GTDBTK_DATA_PATH="/mnt/nvme4n1p1/gtdb_data/release232"
# 先退出当前环境,再重新激活
mamba deactivate
mamba activate gtdbtk
# 验证环境变量是否加载成功
echo $GTDBTK_DATA_PATH
# 应输出:/mnt/nvme4n1p1/gtdb_data/release232
# 3. 运行分类(你提供的命令 + 实用参数)
gtdbtk classify_wf \
--genome_dir ./bacass_out/Medaka \
--out_dir gtdb_out \
--cpus 64 \
--extension .fa \
--prefix mygenome
# 4. 查看结果
cat gtdb_out/classify/mygenome.bac120.summary.tsv # 细菌结果
#For An6:
#closest_genome_reference closest_genome_reference_radius closest_genome_taxonomy closest_genome_ani closest_genome_af
#GCF_000816495.1 95 d__Bacteria;p__Pseudomonadota;c__Gammaproteobacteria;o__Pseudomonadales;f__Moraxellaceae;g__Acinetobacter;s__Acinetobacter harbinensis 97.41 0.892
#For BG5:
#closest_genome_reference closest_genome_reference_radius closest_genome_taxonomy closest_genome_ani closest_genome_af
#GCF_040547305.1 95 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Sphingobacteriales;f__Sphingobacteriaceae;g__Pedobacter;s__Pedobacter sp040547305 98.45 0.894
#other_related_references(genome_id,species_name,radius,ANI,AF)
#GCF_040822065.1, s__Pedobacter sp040822065, 95.0, 89.17, 0.499;
#GCF_014200595.1, s__Pedobacter cryoconitis_C, 95.0, 89.48, 0.577;
#GCF_003259615.1, s__Pedobacter cryoconitis, 95.0, 89.22, 0.556;
#GCF_040026395.1, s__Pedobacter lusitanus, 95.0, 83.93, 0.167;
#GCA_014302835.1, s__Pedobacter sp014302835, 95.0, 84.24, 0.161;
#GCF_014200605.1, s__Pedobacter cryoconitis_B, 95.0, 84.47, 0.194;
#GCF_001590605.1, s__Pedobacter cryoconitis_A, 95.0, 91.97, 0.663
-
Using https://www.bv-brc.org/app/ComprehensiveGenomeAnalysis to annotate the genome using scaffolded results from bacass. ComprehensiveGenomeAnalysis provides comprehensive overview of the data.
-
Antimicrobial resistance gene profiling and Resistome and Virulence Profiling with Abricate and RGI (Reisistance Gene Identifier)
# Table 4. Specialty Genes # Source Genes # NDARO 1 # Antibiotic Resistance CARD 15 # Antibiotic Resistance PATRIC 55 # Drug Target TTD 38 # Metal Resistance BacMet 29 # Transporter TCDB 250 # Virulance factor VFDB 33 # https://www.genomicepidemiology.org/services/ # https://genepi.dk/ conda activate /home/jhuang/miniconda3/envs/bengal3_ac3 abricate --list #DATABASE SEQUENCES DBTYPE DATE #vfdb 2597 nucl 2025-Oct-22 #resfinder 3077 nucl 2025-Oct-22 #argannot 2223 nucl 2025-Oct-22 #ecoh 597 nucl 2025-Oct-22 #megares 6635 nucl 2025-Oct-22 #card 2631 nucl 2025-Oct-22 #ecoli_vf 2701 nucl 2025-Oct-22 #plasmidfinder 460 nucl 2025-Oct-22 #ncbi 5386 nucl 2025-Oct-22 abricate-get_db --list #Choices: argannot bacmet2 card ecoh ecoli_vf megares ncbi plasmidfinder resfinder vfdb victors (default ''). # CARD abricate-get_db --db card # MEGARes (automatically install, if error try MANUAL install as below) abricate-get_db --db megares # MANUAL install wget -O megares_database_v3.00.fasta \ "https://www.meglab.org/downloads/megares_v3.00/megares_database_v3.00.fasta" #wget -O megares_drugs_database_v3.00.fasta \ "https://www.meglab.org/downloads/megares_v3.00/megares_drugs_database_v3.00.fasta" # 1) Define dbdir (adjust to your env; from your logs it's inside the conda env) DBDIR=/home/jhuang/miniconda3/envs/bengal3_ac3/db # 2) Create a custom db folder for MEGARes v3.0 mkdir -p ${DBDIR}/megares_v3.0 # 3) Copy the downloaded MEGARes v3.0 nucleotide FASTA to 'sequences' cp megares_database_v3.00.fasta ${DBDIR}/megares_v3.0/sequences # 4) Build ABRicate indices abricate --setupdb #abricate-get_db --setupdb abricate --list | egrep 'card|megares' abricate --list | grep -i megares conda deactivate chmod +x run_abricate_resistome_virulome_one_per_gene.sh ENV_NAME=/home/jhuang/miniconda3/envs/bengal3_ac3 \ ASM=BG5-unicycler-medaka_polished_genome.fa \ SAMPLE=BG5 \ OUTDIR=resistome_virulence_BG5 \ MINID=70 MINCOV=50 \ THREADS=32 \ ./run_abricate_resistome_virulome_one_per_gene.sh #ABRicate thresholds: MINID=70 MINCOV=50 Database Hit_lines File MEGARes 5 resistome_virulence_An6/raw/An6.megares.tab CARD 5 resistome_virulence_An6/raw/An6.card.tab ResFinder 0 resistome_virulence_An6/raw/An6.resfinder.tab VFDB 4 resistome_virulence_An6/raw/An6.vfdb.tab Database Hit_lines File MEGARes 3 resistome_virulence_BG5/raw/BG5.megares.tab CARD 2 resistome_virulence_BG5/raw/BG5.card.tab ResFinder 1 resistome_virulence_BG5/raw/BG5.resfinder.tab VFDB 0 resistome_virulence_BG5/raw/BG5.vfdb.tab #ABRicate thresholds: MINID=80 MINCOV=60 Database Hit_lines File MEGARes 0 resistome_virulence_An6/raw/An6.megares.tab CARD 0 resistome_virulence_An6/raw/An6.card.tab ResFinder 0 resistome_virulence_An6/raw/An6.resfinder.tab VFDB 0 resistome_virulence_An6/raw/An6.vfdb.tab conda activate /home/jhuang/miniconda3/envs/bengal3_ac3 #Adapted the parameters in merge_amr_sources_by_gene.py OUTDIR = Path("resistome_virulence_An6") or "resistome_virulence_BG5" SAMPLE = "An6" or "BG5" python merge_amr_sources_by_gene.py python export_resistome_virulence_to_excel_py36.py \ --workdir resistome_virulence_BG5 \ --sample BG5 \ --out Resistome_Virulence_BG5.xlsx # Delete the column 'COVERAGE_MAP' in all 'Raw_*' sheets Methods sentence (AMR + virulence) AMR genes were identified by screening the genome assembly with ABRicate against the MEGARes and ResFinder databases, using minimum identity and coverage thresholds of X% and Y%, respectively. CARD-based AMR determinants were additionally predicted using RGI (Resistance Gene Identifier) to leverage curated resistance models. Virulence factors were screened using ABRicate against VFDB under the same thresholds. Replace X/Y with your actual values (e.g., 90/60) or state “default parameters” if you truly used defaults. Table 2 caption (AMR) Table 2. AMR gene profiling of the genome assembly. Hits were detected using ABRicate (MEGARes and ResFinder) and RGI (CARD). The presence of AMR-associated genes does not necessarily imply phenotypic resistance, which may depend on allele type, genomic context/expression, and/or SNP-mediated mechanisms; accordingly, phenotype predictions (e.g., ResFinder) should be interpreted cautiously. Table 3 caption (virulence) Table 3. Virulence factor profiling of the genome assembly based on ABRicate screening against VFDB, reporting loci with sequence identity and coverage above the specified thresholds.