Daily Archives: 2026年5月29日

dorado + nf-core/methylong + modkit pileup + interpretation using own scripts processing Data_Tam_DNAseq_2026_An6_BG5 methylome data

dna_methylation_analysis_4mc_5mc.sh dna_methylation_analysis_6mA.sh annotate_motifs_rebase.py generate_methylation_excel.py

  1. Run nextflow methylong after running nextflow bacass

     ✅ ---- 终极解决方案:使用本地 Dorado 预生成 modBAM + --skip_basecalling (绕过容器 CUDA 限制)----
    
     鉴于容器镜像的复杂性和 CUDA 版本限制,最可靠、最快速的路径是:
     使用您已安装的本地 Dorado v2.0.0 生成已比对的 modBAM
     运行 methylong 时添加 --skip_basecalling,跳过内部容器化 Dorado
     这样完全绕过容器、CUDA、GPU 传递等所有问题!
    
     步骤 1:如果您已安装本地 Dorado(非容器版),可预先生成 modBAM:
    
     # 安装本地 Dorado (CPU 或匹配您 CUDA 的版本)
     #    下载: https://github.com/nanoporetech/dorado/releases; https://software-docs.nanoporetech.com/dorado/latest/
     curl "https://cdn.oxfordnanoportal.com/software/analysis/dorado-2.0.0-linux-x64.tar.gz" -o dorado-2.0.0-linux-x64.tar.gz
     tar -xzf dorado-2.0.0-linux-x64.tar.gz
    
     # 添加到 PATH (永久生效)
     echo 'export PATH="/home/jhuang/Tools/dorado-2.0.0-linux-x64/bin:$PATH"' >> ~/.bashrc
     source ~/.bashrc
    
     # 验证安装: 2.0.0+20e87c8b
     dorado --version
    
     # 查看可用的碱基调用模型
     Positional arguments:
       model                             Model selection {fast,hac,sup}@v{version} for automatic model selection including modbases, or path to existing model directory.
       data                              The data directory or POD5 file path.
    
     # 查看可用的修饰检测模型
     dorado basecaller --help | grep -A5 "modified-bases"
     # ✅ 应看到: 6mA, 4mC_5mC, 5mC_5hmC 等
    
     步骤 2: 生成 modBAM
     ./generate_mapped_modbam.sh
    
         #!/bin/bash
         #===============================================================================
         # generate_mapped_modbam.sh - 本地 Dorado 生成已比对 modBAM
         #===============================================================================
         set -euo pipefail
    
         # === 配置 ===
         DORADO="/home/jhuang/Tools/dorado-2.0.0-linux-x64/bin/dorado"  # 您的本地 Dorado 路径
         REF_AN6="/mnt/md1/DATA/Data_Tam_DNAseq_2026_An6_BG5/bacass_out/Medaka/An6-unicycler-medaka_polished_genome.fa"
         REF_BG5="/mnt/md1/DATA/Data_Tam_DNAseq_2026_An6_BG5/bacass_out/Medaka/BG5-unicycler-medaka_polished_genome.fa"
         POD5_AN6="./X101SC26036392-Z01-J003/Release-X101SC26036392-Z01-J003-20260513_01/Data-X101SC26036392-Z01-J003/An6/2157_4C_PBK79106_7ec05c46/An6_pod5_pass"
         POD5_BG5="./X101SC26036392-Z01-J003/Release-X101SC26036392-Z01-J003-20260513_01/Data-X101SC26036392-Z01-J003/BG5/2157_4C_PBK79106_7ec05c46/BG5_pod5_pass"
         OUTDIR="/mnt/md1/DATA/Data_Tam_DNAseq_2026_An6_BG5"
    
         # 确保参考基因组已索引
         samtools faidx "${REF_AN6}"
         samtools faidx "${REF_BG5}"
    
         echo "🚀 生成 An6 6mA modBAM (已比对)..."
         "${DORADO}" basecaller \
           --modified-bases 6mA \
           --emit-moves \
           --device cuda:0 \
           --reference "${REF_AN6}" \
           dna_r10.4.1_e8.2_400bps_sup@v5.0.0 \
           "${POD5_AN6}" | samtools view -b - > "${OUTDIR}/An6_6mA_mapped.mod.bam"
    
         echo "🚀 生成 BG5 6mA modBAM (已比对)..."
         "${DORADO}" basecaller \
           --modified-bases 6mA \
           --emit-moves \
           --device cuda:0 \
           --reference "${REF_BG5}" \
           dna_r10.4.1_e8.2_400bps_sup@v5.0.0 \
           "${POD5_BG5}" | samtools view -b - > "${OUTDIR}/BG5_6mA_mapped.mod.bam"
    
         echo "🚀 生成 An6 6mA modBAM (已比对)..."
         "${DORADO}" basecaller \
           --modified-bases 4mC_5mC \
           --emit-moves \
           --device cuda:0 \
           --reference "${REF_AN6}" \
           dna_r10.4.1_e8.2_400bps_sup@v5.0.0 \
           "${POD5_AN6}" | samtools view -b - > "${OUTDIR}/An6_4mC_5mC_mapped.mod.bam"
    
         echo "🚀 生成 BG5 6mA modBAM (已比对)..."
         "${DORADO}" basecaller \
           --modified-bases 4mC_5mC \
           --emit-moves \
           --device cuda:0 \
           --reference "${REF_BG5}" \
           dna_r10.4.1_e8.2_400bps_sup@v5.0.0 \
           "${POD5_BG5}" | samtools view -b - > "${OUTDIR}/BG5_4mC_5mC_mapped.mod.bam"
    
         # 验证输出
         echo "🔍 验证 BAM 文件..."
         for BAM in "${OUTDIR}"/*_6mA_mapped.mod.bam; do
           if samtools quickcheck "${BAM}" 2>/dev/null; then
             READS=$(samtools view -c "${BAM}")
             TAGS=$(samtools view "${BAM}" | head -50 | grep -o "MM:Z:[^ ]*" | sort -u | tr '\n' ' ')
             echo "✅ $(basename "${BAM}"): ${READS} reads, tags: ${TAGS}"
           else
             echo "❌ $(basename "${BAM}"): 文件损坏或格式错误"
           fi
         done
    
         echo "🎉 modBAM 生成完成!"
    
     步骤 3: 创建 methylong samplesheet(使用预生成 modBAM)
     See samplesheet_6mA.csv and samplesheet_4mC_5mC.csv
    
     步骤 4: 运行 methylong with --skip_basecalling
    
         ## Check if BAMs are read-name sorted:
         #samtools view -H An6_6mA_mapped.mod.bam | grep "^@HD"
         ## Should show: @HD VN:1.6 SO:queryname
    
         ## If not sorted correctly:
         #samtools sort -n -o An6.sorted_by_name.bam An6_6mA_mapped.mod.bam
         #samtools sort -n -o BG5.sorted_by_name.bam BG5_6mA_mapped.mod.bam
    
         ## Check BAM integrity:
         #samtools quickcheck -v An6_6mA_mapped.mod.bam
         #samtools quickcheck -v An6_after_trim.bam
    
         ## Use modkit to check tags:
         #modkit modbam check-tags An6_6mA_mapped.mod.bam
         #modkit modbam check-tags An6_after_trim.bam
    
         #BUG: '--skip_basecalling' does not exist (see the complete option list below.)
         nextflow run nf-core/methylong \
             -r 2.0.0 \
             -profile docker \
             --input samplesheet_6mA.csv \
             --outdir methylome_out_6mA \
             --skip_basecalling \          # ⭐ 关键: 跳过内部 Dorado,使用预生成 modBAM
             -resume \
             -work-dir methylome_out_6mA/work
    
         # Ist die BAM-Datei unaligniert? (keine @SQ Header mit Alignment)
         samtools view -H An6_6mA_mapped.mod.bam | head -10
    
         # Sind MM/ML-Tags für 6mA vorhanden?
         samtools view An6_6mA_mapped.mod.bam | head -5 | grep -o "MM:Z:[^ ]*"
    
         #NEXT_WEEK FOLLOWING the command under the website
         #https://bioinformatics.cc/debug-methylomic-analysis-of-data_tam_dnaseq_2026_an6_bg5/
    
         # FIRSTLY RUNNING NEXTFLOW
         # 1️⃣ 处理 6mA BAM(仅含 6mA 信号)
         nextflow run nf-core/methylong -r 2.0.0 -profile docker \
             --input samplesheet_6mA.csv \
             --outdir methylome_out_6mA \
             --ont_aligner minimap2 \
             --m6a \
             --skip_snvs \
             -resume \
             -work-dir methylome_out_6mA/work
         # 2️⃣ 处理 4mC_5mC BAM(含 4mC + 5mC 信号)
         nextflow run nf-core/methylong -r 2.0.0 -profile docker \
             --input samplesheet_4mC_5mC.csv \
             --outdir methylome_out_4mC_5mC \
             --ont_aligner minimap2 \
             --all_contexts \
             --skip_snvs \
             -resume \
             -work-dir methylome_out_4mC_5mC/work
    
     步骤 5: Check output
     [ ] ✅ 本地 Dorado 生成已比对 modBAM: samtools quickcheck An6_6mA_mapped.mod.bam 通过
     [ ] ✅ BAM 含 @SQ 头: samtools view -H An6_6mA_mapped.mod.bam | grep "^@SQ"
     [ ] ✅ BAM 含修饰标签: samtools view An6_6mA_mapped.mod.bam | grep "MM:Z:6mA" | head -3 returns nothing, but samtools view An6_6mA_mapped.mod.bam | grep "MM:Z:" | head -3
     [ ] ✅ samplesheet 使用绝对路径 + 精确 sample 名称
     [ ] ✅ 运行命令添加 --skip_basecalling
     [ ] ✅ 输出目录存在: methylome_out_6mA/methylation_calls/*.bedMethyl.gz
  2. SECONDLY RERUN modkit pileup to correct the BUG from nf-core/methylong

     # ---- 6mA 对应过滤命令 ----
     # 方案 A:降低阈值测试 (NOT_USED)
     # docker run --rm -v /mnt/md1/DATA/Data_Tam_DNAseq_2026_An6_BG5:/data -w /data quay.io/biocontainers/ont-modkit:0.5.0--hcdda2d0_2 modkit pileup ./methylome_out_6mA/ont/An6/alignment/An6_aligned.bam An6_6mA_test.bed --ref bacass_out/Medaka/An6-unicycler-medaka_polished_genome.fa --motif A 0 --filter-threshold A:0.7 --threads 8
    
     方案 B:6mA: 无过滤 + 手动后处理 (CHOSEN)
     # ✅ 第 1 步:生成无过滤的原始结果(6mA)
     docker run --rm -v /mnt/md1/DATA/Data_Tam_DNAseq_2026_An6_BG5:/data -w /data quay.io/biocontainers/ont-modkit:0.5.0--hcdda2d0_2 modkit pileup ./methylome_out_6mA/ont/An6/alignment/An6_aligned.bam An6_6mA_raw.bed --ref bacass_out/Medaka/An6-unicycler-medaka_polished_genome.fa --motif A 0 --no-filtering --threads 8
     docker run --rm -v /mnt/md1/DATA/Data_Tam_DNAseq_2026_An6_BG5:/data -w /data quay.io/biocontainers/ont-modkit:0.5.0--hcdda2d0_2 modkit pileup ./methylome_out_6mA/ont/BG5/alignment/BG5_aligned.bam BG5_6mA_raw.bed --ref bacass_out/Medaka/BG5-unicycler-medaka_polished_genome.fa --motif A 0 --no-filtering --threads 8
    
     # # ✅ 第 2 步:检查原始概率最大值
     # awk '{print $11}' An6_6mA_raw.bed | sort -rn | head -5
     # # 如果最大值 < 70 → 降低分析阈值
    
     # # ✅ 第 3 步:手动过滤生成最终结果
     # # ✅ 过滤高置信度 6mA 位点:概率≥70% + 覆盖度≥10
     # awk '$4 ~ /^A,/ && $11 >= 70 && $5 >= 10' An6_6mA_raw.bed > An6_6mA_final.bed
    
     # # ✅ 第 4 步:验证并统计
     # echo "=== 6mA 最终结果 ==="
     # # 📊 统计结果
     # echo "=== 6mA 高置信度位点统计 ==="
     # echo "总位点数: $(wc -l < An6_6mA_final.bed)"
     # echo ""
     # echo "概率分布 (前10名):"
     # awk '{print $11}' An6_6mA_final.bed | sort -rn | uniq -c | head -10
     # echo ""
     # echo "按染色体分布:"
     # cut -f1 An6_6mA_final.bed | sort | uniq -c | sort -rn
    
     # 方案 C:检查 BAM 信号原始分布 (For DEBUG)
     # # 提取 ML 标签中的概率值 (0-255 缩放 → 转换为 0-1)
     # samtools view ./methylome_out_6mA/ont/An6/alignment/An6_aligned.bam | head -100 | \
     #   grep -oP 'ML:B:C,\K[0-9,]+' | tr ',' '\n' | \
     #   awk '{if($1>0) print $1/255}' | sort -n | uniq -c | head -10
    
     # ---- 4mC_5mC 对应过滤命令 ----
     docker run --rm -v /mnt/md1/DATA/Data_Tam_DNAseq_2026_An6_BG5:/data -w /data quay.io/biocontainers/ont-modkit:0.5.0--hcdda2d0_2 modkit pileup ./methylome_out_4mC_5mC/ont/An6/alignment/An6_aligned.bam An6_4mC_5mC_raw.bed --ref bacass_out/Medaka/An6-unicycler-medaka_polished_genome.fa --motif CG 0 --motif C 0 --no-filtering --threads 8
     docker run --rm -v /mnt/md1/DATA/Data_Tam_DNAseq_2026_An6_BG5:/data -w /data quay.io/biocontainers/ont-modkit:0.5.0--hcdda2d0_2 modkit pileup ./methylome_out_4mC_5mC/ont/BG5/alignment/BG5_aligned.bam BG5_4mC_5mC_raw.bed --ref bacass_out/Medaka/BG5-unicycler-medaka_polished_genome.fa --motif CG 0 --motif C 0 --no-filtering --threads 8
    
     #修饰类型   列4匹配正则  说明
     #5mC @ CG   ^m,CG,  5-甲基胞嘧啶,在CG motif上
     #5mC @ C    ^m,C,   5-甲基胞嘧啶,在单碱基C上
     #4mC @ CG   ^21839,CG,  4-甲基胞嘧啶,在CG motif上
     #4mC @ C    ^21839,C,   4-甲基胞嘧啶,在单碱基C上
    
     ┌─────────────────────────────────────────────────────┐
     │  Column 4 格式:[修饰代码],[motif],[offset]         │
     ├─────────────────────────────────────────────────────┤
     │  m,CG,0      → 5mC @ CG 位点                        │
     │  m,C,0       → 5mC @ 任意 C 位点(包含 CG+非CG)    │
     │  21839,CG,0  → 4mC @ CG 位点                        │
     │  21839,C,0   → 4mC @ 任意 C 位点(包含 CG+非CG)    │
     └─────────────────────────────────────────────────────┘
     ⚠️ 重要提示:m,C,0 包含 所有 C 位点(CG + CHG + CHH),与 m,CG,0 有重叠!
     优先级 1: 4mC @ CG (21839,CG,0)  ← 细菌最主要的修饰
     优先级 2: 5mC @ CG (m,CG,0)      ← 次要修饰
     优先级 3: 非 CG 修饰             ← 探索性分析
     重点关注:
     ✅ 4mC @ CG - 细菌限制修饰系统的核心
     ✅ 5mC @ CG - 部分细菌也有 5mC
     ⚠️ 非 CG 修饰 - 细菌中较少,可先忽略
    
     修饰类型    位点数量    占比  生物学解释
     5mC @ CG    437 33% 经典 CpG 甲基化
     5mC @ non-CG    870 67% ⭐ 非 CG 位点更多,正常!
     4mC @ CG    7   17% 4mC 很少在 CG 上
     4mC @ non-CG    34  83% ⭐ 4mC 偏好非 CG 位点
  3. DNA methylation analysis (Sample An6: Acinetobacter harbinensis and Sample BG5: Pedobacter sp. strain BG5, both are bacteria, not plants.)

     mamba create -n methylation -c conda-forge -c bioconda \
       r-base=4.3 \
       r-dss \
       r-methylkit \
       r-chipseeker \
       r-qvalue \
       metilene \
       bedtools \
       samtools \
       ucsc-bedgraphtobigwig \
       -y
     mamba activate methylation
    
     #chmod +x dna_methylation_analysis_4mC_5mC.sh
     (methylation) bash dna_methylation_analysis_4mC_5mC.sh \
         -i An6_4mC_5mC_raw.bed \
         -r bacass_out/Medaka/An6-unicycler-medaka_polished_genome.fa \
         -o An6_4mC_5mC_methylation_results \
         -c 10 -m 70 -n 50 -f 50 -t 100
    
     #Using homerMotifs.all.motifs rather than nonRedundant.motifs
     (methylation) python annotate_motifs_rebase.py \
         --homer An6_4mC_5mC_methylation_results/motif_analysis/homer/5mC_CG/homerMotifs.all.motifs \
         --rebase withrefm.txt \
         --output An6_5mC_CG_results.tsv
     #  Total motifs: 35
     #  REBASE matches: 8 (22.9%)
     #  ⚠️ Methylase matches: 2
    
     python annotate_motifs_rebase.py \
         --homer An6_4mC_5mC_methylation_results/motif_analysis/homer/4mC_CG/homerMotifs.all.motifs \
         --rebase withrefm.txt \
         --output An6_4mC_CG_results.tsv
     #  Total motifs: Total motifs: 35
     # REBASE matches: 6 (17.1%)
     #⚠️ Methylase matches: 0
    
     python annotate_motifs_rebase.py \
         --homer An6_4mC_5mC_methylation_results/motif_analysis/homer/5mC_nonCG/homerMotifs.all.motifs \
         --rebase withrefm.txt \
         --output An6_5mC_nonCG_results.tsv
     #📊 Summary:
     #Total motifs: 37
     #REBASE matches: 4 (10.8%)
     #⚠️ Methylase matches: 0
    
     python annotate_motifs_rebase.py \
         --homer An6_4mC_5mC_methylation_results/motif_analysis/homer/4mC_nonCG/homerMotifs.all.motifs \
         --rebase withrefm.txt \
         --output An6_4mC_nonCG_results.tsv
     #    Total motifs: 35
     #REBASE matches: 10 (28.6%)
     #⚠️ Methylase matches: 2
    
     bash dna_methylation_analysis_4mC_5mC.sh \
         -i BG5_4mC_5mC_raw.bed \
         -r bacass_out/Medaka/BG5-unicycler-medaka_polished_genome.fa \
         -o BG5_4mC_5mC_methylation_results \
         -c 10 -m 70 -n 50 -f 50 -t 100
    
     python annotate_motifs_rebase.py \
         --homer BG5_4mC_5mC_methylation_results/motif_analysis/homer/5mC_CG/homerMotifs.all.motifs \
         --rebase withrefm.txt \
         --output BG5_5mC_CG_results.tsv
     #  📊 Summary:
     #Total motifs: 36
     #REBASE matches: 7 (19.4%)
     #⚠️ Methylase matches: 1
    
     python annotate_motifs_rebase.py \
         --homer BG5_4mC_5mC_methylation_results/motif_analysis/homer/4mC_CG/homerMotifs.all.motifs \
         --rebase withrefm.txt \
         --output BG5_4mC_CG_results.tsv
     # Total motifs: 35
     #REBASE matches: 8 (22.9%)
     #⚠️ Methylase matches: 3
    
     python annotate_motifs_rebase.py \
         --homer BG5_4mC_5mC_methylation_results/motif_analysis/homer/5mC_nonCG/homerMotifs.all.motifs \
         --rebase withrefm.txt \
         --output BG5_5mC_nonCG_results.tsv
     #  Total motifs: 39
     #REBASE matches: 7 (17.9%)
     #⚠️ Methylase matches: 0
    
     python annotate_motifs_rebase.py \
         --homer  BG5_4mC_5mC_methylation_results/motif_analysis/homer/4mC_nonCG/homerMotifs.all.motifs \
         --rebase withrefm.txt \
         --output  BG5_4mC_nonCG_results.tsv
     #Total motifs: 35
     #REBASE matches: 9 (25.7%)
     #⚠️ Methylase matches: 3
    
     # -- For 6mA --
     ./dna_methylation_analysis_6mA.sh \
         -i An6_6mA_raw.bed \
         -r bacass_out/Medaka/An6-unicycler-medaka_polished_genome.fa \
         -o An6_6mA_methylation_results \
         -c 10 -m 70 -n 50 -f 50 -t 100
    
     python annotate_motifs_rebase.py \
         --homer An6_6mA_methylation_results/nonRedundant.motifs.txt \
         --rebase withrefm.txt \
         --output An6_6mA_results.tsv
     #📊 Summary:
     #Total motifs: 25
     #REBASE matches: 4 (16.0%)
     #⚠️ Methylase matches: 1
    
     ./dna_methylation_analysis_6mA.sh \
         -i BG5_6mA_raw.bed \
         -r bacass_out/Medaka/BG5-unicycler-medaka_polished_genome.fa \
         -o BG5_6mA_methylation_results \
         -c 10 -m 70 -n 50 -f 50 -t 100
    
     python annotate_motifs_rebase.py \
         --homer BG5_6mA_methylation_results/nonRedundant.motifs.txt \
         --rebase withrefm.txt \
         --output BG5_6mA_results.tsv
     #Total motifs: 25
     #REBASE matches: 7 (28.0%)
     #⚠️ Methylase matches: 2
    
     (table1_env) python3 generate_methylation_excel.py
     #Delete the column 'bestguess_tf'; Mark rebase- and methylase-annotated cells yellow; Move 6mA as the first sheet!