dna_methylation_analysis_4mc_5mc.sh dna_methylation_analysis_6mA.sh annotate_motifs_rebase.py generate_methylation_excel.py
-
Run nextflow methylong after running nextflow bacass
✅ ---- 终极解决方案:使用本地 Dorado 预生成 modBAM + --skip_basecalling (绕过容器 CUDA 限制)---- 鉴于容器镜像的复杂性和 CUDA 版本限制,最可靠、最快速的路径是: 使用您已安装的本地 Dorado v2.0.0 生成已比对的 modBAM 运行 methylong 时添加 --skip_basecalling,跳过内部容器化 Dorado 这样完全绕过容器、CUDA、GPU 传递等所有问题! 步骤 1:如果您已安装本地 Dorado(非容器版),可预先生成 modBAM: # 安装本地 Dorado (CPU 或匹配您 CUDA 的版本) # 下载: https://github.com/nanoporetech/dorado/releases; https://software-docs.nanoporetech.com/dorado/latest/ curl "https://cdn.oxfordnanoportal.com/software/analysis/dorado-2.0.0-linux-x64.tar.gz" -o dorado-2.0.0-linux-x64.tar.gz tar -xzf dorado-2.0.0-linux-x64.tar.gz # 添加到 PATH (永久生效) echo 'export PATH="/home/jhuang/Tools/dorado-2.0.0-linux-x64/bin:$PATH"' >> ~/.bashrc source ~/.bashrc # 验证安装: 2.0.0+20e87c8b dorado --version # 查看可用的碱基调用模型 Positional arguments: model Model selection {fast,hac,sup}@v{version} for automatic model selection including modbases, or path to existing model directory. data The data directory or POD5 file path. # 查看可用的修饰检测模型 dorado basecaller --help | grep -A5 "modified-bases" # ✅ 应看到: 6mA, 4mC_5mC, 5mC_5hmC 等 步骤 2: 生成 modBAM ./generate_mapped_modbam.sh #!/bin/bash #=============================================================================== # generate_mapped_modbam.sh - 本地 Dorado 生成已比对 modBAM #=============================================================================== set -euo pipefail # === 配置 === DORADO="/home/jhuang/Tools/dorado-2.0.0-linux-x64/bin/dorado" # 您的本地 Dorado 路径 REF_AN6="/mnt/md1/DATA/Data_Tam_DNAseq_2026_An6_BG5/bacass_out/Medaka/An6-unicycler-medaka_polished_genome.fa" REF_BG5="/mnt/md1/DATA/Data_Tam_DNAseq_2026_An6_BG5/bacass_out/Medaka/BG5-unicycler-medaka_polished_genome.fa" POD5_AN6="./X101SC26036392-Z01-J003/Release-X101SC26036392-Z01-J003-20260513_01/Data-X101SC26036392-Z01-J003/An6/2157_4C_PBK79106_7ec05c46/An6_pod5_pass" POD5_BG5="./X101SC26036392-Z01-J003/Release-X101SC26036392-Z01-J003-20260513_01/Data-X101SC26036392-Z01-J003/BG5/2157_4C_PBK79106_7ec05c46/BG5_pod5_pass" OUTDIR="/mnt/md1/DATA/Data_Tam_DNAseq_2026_An6_BG5" # 确保参考基因组已索引 samtools faidx "${REF_AN6}" samtools faidx "${REF_BG5}" echo "🚀 生成 An6 6mA modBAM (已比对)..." "${DORADO}" basecaller \ --modified-bases 6mA \ --emit-moves \ --device cuda:0 \ --reference "${REF_AN6}" \ dna_r10.4.1_e8.2_400bps_sup@v5.0.0 \ "${POD5_AN6}" | samtools view -b - > "${OUTDIR}/An6_6mA_mapped.mod.bam" echo "🚀 生成 BG5 6mA modBAM (已比对)..." "${DORADO}" basecaller \ --modified-bases 6mA \ --emit-moves \ --device cuda:0 \ --reference "${REF_BG5}" \ dna_r10.4.1_e8.2_400bps_sup@v5.0.0 \ "${POD5_BG5}" | samtools view -b - > "${OUTDIR}/BG5_6mA_mapped.mod.bam" echo "🚀 生成 An6 6mA modBAM (已比对)..." "${DORADO}" basecaller \ --modified-bases 4mC_5mC \ --emit-moves \ --device cuda:0 \ --reference "${REF_AN6}" \ dna_r10.4.1_e8.2_400bps_sup@v5.0.0 \ "${POD5_AN6}" | samtools view -b - > "${OUTDIR}/An6_4mC_5mC_mapped.mod.bam" echo "🚀 生成 BG5 6mA modBAM (已比对)..." "${DORADO}" basecaller \ --modified-bases 4mC_5mC \ --emit-moves \ --device cuda:0 \ --reference "${REF_BG5}" \ dna_r10.4.1_e8.2_400bps_sup@v5.0.0 \ "${POD5_BG5}" | samtools view -b - > "${OUTDIR}/BG5_4mC_5mC_mapped.mod.bam" # 验证输出 echo "🔍 验证 BAM 文件..." for BAM in "${OUTDIR}"/*_6mA_mapped.mod.bam; do if samtools quickcheck "${BAM}" 2>/dev/null; then READS=$(samtools view -c "${BAM}") TAGS=$(samtools view "${BAM}" | head -50 | grep -o "MM:Z:[^ ]*" | sort -u | tr '\n' ' ') echo "✅ $(basename "${BAM}"): ${READS} reads, tags: ${TAGS}" else echo "❌ $(basename "${BAM}"): 文件损坏或格式错误" fi done echo "🎉 modBAM 生成完成!" 步骤 3: 创建 methylong samplesheet(使用预生成 modBAM) See samplesheet_6mA.csv and samplesheet_4mC_5mC.csv 步骤 4: 运行 methylong with --skip_basecalling ## Check if BAMs are read-name sorted: #samtools view -H An6_6mA_mapped.mod.bam | grep "^@HD" ## Should show: @HD VN:1.6 SO:queryname ## If not sorted correctly: #samtools sort -n -o An6.sorted_by_name.bam An6_6mA_mapped.mod.bam #samtools sort -n -o BG5.sorted_by_name.bam BG5_6mA_mapped.mod.bam ## Check BAM integrity: #samtools quickcheck -v An6_6mA_mapped.mod.bam #samtools quickcheck -v An6_after_trim.bam ## Use modkit to check tags: #modkit modbam check-tags An6_6mA_mapped.mod.bam #modkit modbam check-tags An6_after_trim.bam #BUG: '--skip_basecalling' does not exist (see the complete option list below.) nextflow run nf-core/methylong \ -r 2.0.0 \ -profile docker \ --input samplesheet_6mA.csv \ --outdir methylome_out_6mA \ --skip_basecalling \ # ⭐ 关键: 跳过内部 Dorado,使用预生成 modBAM -resume \ -work-dir methylome_out_6mA/work # Ist die BAM-Datei unaligniert? (keine @SQ Header mit Alignment) samtools view -H An6_6mA_mapped.mod.bam | head -10 # Sind MM/ML-Tags für 6mA vorhanden? samtools view An6_6mA_mapped.mod.bam | head -5 | grep -o "MM:Z:[^ ]*" #NEXT_WEEK FOLLOWING the command under the website #https://bioinformatics.cc/debug-methylomic-analysis-of-data_tam_dnaseq_2026_an6_bg5/ # FIRSTLY RUNNING NEXTFLOW # 1️⃣ 处理 6mA BAM(仅含 6mA 信号) nextflow run nf-core/methylong -r 2.0.0 -profile docker \ --input samplesheet_6mA.csv \ --outdir methylome_out_6mA \ --ont_aligner minimap2 \ --m6a \ --skip_snvs \ -resume \ -work-dir methylome_out_6mA/work # 2️⃣ 处理 4mC_5mC BAM(含 4mC + 5mC 信号) nextflow run nf-core/methylong -r 2.0.0 -profile docker \ --input samplesheet_4mC_5mC.csv \ --outdir methylome_out_4mC_5mC \ --ont_aligner minimap2 \ --all_contexts \ --skip_snvs \ -resume \ -work-dir methylome_out_4mC_5mC/work 步骤 5: Check output [ ] ✅ 本地 Dorado 生成已比对 modBAM: samtools quickcheck An6_6mA_mapped.mod.bam 通过 [ ] ✅ BAM 含 @SQ 头: samtools view -H An6_6mA_mapped.mod.bam | grep "^@SQ" [ ] ✅ BAM 含修饰标签: samtools view An6_6mA_mapped.mod.bam | grep "MM:Z:6mA" | head -3 returns nothing, but samtools view An6_6mA_mapped.mod.bam | grep "MM:Z:" | head -3 [ ] ✅ samplesheet 使用绝对路径 + 精确 sample 名称 [ ] ✅ 运行命令添加 --skip_basecalling [ ] ✅ 输出目录存在: methylome_out_6mA/methylation_calls/*.bedMethyl.gz -
SECONDLY RERUN modkit pileup to correct the BUG from nf-core/methylong
# ---- 6mA 对应过滤命令 ---- # 方案 A:降低阈值测试 (NOT_USED) # docker run --rm -v /mnt/md1/DATA/Data_Tam_DNAseq_2026_An6_BG5:/data -w /data quay.io/biocontainers/ont-modkit:0.5.0--hcdda2d0_2 modkit pileup ./methylome_out_6mA/ont/An6/alignment/An6_aligned.bam An6_6mA_test.bed --ref bacass_out/Medaka/An6-unicycler-medaka_polished_genome.fa --motif A 0 --filter-threshold A:0.7 --threads 8 方案 B:6mA: 无过滤 + 手动后处理 (CHOSEN) # ✅ 第 1 步:生成无过滤的原始结果(6mA) docker run --rm -v /mnt/md1/DATA/Data_Tam_DNAseq_2026_An6_BG5:/data -w /data quay.io/biocontainers/ont-modkit:0.5.0--hcdda2d0_2 modkit pileup ./methylome_out_6mA/ont/An6/alignment/An6_aligned.bam An6_6mA_raw.bed --ref bacass_out/Medaka/An6-unicycler-medaka_polished_genome.fa --motif A 0 --no-filtering --threads 8 docker run --rm -v /mnt/md1/DATA/Data_Tam_DNAseq_2026_An6_BG5:/data -w /data quay.io/biocontainers/ont-modkit:0.5.0--hcdda2d0_2 modkit pileup ./methylome_out_6mA/ont/BG5/alignment/BG5_aligned.bam BG5_6mA_raw.bed --ref bacass_out/Medaka/BG5-unicycler-medaka_polished_genome.fa --motif A 0 --no-filtering --threads 8 # # ✅ 第 2 步:检查原始概率最大值 # awk '{print $11}' An6_6mA_raw.bed | sort -rn | head -5 # # 如果最大值 < 70 → 降低分析阈值 # # ✅ 第 3 步:手动过滤生成最终结果 # # ✅ 过滤高置信度 6mA 位点:概率≥70% + 覆盖度≥10 # awk '$4 ~ /^A,/ && $11 >= 70 && $5 >= 10' An6_6mA_raw.bed > An6_6mA_final.bed # # ✅ 第 4 步:验证并统计 # echo "=== 6mA 最终结果 ===" # # 📊 统计结果 # echo "=== 6mA 高置信度位点统计 ===" # echo "总位点数: $(wc -l < An6_6mA_final.bed)" # echo "" # echo "概率分布 (前10名):" # awk '{print $11}' An6_6mA_final.bed | sort -rn | uniq -c | head -10 # echo "" # echo "按染色体分布:" # cut -f1 An6_6mA_final.bed | sort | uniq -c | sort -rn # 方案 C:检查 BAM 信号原始分布 (For DEBUG) # # 提取 ML 标签中的概率值 (0-255 缩放 → 转换为 0-1) # samtools view ./methylome_out_6mA/ont/An6/alignment/An6_aligned.bam | head -100 | \ # grep -oP 'ML:B:C,\K[0-9,]+' | tr ',' '\n' | \ # awk '{if($1>0) print $1/255}' | sort -n | uniq -c | head -10 # ---- 4mC_5mC 对应过滤命令 ---- docker run --rm -v /mnt/md1/DATA/Data_Tam_DNAseq_2026_An6_BG5:/data -w /data quay.io/biocontainers/ont-modkit:0.5.0--hcdda2d0_2 modkit pileup ./methylome_out_4mC_5mC/ont/An6/alignment/An6_aligned.bam An6_4mC_5mC_raw.bed --ref bacass_out/Medaka/An6-unicycler-medaka_polished_genome.fa --motif CG 0 --motif C 0 --no-filtering --threads 8 docker run --rm -v /mnt/md1/DATA/Data_Tam_DNAseq_2026_An6_BG5:/data -w /data quay.io/biocontainers/ont-modkit:0.5.0--hcdda2d0_2 modkit pileup ./methylome_out_4mC_5mC/ont/BG5/alignment/BG5_aligned.bam BG5_4mC_5mC_raw.bed --ref bacass_out/Medaka/BG5-unicycler-medaka_polished_genome.fa --motif CG 0 --motif C 0 --no-filtering --threads 8 #修饰类型 列4匹配正则 说明 #5mC @ CG ^m,CG, 5-甲基胞嘧啶,在CG motif上 #5mC @ C ^m,C, 5-甲基胞嘧啶,在单碱基C上 #4mC @ CG ^21839,CG, 4-甲基胞嘧啶,在CG motif上 #4mC @ C ^21839,C, 4-甲基胞嘧啶,在单碱基C上 ┌─────────────────────────────────────────────────────┐ │ Column 4 格式:[修饰代码],[motif],[offset] │ ├─────────────────────────────────────────────────────┤ │ m,CG,0 → 5mC @ CG 位点 │ │ m,C,0 → 5mC @ 任意 C 位点(包含 CG+非CG) │ │ 21839,CG,0 → 4mC @ CG 位点 │ │ 21839,C,0 → 4mC @ 任意 C 位点(包含 CG+非CG) │ └─────────────────────────────────────────────────────┘ ⚠️ 重要提示:m,C,0 包含 所有 C 位点(CG + CHG + CHH),与 m,CG,0 有重叠! 优先级 1: 4mC @ CG (21839,CG,0) ← 细菌最主要的修饰 优先级 2: 5mC @ CG (m,CG,0) ← 次要修饰 优先级 3: 非 CG 修饰 ← 探索性分析 重点关注: ✅ 4mC @ CG - 细菌限制修饰系统的核心 ✅ 5mC @ CG - 部分细菌也有 5mC ⚠️ 非 CG 修饰 - 细菌中较少,可先忽略 修饰类型 位点数量 占比 生物学解释 5mC @ CG 437 33% 经典 CpG 甲基化 5mC @ non-CG 870 67% ⭐ 非 CG 位点更多,正常! 4mC @ CG 7 17% 4mC 很少在 CG 上 4mC @ non-CG 34 83% ⭐ 4mC 偏好非 CG 位点 -
DNA methylation analysis (Sample An6: Acinetobacter harbinensis and Sample BG5: Pedobacter sp. strain BG5, both are bacteria, not plants.)
mamba create -n methylation -c conda-forge -c bioconda \ r-base=4.3 \ r-dss \ r-methylkit \ r-chipseeker \ r-qvalue \ metilene \ bedtools \ samtools \ ucsc-bedgraphtobigwig \ -y mamba activate methylation #chmod +x dna_methylation_analysis_4mC_5mC.sh (methylation) bash dna_methylation_analysis_4mC_5mC.sh \ -i An6_4mC_5mC_raw.bed \ -r bacass_out/Medaka/An6-unicycler-medaka_polished_genome.fa \ -o An6_4mC_5mC_methylation_results \ -c 10 -m 70 -n 50 -f 50 -t 100 #Using homerMotifs.all.motifs rather than nonRedundant.motifs (methylation) python annotate_motifs_rebase.py \ --homer An6_4mC_5mC_methylation_results/motif_analysis/homer/5mC_CG/homerMotifs.all.motifs \ --rebase withrefm.txt \ --output An6_5mC_CG_results.tsv # Total motifs: 35 # REBASE matches: 8 (22.9%) # ⚠️ Methylase matches: 2 python annotate_motifs_rebase.py \ --homer An6_4mC_5mC_methylation_results/motif_analysis/homer/4mC_CG/homerMotifs.all.motifs \ --rebase withrefm.txt \ --output An6_4mC_CG_results.tsv # Total motifs: Total motifs: 35 # REBASE matches: 6 (17.1%) #⚠️ Methylase matches: 0 python annotate_motifs_rebase.py \ --homer An6_4mC_5mC_methylation_results/motif_analysis/homer/5mC_nonCG/homerMotifs.all.motifs \ --rebase withrefm.txt \ --output An6_5mC_nonCG_results.tsv #📊 Summary: #Total motifs: 37 #REBASE matches: 4 (10.8%) #⚠️ Methylase matches: 0 python annotate_motifs_rebase.py \ --homer An6_4mC_5mC_methylation_results/motif_analysis/homer/4mC_nonCG/homerMotifs.all.motifs \ --rebase withrefm.txt \ --output An6_4mC_nonCG_results.tsv # Total motifs: 35 #REBASE matches: 10 (28.6%) #⚠️ Methylase matches: 2 bash dna_methylation_analysis_4mC_5mC.sh \ -i BG5_4mC_5mC_raw.bed \ -r bacass_out/Medaka/BG5-unicycler-medaka_polished_genome.fa \ -o BG5_4mC_5mC_methylation_results \ -c 10 -m 70 -n 50 -f 50 -t 100 python annotate_motifs_rebase.py \ --homer BG5_4mC_5mC_methylation_results/motif_analysis/homer/5mC_CG/homerMotifs.all.motifs \ --rebase withrefm.txt \ --output BG5_5mC_CG_results.tsv # 📊 Summary: #Total motifs: 36 #REBASE matches: 7 (19.4%) #⚠️ Methylase matches: 1 python annotate_motifs_rebase.py \ --homer BG5_4mC_5mC_methylation_results/motif_analysis/homer/4mC_CG/homerMotifs.all.motifs \ --rebase withrefm.txt \ --output BG5_4mC_CG_results.tsv # Total motifs: 35 #REBASE matches: 8 (22.9%) #⚠️ Methylase matches: 3 python annotate_motifs_rebase.py \ --homer BG5_4mC_5mC_methylation_results/motif_analysis/homer/5mC_nonCG/homerMotifs.all.motifs \ --rebase withrefm.txt \ --output BG5_5mC_nonCG_results.tsv # Total motifs: 39 #REBASE matches: 7 (17.9%) #⚠️ Methylase matches: 0 python annotate_motifs_rebase.py \ --homer BG5_4mC_5mC_methylation_results/motif_analysis/homer/4mC_nonCG/homerMotifs.all.motifs \ --rebase withrefm.txt \ --output BG5_4mC_nonCG_results.tsv #Total motifs: 35 #REBASE matches: 9 (25.7%) #⚠️ Methylase matches: 3 # -- For 6mA -- ./dna_methylation_analysis_6mA.sh \ -i An6_6mA_raw.bed \ -r bacass_out/Medaka/An6-unicycler-medaka_polished_genome.fa \ -o An6_6mA_methylation_results \ -c 10 -m 70 -n 50 -f 50 -t 100 python annotate_motifs_rebase.py \ --homer An6_6mA_methylation_results/nonRedundant.motifs.txt \ --rebase withrefm.txt \ --output An6_6mA_results.tsv #📊 Summary: #Total motifs: 25 #REBASE matches: 4 (16.0%) #⚠️ Methylase matches: 1 ./dna_methylation_analysis_6mA.sh \ -i BG5_6mA_raw.bed \ -r bacass_out/Medaka/BG5-unicycler-medaka_polished_genome.fa \ -o BG5_6mA_methylation_results \ -c 10 -m 70 -n 50 -f 50 -t 100 python annotate_motifs_rebase.py \ --homer BG5_6mA_methylation_results/nonRedundant.motifs.txt \ --rebase withrefm.txt \ --output BG5_6mA_results.tsv #Total motifs: 25 #REBASE matches: 7 (28.0%) #⚠️ Methylase matches: 2 (table1_env) python3 generate_methylation_excel.py #Delete the column 'bestguess_tf'; Mark rebase- and methylase-annotated cells yellow; Move 6mA as the first sheet!