🚀 DNA甲基化分析一体化脚本
这是一个完整的”过滤 + 分离 + 分析”自动化脚本:
#!/bin/bash
#===============================================================================
# 脚本名称: dna_methylation_analysis.sh
# 功能描述: DNA甲基化数据过滤、分离、motif分析和可视化一体化流程
# 作者: Bioinformatics Pipeline
# 版本: 1.0
# 日期: 2026
# 用法: bash dna_methylation_analysis.sh -i input.bed -r reference.fa -o output_dir
#===============================================================================
set -euo pipefail # 严格模式:遇到错误立即退出
#-------------------------------------------------------------------------------
# 配置参数
#-------------------------------------------------------------------------------
# 默认参数
INPUT_BED=""
REF_GENOME=""
OUTPUT_DIR="methylation_analysis"
MIN_COVERAGE=10
MIN_MOD_5MC=70
MIN_MOD_4MC=50
FLANK_SIZE=50
THREADS=8
RUN_HOMER=true
RUN_MEME=false
RUN_VISUALIZATION=true
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# 日志函数
log_info() {
echo -e "${GREEN}[INFO]${NC} $(date '+%Y-%m-%d %H:%M:%S') - $1"
}
log_warn() {
echo -e "${YELLOW}[WARN]${NC} $(date '+%Y-%m-%d %H:%M:%S') - $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $(date '+%Y-%m-%d %H:%M:%S') - $1"
exit 1
}
log_step() {
echo -e "${BLUE}[STEP]${NC} $(date '+%Y-%m-%d %H:%M:%S') - $1"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
}
#-------------------------------------------------------------------------------
# 参数解析
#-------------------------------------------------------------------------------
usage() {
cat << EOF
用法: $0 -i <input.bed> -r <reference.fa> [选项]
必需参数:
-i, --input FILE 输入的bed文件 (modkit pileup输出)
-r, --reference FILE 参考基因组fasta文件
可选参数:
-o, --outdir DIR 输出目录 (默认: methylation_analysis)
-c, --coverage INT 最小覆盖度 (默认: 10)
-m, --mod5mc INT 5mC最小修饰率% (默认: 70)
-n, --mod4mc INT 4mC最小修饰率% (默认: 50)
-f, --flank INT 提取序列的侧翼大小 (默认: 50)
-t, --threads INT 并行线程数 (默认: 8)
--no-homer 不运行HOMER motif分析
--meme 运行MEME motif分析 (默认关闭)
--no-viz 不生成可视化图表
-h, --help 显示帮助信息
示例:
$0 -i An6_4mC_5mC_raw.bed -r genome.fa -o results -c 10 -m 70
$0 -i input.bed -r genome.fa --mod5mc 80 --mod4mc 60 --threads 16
EOF
exit 0
}
# 解析命令行参数
while [[ $# -gt 0 ]]; do
case $1 in
-i|--input)
INPUT_BED="$2"
shift 2
;;
-r|--reference)
REF_GENOME="$2"
shift 2
;;
-o|--outdir)
OUTPUT_DIR="$2"
shift 2
;;
-c|--coverage)
MIN_COVERAGE="$2"
shift 2
;;
-m|--mod5mc)
MIN_MOD_5MC="$2"
shift 2
;;
-n|--mod4mc)
MIN_MOD_4MC="$2"
shift 2
;;
-f|--flank)
FLANK_SIZE="$2"
shift 2
;;
-t|--threads)
THREADS="$2"
shift 2
;;
--no-homer)
RUN_HOMER=false
shift
;;
--meme)
RUN_MEME=true
shift
;;
--no-viz)
RUN_VISUALIZATION=false
shift
;;
-h|--help)
usage
;;
*)
log_error "未知参数: $1"
usage
;;
esac
done
#-------------------------------------------------------------------------------
# 参数验证
#-------------------------------------------------------------------------------
log_step "验证输入参数"
[[ -z "$INPUT_BED" ]] && log_error "未指定输入bed文件 (-i)"
[[ -z "$REF_GENOME" ]] && log_error "未指定参考基因组 (-r)"
[[ ! -f "$INPUT_BED" ]] && log_error "输入文件不存在: $INPUT_BED"
[[ ! -f "$REF_GENOME" ]] && log_error "参考基因组不存在: $REF_GENOME"
# 检查必需工具
check_tool() {
if ! command -v "$1" &> /dev/null; then
log_warn "未找到工具: $1 (某些功能可能不可用)"
return 1
fi
return 0
}
log_info "检查必需工具..."
check_tool samtools || SAMTOOLS_FOUND=false
check_tool bedtools || BEDTOOLS_FOUND=false
check_tool awk || AWK_FOUND=false
if [[ "$RUN_HOMER" == "true" ]]; then
check_tool findMotifsGenome.pl || HOMER_FOUND=false
fi
#-------------------------------------------------------------------------------
# 创建输出目录结构
#-------------------------------------------------------------------------------
log_step "创建输出目录"
mkdir -p "$OUTPUT_DIR"/{filtered,sequences,motif_analysis/homer,motif_analysis/meme,figures,reports,logs}
# 记录配置
cat > "$OUTPUT_DIR/reports/analysis_config.txt" << EOF
=====================================
DNA甲基化分析配置
=====================================
分析时间: $(date)
输入文件: $INPUT_BED
参考基因组: $REF_GENOME
输出目录: $OUTPUT_DIR
过滤参数:
最小覆盖度: $MIN_COVERAGE
5mC最小修饰率: ${MIN_MOD_5MC}%
4mC最小修饰率: ${MIN_MOD_4MC}%
侧翼序列大小: ${FLANK_SIZE}bp
分析选项:
运行HOMER: $RUN_HOMER
运行MEME: $RUN_MEME
生成可视化: $RUN_VISUALIZATION
线程数: $THREADS
=====================================
EOF
log_info "配置已保存到: $OUTPUT_DIR/reports/analysis_config.txt"
#-------------------------------------------------------------------------------
# 步骤1: 数据质量评估
#-------------------------------------------------------------------------------
log_step "步骤1/7: 数据质量评估"
RAW_COUNT=$(wc -l < "$INPUT_BED")
log_info "原始数据总行数: $RAW_COUNT"
# 统计各类型位点数量
log_info "统计各修饰类型..."
COUNT_5MC_CG=$(awk -F'\t' '$4 ~ /^m,CG,/' "$INPUT_BED" | wc -l)
COUNT_5MC_NONCG=$(awk -F'\t' '$4 ~ /^m,C,/ && $4 !~ /^m,CG,/' "$INPUT_BED" | wc -l)
COUNT_4MC_CG=$(awk -F'\t' '$4 ~ /^21839,CG,/' "$INPUT_BED" | wc -l)
COUNT_4MC_NONCG=$(awk -F'\t' '$4 ~ /^21839,C,/ && $4 !~ /^21839,CG,/' "$INPUT_BED" | wc -l)
cat > "$OUTPUT_DIR/reports/raw_statistics.txt" << EOF
原始数据统计 (无过滤)
=====================================
5mC @ CG: $COUNT_5MC_CG
5mC @ non-CG: $COUNT_5MC_NONCG
4mC @ CG: $COUNT_4MC_CG
4mC @ non-CG: $COUNT_4MC_NONCG
-------------------------------------
总计: $RAW_COUNT
=====================================
EOF
# 修饰率分布
log_info "计算修饰率分布..."
awk -F'\t' '{print $11}' "$INPUT_BED" | \
awk '{
if($1>=90) h["90-100"]++;
else if($1>=70) h["70-90"]++;
else if($1>=50) h["50-70"]++;
else if($1>=30) h["30-50"]++;
else h["0-30"]++
} END {
print "修饰率分布:";
for(i in h) print i"%: "h[i]
}' > "$OUTPUT_DIR/reports/mod_rate_distribution.txt"
log_info "质量评估完成!"
#-------------------------------------------------------------------------------
# 步骤2: 数据过滤和分离
#-------------------------------------------------------------------------------
log_step "步骤2/7: 数据过滤和分离"
# 定义过滤函数
filter_and_separate() {
local type_name=$1
local pattern=$2
local min_mod=$3
local output_file="$OUTPUT_DIR/filtered/${type_name}_filtered.bed"
log_info "过滤 $type_name (修饰率≥${min_mod}%, 覆盖度≥$MIN_COVERAGE)..."
awk -F'\t' -v pat="$pattern" -v min_mod="$min_mod" -v min_cov="$MIN_COVERAGE" '
$4 ~ pat && $11 >= min_mod && $5 >= min_cov {
print $0
}' "$INPUT_BED" > "$output_file"
local count=$(wc -l < "$output_file")
log_info " → 保留 $count 个位点"
echo "$count"
}
# 过滤和分离4种类型
COUNT_FILT_5MC_CG=$(filter_and_separate "5mC_CG" "^m,CG," "$MIN_MOD_5MC")
COUNT_FILT_5MC_NONCG=$(filter_and_separate "5mC_nonCG" "^m,C," "$MIN_MOD_5MC" | \
awk -v pat="^m,CG," '$4 !~ pat' > "$OUTPUT_DIR/filtered/5mC_nonCG_filtered.bed" && \
wc -l < "$OUTPUT_DIR/filtered/5mC_nonCG_filtered.bed")
# 重新正确过滤5mC non-CG(排除CG)
awk -F'\t' -v min_mod="$MIN_MOD_5MC" -v min_cov="$MIN_COVERAGE" '
$4 ~ /^m,C,/ && $4 !~ /^m,CG,/ && $11 >= min_mod && $5 >= min_cov {
print $0
}' "$INPUT_BED" > "$OUTPUT_DIR/filtered/5mC_nonCG_filtered.bed"
COUNT_FILT_5MC_NONCG=$(wc -l < "$OUTPUT_DIR/filtered/5mC_nonCG_filtered.bed")
COUNT_FILT_4MC_CG=$(filter_and_separate "4mC_CG" "^21839,CG," "$MIN_MOD_4MC")
COUNT_FILT_4MC_NONCG=$(filter_and_separate "4mC_nonCG" "^21839,C," "$MIN_MOD_4MC" | \
awk -v pat="^21839,CG," '$4 !~ pat' > "$OUTPUT_DIR/filtered/4mC_nonCG_filtered.bed" && \
wc -l < "$OUTPUT_DIR/filtered/4mC_nonCG_filtered.bed")
# 重新正确过滤4mC non-CG
awk -F'\t' -v min_mod="$MIN_MOD_4MC" -v min_cov="$MIN_COVERAGE" '
$4 ~ /^21839,C,/ && $4 !~ /^21839,CG,/ && $11 >= min_mod && $5 >= min_cov {
print $0
}' "$INPUT_BED" > "$OUTPUT_DIR/filtered/4mC_nonCG_filtered.bed"
COUNT_FILT_4MC_NONCG=$(wc -l < "$OUTPUT_DIR/filtered/4mC_nonCG_filtered.bed")
# 生成过滤统计报告
cat > "$OUTPUT_DIR/reports/filtering_summary.txt" << EOF
过滤结果统计
=====================================
过滤条件:
最小覆盖度: $MIN_COVERAGE
5mC修饰率阈值: ${MIN_MOD_5MC}%
4mC修饰率阈值: ${MIN_MOD_4MC}%
过滤后位点数量:
5mC @ CG: $COUNT_FILT_5MC_CG (原始: $COUNT_5MC_CG, 保留: $(awk "BEGIN {printf \"%.1f\", $COUNT_FILT_5MC_CG/$COUNT_5MC_CG*100}")%)
5mC @ non-CG: $COUNT_FILT_5MC_NONCG (原始: $COUNT_5MC_NONCG, 保留: $(awk "BEGIN {printf \"%.1f\", $COUNT_FILT_5MC_NONCG/$COUNT_5MC_NONCG*100}")%)
4mC @ CG: $COUNT_FILT_4MC_CG (原始: $COUNT_4MC_CG, 保留: $(awk "BEGIN {printf \"%.1f\", $COUNT_FILT_4MC_CG/$COUNT_4MC_CG*100}")%)
4mC @ non-CG: $COUNT_FILT_4MC_NONCG (原始: $COUNT_4MC_NONCG, 保留: $(awk "BEGIN {printf \"%.1f\", $COUNT_FILT_4MC_NONCG/$COUNT_4MC_NONCG*100}")%)
=====================================
EOF
log_info "过滤统计已保存到: $OUTPUT_DIR/reports/filtering_summary.txt"
cat "$OUTPUT_DIR/reports/filtering_summary.txt"
#-------------------------------------------------------------------------------
# 步骤3: 提取序列上下文
#-------------------------------------------------------------------------------
log_step "步骤3/7: 提取序列上下文"
# 检查bedtools
if [[ "${BEDTOOLS_FOUND:-true}" != "true" ]]; then
log_warn "bedtools未安装,跳过序列提取"
SEQUENCES_EXTRACTED=false
else
SEQUENCES_EXTRACTED=true
# 索引参考基因组
if [[ ! -f "${REF_GENOME}.fai" ]]; then
log_info "索引参考基因组..."
samtools faidx "$REF_GENOME"
fi
# 创建染色体大小文件
cut -f1,2 "${REF_GENOME}.fai" > "$OUTPUT_DIR/reports/chrom.sizes"
# 提取序列函数
extract_sequences() {
local bed_file=$1
local type_name=$2
local output_fa="$OUTPUT_DIR/sequences/${type_name}_sequences.fa"
if [[ ! -s "$bed_file" ]]; then
log_warn "$type_name 位点数为0,跳过序列提取"
return
fi
log_info "提取 $type_name 序列 (±${FLANK_SIZE}bp)..."
# 生成BED文件(带侧翼序列)
awk -F'\t' -v flank="$FLANK_SIZE" '{
start = ($2 - flank > 0) ? $2 - flank : 0;
end = $3 + flank;
print $1"\t"start"\t"end"\t"$4"_"NR
}' "$bed_file" > "$OUTPUT_DIR/sequences/${type_name}_regions.bed"
# 提取fasta
bedtools getfasta -fi "$REF_GENOME" \
-bed "$OUTPUT_DIR/sequences/${type_name}_regions.bed" \
-fo "$output_fa" \
-nameOnly
local seq_count=$(grep -c "^>" "$output_fa" || echo 0)
log_info " → 提取 $seq_count 条序列"
}
# 为每个类型提取序列
extract_sequences "$OUTPUT_DIR/filtered/5mC_CG_filtered.bed" "5mC_CG"
extract_sequences "$OUTPUT_DIR/filtered/5mC_nonCG_filtered.bed" "5mC_nonCG"
extract_sequences "$OUTPUT_DIR/filtered/4mC_CG_filtered.bed" "4mC_CG"
extract_sequences "$OUTPUT_DIR/filtered/4mC_nonCG_filtered.bed" "4mC_nonCG"
fi
#-------------------------------------------------------------------------------
# 步骤4: 生成背景序列
#-------------------------------------------------------------------------------
log_step "步骤4/7: 生成背景序列"
if [[ "$SEQUENCES_EXTRACTED" == "true" ]]; then
# 合并所有target区域用于生成背景
cat "$OUTPUT_DIR/sequences/"*_regions.bed 2>/dev/null | \
bedtools shuffle -g "$OUTPUT_DIR/reports/chrom.sizes" | \
bedtools getfasta -fi "$REF_GENOME" -bed stdin \
-fo "$OUTPUT_DIR/sequences/background_sequences.fa" \
-nameOnly
BG_COUNT=$(grep -c "^>" "$OUTPUT_DIR/sequences/background_sequences.fa" || echo 0)
log_info "生成 $BG_COUNT 条背景序列"
else
log_warn "跳过背景序列生成"
fi
#-------------------------------------------------------------------------------
# 步骤5: Motif富集分析 (HOMER)
#-------------------------------------------------------------------------------
log_step "步骤5/7: Motif富集分析 (HOMER)"
if [[ "$RUN_HOMER" == "true" ]]; then
if [[ "${HOMER_FOUND:-true}" != "true" ]]; then
log_warn "HOMER未安装,跳过motif分析"
RUN_HOMER=false
else
run_homer_motif() {
local type_name=$1
local bed_file="$OUTPUT_DIR/sequences/${type_name}_regions.bed"
local output_dir="$OUTPUT_DIR/motif_analysis/homer/${type_name}"
if [[ ! -s "$bed_file" ]]; then
log_warn "$type_name 无位点,跳过HOMER分析"
return
fi
local region_count=$(wc -l < "$bed_file")
if [[ $region_count -lt 10 ]]; then
log_warn "$type_name 位点数过少 ($region_count),跳过HOMER分析"
return
fi
log_info "运行HOMER: $type_name..."
mkdir -p "$output_dir"
findMotifsGenome.pl "$bed_file" \
"$REF_GENOME" \
"$output_dir" \
-bg "$OUTPUT_DIR/sequences/background_sequences.fa" \
-len 6,8,10,12 \
-p "$THREADS" \
> "$output_dir/homer.log" 2>&1
log_info " → 结果保存至: $output_dir"
}
run_homer_motif "5mC_CG"
run_homer_motif "5mC_nonCG"
run_homer_motif "4mC_CG"
run_homer_motif "4mC_nonCG"
# 汇总HOMER结果
log_info "汇总HOMER结果..."
echo "HOMER Motif富集结果摘要" > "$OUTPUT_DIR/reports/homer_summary.txt"
echo "=====================================" >> "$OUTPUT_DIR/reports/homer_summary.txt"
for type in 5mC_CG 5mC_nonCG 4mC_CG 4mC_nonCG; do
results_file="$OUTPUT_DIR/motif_analysis/homer/${type}/knownResults.txt"
if [[ -f "$results_file" ]]; then
echo -e "\n$type 前5个富集motif:" >> "$OUTPUT_DIR/reports/homer_summary.txt"
head -6 "$results_file" >> "$OUTPUT_DIR/reports/homer_summary.txt"
fi
done
fi
else
log_info "HOMER分析已禁用"
fi
#-------------------------------------------------------------------------------
# 步骤6: Motif富集分析 (MEME - 可选)
#-------------------------------------------------------------------------------
log_step "步骤6/7: Motif富集分析 (MEME)"
if [[ "$RUN_MEME" == "true" ]]; then
if ! command -v dreme &> /dev/null; then
log_warn "MEME Suite未安装,跳过MEME分析"
RUN_MEME=false
else
run_meme_dreme() {
local type_name=$1
local seq_file="$OUTPUT_DIR/sequences/${type_name}_sequences.fa"
local output_dir="$OUTPUT_DIR/motif_analysis/meme/${type_name}"
if [[ ! -s "$seq_file" ]]; then
return
fi
local seq_count=$(grep -c "^>" "$seq_file" || echo 0)
if [[ $seq_count -lt 10 ]]; then
return
fi
log_info "运行DREME: $type_name..."
mkdir -p "$output_dir"
dreme -p "$seq_file" \
-n "$OUTPUT_DIR/sequences/background_sequences.fa" \
-oc "$output_dir" \
-e 0.05 \
-p "$THREADS" \
> "$output_dir/dreme.log" 2>&1
log_info " → 结果保存至: $output_dir"
}
run_meme_dreme "5mC_CG"
run_meme_dreme "5mC_nonCG"
run_meme_dreme "4mC_CG"
run_meme_dreme "4mC_nonCG"
fi
else
log_info "MEME分析已禁用"
fi
#-------------------------------------------------------------------------------
# 步骤7: 生成可视化图表
#-------------------------------------------------------------------------------
log_step "步骤7/7: 生成可视化图表"
if [[ "$RUN_VISUALIZATION" == "true" ]]; then
# 创建Python可视化脚本
cat > "$OUTPUT_DIR/generate_plots.py" << 'PYTHON_SCRIPT'
#!/usr/bin/env python3
"""DNA甲基化数据可视化脚本"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
# 设置风格
sns.set_style("whitegrid")
sns.set_context("paper", font_scale=1.2)
plt.rcParams['font.sans-serif'] = ['Arial', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False
def load_data(bed_file):
"""加载bed文件"""
columns = ['chrom', 'start', 'end', 'name', 'score', 'strand',
'thickStart', 'thickEnd', 'color', 'Nvalid_cov',
'percent_modified', 'Nmod', 'Ncanonical', 'Nother_mod',
'Ndelete', 'Nfail', 'Ndiff', 'Nnocall']
return pd.read_csv(bed_file, sep='\t', header=None, names=columns)
def classify_modifications(df):
"""分类修饰类型"""
df = df.copy()
df['mod_type'] = df['name'].apply(lambda x: '5mC' if x.startswith('m,') else '4mC')
df['motif_type'] = df['name'].apply(lambda x: 'CG' if ',CG,' in x else 'non-CG')
df['category'] = df['mod_type'] + '_' + df['motif_type']
return df
def plot_counts_comparison(df_raw, df_filt, output_file):
"""图1: 位点数量对比"""
fig, ax = plt.subplots(figsize=(10, 6))
categories = ['5mC_CG', '5mC_nonCG', '4mC_CG', '4mC_nonCG']
counts_raw = [len(df_raw[df_raw['category']==c]) for c in categories]
counts_filt = [len(df_filt[df_filt['category']==c]) for c in categories]
x = np.arange(len(categories))
width = 0.35
bars1 = ax.bar(x - width/2, counts_raw, width, label='原始数据',
color='lightgray', edgecolor='black')
bars2 = ax.bar(x + width/2, counts_filt, width, label=f'过滤后',
color='skyblue', edgecolor='black')
ax.set_ylabel('位点数量', fontsize=12)
ax.set_title('A: 甲基化位点数量统计', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(['5mC@CG', '5mC@non-CG', '4mC@CG', '4mC@non-CG'])
ax.legend()
ax.grid(axis='y', alpha=0.3)
# 添加数值标签
for bars in [bars1, bars2]:
for bar in bars:
height = bar.get_height()
if height > 0:
ax.annotate(f'{int(height)}',
xy=(bar.get_x() + bar.get_width() / 2, height),
xytext=(0, 3),
textcoords="offset points",
ha='center', va='bottom', fontsize=9)
plt.tight_layout()
plt.savefig(output_file, dpi=300, bbox_inches='tight')
plt.close()
print(f" ✓ 保存: {output_file}")
def plot_modification_distribution(df, output_file):
"""图2: 修饰率分布"""
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes = axes.flatten()
categories = ['5mC_CG', '5mC_nonCG', '4mC_CG', '4mC_nonCG']
colors = {'5mC_CG': '#1f77b4', '5mC_nonCG': '#ff7f0e',
'4mC_CG': '#2ca02c', '4mC_nonCG': '#d62728'}
titles = {'5mC_CG': '5mC @ CG', '5mC_nonCG': '5mC @ non-CG',
'4mC_CG': '4mC @ CG', '4mC_nonCG': '4mC @ non-CG'}
for idx, cat in enumerate(categories):
data = df[df['category']==cat]['percent_modified']
if len(data) > 0:
sns.histplot(data=data, ax=axes[idx], color=colors[cat],
bins=50, kde=True, edgecolor='black')
axes[idx].axvline(x=50, color='red', linestyle='--', alpha=0.7, label='阈值 50%')
axes[idx].axvline(x=70, color='darkred', linestyle='--', alpha=0.7, label='阈值 70%')
axes[idx].set_xlabel('修饰率 (%)')
axes[idx].set_ylabel('位点数量')
axes[idx].set_title(f'{titles[cat]} (n={len(data)})')
axes[idx].legend(fontsize=8)
axes[idx].grid(axis='y', alpha=0.3)
else:
axes[idx].text(0.5, 0.5, '无数据', ha='center', va='center',
transform=axes[idx].transAxes, fontsize=14)
plt.suptitle('B: 修饰率分布', fontsize=14, fontweight='bold', y=0.995)
plt.tight_layout()
plt.savefig(output_file, dpi=300, bbox_inches='tight')
plt.close()
print(f" ✓ 保存: {output_file}")
def plot_coverage_distribution(df, output_file):
"""图3: 覆盖度分布"""
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# 5mC
data_5mC_CG = df[df['category']=='5mC_CG']['Nvalid_cov']
data_5mC_nonCG = df[df['category']=='5mC_nonCG']['Nvalid_cov']
if len(data_5mC_CG) > 0:
sns.kdeplot(data=data_5mC_CG, ax=axes[0], label='5mC@CG',
color='#1f77b4', linewidth=2)
if len(data_5mC_nonCG) > 0:
sns.kdeplot(data=data_5mC_nonCG, ax=axes[0], label='5mC@non-CG',
color='#ff7f0e', linewidth=2)
axes[0].set_xlabel('覆盖度 (Reads)')
axes[0].set_ylabel('密度')
axes[0].set_title('C: 5mC 覆盖度分布')
axes[0].legend()
axes[0].grid(axis='y', alpha=0.3)
# 4mC
data_4mC_CG = df[df['category']=='4mC_CG']['Nvalid_cov']
data_4mC_nonCG = df[df['category']=='4mC_nonCG']['Nvalid_cov']
if len(data_4mC_CG) > 0:
sns.kdeplot(data=data_4mC_CG, ax=axes[1], label='4mC@CG',
color='#2ca02c', linewidth=2)
if len(data_4mC_nonCG) > 0:
sns.kdeplot(data=data_4mC_nonCG, ax=axes[1], label='4mC@non-CG',
color='#d62728', linewidth=2)
axes[1].set_xlabel('覆盖度 (Reads)')
axes[1].set_ylabel('密度')
axes[1].set_title('D: 4mC 覆盖度分布')
axes[1].legend()
axes[1].grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig(output_file, dpi=300, bbox_inches='tight')
plt.close()
print(f" ✓ 保存: {output_file}")
def generate_statistics_table(df_raw, df_filt, output_file):
"""图4: 统计汇总表"""
fig, ax = plt.subplots(figsize=(10, 4))
ax.axis('off')
categories = ['5mC_CG', '5mC_nonCG', '4mC_CG', '4mC_nonCG']
stats_data = []
for cat in categories:
cat_raw = df_raw[df_raw['category']==cat]
cat_filt = df_filt[df_filt['category']==cat]
if len(cat_raw) > 0:
retention = len(cat_filt) / len(cat_raw) * 100
else:
retention = 0
stats_data.append({
'类型': cat.replace('_', ' @ '),
'原始位点数': len(cat_raw),
'过滤后位点数': len(cat_filt),
'保留率(%)': f"{retention:.1f}",
'中位修饰率(%)': f"{cat_filt['percent_modified'].median():.2f}" if len(cat_filt)>0 else "N/A",
'平均修饰率(%)': f"{cat_filt['percent_modified'].mean():.2f}" if len(cat_filt)>0 else "N/A",
'中位覆盖度': f"{cat_filt['Nvalid_cov'].median():.0f}" if len(cat_filt)>0 else "N/A",
})
df_stats = pd.DataFrame(stats_data)
table = ax.table(cellText=df_stats.values, colLabels=df_stats.columns,
cellLoc='center', loc='center', bbox=[0, 0, 1, 1])
table.auto_set_font_size(False)
table.set_fontsize(9)
table.scale(1.2, 1.8)
# 设置表头样式
for i in range(len(df_stats.columns)):
table[(0, i)].set_facecolor('#4472C4')
table[(0, i)].set_text_props(color='white', fontweight='bold')
plt.savefig(output_file, dpi=300, bbox_inches='tight')
plt.close()
print(f" ✓ 保存: {output_file}")
def main():
input_bed = sys.argv[1]
output_dir = sys.argv[2]
print("📊 生成可视化图表...")
# 加载数据
df_raw = load_data(input_bed)
df_raw = classify_modifications(df_raw)
# 加载过滤后数据
filtered_files = {
'5mC_CG': f"{output_dir}/filtered/5mC_CG_filtered.bed",
'5mC_nonCG': f"{output_dir}/filtered/5mC_nonCG_filtered.bed",
'4mC_CG': f"{output_dir}/filtered/4mC_CG_filtered.bed",
'4mC_nonCG': f"{output_dir}/filtered/4mC_nonCG_filtered.bed"
}
# 合并过滤后的数据
df_list = []
for cat, fpath in filtered_files.items():
if os.path.exists(fpath) and os.path.getsize(fpath) > 0:
df_tmp = load_data(fpath)
df_tmp['mod_type'] = '5mC' if '5mC' in cat else '4mC'
df_tmp['motif_type'] = 'CG' if 'CG' in cat else 'non-CG'
df_tmp['category'] = cat
df_list.append(df_tmp)
if df_list:
df_filt = pd.concat(df_list, ignore_index=True)
else:
df_filt = df_raw.iloc[0:0] # 空DataFrame
# 生成图表
os.makedirs(f"{output_dir}/figures", exist_ok=True)
plot_counts_comparison(df_raw, df_filt, f"{output_dir}/figures/Fig1_counts.png")
plot_modification_distribution(df_filt, f"{output_dir}/figures/Fig2_mod_distribution.png")
plot_coverage_distribution(df_filt, f"{output_dir}/figures/Fig3_coverage.png")
generate_statistics_table(df_raw, df_filt, f"{output_dir}/figures/Fig4_statistics.png")
print("✅ 所有图表生成完成!")
if __name__ == "__main__":
if len(sys.argv) != 3:
print(f"用法: {sys.argv[0]} <input.bed> <output_dir>")
sys.exit(1)
main()
PYTHON_SCRIPT
chmod +x "$OUTPUT_DIR/generate_plots.py"
# 运行Python脚本
if command -v python3 &> /dev/null; then
log_info "运行Python可视化脚本..."
python3 "$OUTPUT_DIR/generate_plots.py" "$INPUT_BED" "$OUTPUT_DIR" 2>&1 | tee "$OUTPUT_DIR/logs/visualization.log"
else
log_warn "Python3未安装,跳过可视化生成"
fi
else
log_info "可视化已禁用"
fi
#-------------------------------------------------------------------------------
# 生成最终报告
#-------------------------------------------------------------------------------
log_step "生成分析总结报告"
cat > "$OUTPUT_DIR/reports/FINAL_REPORT.txt" << EOF
================================================================================
DNA甲基化分析总结报告
================================================================================
生成时间: $(date)
分析脚本: $0
📁 输入文件:
- BED文件: $INPUT_BED
- 参考基因组: $REF_GENOME
⚙️ 分析参数:
- 最小覆盖度: $MIN_COVERAGE
- 5mC修饰率阈值: ${MIN_MOD_5MC}%
- 4mC修饰率阈值: ${MIN_MOD_4MC}%
- 侧翼序列: ±${FLANK_SIZE}bp
- 线程数: $THREADS
📊 数据统计:
$(cat "$OUTPUT_DIR/reports/filtering_summary.txt")
📂 输出文件结构:
$OUTPUT_DIR/
├── filtered/ # 过滤后的bed文件
│ ├── 5mC_CG_filtered.bed
│ ├── 5mC_nonCG_filtered.bed
│ ├── 4mC_CG_filtered.bed
│ └── 4mC_nonCG_filtered.bed
├── sequences/ # 提取的fasta序列
├── motif_analysis/ # Motif富集分析结果
│ └── homer/
├── figures/ # 可视化图表
└── reports/ # 统计报告
🔬 主要发现:
- 5mC主要富集在: $(if [[ -f "$OUTPUT_DIR/motif_analysis/homer/5mC_nonCG/knownResults.txt" ]]; then head -2 "$OUTPUT_DIR/motif_analysis/homer/5mC_nonCG/knownResults.txt" | tail -1 | cut -f1; else "待分析"; fi)
- 4mC主要富集在: $(if [[ -f "$OUTPUT_DIR/motif_analysis/homer/4mC_nonCG/knownResults.txt" ]]; then head -2 "$OUTPUT_DIR/motif_analysis/homer/4mC_nonCG/knownResults.txt" | tail -1 | cut -f1; else "待分析"; fi)
================================================================================
分析完成!
================================================================================
EOF
log_info "最终报告: $OUTPUT_DIR/reports/FINAL_REPORT.txt"
cat "$OUTPUT_DIR/reports/FINAL_REPORT.txt"
#-------------------------------------------------------------------------------
# 清理临时文件
#-------------------------------------------------------------------------------
log_step "清理临时文件"
# 可选:删除大型中间文件
# rm -rf "$OUTPUT_DIR/sequences/"*_regions.bed
log_info "分析流程完成!✅"
log_info "查看最终报告: cat $OUTPUT_DIR/reports/FINAL_REPORT.txt"
exit 0
🚀 使用方法
1️⃣ 保存脚本
# 保存为 dna_methylation_analysis.sh
nano dna_methylation_analysis.sh
# 粘贴上面的代码,保存退出
# 赋予执行权限
chmod +x dna_methylation_analysis.sh
2️⃣ 运行分析
# 基本用法
bash dna_methylation_analysis.sh \
-i An6_4mC_5mC_raw.bed \
-r bacass_out/Medaka/An6-unicycler-medaka_polished_genome.fa \
-o An6_methylation_results
# 自定义参数
bash dna_methylation_analysis.sh \
-i An6_4mC_5mC_raw.bed \
-r genome.fa \
-o results \
-c 10 \ # 最小覆盖度
-m 70 \ # 5mC阈值
-n 50 \ # 4mC阈值
-f 50 \ # 侧翼序列
-t 16 # 线程数
# 禁用某些功能
bash dna_methylation_analysis.sh \
-i input.bed -r genome.fa \
--no-homer # 不运行HOMER
--no-viz # 不生成图表
3️⃣ 查看结果
# 查看最终报告
cat An6_methylation_results/reports/FINAL_REPORT.txt
# 查看图表
ls -lh An6_methylation_results/figures/
# 查看motif结果
ls -lh An6_methylation_results/motif_analysis/homer/
📦 依赖安装
# Conda环境(推荐)
conda create -n methylation python=3.9
conda activate methylation
# 必需工具
conda install -c bioconda bedtools samtools
# 可选:HOMER
conda install -c bioconda homer
# 可选:MEME Suite
conda install -c bioconda meme
# Python库
pip install pandas numpy matplotlib seaborn
📋 输出文件说明
An6_methylation_results/
├── filtered/ # 过滤后的bed文件
│ ├── 5mC_CG_filtered.bed # 5mC @ CG位点
│ ├── 5mC_nonCG_filtered.bed # 5mC @ non-CG位点
│ ├── 4mC_CG_filtered.bed # 4mC @ CG位点
│ └── 4mC_nonCG_filtered.bed # 4mC @ non-CG位点
├── sequences/ # 提取的序列
│ ├── *_sequences.fa # 目标序列
│ └── background_sequences.fa # 背景序列
├── motif_analysis/
│ └── homer/
│ ├── 5mC_CG/ # HOMER结果
│ ├── 5mC_nonCG/
│ ├── 4mC_CG/
│ └── 4mC_nonCG/
├── figures/ # 可视化图表
│ ├── Fig1_counts.png # 位点数量对比
│ ├── Fig2_mod_distribution.png # 修饰率分布
│ ├── Fig3_coverage.png # 覆盖度分布
│ └── Fig4_statistics.png # 统计表
└── reports/ # 统计报告
├── analysis_config.txt # 分析配置
├── filtering_summary.txt # 过滤统计
└── FINAL_REPORT.txt # 最终报告
需要我帮您调整任何参数或添加其他功能吗?🔧✨