Daily Archives: 2026年4月24日

How to generate manhattan_plot? (Data_Ute_smallRNA_7)

manhattan_plot_top_miRNAs_based_on_mean_RPM

exceRpt_miRNA_ReadCounts.txt

manhattan_plot_top_miRNAs_based_on_mean_RPM.R

    # see http://xgenes.com/article/article-content/288/draw-plots-for-mirnas-generated-by-compsra/
    # see http://xgenes.com/article/article-content/289/draw-plots-for-pirna-generated-by-compsra/
    # see http://xgenes.com/article/article-content/290/draw-plots-for-snrna-generated-by-compsra/

    #Input file
    #exceRpt_miRNA_ReadCounts.txt
    #exceRpt_piRNA_ReadCounts.txt

    cd ~/DATA/Data_Ute/Data_Ute_smallRNA_7/summaries_exo7
    mamba activate r_env
    R
    #> .libPaths()
    #[1] "/home/jhuang/mambaforge/envs/r_env/lib/R/library"

    #BiocManager::install("AnnotationDbi")
    #BiocManager::install("clusterProfiler")
    #BiocManager::install(c("ReactomePA","org.Hs.eg.db"))
    #BiocManager::install("limma")
    #BiocManager::install("sva")
    #install.packages("writexl")
    #install.packages("openxlsx")
    library("AnnotationDbi")
    library("clusterProfiler")
    library("ReactomePA")
    library("org.Hs.eg.db")
    library(DESeq2)
    library(gplots)
    library(limma)
    library(sva)
    #library(writexl)  #d.raw_with_rownames <- cbind(RowNames = rownames(d.raw), d.raw); write_xlsx(d.raw, path = "d_raw.xlsx");
    library(openxlsx)

    setwd("../summaries_exo7/")
    d.raw<- read.delim2("exceRpt_miRNA_ReadCounts.txt",sep="\t", header=TRUE, row.names=1)

    # Desired column order
    desired_order <- c(
        "parental_cells_1", "parental_cells_2", "parental_cells_3",
        "untreated_1", "untreated_2",
        "scr_control_1", "scr_control_2", "scr_control_3",
        "DMSO_control_1", "DMSO_control_2", "DMSO_control_3",
        "scr_DMSO_control_1", "scr_DMSO_control_2", "scr_DMSO_control_3",
        "sT_knockdown_1", "sT_knockdown_2", "sT_knockdown_3"
    )
    # Reorder columns
    d.raw <- d.raw[, desired_order]
    setdiff(desired_order, colnames(d.raw))  # Shows missing or misnamed columns
    #sapply(d.raw, is.numeric)
    d.raw[] <- lapply(d.raw, as.numeric)
    #d.raw[] <- lapply(d.raw, function(x) as.numeric(as.character(x)))
    d.raw <- round(d.raw)
    write.csv(d.raw, file ="d_raw.csv")
    write.xlsx(d.raw, file = "d_raw.xlsx", rowNames = TRUE)

    # ------ Code sent to Ute ------
    #d.raw <- read.delim2("d_raw.csv",sep=",", header=TRUE, row.names=1)
    parental_or_EV = as.factor(c("parental","parental","parental", "EV","EV","EV","EV","EV","EV","EV","EV","EV","EV","EV","EV","EV","EV"))
    #donor = as.factor(c("0505","1905", "0505","1905", "0505","1905", "0505","1905", "0505","1905", "0505","1905"))
    batch = as.factor(c("Aug22","March25","March25", "Sep23","Sep23", "Sep23","Sep23","March25", "Sep23","Sep23","March25", "Sep23","Sep23","March25", "Sep23","Sep23","March25"))

    replicates = as.factor(c("parental_cells","parental_cells","parental_cells",  "untreated","untreated",   "scr_control","scr_control","scr_control",  "DMSO_control","DMSO_control","DMSO_control",  "scr_DMSO_control", "scr_DMSO_control","scr_DMSO_control",  "sT_knockdown", "sT_knockdown", "sT_knockdown"))
    ids = as.factor(c("parental_cells_1", "parental_cells_2", "parental_cells_3",
        "untreated_1", "untreated_2",
        "scr_control_1", "scr_control_2", "scr_control_3",
        "DMSO_control_1", "DMSO_control_2", "DMSO_control_3",
        "scr_DMSO_control_1", "scr_DMSO_control_2", "scr_DMSO_control_3",
        "sT_knockdown_1", "sT_knockdown_2", "sT_knockdown_3"))
    cData = data.frame(row.names=colnames(d.raw), replicates=replicates, ids=ids, batch=batch, parental_or_EV=parental_or_EV)
    dds<-DESeqDataSetFromMatrix(countData=d.raw, colData=cData, design=~replicates+batch)

    # Filter low-count miRNAs
    dds <- dds[ rowSums(counts(dds)) > 10, ]  #1322-->903
    rld <- rlogTransformation(dds)

    # ----------- manhattan_plot -------------

    # Load the required libraries
    library(ggplot2)
    library(dplyr)
    library(tidyr)
    library(ggrepel)  # For better label positioning

    # Step 1: Compute RPM from raw counts (d.raw has miRNAs in rows, samples in columns)
    d.raw_5 <- d.raw[, 1:5]  # assuming 5 samples
    total_counts <- colSums(d.raw_5)
    RPM <- sweep(d.raw_5, 2, total_counts, FUN = "/") * 1e6

    # Step 2: Prepare long-format dataframe
    RPM$miRNA <- rownames(RPM)
    df <- pivot_longer(RPM, cols = -miRNA, names_to = "sample", values_to = "RPM")

    # Step 3: Log-transform RPM
    df <- df %>%
    mutate(logRPM = log10(RPM + 1))

    # Step 4: Add miRNA index for x-axis positioning
    df <- df %>%
    arrange(miRNA) %>%
    group_by(sample) %>%
    mutate(Position = row_number())

    # Step 5: Identify top miRNAs based on mean RPM
    top_mirnas <- df %>%
    group_by(miRNA) %>%
    summarise(mean_RPM = mean(RPM)) %>%
    arrange(desc(mean_RPM)) %>%
    head(5) %>%
    pull(miRNA)  # Get the names of top 5 miRNAs

    # Step 6: Assign color based on whether the miRNA is top or not
    df$color <- ifelse(df$miRNA %in% top_mirnas, "red", "darkblue")

    # Rename the sample labels for display
    sample_labels <- c(
    "parental_cells_1" = "Parental cell 1",
    "parental_cells_2" = "Parental cell 2",
    "parental_cells_3" = "Parental cell 3",
    "untreated_1"      = "Untreated 1",
    "untreated_2"      = "Untreated 2"
    )

    # Step 7: Plot
    png("manhattan_plot_top_miRNAs_based_on_mean_RPM.png", width = 1200, height = 1200)
    ggplot(df, aes(x = Position, y = logRPM, color = color)) +
    scale_color_manual(values = c("red" = "red", "darkblue" = "darkblue")) +
    geom_jitter(width = 0.4) +
    geom_text_repel(
        data = df %>% filter(miRNA %in% top_mirnas),
        aes(label = miRNA),
        box.padding = 0.5,
        point.padding = 0.5,
        segment.color = 'gray50',
        size = 5,
        max.overlaps = 8,
        color = "black"
    ) +
    labs(x = "", y = "log10(Read Per Million) (RPM)") +
    facet_wrap(~sample, scales = "free_x", ncol = 5,
                labeller = labeller(sample = sample_labels)) +
    theme_minimal() +
    theme(
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        legend.position = "none",
        text = element_text(size = 16),
        axis.title = element_text(size = 18),
        strip.text = element_text(size = 16, face = "bold"),
        panel.spacing = unit(1.5, "lines")  # <-- More space between plots
    )
    dev.off()

    top_mirnas = c("hsa-miR-20a-5p","hsa-miR-93-5p","hsa-let-7g-5p","hsa-miR-30a-5p","hsa-miR-423-5p","hsa-let-7i-5p")
    #,"hsa-miR-17-5p","hsa-miR-107","hsa-miR-483-5p","hsa-miR-9-5p","hsa-miR-103a-3p","hsa-miR-30e-5p","hsa-miR-21-5p","hsa-miR-30d-5p")

    # Step 6: Assign color based on whether the miRNA is top or not
    df$color <- ifelse(df$miRNA %in% top_mirnas, "red", "darkblue")

    # Rename the sample labels for display
    sample_labels <- c(
    "parental_cells_1" = "Parental cell 1",
    "parental_cells_2" = "Parental cell 2",
    "parental_cells_3" = "Parental cell 3",
    "untreated_1"      = "Untreated 1",
    "untreated_2"      = "Untreated 2"
    )

    # Step 7: Plot
    png("manhattan_plot_most_differentially_expressed_miRNAs.png", width = 1200, height = 1200)
    ggplot(df, aes(x = Position, y = logRPM, color = color)) +
    scale_color_manual(values = c("red" = "red", "darkblue" = "darkblue")) +
    geom_jitter(width = 0.4) +
    geom_text_repel(
        data = df %>% filter(miRNA %in% top_mirnas),
        aes(label = miRNA),
        box.padding = 0.5,
        point.padding = 0.5,
        segment.color = 'gray50',
        size = 5,
        max.overlaps = 8,
        color = "black"
    ) +
    labs(x = "", y = "log10(Read Per Million) (RPM)") +
    facet_wrap(~sample, scales = "free_x", ncol = 5,
                labeller = labeller(sample = sample_labels)) +
    theme_minimal() +
    theme(
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        legend.position = "none",
        text = element_text(size = 16),
        axis.title = element_text(size = 18),
        strip.text = element_text(size = 16, face = "bold"),
        panel.spacing = unit(1.5, "lines")  # <-- More space between plots
    )
    dev.off()

ONT bacterial genome sequencing with methylation detection

❌ No, You Do NOT Need FASTQ Files When You Have modBAM

For nf-core/methylong and most methylation analysis workflows, modBAM is sufficient and preferred. FASTQ files become redundant once you have a properly generated modBAM.


📊 Information Comparison: modBAM vs. FASTQ

Data Type modBAM FASTQ Required for methylong?
Base calls (A/C/G/T) ✅ Yes ✅ Yes ✅ (in modBAM)
Quality scores ✅ Yes ✅ Yes ⚠️ Optional
Read alignment ✅ Yes (if mapped) ❌ No ✅ (modBAM preferred)
Methylation tags (MM/ML) ✅ Yes ❌ No Critical
Signal-level info ⚠️ Via tags (if --emit-moves) ❌ No ✅ For advanced analysis
File size (5 Mb genome) ~200-500 MB ~300 MB

➡️ modBAM contains everything FASTQ has, PLUS alignment and methylation data.


🎯 When FASTQ Is Still Useful (Optional Scenarios)

Use Case Why Keep FASTQ? Alternative with modBAM
Independent QC Run FastQC, NanoPlot on raw reads samtools stats, modkit stats, or extract FASTQ from BAM: samtools fastq file.bam
Re-basecalling Change Dorado model/modifications Keep raw signal files (pod5/fast5) instead — they’re the true source
Collaboration Share with labs not using modBAM Convert modBAM → FASTQ anytime: samtools fastq
Backup/Reproducibility Archival completeness modBAM + reference genome is sufficient for reproducibility
Non-methylation analysis Variant calling, assembly Use modBAM directly or extract FASTQ if needed

💡 Key Insight: If you have modBAM + reference genome, you can regenerate FASTQ anytime. But you cannot regenerate methylation tags from FASTQ alone.


🔄 Practical Workflow: What to Keep & What to Discard

graph LR
    A[Raw Signal: pod5/fast5] --> B[Dorado basecalling];
    B --> C[modBAM with MM/ML tags];
    C --> D[nf-core/methylong];
    C --> E[Convert to FASTQ if needed: samtools fastq];

    F[Original FASTQ from Novogene] -.->|Redundant| C;
    style F stroke-dasharray: 5 5

✅ Recommended File Retention Strategy

Priority File Type Keep? Reason
🔴 Essential *.mod.bam (with MM/ML tags) ✅ Yes Direct input for methylong; contains all analysis-ready data
🟡 High Value pod5/ or fast5/ (raw signal) ✅ Yes (if storage allows) Enables re-basecalling with new models/modifications
🟢 Optional *.fq.gz (basecalled FASTQ) ❌ Can delete Redundant; can be regenerated from modBAM
🔵 Helpful sequencing_summary.txt, reference genome ✅ Yes Metadata + alignment reference

🛠️ How to Extract FASTQ from modBAM (If Ever Needed)

# Extract FASTQ from mapped or unmapped modBAM
samtools fastq -1 output_R1.fq.gz -2 output_R2.fq.gz -0 /dev/null -s /dev/null -n input.mod.bam

# For single-end (typical for ONT):
samtools fastq input.mod.bam > output.fq.gz

# Preserve modification tags in header (optional):
samtools view -h input.mod.bam | grep -E "^@|^.*MM:Z:" | ...

⚠️ Note: Extracted FASTQ will not contain methylation information — that only lives in the MM/ML tags within the BAM.


📋 Action Plan for Your Project

If Novogene Provides modBAM:

# 1. Verify modification tags exist
samtools view An6.mod.bam | head -3 | grep -o "MM:Z:[^ ]*"

# 2. (Optional) Delete redundant FASTQ to save space
rm clean_data/An6/*.fq.gz

# 3. Prepare samplesheet for methylong
echo "group,sample,path,ref,method" > samplesheet.csv
echo "control,An6,/path/to/An6.mod.bam,/path/to/ref.fa,ont" >> samplesheet.csv

# 4. Run pipeline — NO FASTQ NEEDED
nextflow run nf-core/methylong -profile docker --input samplesheet.csv --outdir ./results

If Novogene Only Provides FASTQ + Raw Signal (pod5/fast5):

# You MUST basecall to modBAM first:
dorado basecaller \
  --modified-bases 5mC_6mA_4mC \
  --emit-moves \
  --device cuda:0 \
  dna_r10.4.1_e8.2_400bps_sup@v5.0.0 \
  /path/to/pod5/ \
  > An6.mod.bam

# Then proceed as above — original FASTQ can be deleted

💬 Reply to Novogene (Clarifying File Needs)

If they ask “Do you need the FASTQ files we already sent?”, you can reply:

Thank you for checking. For our methylation analysis workflow:

✅ We DO need: modBAM files with MM/ML tags for 5mC_6mA_4mC (or POD5/FAST5 raw signal files to generate them)

❌ We do NOT need: the basecalled FASTQ files already delivered, as they are redundant once modBAM is available

If storage or transfer is a concern, please prioritize delivering modBAM or raw signal files. We can regenerate FASTQ from modBAM if ever needed.

Best regards,
Jiabin

✅ Summary Checklist

[ ] modBAM received? → Verify MM/ML tags with samtools
[ ] Raw signal (pod5/fast5) available? → Keep for future re-basecalling
[ ] Original FASTQ taking up space? → Safe to delete after modBAM validation
[ ] Running methylong? → Use modBAM directly; no FASTQ required

🎯 Bottom Line: modBAM is the “golden copy” for methylation analysis. FASTQ is optional backup — keep it only if you have ample storage or specific QC needs.

Let me know if you’d like a one-liner script to batch-validate modBAM tags across your samples! 🧬✨



📧 Email Template: Requesting ONT Methylation Files from Novogene

Here’s a clear, professional email that specifies exactly what you need — in order of preference — with technical justification.


✉️ English Version (Recommended for Novogene International Support)

Subject: Request for Methylation-Ready Files – Project X101SC26036392-Z01

Dear Novogene Technical Support,

Project ID: X101SC26036392-Z01  
Samples: An6 (FDSW260173014-2r), BG5 (FDSW260173015-2r)  
Service: ONT bacterial genome sequencing with methylation detection

Thank you for delivering the basecalled FASTQ files. For downstream methylation analysis using nf-core/methylong, we require signal-level or modification-aware data.

🔹 Preferred File Formats (in order of priority):

✅ PRIORITY 1 (Best option – ready for analysis):
   • modBAM files with MM/ML tags for modifications: 5mC, 6mA, 4mC
   • Example filename: An6.mod.bam
   • Why: Compact, alignment-ready, directly compatible with methylong

✅ PRIORITY 2 (Alternative – raw signal, newer format):
   • POD5 raw signal files
   • Why: More storage-efficient than FAST5; can be re-basecalled with custom modification models

✅ PRIORITY 3 (Fallback – raw signal, legacy format):
   • FAST5 raw signal files (fast5_pass/)
   • Why: Contains full signal data; allows re-basecalling, but large file size

❌ NOT sufficient for methylation analysis:
   • FASTQ files only (no kinetic/signal information retained)

🔹 Additional helpful files (optional but appreciated):
   • sequencing_summary.txt (to confirm flowcell/kit/model)
   • Dorado basecalling version and model name used
   • Reference genome or assembly files if already generated

⏰ Time Sensitivity:
We understand from your documentation that ONT signal files (FAST5/POD5) are retained for a maximum of 21 days after data release. If these files were generated for our project, we kindly request they be shared before deletion.

🔹 Delivery Preference:
Please upload the requested files to our existing cloud directory:
oss://CP2024121300060/H101SC26036392/RSMD01814/X101SC26036392-Z01/X101SC26036392-Z01-J002/

Or provide a secure download link if alternative transfer is needed.

Could you please confirm:
1. Which of the above file types are available for our project?
2. If modBAM is not available, can you re-basecall the raw signal with:
   --modified-bases 5mC_6mA_4mC --emit-moves
   and deliver the resulting modBAM?

Thank you for your urgent assistance. We look forward to your reply.

Best regards,
Jiabin Huang
[Your affiliation/contact info]

🇨🇳 Chinese Version (If contacting Chinese support team)

主题:请求提供甲基化分析所需文件 – 项目 X101SC26036392-Z01

尊敬的新诺基因技术支持团队:

项目编号:X101SC26036392-Z01  
样本:An6 (FDSW260173014-2r), BG5 (FDSW260173015-2r)  
服务:纳米孔(ONT)细菌全基因组测序 + 甲基化检测

感谢贵司交付 basecalled FASTQ 文件。为进行后续甲基化分析(使用 nf-core/methylong 流程),我们需要信号级别或含修饰标签的数据文件。

🔹 所需文件格式(按优先级排序):

✅ 首选方案(最推荐,可直接分析):
   • 含修饰标签的 modBAM 文件(MM/ML tags),检测类型:5mC, 6mA, 4mC
   • 示例文件名:An6.mod.bam
   • 优势:文件体积小、已比对、可直接用于 methylong 分析

✅ 备选方案 1(原始信号,新版格式):
   • POD5 原始信号文件
   • 优势:比 FAST5 更节省存储,支持重新 basecalling 自定义修饰检测

✅ 备选方案 2(原始信号,旧版格式):
   • FAST5 原始信号文件(fast5_pass/ 目录)
   • 优势:包含完整信号数据,可重新 basecalling;但文件体积较大

❌ 无法用于甲基化分析的文件:
   • 仅 FASTQ 文件(不含动力学/原始信号信息)

🔹 其他辅助文件(可选,但有帮助):
   • sequencing_summary.txt(确认 flowcell/kit/model 信息)
   • Dorado basecalling 版本及所用模型名称
   • 如已生成,提供参考基因组或组装结果

⏰ 时间敏感性:
根据贵司文档说明,ONT 原始信号文件(FAST5/POD5)在数据交付后仅保留 21 天。如本项目生成了此类文件,恳请在删除前提供下载。

🔹 文件交付方式:
请上传至我们现有的云端目录:
oss://CP2024121300060/H101SC26036392/RSMD01814/X101SC26036392-Z01/X101SC26036392-Z01-J002/

或提供安全下载链接。

烦请确认:
1. 以上哪些文件类型可用于本项目?
2. 如暂无 modBAM,能否使用以下参数重新 basecalling 并交付:
   --modified-bases 5mC_6mA_4mC --emit-moves

感谢您的紧急支持,期待您的回复!

此致  
敬礼  
黄嘉彬 (Jiabin Huang)  
[单位/联系方式]

📋 Quick Reference: File Format Comparison

Format Size (5 Mb genome) Methylation Ready? Re-basecall Possible? Recommendation
modBAM (with MM/ML) ~200-500 MB ✅ Yes, directly ❌ No (already called) 🥇 Best choice
POD5 (raw signal) ~2-5 GB ⚠️ Needs basecalling ✅ Yes, flexible 🥈 Great alternative
FAST5 (raw signal) ~5-15 GB ⚠️ Needs basecalling ✅ Yes, flexible 🥉 Works, but large
FASTQ (basecalled) ~300 MB ❌ No signal data ❌ No ❌ Not sufficient

💡 Pro Tip: If Novogene asks “Which one do you want?”, reply:
“Please provide modBAM with MM/ML tags for 5mC_6mA_4mC if available. If not, POD5 or FAST5 raw signal files are also acceptable.”


🚀 After Sending: Follow-Up Checklist

# 1. Save email timestamp + expected response window (e.g., 3 business days)
# 2. If no reply in 48h: send polite follow-up
# 3. Once files received:
#    • Verify modBAM tags: samtools view file.bam | grep "MM:Z:"
#    • Or test pod5 conversion: dorado basecaller --help
# 4. Update your samplesheet.csv and run methylong

🔁 If Novogene Says “We Only Deliver FASTQ”

Reply with this escalation:

Thank you for the clarification. To confirm our understanding:

1. Was the sequencing performed on Illumina platform (not ONT), 
   despite the contract stating "ONT platform bacterial complete genome sequencing"?

2. If ONT was used but signal files are no longer available due to the 21-day policy, 
   could you advise on options for re-sequencing or data recovery?

Our study specifically requires long-read methylation detection (6mA/4mC) for bacterial epigenetics. 
If the current FASTQ data cannot support this analysis, we may need to discuss service adjustment or re-sequencing.

We appreciate your guidance on next steps.

Let me know if you’d like me to:

  • Adjust the tone (more urgent/formal/casual)
  • Add German phrases (since you prefer German/Chinese communication)
  • Help draft a follow-up email if they don’t respond

I’m here to help you get the right data! 🧬📬



⚠️ First: Critical Reminder About Your Data

Before discussing model selection, I must re-emphasize:

❌ Your current files (Illumina short-read FASTQ) CANNOT be used with nf-core/methylong
❌ Dorado basecalling parameters only apply to ONT raw signal files (fast5/pod5)

The --dorado_model and --dorado_modification parameters in nf-core/methylong are only relevant if you have ONT raw signal data.


✅ If You Obtain ONT Data: Model Selection Guide for Bacterial Methylation

Assuming you successfully request ONT signal files from Novogene, here’s how to choose the right parameters for your bacterial samples (Acinetobacter harbinensis and Pedobacter cryoconitis):


🔹 --dorado_model: Basecalling Accuracy Tier

Option Description Best For Recommendation
sup Super Accuracy Model Highest basecalling + modification accuracy; publication-quality analysis Recommended for bacterial methylomes
hac High Accuracy Model Faster processing, ~99% of SUP accuracy Good for quick QC or large batches
fast Fast Model Rapid prototyping, not for methylation calling ❌ Avoid for methylation analysis

How to Match Model to Your Sequencing Chemistry

You need to know your flowcell version and kit:

# Check sequencing_summary.txt (if available from Novogene)
grep -E "model|kit|flow_cell" sequencing_summary.txt

# Common combinations for bacterial ONT sequencing:
# • R10.4.1 flowcell + SQK-RAD114 kit → dna_r10.4.1_e8.2_400bps_sup@v5.0.0
# • R9.4.1 flowcell + SQK-LSK114 kit → dna_r9.4.1_e8_hac@v3.3

📌 If unsure, ask Novogene: “Which Dorado model name corresponds to the chemistry used for project X101SC26036392-Z01?”


🔹 --dorado_modification: Methylation Types to Detect

Bacterial vs. Mammalian Modification Profiles

Organism Type Dominant Modifications Recommended --dorado_modification
Bacteria (your case) 6mA, 4mC, some 5mC 5mC_6mA_4mC
Mammals 5mC, 5hmC 5mC_5hmC (default)
Yeast/Fungi Variable 5mC_6mA or custom

Available Dorado Modification Models (check GitHub for updates)

# List available modification models in installed Dorado:
dorado print-mods

# Common options:
5mC_5hmC          # Mammalian default
5mC_6mA_4mC       # Bacterial comprehensive ← RECOMMENDED FOR YOU
6mA               # Standalone 6mA detection
4mC               # Standalone 4mC detection
5mC               # Standalone 5mC detection

🔑 Key Insight: Acinetobacter and Pedobacter species commonly use 6mA and 4mC for restriction-modification systems. Using the default 5mC_5hmC would miss these biologically important modifications!


📋 Recommended Parameters for Your Project

# For bacterial genomes (An6, BG5) with ONT data:
--dorado_model: sup                          # Highest accuracy
--dorado_modification: 5mC_6mA_4mC          # Cover bacterial modifications
--dorado_device: cuda:0                     # Use GPU (required for reasonable speed)

Example nf-core/methylong Command (once you have modBAM/pod5):

nextflow run nf-core/methylong \
  -profile docker \
  --input samplesheet.csv \
  --outdir ./results_methylong \
  --genome /path/to/bacterial_reference.fa \
  --method ont \
  --modbam true \
  --dorado_model sup \
  --dorado_modification 5mC_6mA_4mC \
  --dorado_device cuda:0

🔍 How to Verify Your Sequencing Chemistry (Ask Novogene)

If you obtain ONT signal files, confirm these details:

Parameter Why It Matters Example Value
Flowcell version Determines basecalling model prefix R10.4.1, R9.4.1
Kit type Affects pore speed and model suffix SQK-RAD114, SQK-LSK114
Basecalling speed Determines model suffix 400bps, 260bps
Dorado version Model compatibility v5.0.0, v4.1.0

📧 Email snippet for Novogene:

Could you please confirm the ONT sequencing chemistry details for project X101SC26036392-Z01:
- Flowcell type (e.g., R10.4.1 or R9.4.1)?
- Library prep kit (e.g., SQK-RAD114)?
- Which Dorado model was used for initial basecalling (if any)?
This information is required to select the correct --dorado_model parameter for methylation analysis.

🚨 If You Cannot Obtain ONT Signal Files: Alternative Paths

Since your current data appears to be Illumina short-read, consider these options:

Option A: Standard Bacterial Analysis (with current FASTQ)

# Genome assembly + annotation
nextflow run nf-core/bactopia \
  -profile docker \
  --input samplesheet_illumina.csv \
  --outdir ./results_assembly \
  --species "Acinetobacter"

# Variant calling (if reference available)
nextflow run nf-core/bacass \
  -profile docker \
  --input samplesheet.csv \
  --outdir ./results_variants

Option B: If Libraries Were Bisulfite-Converted (WGBS)

# Only works if DNA was bisulfite-treated before Illumina sequencing
nextflow run nf-core/methylseq \
  -profile docker \
  --input samplesheet_wgbs.csv \
  --genome GCA_000000000.1 \
  --outdir ./results_methylseq \
  --skip_bismark false

🔍 Check for bisulfite signatures:

# High C→T conversion in non-CpG context suggests WGBS
zcat clean_data/An6/An6_L1_1.clean.rd.fq.gz | \
  awk 'NR%4==2' | head -10000 | \
  grep -o '[^C]T[^G]' | wc -l

📊 Quick Decision Tree

graph TD
    A[Do you have ONT raw signal files?] -->|No| B[Contact Novogene urgently];
    A -->|Yes| C{Confirm chemistry};
    C --> D[Select dorado_model: sup/hac/fast];
    C --> E[Select dorado_modification: 5mC_6mA_4mC for bacteria];
    B --> F{Novogene response};
    F -->|ONT data available| C;
    F -->|Only Illumina FASTQ| G[Use nf-core/bactopia or methylseq if WGBS];

✅ Summary Checklist

Task Status
🔴 Confirm whether ONT signal files (fast5/pod5) exist in cloud storage
🔴 If yes: verify sequencing chemistry with Novogene
🟡 If ONT confirmed: use --dorado_model sup --dorado_modification 5mC_6mA_4mC 📋
🟢 If only Illumina: proceed with bactopia/methylseq pipelines 🔄

💡 Pro Tip: For bacterial restriction-modification system studies, 6mA and 4mC are often more biologically relevant than 5mC. Always prioritize modification models that include these when analyzing prokaryotes.

Let me know:

  1. Did you find any fast5/, pod5/, or .bam files in the cloud directory?
  2. What does Novogene say about the sequencing platform used?

I’m ready to help you proceed with the correct pipeline once we clarify the data type! 🧬🔍