Processing Data_Patricia_Transposon_2025 v2 (Workflow for Structural Variant Calling in Nanopore Sequencing)

Generate the HD46_Ctrol annotation

mamba activate trycycler
cd trycycler_HD46_Ctrl;
trycycler cluster --threads 55 --assemblies assemblies/*.fasta --reads reads.fastq --out_dir trycycler;

trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_001
mv trycycler/cluster_001/1_contigs/J_ctg000010.fasta .
mv trycycler/cluster_001/1_contigs/L_tig00000016.fasta .
mv trycycler/cluster_001/1_contigs/R_tig00000001.fasta .
mv trycycler/cluster_001/1_contigs/H_utg000001c.fasta .

trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_002
mv trycycler/cluster_002/1_contigs/*00000*.fasta .
Error: unable to find a suitable common sequence

trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_003
mv trycycler/cluster_003/1_contigs/F_tig00000004.fasta .
mv trycycler/cluster_003/1_contigs/L_tig00000003.fasta .

trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_004
mv trycycler/cluster_004/1_contigs/J_ctg000000.fasta .
mv trycycler/cluster_004/1_contigs/P_ctg000000.fasta .
mv trycycler/cluster_004/1_contigs/S_contig_2.fasta .

trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_005
trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_006
trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_007
trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_008

trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_009
trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_010
trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_011
trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_012
trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_013
trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_014
trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_015
trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_016
trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_017
trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_018
trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_019
trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_020
trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_021
trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_022
trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_023
trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_024
trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_025
trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_026
trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_027
trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_028
trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_029

trycycler msa --threads 55 --cluster_dir trycycler/cluster_001
trycycler msa --threads 55 --cluster_dir trycycler/cluster_004

trycycler partition --threads 55 --reads reads.fastq --cluster_dirs trycycler/cluster_001
trycycler partition --threads 55 --reads reads.fastq --cluster_dirs trycycler/cluster_004

trycycler consensus --threads 55 --cluster_dir trycycler/cluster_001
trycycler consensus --threads 55 --cluster_dir trycycler/cluster_004

#Polish --> TODO: Need to be Debugged!
for c in trycycler/cluster_001 trycycler/cluster_004; do
    medaka_consensus -i "$c"/4_reads.fastq -d "$c"/7_final_consensus.fasta -o "$c"/medaka  -m r941_min_sup_g507 -t 12
    mv "$c"/medaka/consensus.fasta "$c"/8_medaka.fasta
    rm -r "$c"/medaka "$c"/*.fai "$c"/*.mmi  # clean up
done
# cat trycycler/cluster_*/8_medaka.fasta > trycycler/consensus.fasta

cp trycycler/cluster_001/7_final_consensus.fasta HD46_Ctrl_chr.fasta
cp trycycler/cluster_004/7_final_consensus.fasta HD46_Ctrl_plasmid.fasta

install mambaforge https://conda-forge.org/miniforge/ (recommended)

#download Mambaforge-24.9.2-0-Linux-x86_64.sh from website
chmod +x Mambaforge-24.9.2-0-Linux-x86_64.sh
./Mambaforge-24.9.2-0-Linux-x86_64.sh

To activate this environment, use:
    micromamba activate /home/jhuang/mambaforge
Or to execute a single command in this environment, use:
    micromamba run -p /home/jhuang/mambaforge mycommand
installation finished.

Do you wish to update your shell profile to automatically initialize conda?
This will activate conda on startup and change the command prompt when activated.
If you'd prefer that conda's base environment not be activated on startup,
  run the following command when conda is activated:

conda config --set auto_activate_base false

You can undo this by running `conda init --reverse $SHELL`? [yes|no]
[no] >>> yes
no change     /home/jhuang/mambaforge/condabin/conda
no change     /home/jhuang/mambaforge/bin/conda
no change     /home/jhuang/mambaforge/bin/conda-env
no change     /home/jhuang/mambaforge/bin/activate
no change     /home/jhuang/mambaforge/bin/deactivate
no change     /home/jhuang/mambaforge/etc/profile.d/conda.sh
no change     /home/jhuang/mambaforge/etc/fish/conf.d/conda.fish
no change     /home/jhuang/mambaforge/shell/condabin/Conda.psm1
no change     /home/jhuang/mambaforge/shell/condabin/conda-hook.ps1
no change     /home/jhuang/mambaforge/lib/python3.12/site-packages/xontrib/conda.xsh
no change     /home/jhuang/mambaforge/etc/profile.d/conda.csh
modified      /home/jhuang/.bashrc
==> For changes to take effect, close and re-open your current shell. <==
no change     /home/jhuang/mambaforge/condabin/conda
no change     /home/jhuang/mambaforge/bin/conda
no change     /home/jhuang/mambaforge/bin/conda-env
no change     /home/jhuang/mambaforge/bin/activate
no change     /home/jhuang/mambaforge/bin/deactivate
no change     /home/jhuang/mambaforge/etc/profile.d/conda.sh
no change     /home/jhuang/mambaforge/etc/fish/conf.d/conda.fish
no change     /home/jhuang/mambaforge/shell/condabin/Conda.psm1
no change     /home/jhuang/mambaforge/shell/condabin/conda-hook.ps1
no change     /home/jhuang/mambaforge/lib/python3.12/site-packages/xontrib/conda.xsh
no change     /home/jhuang/mambaforge/etc/profile.d/conda.csh
no change     /home/jhuang/.bashrc
No action taken.
WARNING conda.common.path.windows:_path_to(100): cygpath is not available, fallback to manual path conversion
WARNING conda.common.path.windows:_path_to(100): cygpath is not available, fallback to manual path conversion
Added mamba to /home/jhuang/.bashrc
==> For changes to take effect, close and re-open your current shell. <==
Thank you for installing Mambaforge!

Close your terminal window and open a new one, or run:
#source ~/mambaforge/bin/activate
conda --version
mamba --version

https://github.com/conda-forge/miniforge/releases
Note

    * After installation, please make sure that you do not have the Anaconda default channels configured.
        conda config --show channels
        conda config --remove channels defaults
        conda config --add channels conda-forge
        conda config --show channels
        conda config --set channel_priority strict
        #conda clean --all
        conda config --remove channels biobakery

    * !!!!Do not install anything into the base environment as this might break your installation. See here for details.!!!!

# --Deprecated method: mamba installing on conda--
#conda install -n base --override-channels -c conda-forge mamba 'python_abi=*=*cp*'
#    * Note that installing mamba into any other environment than base is not supported.
#
#conda activate base
#conda install conda
#conda uninstall mamba
#conda install mamba

2: install required Tools on the mamba env

    * Sniffles2: Detect structural variants, including transposons, from long-read alignments.
    * RepeatModeler2: Identify and classify transposons de novo.
    * RepeatMasker: Annotate known transposable elements using transposon libraries.
    * SVIM: An alternative structural variant caller optimized for long-read sequencing, if needed.
    * SURVIVOR: Consolidate structural variants across samples for comparative analysis.

    mamba deactivate
    # Create a new conda environment
    mamba create -n transposon_long python=3.6 -y

    # Activate the environment
    mamba activate transposon_long

    mamba install -c bioconda sniffles
    mamba install -c bioconda repeatmodeler repeatmasker

    # configure repeatmasker database
    mamba info --envs
    cd /home/jhuang/mambaforge/envs/transposon_long/share/RepeatMasker

    #mamba install python=3.6
    mamba install -c bioconda svim
    mamba install -c bioconda survivor

Test the installed tools

# Check versions
sniffles --version
RepeatModeler -h
RepeatMasker -h
svim --help
SURVIVOR --help
mamba install -c conda-forge perl r

Data Preparation

Raw Signal Data: Nanopore devices generate electrical signal data as DNA passes through the nanopore.
Basecalling: Tools like Guppy or Dorado are used to convert raw signals into nucleotide sequences (FASTQ files).

Preprocessing

Quality Filtering: Remove low-quality reads using tools like Filtlong or NanoFilt.
Adapter Trimming: Identify and remove sequencing adapters with tools like Porechop.

(Optional) Variant Calling for SNP and Indel Detection:

Tools like Medaka, Longshot, or Nanopolish analyze the aligned reads to identify SNPs and small indels.

(OFFICIAL STARTING POINT) Alignment and Structural Variant Calling: Tools such as Sniffles or SVIM detect large insertions, deletions, and other structural variants. 使用长读长测序工具如 SVIM 或 Sniffles 检测结构变异(e.g. 散在性重复序列)。

  #NOTE that the ./batch1_depth25/trycycler_WT/reads.fastq and F24A430001437_BACctmoD/BGI_result/Separate/${sample}/1.Cleandata/${sample}.filtered_reads.fq.gz are the same!

  # -- PREPARING the input fastq-data, merge the fastqz and move the top-directory

  # Under raw_data/no_sample_id/20250731_0943_MN45170_FBD12615_97f118c2/fastq_pass
  zcat ./barcode01/FBD12615_pass_barcode01_97f118c2_aa46ecf7_0.fastq.gz ./barcode01/FBD12615_pass_barcode01_97f118c2_aa46ecf7_1.fastq.gz ./barcode01/FBD12615_pass_barcode01_97f118c2_aa46ecf7_2.fastq.gz ./barcode01/FBD12615_pass_barcode01_97f118c2_aa46ecf7_3.fastq.gz ... | gzip > HD46_1.fastq.gz
  mv ./raw_data/no_sample_id/20250731_0943_MN45170_FBD12615_97f118c2/fastq_pass/HD46_1.fastq.gz ~/DATA/Data_Patricia_Transposon_2025

    #this are the corresponding sample names:
    #barcode 1: HD46-1
    #barcode 2: HD46-2
    #barcode 3: HD46-3
    #barcode 4: HD46-4
    mv barcode01.fastq.gz HD46_1.fastq.gz
    mv barcode02.fastq.gz HD46_2.fastq.gz
    mv barcode03.fastq.gz HD46_3.fastq.gz
    mv barcode04.fastq.gz HD46_4.fastq.gz

  # -- CALCULATE the coverages
    #!/bin/bash

    for bam in barcode*_minimap2.sorted.bam; do
        echo "Processing $bam ..."
        avg_cov=$(samtools depth -a "$bam" | awk '{sum+=$3; cnt++} END {if (cnt>0) print sum/cnt; else print 0}')
        echo -e "${bam}\t${avg_cov}" >> coverage_summary.txt
    done

  # ---- !!!! LOGIN the suitable environment !!!! ----
  # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! #
  mamba activate transposon_long

  # -- TODO: AFTERNOON_DEBUG_THIS: FAILED and not_USED: Alignment and Detect structural variants in each sample using SVIM which used aligner ngmlr or mimimap2
  #mamba install -c bioconda ngmlr
  mamba install -c bioconda svim

  #SEARCH FOR "HD46_Ctrl_chr_plasmid.fasta" for finding the insertion-calling-commands
  # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! for all 4 options #
  # ---- Option_1: minimap2 (aligner) + SVIM (structural variant caller) --> SUCCESSFUL ----

  for sample in HD46_1 HD46_2 HD46_3 HD46_4 HD46_5 HD46_6 HD46_7 HD46_8 HD46_13; do
      #INS,INV,DUP:TANDEM,DUP:INT,BND
      svim reads --aligner minimap2 --nanopore minimap2+svim_${sample}    ${sample}.fastq.gz HD46_Ctrl_chr_plasmid.fasta  --cores 20 --types INS --min_sv_size 100 --sequence_allele --insertion_sequences --read_names;
  done

  #svim alignment svim_alignment_minmap2_1_re 1.sorted.bam CP020463_.fasta --types INS --sequence_alleles --insertion_sequences --read_names

  # ---- Option_2: minamap2 (aligner) + Sniffles2 (structural variant caller) --> SUCCESSFUL ----
  #Minimap2: A commonly used aligner for nanopore sequencing data.
  #    Align Long Reads to the WT Reference using Minimap2
  #sniffles -m WT.sorted.bam -v WT.vcf -s 10 -l 50 -t 60
  #  -s 20: Requires at least 20 reads to support an SV for reporting. --> 10
  #  -l 50: Reports SVs that are at least 50 base pairs long.
  #  -t 60: Uses 60 threads for faster processing.
  for sample in HD46_1 HD46_2 HD46_3 HD46_4 HD46_5 HD46_6 HD46_7 HD46_8 HD46_13; do
      #minimap2 --MD -t 60 -ax map-ont HD46_Ctrl_chr_plasmid.fasta ./batch1_depth25/trycycler_${sample}/reads.fastq | samtools sort -o ${sample}.sorted.bam
      minimap2 --MD -t 60 -ax map-ont HD46_Ctrl_chr_plasmid.fasta ${sample}.fastq.gz | samtools sort -o ${sample}_minimap2.sorted.bam
      samtools index ${sample}_minimap2.sorted.bam
      sniffles -m ${sample}_minimap2.sorted.bam -v ${sample}_minimap2+sniffles.vcf -s 10 -l 50 -t 60
      #QUAL < 20 ||
      bcftools filter -e "INFO/SVTYPE != 'INS'" ${sample}_minimap2+sniffles.vcf > ${sample}_minimap2+sniffles_filtered.vcf
  done

    #Estimating parameter...
    #        Max dist between aln events: 44
    #        Max diff in window: 76
    #        Min score ratio: 2
    #        Avg DEL ratio: 0.0112045
    #        Avg INS ratio: 0.0364027
    #Start parsing... CP020463
    #                # Processed reads: 10000
    #                # Processed reads: 20000
    #        Finalizing  ..
    #Start genotype calling:
    #        Reopening Bam file for parsing coverage
    #        Finalizing  ..
    #Estimating parameter...
    #        Max dist between aln events: 28
    #        Max diff in window: 89
    #        Min score ratio: 2
    #        Avg DEL ratio: 0.013754
    #        Avg INS ratio: 0.17393
    #Start parsing... CP020463
    #                # Processed reads: 10000
    #                # Processed reads: 20000
    #                # Processed reads: 30000
    #                # Processed reads: 40000

    # Results:
    # * barcode01_minimap2+sniffles.vcf
    # * barcode01_minimap2+sniffles_filtered.vcf
    # * barcode02_minimap2+sniffles.vcf
    # * barcode02_minimap2+sniffles_filtered.vcf
    # * barcode03_minimap2+sniffles.vcf
    # * barcode03_minimap2+sniffles_filtered.vcf
    # * barcode04_minimap2+sniffles.vcf
    # * barcode04_minimap2+sniffles_filtered.vcf

  #ERROR: No MD string detected! Check bam file! Otherwise generate using e.g. samtools. --> No results!
  #for sample in barcode01 barcode02 barcode03 barcode04; do
  #    sniffles -m svim_reads_minimap2_${sample}/${sample}.fastq.minimap2.coordsorted.bam -v sniffles_minimap2_${sample}.vcf -s 10 -l 50 -t 60
  #    bcftools filter -e "INFO/SVTYPE != 'INS'" sniffles_minimap2_${sample}.vcf > sniffles_minimap2_${sample}_filtered.vcf
  #done

  # ---- Option_3: NGMLR (aligner) + SVIM (structural variant caller) --> SUCCESSFUL ----
  for sample in HD46_1 HD46_2 HD46_3 HD46_4 HD46_5 HD46_6 HD46_7 HD46_8 HD46_13; do
      svim reads --aligner ngmlr --nanopore    ngmlr+svim_${sample}       ${sample}.fastq.gz HD46_Ctrl_chr_plasmid.fasta  --cores 10;
  done

  # ---- Option_4: NGMLR (aligner) + sniffles (structural variant caller) --> SUCCESSFUL ----
  for sample in HD46_1 HD46_2 HD46_3 HD46_4 HD46_5 HD46_6 HD46_7 HD46_8 HD46_13; do
      sniffles -m ngmlr+svim_${sample}/${sample}.fastq.ngmlr.coordsorted.bam -v ${sample}_ngmlr+sniffles.vcf -s 10 -l 50 -t 60
      bcftools filter -e "INFO/SVTYPE != 'INS'" ${sample}_ngmlr+sniffles.vcf > ${sample}_ngmlr+sniffles_filtered.vcf
  done

  #END

Compare and integrate all results produced by minimap2+sniffles and ngmlr+sniffles, and check them each position in IGV!

# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! #
mv HD46_1_minimap2+sniffles_filtered.vcf    HD46-1_minimap2+sniffles_filtered.vcf
mv HD46_1_ngmlr+sniffles_filtered.vcf       HD46-1_ngmlr+sniffles_filtered.vcf
mv HD46_2_minimap2+sniffles_filtered.vcf    HD46-2_minimap2+sniffles_filtered.vcf
mv HD46_2_ngmlr+sniffles_filtered.vcf       HD46-2_ngmlr+sniffles_filtered.vcf
mv HD46_3_minimap2+sniffles_filtered.vcf    HD46-3_minimap2+sniffles_filtered.vcf
mv HD46_3_ngmlr+sniffles_filtered.vcf       HD46-3_ngmlr+sniffles_filtered.vcf
mv HD46_4_minimap2+sniffles_filtered.vcf    HD46-4_minimap2+sniffles_filtered.vcf
mv HD46_4_ngmlr+sniffles_filtered.vcf       HD46-4_ngmlr+sniffles_filtered.vcf
mv HD46_5_minimap2+sniffles_filtered.vcf    HD46-5_minimap2+sniffles_filtered.vcf
mv HD46_5_ngmlr+sniffles_filtered.vcf       HD46-5_ngmlr+sniffles_filtered.vcf
mv HD46_6_minimap2+sniffles_filtered.vcf    HD46-6_minimap2+sniffles_filtered.vcf
mv HD46_6_ngmlr+sniffles_filtered.vcf       HD46-6_ngmlr+sniffles_filtered.vcf
mv HD46_7_minimap2+sniffles_filtered.vcf    HD46-7_minimap2+sniffles_filtered.vcf
mv HD46_7_ngmlr+sniffles_filtered.vcf       HD46-7_ngmlr+sniffles_filtered.vcf
mv HD46_8_minimap2+sniffles_filtered.vcf    HD46-8_minimap2+sniffles_filtered.vcf
mv HD46_8_ngmlr+sniffles_filtered.vcf       HD46-8_ngmlr+sniffles_filtered.vcf
mv HD46_13_minimap2+sniffles_filtered.vcf   HD46-13_minimap2+sniffles_filtered.vcf
mv HD46_13_ngmlr+sniffles_filtered.vcf      HD46-13_ngmlr+sniffles_filtered.vcf

(NOT_USED) Filtering low-complexity insertions using RepeatMasker (TODO: how to use RepeatModeler to generate own lib?)
```
  python vcf_to_fasta.py variants.vcf variants.fasta
  #python filter_low_complexity.py variants.fasta filtered_variants.fasta retained_variants.fasta
  #Using RepeatMasker to filter the low-complexity fasta, the used h5 lib is
  /home/jhuang/mambaforge/envs/transposon_long/share/RepeatMasker/Libraries/Dfam.h5    #1.9G
  python /home/jhuang/mambaforge/envs/transposon_long/share/RepeatMasker/famdb.py -i /home/jhuang/mambaforge/envs/transposon_long/share/RepeatMasker/Libraries/Dfam.h5 names 'bacteria' | head
  Exact Matches
  =============
  2 bacteria (blast name), Bacteria 
```
(scientific name), eubacteria (genbank common name), Monera (in-part), Procaryotae (in-part), Prokaryota (in-part), Prokaryotae (in-part), prokaryote (in-part), prokaryotes (in-part) Non-exact Matches ================= 1783272 Terrabacteria group (scientific name) 91061 Bacilli (scientific name), Bacilli Ludwig et al. 2010 (authority), Bacillus/Lactobacillus/Streptococcus group (synonym), Firmibacteria (synonym), Firmibacteria Murray 1988 (authority) 1239 Bacillaeota (synonym), Bacillaeota Oren et al. 2015 (authority), Bacillota (synonym), Bacillus/Clostridium group (synonym), clostridial firmicutes (synonym), Clostridium group firmicutes (synonym), Firmacutes (synonym), firmicutes (blast name), Firmicutes (scientific name), Firmicutes corrig. Gibbons and Murray 1978 (authority), Low G+C firmicutes (synonym), low G+C Gram-positive bacteria (common name), low GC Gram+ (common name) Summary of Classes within Firmicutes: * Bacilli (includes many common pathogenic and non-pathogenic Gram-positive bacteria, taxid=91061) * Bacillus (e.g., Bacillus subtilis, Bacillus anthracis) * Staphylococcus (e.g., Staphylococcus aureus, Staphylococcus epidermidis) * Streptococcus (e.g., Streptococcus pneumoniae, Streptococcus pyogenes) * Listeria (e.g., Listeria monocytogenes) * Clostridia (includes many anaerobic species like Clostridium and Clostridioides) * Erysipelotrichia (intestinal bacteria, some pathogenic) * Tissierellia (less-studied, veterinary relevance) * Mollicutes (cell wall-less, includes Mycoplasma species) * Negativicutes (includes some Gram-negative, anaerobic species) RepeatMasker -species Bacilli -pa 4 -xsmall variants.fasta python extract_unmasked_seq.py variants.fasta.masked unmasked_variants.fasta #bcftools filter -i ‘QUAL>30 && INFO/SVLEN>100’ variants.vcf -o filtered.vcf # #bcftools view -i ‘SVTYPE=”INS”‘ variants.vcf | bcftools query -f ‘%CHROM\t%POS\t%REF\t%ALT\t%INFO\n’ > insertions.txt #mamba install -c bioconda vcf2fasta #vcf2fasta variants.vcf -o insertions.fasta #grep “SEQS” variants.vcf | awk ‘{ print $1, $2, $4, $5, $8 }’ > insertions.txt #python3 filtering_low_complexity.py # #vcftools –vcf input.vcf –recode –out filtered_output –minSVLEN 100 #bcftools filter -e ‘INFO/SEQS ~ “^(G+|C+|T+|A+){4,}”‘ variants.vcf -o filtered.vcf # — calculate the percentage of reads To calculate the percentage of reads that contain the insertion from the VCF entry, use the INFO and FORMAT fields provided in the VCF record. Step 1: Extract Relevant Information In the provided VCF entry: RE (Reads Evidence): 733 – the total number of reads supporting the insertion. GT (Genotype): 1/1 – this indicates a homozygous insertion, meaning all reads covering this region are expected to have the insertion. AF (Allele Frequency): 1 – a 100% allele frequency, indicating that every read in this sample supports the insertion. DR (Depth Reference): 0 – the number of reads supporting the reference allele. DV (Depth Variant): 733 – the number of reads supporting the variant allele (insertion). Step 2: Calculate Percentage of Reads Supporting the Insertion Using the formula: Percentage of reads with insertion=(DVDR+DV)×100 Percentage of reads with insertion=(DR+DVDV)×100 Substitute the values: Percentage=(7330+733)×100=100% Percentage=(0+733733)×100=100% Conclusion Based on the VCF record, 100% of the reads support the insertion, indicating that the insertion is fully present in the sample (homozygous insertion). This is consistent with the AF=1 and GT=1/1 fields. * In your VCF file generated by Sniffles, the REF=N in the results has a specific meaning: * In a standard VCF, the REF field usually contains the reference base(s) at the variant position. * For structural variants (SVs), especially insertions, there is no reference sequence replaced; the insertion occurs between reference bases. * Therefore, Sniffles uses N as a placeholder in the REF field to indicate “no reference base replaced”. * The actual inserted sequence is then stored in the ALT field.

Why some records have UNRESOLVED in the FILTER field in the Excel output.

1. Understanding the format

    The data appears to be structural variant (SV) calls from Sniffles, probably in a VCF-like tabular format exported to Excel:

        * gi|1176884116|gb|CP020463.1| → reference sequence
        * Positions: 1855752 and 2422820
        * N → insertion event
        * SVLEN=999 → size of the insertion
        * AF → allele frequency
        * GT:DR:DV → genotype, depth reference, depth variant (1/1:0:678, example values for a PASS variant)
        * FILTER → whether the variant passed filters (UNRESOLVED means it didn’t pass)

2. What UNRESOLVED usually means

    In Sniffles:

    * UNRESOLVED is assigned to SVs when the tool cannot confidently resolve the exact sequence or breakpoint.
    * Reasons include:
        - Low read support (RE, DV) relative to the expected coverage
        - Ambiguous alignment at repetitive regions
        - Conflicting strand or orientation signals
        - Allele frequency inconsistent with expectations

3. Examine your two records

    First record

        POS: 1855752
        SVTYPE: INS
        SVLEN: 999
        RE: 68
        AF: 1
        GT: 1/1
        FILTER: UNRESOLVED

    Observations:

    * AF = 1 → allele frequency 100%, homozygous insertion
    * RE = 68 → 68 reads support the variant, decent coverage
    * Still UNRESOLVED → likely because Sniffles could not resolve the inserted sequence precisely; sometimes long insertions in repetitive regions are hard to reconstruct fully even with good read support.

    Second record

        POS: 2422820
        SVTYPE: INS
        SVLEN: 999
        RE: 22
        AF: 0.025522
        GT: 0/0
        FILTER: UNRESOLVED

    Observations:

    * AF = 0.0255 → very low allele frequency (~2.5%)
    * RE = 22, DR = 840 → very low variant reads vs reference
    * GT = 0/0 → homozygous reference
    * Sniffles marks it UNRESOLVED because the variant is essentially noise, not confidently detected.

4. Key difference between the two
    Feature First record    Second record
    Allele frequency (AF)   1 (high)    0.0255 (very low)
    Variant reads (RE)  68  22
    Genotype (GT)   1/1 0/0
    Reason for UNRESOLVED   Unresolvable inserted sequence

✅ 5. Conclusion

    * Sniffles marks a variant as UNRESOLVED when the SV cannot be confidently characterized.
    * Even if there is good read support (first record), complex insertions can’t always be reconstructed fully.
    * Very low allele frequency (second record) also triggers UNRESOLVED because the signal is too weak compared to background noise.
    * Essentially: “UNRESOLVED” ≠ bad data, it’s just unresolved uncertainty.

(NOT_SURE_HOW_TO_USE) Polishing of assembly: Use tools like Medaka to refine variant calls by leveraging consensus sequences derived from nanopore data.
```
  mamba install -c bioconda medaka
  medaka-consensus -i aligned_reads.bam -r reference.fasta -o polished_output -t 4
```

Compare Insertions Across Samples

Merge Variants Across Samples: Use SURVIVOR to merge and compare the detected insertions in all samples against the WT:

SURVIVOR merge input_vcfs.txt 1000 1 1 1 0 30 merged.vcf

    Input: List of VCF files from Sniffles2.
    Output: A consolidated VCF file with shared and unique variants.

Filter WT Insertions:

    Identify transposons present only in samples 1–9 by subtracting WT variants using bcftools:

        bcftools isec WT.vcf merged.vcf -p comparison_results

Validate and Visualize

Visualize with IGV: Use IGV to inspect insertion sites in the alignment and confirm quality.

igv.sh

Validate Findings:
    Perform PCR or additional sequencing for key transposon insertion sites to confirm results.

Alternatives to TEPID for Long-Read Data

If you’re looking for transposon-specific tools for long reads:

    REPET: A robust transposon annotation tool compatible with assembled genomes.
    EDTA (Extensive de novo TE Annotator):
        A pipeline to identify, classify, and annotate transposons.
        Works directly on your assembled genomes.

        perl EDTA.pl --genome WT.fasta --type all

The WT.vcf file in the pipeline is generated by detecting structural variants (SVs) in the wild-type (WT) genome aligned against itself or using it as a baseline reference. Here’s how you can generate the WT.vcf:

Steps to Generate WT.vcf
1. Align WT Reads to the WT Reference Genome

The goal here is to create an alignment of the WT sequencing data to the WT reference genome to detect any self-contained structural variations, such as native insertions, deletions, or duplications.

Command using Minimap2:

minimap2 -ax map-ont WT.fasta WT_reads.fastq | samtools sort -o WT.sorted.bam

Index the BAM file:

samtools index WT.sorted.bam

2. Detect Structural Variants with Sniffles2

Run Sniffles2 on the WT alignment to call structural variants:

sniffles --input WT.sorted.bam --vcf WT.vcf

This step identifies:

    Native transposons and insertions present in the WT genome.
    Other structural variants that are part of the reference genome or sequencing artifacts.

Key parameters to consider:

    --min_support: Adjust based on your WT sequencing coverage.
    --max_distance: Define proximity for merging variants.
    --min_length: Set a minimum SV size (e.g., >50 bp for transposons).

Clean and Filter the WT.vcf, Variant Filtering: Remove low-confidence variants based on read depth, quality scores, or allele frequency.

To ensure the WT.vcf only includes relevant transposons or SVs:

    Use bcftools or similar tools to filter out low-confidence variants:

    bcftools filter -e "QUAL < 20 || INFO/SVTYPE != 'INS'" WT.vcf > WT_filtered.vcf
    bcftools filter -e "QUAL < 1 || INFO/SVTYPE != 'INS'" 1_.vcf > 1_filtered_.vcf

NOTE that in this pipeline, the WT.fasta (reference genome) is typically a high-quality genome sequence from a database or a well-annotated version of your species’ genome. It is not assembled from the WT.fastq sequencing reads in this context. Here’s why:

Why Use a Reference Genome (WT.fasta) from a Database?

    Higher Quality and Completeness:
        Database references (e.g., NCBI, Ensembl) are typically well-assembled, highly polished, and annotated. They serve as a reliable baseline for variant detection.

    Consistency:
        Using a standard reference ensures consistent comparisons across your WT and samples (1–9). Variants detected will be relative to this reference, not influenced by possible assembly errors.

    Saves Time:
        Assembling a reference genome from WT reads requires significant computational effort. Using an existing reference streamlines the analysis.

Alternative: Assembling WT from FASTQ

If you don’t have a high-quality reference genome (WT.fasta) and must rely on your WT FASTQ reads:

    Assemble the genome from your WT.fastq:
        Use long-read assemblers like Flye, Canu, or Shasta to create a draft genome.

    flye --nano-raw WT.fastq --out-dir WT_assembly --genome-size

Polish the assembly using tools like Racon (with the same reads) or Medaka for higher accuracy. Use the assembled and polished genome as your WT.fasta reference for further steps. Key Takeaways: If you have access to a reliable, high-quality reference genome, use it as the WT.fasta. Only assemble WT.fasta from raw reads (WT.fastq) if no database reference is available for your organism.

Annotate Transposable Elements: Tools like ANNOVAR or SnpEff provide functional insights into the detected variants.

# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! #
#Using snpEff to annotate the insertion!
conda activate /home/jhuang/miniconda3/envs/spandx

# --> BUG:
LOCUS       HD46_Ctrl 2707468 bp    DNA     circular BCT
    02-OCT-2025
DEFINITION  Staphylococcus epidermidis strain HD46-ctrl chromosome, whole
            genome shotgun sequence.
ACCESSION
VERSION

# --> DEBUG: adapt the genbank-file header as follows:
LOCUS       HD46_Ctrl 2707468 bp    DNA     circular BCT 02-OCT-2025
DEFINITION  Staphylococcus epidermidis strain HD46-ctrl chromosome, whole
            genome shotgun sequence.
ACCESSION   HD46_Ctrl
VERSION     HD46_Ctrl.1
DBLINK      BioProject: PRJNA1337321
            BioSample: SAMN52215988
KEYWORDS    .
SOURCE      Staphylococcus epidermidis
  ORGANISM  Staphylococcus epidermidis
            Bacteria; Firmicutes; Bacilli; Bacillales; Staphylococcaceae;
            Staphylococcus.
COMMENT     Annotated genome for HD46_Ctrl.
...

    # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! #
    mkdir ~/miniconda3/envs/spandx/share/snpeff-5.1-2/data/HD46_Ctrl
    cp HD46_Ctrl_chr.gb ~/miniconda3/envs/spandx/share/snpeff-5.1-2/data/HD46_Ctrl/genes.gbk

    vim ~/miniconda3/envs/spandx/share/snpeff-5.1-2/snpEff.config  #HD46_Ctrl.genome : HD46_Ctrl
    /home/jhuang/miniconda3/envs/spandx/bin/snpEff build -genbank HD46_Ctrl      -d

    sed -i 's/^cluster_001_consensus/HD46_Ctrl.1/' HD46-8_ngmlr+sniffles_filtered.vcf
    sed -i 's/^cluster_001_consensus/HD46_Ctrl.1/' HD46-13_ngmlr+sniffles_filtered.vcf
    #snpEff eff -nodownload -no-downstream -no-intergenic -ud 100 -v HD46_Ctrl HD46-8_ngmlr+sniffles_filtered.vcf > HD46-8_ngmlr+sniffles_filtered.annotated.vcf
    #snpEff eff -nodownload -no-downstream -no-intergenic -ud 100 -v HD46_Ctrl HD46-13_ngmlr+sniffles_filtered.vcf > HD46-13_ngmlr+sniffles_filtered.annotated.vcf

    # HD46-8
    snpEff ann -Xmx8g -v -hgvs -canon -ud 200 \
    -stats HD46-8_snpeff_stats.html \
    HD46_Ctrl \
    HD46-8_ngmlr+sniffles_filtered.vcf \
    > HD46-8_ngmlr+sniffles_filtered.annotated.vcf

    # HD46-13
    snpEff ann -Xmx8g -v -hgvs -canon -ud 200 \
    -stats HD46-13_snpeff_stats.html \
    HD46_Ctrl \
    HD46-13_ngmlr+sniffles_filtered.vcf \
    > HD46-13_ngmlr+sniffles_filtered.annotated.vcf

Summarize the results as a Excel-file

# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! #
conda activate plot-numpy1
#python generate_common_vcf.py
#mv common_variants.xlsx putative_transposons.xlsx

# * Reads each of your VCFs.
# * Filters variants → only keep those with FILTER == PASS.
# * Compares the two aligner methods (minimap2+sniffles2 vs ngmlr+sniffles2) per sample.
# * Keeps only variants that appear in both methods for the same sample.
# * Outputs: An Excel file with the common variants and a log text file listing which variants were filtered out, and why (not_PASS or not_COMMON_in_two_VCF).

#python generate_fuzzy_common_vcf_v1.py
#Sample PASS_minimap2   PASS_ngmlr  COMMON
#  HD46-Ctrl_Ctrl   39  29  28
#  HD46-1   39  32  29
#  HD46-2   40  32  28
#  HD46-3   38  30  27
#  HD46-4   46  35  32
#  HD46-5   40  35  31
#  HD46-6   43  35  30
#  HD46-7   40  33  28
#  HD46-8   37  20  11
#  HD46-13  39  38  27

#Sample PASS_minimap2   PASS_ngmlr  COMMON_FINAL
#HD46-Ctrl_Ctrl 39  29  6
#HD46-1 39  32  8
#HD46-2 40  32  8
#HD46-3 38  30  6
#HD46-4 46  35  8
#HD46-5 40  35  9
#HD46-6 43  35  10
#HD46-7 40  33  8
#HD46-8 37  20  4
#HD46-13    39  38  5

# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! #
#!!!! Summarize the results of ngmlr+sniffles !!!!
python merge_ngmlr+sniffles_filtered_results_and_summarize.py

#!!!! Post-Processing !!!!
#DELETE "2186168    N

. PASS” in Sheet HD46-13 and Summary #DELETE “2427785 N CGTCAGAATCGCTGTCTGCGTCCGAGTCACTGTCTGAGTCTGAATCACTATCTGCGTCTGAGTCACTGTCTG . PASS” due to “0/1:169:117” in HD46-13 and Summary #DELETE “2441640 N GCTCATTAAGAATCATTAAATTAC . PASS” due to 0/1:170:152 in HD46-13 and Summary

Source code of merge_ngmlr+sniffles_filtered_results_and_summarize.py

python add_ann_to_excel.py         --excel merged_ngmlr+sniffles_variants.xlsx         --sheet8 "HD46-8"         --sheet13 "HD46-13"         --vcf8 HD46-8_ngmlr+sniffles_filtered.annotated.vcf         --vcf13 HD46-13_ngmlr+sniffles_filtered.annotated.vcf         --out merged_ngmlr+sniffles_variants_with_ANN.xlsx

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Add SnpEff ANN columns (for SVTYPE=INS) from annotated VCFs into an Excel workbook,
with detailed debug about why CHROM+POS may not match.

Key improvements:
- Stronger CHROM/POS normalization (strip 'chr', unify MT naming, coerce numbers).
- Explicit detection and logging of sheet key columns used.
- Debug block prints:
* Unique key counts in sheet vs VCF (before/after normalization)
* Example non-matching keys from the sheet and from the VCF (top N)
* Chromosome naming diagnostics (e.g., 'chr' presence, 'MT'/'M' harmonization)
* Off-by-N diagnostics via --pos_tolerance (counts for would-match @ ±N)
* Optional preview of the sheet's SV type column, if present
- Safer ANN parsing and aggregation.
- Command-line options: --debug_examples, --pos_tolerance

ANN filling is done only for **exact** (CHROM, POS) equality (as before).
Tolerance is used only for *diagnostics*, not for filling, to avoid incorrect merges.
"""

import argparse
import gzip
import io
import re
from pathlib import Path
from typing import List, Tuple, Dict, Iterable, Set

import pandas as pd

FALLBACK_ANN_FIELDS: List[str] = [
    'Allele','Annotation','Annotation_Impact','Gene_Name','Gene_ID',
    'Feature_Type','Feature_ID','Transcript_BioType','Rank','HGVS.c',
    'HGVS.p','cDNA.pos/cDNA.length','CDS.pos/CDS.length','AA.pos/AA.length',
    'Distance','Errors_Warnings_Info'
]

def open_text_maybe_gzip(path: Path):
    if str(path).endswith('.gz'):
        return io.TextIOWrapper(gzip.open(path, 'rb'), encoding='utf-8', errors='ignore')
    return open(path, 'r', encoding='utf-8', errors='ignore')

def normalize_chrom(col: pd.Series) -> pd.Series:
    s = col.astype(str).str.strip()
    s = s.str.replace(r'^(chr|CHR)', '', regex=True)
    # Standardize mitochondrial names to "MT"
    s = s.str.replace(r'^(M|MtDNA|MTDNA|Mito|Mitochondrion)$', 'MT', regex=True, case=False)
    return s.str.upper()

def normalize_pos(col: pd.Series) -> pd.Series:
    # Excel can make ints look like floats; coerce then Int64
    # (We keep Int64 nullable for robustness; we never compare NaNs.)
    s = pd.to_numeric(col, errors='coerce')
    # If people had 0-based starts in the sheet (rare for INS), this won't fix it,
    # but the tolerance debug will reveal a +1 shift if present.
    return s.astype('Int64')

def parse_vcf_ann(vcf_path: Path) -> Tuple[pd.DataFrame, List[str]]:
    ann_fields = None
    header_cols = None
    records = []

    with open_text_maybe_gzip(vcf_path) as f:
        for line in f:
            if line.startswith('##INFO=<ID=ANN'):
                m = re.search(r'Format:\s*([^">]+)', line)
                if m:
                    ann_fields = [s.strip() for s in m.group(1).split('|')]
            if line.startswith('#CHROM'):
                header_cols = line.strip().lstrip('#').split('\t')
                break

        if not header_cols:
            raise RuntimeError(f"Could not find VCF header line (#CHROM ...) in {vcf_path}")

        if not ann_fields:
            ann_fields = FALLBACK_ANN_FIELDS

        ann_cols = [f'ANN_{x}' for x in ann_fields]

        for line in f:
            if not line or line[0] == '#':
                continue
            parts = line.rstrip('\n').split('\t')
            if len(parts) < len(header_cols):
                continue
            row = dict(zip(header_cols, parts))
            info = row.get('INFO', '')

            # Only INS
            if not re.search(r'(?:^|;)SVTYPE=INS(?:;|$)', info):
                continue

            chrom = row.get('#CHROM') or row.get('CHROM')
            pos_str = row.get('POS')
            try:
                pos = int(pos_str)
            except Exception:
                continue

            # Extract ANN entries
            ann_match = re.search(r'(?:^|;)ANN=([^;]+)', info)
            ann_entries = ann_match.group(1).split(',') if ann_match else []

            field_values: Dict[str, List[str]] = {k: [] for k in ann_fields}
            for ann in ann_entries:
                items = ann.split('|')
                if len(items) < len(ann_fields):
                    items += [''] * (len(ann_fields) - len(items))
                elif len(items) > len(ann_fields):
                    items = items[:len(ann_fields)]
                for k, v in zip(ann_fields, items):
                    field_values[k].append(v)

            joined = {f'ANN_{k}': (';'.join(v) if v else '') for k, v in field_values.items()}
            records.append({'CHROM': chrom, 'POS': pos, **joined})

    df = pd.DataFrame.from_records(records)
    if not df.empty:
        df['POS'] = pd.to_numeric(df['POS'], errors='coerce').astype('Int64')
        df['CHROM'] = normalize_chrom(df['CHROM'])
    return df, ann_cols

def detect_key_columns(df: pd.DataFrame) -> Dict[str, str]:
    chrom_candidates = ['CHROM', '#CHROM', 'Chrom', 'Chromosome', 'chrom', 'chr', 'Chr']
    pos_candidates   = ['POS', 'Position', 'position', 'pos', 'Start', 'start']
    mapping = {}
    for c in chrom_candidates:
        if c in df.columns:
            mapping['CHROM'] = c
            break
    for p in pos_candidates:
        if p in df.columns:
            mapping['POS'] = p
            break
    return mapping

def normalize_chrom_pos_df(df: pd.DataFrame, keys: Dict[str, str]) -> pd.DataFrame:
    out = df.copy()
    out[keys['CHROM']] = normalize_chrom(out[keys['CHROM']])
    out[keys['POS']]   = normalize_pos(out[keys['POS']])
    return out

def summarize_chr_formats(series: pd.Series, label: str):
    raw = series.astype(str)
    has_chr_prefix = raw.str.startswith(('chr','CHR')).sum()
    mt_like = raw.str.fullmatch(r'(M|MtDNA|MTDNA|Mito|Mitochondrion)', case=False).sum()
    print(f"[{label}] CHROM diagnostics:")
    print(f"  total rows: {len(raw)}")
    print(f"  with 'chr'/'CHR' prefix: {has_chr_prefix}")
    print(f"  mitochondrial names like M/MtDNA/etc: {mt_like}")

def keys_set(df: pd.DataFrame, chrom_col: str, pos_col: str) -> Set[Tuple[str, int]]:
    # Drop NA POS, NA CHROM
    sub = df[[chrom_col, pos_col]].dropna()
    # Ensure ints (drop NA after coercion)
    sub = sub[(sub[pos_col].astype('Int64').notna())]
    return set(zip(sub[chrom_col].astype(str), sub[pos_col].astype('int64')))

def tolerance_match_count(sheet_keys: Iterable[Tuple[str,int]],
                        vcf_keys: Set[Tuple[str,int]],
                        tol: int) -> int:
    if tol <= 0:
        return sum(1 for k in sheet_keys if k in vcf_keys)
    cnt = 0
    for chrom, pos in sheet_keys:
        if (chrom, pos) in vcf_keys:
            cnt += 1
        else:
            matched = False
            # check +/- 1..tol
            for d in range(1, tol+1):
                if (chrom, pos - d) in vcf_keys or (chrom, pos + d) in vcf_keys:
                    matched = True
                    break
            if matched:
                cnt += 1
    return cnt

def debug_match_report(df_sheet: pd.DataFrame,
                    vcf_df: pd.DataFrame,
                    keys: Dict[str, str],
                    debug_examples: int = 15,
                    pos_tolerance: int = 1):
    print("\n=== DEBUG: Matching overview ===")
    # Raw diagnostics
    summarize_chr_formats(df_sheet[keys['CHROM']], label="SHEET (raw)")
    summarize_chr_formats(vcf_df['CHROM'], label="VCF (normalized)")

    # Normalize sheet
    df_norm = normalize_chrom_pos_df(df_sheet, keys)
    print(f"Detected key columns -> CHROM: '{keys['CHROM']}'  POS: '{keys['POS']}'")
    # Basic stats
    n_sheet_all = len(df_sheet)
    n_sheet_key_nonnull = df_norm[keys['CHROM']].notna().sum() - df_norm[keys['CHROM']].isna().sum()
    n_sheet_pos_nonnull = df_norm[keys['POS']].notna().sum()
    print(f"SHEET rows total: {n_sheet_all}")
    print(f"SHEET rows with non-null CHROM: {n_sheet_key_nonnull}, non-null POS: {n_sheet_pos_nonnull}")

    # Unique key counts
    sheet_norm_keys_df = df_norm.rename(columns={keys['CHROM']: 'CHROM', keys['POS']: 'POS'})
    sheet_norm_keys_df = sheet_norm_keys_df.dropna(subset=['CHROM','POS'])
    sheet_norm_keys_df['POS'] = sheet_norm_keys_df['POS'].astype('Int64')
    sheet_keys_unique = keys_set(sheet_norm_keys_df, 'CHROM', 'POS')
    vcf_keys_unique   = keys_set(vcf_df, 'CHROM', 'POS')

    print(f"Unique (CHROM,POS) keys -> SHEET: {len(sheet_keys_unique)}  VCF(INS): {len(vcf_keys_unique)}")

    # Exact match count
    exact_matches = len(sheet_keys_unique & vcf_keys_unique)
    print(f"Exact key matches (SHEET∩VCF): {exact_matches}")

    # Tolerance diagnostics (diagnose off-by-one etc.)
    if pos_tolerance > 0:
        approx_matches = tolerance_match_count(sheet_keys_unique, vcf_keys_unique, pos_tolerance)
        print(f"Keys that would match within ±{pos_tolerance}: {approx_matches}")

    # Show some examples of non-matching keys from SHEET
    if debug_examples > 0:
        not_in_vcf = sorted(k for k in sheet_keys_unique if k not in vcf_keys_unique)
        not_in_sheet = sorted(k for k in vcf_keys_unique if k not in sheet_keys_unique)
        print(f"\nExamples of SHEET keys not found in VCF (showing up to {debug_examples}):")
        for k in not_in_vcf[:debug_examples]:
            print("  SHEET-only:", k)
        print(f"\nExamples of VCF keys not found in SHEET (showing up to {debug_examples}):")
        for k in not_in_sheet[:debug_examples]:
            print("  VCF-only:", k)

    # Try to detect a type column and report counts
    type_cols = [c for c in df_sheet.columns if c.lower() in ('svtype','type','variant_type','sv_type')]
    if type_cols:
        tcol = type_cols[0]
        is_ins = df_sheet[tcol].astype(str).str.upper() == 'INS'
        print(f"\nType column detected: '{tcol}'. SHEET rows with INS: {int(is_ins.sum())} / {len(df_sheet)}")
        # Of the INS rows, how many have keys that match?
        ins_keys = keys_set(df_norm[is_ins], keys['CHROM'], keys['POS'])
        exact_ins_matches = len(ins_keys & vcf_keys_unique)
        print(f"  INS-only exact key matches: {exact_ins_matches} / {len(ins_keys)}")
        if pos_tolerance > 0:
            approx_ins_matches = tolerance_match_count(ins_keys, vcf_keys_unique, pos_tolerance)
            print(f"  INS-only matches within ±{pos_tolerance}: {approx_ins_matches} / {len(ins_keys)}")
    else:
        print("\nNo explicit type column found in SHEET.")

def merge_ann_into_sheet(df_sheet: pd.DataFrame, vcf_df: pd.DataFrame, ann_cols: List[str],
                        pos_tolerance: int = 1, debug_examples: int = 15) -> pd.DataFrame:
    df = df_sheet.copy()

    keys = detect_key_columns(df)
    if 'CHROM' not in keys or 'POS' not in keys:
        print("WARNING: Could not detect CHROM/POS columns in sheet; ANN columns will be empty.")
        for c in ann_cols:
            if c not in df.columns:
                df[c] = ''
        return df

    # DEBUG: run a comprehensive match report
    debug_match_report(df, vcf_df, keys, debug_examples=debug_examples, pos_tolerance=pos_tolerance)

    # Normalize sheet keys for merge
    df_norm = normalize_chrom_pos_df(df, keys)

    # Prepare VCF map (unique by CHROM,POS), aggregate ANN fields
    vcf_use = vcf_df.copy()
    if vcf_use.empty:
        print("NOTE: No INS records found in VCF; ANN columns will be created but empty.")
    else:
        agg = {c: lambda s: ';'.join([x for x in s.astype(str).tolist() if x]) for c in ann_cols}
        vcf_use = vcf_use.groupby(['CHROM', 'POS'], as_index=False).agg(agg)

    # Identify potential type column in sheet
    type_cols = [c for c in df.columns if c.lower() in ('svtype','type','variant_type','sv_type')]
    has_type = bool(type_cols)
    if has_type:
        tcol = type_cols[0]
        is_ins = df[tcol].astype(str).str.upper() == 'INS'
        print(f"\nMERGE: using type column '{tcol}' -> rows marked INS: {int(is_ins.sum())} / {len(df)}")
    else:
        is_ins = pd.Series([False]*len(df), index=df.index)
        print("\nMERGE: no type column -> will fill ANN wherever exact (CHROM,POS) matches VCF INS.")

    # Left merge on exact keys only (do not use tolerance for filling, just for diagnostics)
    left = df_norm.rename(columns={keys['CHROM']: 'CHROM', keys['POS']: 'POS'})
    merged = left.merge(vcf_use[['CHROM','POS'] + ann_cols], on=['CHROM','POS'], how='left', suffixes=('',''))

    # Initialize ANN columns on original df
    for c in ann_cols:
        if c not in df.columns:
            df[c] = ''

    # Fill values:
    for c in ann_cols:
        values = merged[c]
        if has_type:
            df.loc[is_ins, c] = values[is_ins].fillna('').astype(str).values
        else:
            df[c] = values.fillna('').astype(str).values

    # Report matching stats on the actual merge
    matched = merged[ann_cols].notna().any(axis=1).sum()
    print(f"\nMERGE RESULT: rows with any ANN filled (exact VCF match): {int(matched)} / {len(df)}")

    # Additional hint if tolerance suggests many near-misses
    if pos_tolerance > 0:
        sheet_keys = keys_set(left, 'CHROM', 'POS')
        vcf_keys_unique = keys_set(vcf_use, 'CHROM', 'POS')
        approx = tolerance_match_count(sheet_keys, vcf_keys_unique, pos_tolerance)
        if approx > matched:
            print(f"NOTE: There appear to be {approx - matched} additional rows that would match within ±{pos_tolerance}.")
            print("      This often indicates a 0-based vs 1-based position shift or use of END instead of POS in the sheet.")

    return df

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument('--excel', default='merged_ngmlr+sniffles_variants.xlsx', help='Input Excel workbook')
    ap.add_argument('--sheet8', default='HD46-8', help='Sheet name for HD46-8 sample')
    ap.add_argument('--sheet13', default='HD46-13', help='Sheet name for HD46-13 sample')
    ap.add_argument('--vcf8', default='HD46-8_ngmlr+sniffles_filtered.annotated.vcf', help='Annotated VCF for HD46-8')
    ap.add_argument('--vcf13', default='HD46-13_ngmlr+sniffles_filtered.annotated.vcf', help='Annotated VCF for HD46-13')
    ap.add_argument('--out', default='merged_ngmlr+sniffles_variants_with_ANN.xlsx', help='Output Excel path')
    ap.add_argument('--debug_examples', type=int, default=15, help='How many non-match examples to print from each side')
    ap.add_argument('--pos_tolerance', type=int, default=1, help='Diagnostic tolerance (±N bp) for off-by-N checks (used for debug only)')
    args = ap.parse_args()

    excel_path = Path(args.excel)
    vcf8_path = Path(args.vcf8)
    vcf13_path = Path(args.vcf13)
    out_path = Path(args.out)

    # Load sheets (resolve case-insensitive names)
    xls = pd.ExcelFile(excel_path)
    def resolve_sheet(name: str) -> str:
        if name in xls.sheet_names:
            return name
        lower_map = {s.lower(): s for s in xls.sheet_names}
        return lower_map.get(name.lower(), name)

    sheet8 = resolve_sheet(args.sheet8)
    sheet13 = resolve_sheet(args.sheet13)

    df8 = pd.read_excel(excel_path, sheet_name=sheet8)
    df13 = pd.read_excel(excel_path, sheet_name=sheet13)

    # Parse VCFs (INS only)
    vcf8_df, ann_cols = parse_vcf_ann(vcf8_path)
    vcf13_df, _ = parse_vcf_ann(vcf13_path)

    print(f"VCF8 INS variants: {len(vcf8_df)}; VCF13 INS variants: {len(vcf13_df)}")
    print(f"ANN subfields ({len(ann_cols)}): {', '.join(ann_cols)}")

    # Merge with diagnostics
    df8_out = merge_ann_into_sheet(df8, vcf8_df, ann_cols,
                                pos_tolerance=args.pos_tolerance,
                                debug_examples=args.debug_examples)
    df13_out = merge_ann_into_sheet(df13, vcf13_df, ann_cols,
                                    pos_tolerance=args.pos_tolerance,
                                    debug_examples=args.debug_examples)

    # Save
    with pd.ExcelWriter(out_path, engine='xlsxwriter') as writer:
        df8_out.to_excel(writer, sheet_name=sheet8, index=False)
        df13_out.to_excel(writer, sheet_name=sheet13, index=False)

    print(f"\nDone. Wrote: {out_path.resolve()}")

if __name__ == '__main__':
    main()

Manually merge all contents of ANN=? to a seperate column ‘ANN’ in the isolate-specific sheets in the Excel-file.

#Add CHROM and HD46_Ctrl.1 to first column of the input Excel-file
(plot-numpy1) jhuang@WS-2290C:~/DATA/Data_Patricia_Transposon_2025$ python add_ann_to_excel.py         --excel merged_ngmlr+sniffles_variants.xlsx         --sheet8 "HD46-8"         --sheet13 "HD46-13"         --vcf8 HD46-8_ngmlr+sniffles_filtered.annotated.vcf         --vcf13 HD46-13_ngmlr+sniffles_filtered.annotated.vcf         --out merged_ngmlr+sniffles_variants_with_ANN.xlsx
#DEL some columns (INFO, NN_Allele, ANN_Rank, ANN_Errors_Warnings_Info, from the table, and COPY the summary-sheet to the final table.

Microbial bioinformatics

Microbial bioinformatics uses computational tools to analyze genomes, track evolution, and study functions in microorganisms, including bacteria and viruses.

Processing Data_Patricia_Transposon_2025 v2 (Workflow for Structural Variant Calling in Nanopore Sequencing)

Leave a Reply Cancel reply