Processing Data_Patricia_Transposon_2025 v2 (Workflow for Structural Variant Calling in Nanopore Sequencing)

  1. Generate the HD46_Ctrol annotation

    mamba activate trycycler
    cd trycycler_HD46_Ctrl;
    trycycler cluster --threads 55 --assemblies assemblies/*.fasta --reads reads.fastq --out_dir trycycler;
    
    trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_001
    mv trycycler/cluster_001/1_contigs/J_ctg000010.fasta .
    mv trycycler/cluster_001/1_contigs/L_tig00000016.fasta .
    mv trycycler/cluster_001/1_contigs/R_tig00000001.fasta .
    mv trycycler/cluster_001/1_contigs/H_utg000001c.fasta .
    
    trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_002
    mv trycycler/cluster_002/1_contigs/*00000*.fasta .
    Error: unable to find a suitable common sequence
    
    trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_003
    mv trycycler/cluster_003/1_contigs/F_tig00000004.fasta .
    mv trycycler/cluster_003/1_contigs/L_tig00000003.fasta .
    
    trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_004
    mv trycycler/cluster_004/1_contigs/J_ctg000000.fasta .
    mv trycycler/cluster_004/1_contigs/P_ctg000000.fasta .
    mv trycycler/cluster_004/1_contigs/S_contig_2.fasta .
    
    trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_005
    trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_006
    trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_007
    trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_008
    
    trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_009
    trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_010
    trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_011
    trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_012
    trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_013
    trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_014
    trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_015
    trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_016
    trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_017
    trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_018
    trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_019
    trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_020
    trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_021
    trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_022
    trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_023
    trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_024
    trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_025
    trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_026
    trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_027
    trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_028
    trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_029
    
    trycycler msa --threads 55 --cluster_dir trycycler/cluster_001
    trycycler msa --threads 55 --cluster_dir trycycler/cluster_004
    
    trycycler partition --threads 55 --reads reads.fastq --cluster_dirs trycycler/cluster_001
    trycycler partition --threads 55 --reads reads.fastq --cluster_dirs trycycler/cluster_004
    
    trycycler consensus --threads 55 --cluster_dir trycycler/cluster_001
    trycycler consensus --threads 55 --cluster_dir trycycler/cluster_004
    
    #Polish --> TODO: Need to be Debugged!
    for c in trycycler/cluster_001 trycycler/cluster_004; do
        medaka_consensus -i "$c"/4_reads.fastq -d "$c"/7_final_consensus.fasta -o "$c"/medaka  -m r941_min_sup_g507 -t 12
        mv "$c"/medaka/consensus.fasta "$c"/8_medaka.fasta
        rm -r "$c"/medaka "$c"/*.fai "$c"/*.mmi  # clean up
    done
    # cat trycycler/cluster_*/8_medaka.fasta > trycycler/consensus.fasta
    
    cp trycycler/cluster_001/7_final_consensus.fasta HD46_Ctrl_chr.fasta
    cp trycycler/cluster_004/7_final_consensus.fasta HD46_Ctrl_plasmid.fasta
  2. install mambaforge https://conda-forge.org/miniforge/ (recommended)

    #download Mambaforge-24.9.2-0-Linux-x86_64.sh from website
    chmod +x Mambaforge-24.9.2-0-Linux-x86_64.sh
    ./Mambaforge-24.9.2-0-Linux-x86_64.sh
    
    To activate this environment, use:
        micromamba activate /home/jhuang/mambaforge
    Or to execute a single command in this environment, use:
        micromamba run -p /home/jhuang/mambaforge mycommand
    installation finished.
    
    Do you wish to update your shell profile to automatically initialize conda?
    This will activate conda on startup and change the command prompt when activated.
    If you'd prefer that conda's base environment not be activated on startup,
      run the following command when conda is activated:
    
    conda config --set auto_activate_base false
    
    You can undo this by running `conda init --reverse $SHELL`? [yes|no]
    [no] >>> yes
    no change     /home/jhuang/mambaforge/condabin/conda
    no change     /home/jhuang/mambaforge/bin/conda
    no change     /home/jhuang/mambaforge/bin/conda-env
    no change     /home/jhuang/mambaforge/bin/activate
    no change     /home/jhuang/mambaforge/bin/deactivate
    no change     /home/jhuang/mambaforge/etc/profile.d/conda.sh
    no change     /home/jhuang/mambaforge/etc/fish/conf.d/conda.fish
    no change     /home/jhuang/mambaforge/shell/condabin/Conda.psm1
    no change     /home/jhuang/mambaforge/shell/condabin/conda-hook.ps1
    no change     /home/jhuang/mambaforge/lib/python3.12/site-packages/xontrib/conda.xsh
    no change     /home/jhuang/mambaforge/etc/profile.d/conda.csh
    modified      /home/jhuang/.bashrc
    ==> For changes to take effect, close and re-open your current shell. <==
    no change     /home/jhuang/mambaforge/condabin/conda
    no change     /home/jhuang/mambaforge/bin/conda
    no change     /home/jhuang/mambaforge/bin/conda-env
    no change     /home/jhuang/mambaforge/bin/activate
    no change     /home/jhuang/mambaforge/bin/deactivate
    no change     /home/jhuang/mambaforge/etc/profile.d/conda.sh
    no change     /home/jhuang/mambaforge/etc/fish/conf.d/conda.fish
    no change     /home/jhuang/mambaforge/shell/condabin/Conda.psm1
    no change     /home/jhuang/mambaforge/shell/condabin/conda-hook.ps1
    no change     /home/jhuang/mambaforge/lib/python3.12/site-packages/xontrib/conda.xsh
    no change     /home/jhuang/mambaforge/etc/profile.d/conda.csh
    no change     /home/jhuang/.bashrc
    No action taken.
    WARNING conda.common.path.windows:_path_to(100): cygpath is not available, fallback to manual path conversion
    WARNING conda.common.path.windows:_path_to(100): cygpath is not available, fallback to manual path conversion
    Added mamba to /home/jhuang/.bashrc
    ==> For changes to take effect, close and re-open your current shell. <==
    Thank you for installing Mambaforge!
    
    Close your terminal window and open a new one, or run:
    #source ~/mambaforge/bin/activate
    conda --version
    mamba --version
    
    https://github.com/conda-forge/miniforge/releases
    Note
    
        * After installation, please make sure that you do not have the Anaconda default channels configured.
            conda config --show channels
            conda config --remove channels defaults
            conda config --add channels conda-forge
            conda config --show channels
            conda config --set channel_priority strict
            #conda clean --all
            conda config --remove channels biobakery
    
        * !!!!Do not install anything into the base environment as this might break your installation. See here for details.!!!!
    
    # --Deprecated method: mamba installing on conda--
    #conda install -n base --override-channels -c conda-forge mamba 'python_abi=*=*cp*'
    #    * Note that installing mamba into any other environment than base is not supported.
    #
    #conda activate base
    #conda install conda
    #conda uninstall mamba
    #conda install mamba

2: install required Tools on the mamba env

    * Sniffles2: Detect structural variants, including transposons, from long-read alignments.
    * RepeatModeler2: Identify and classify transposons de novo.
    * RepeatMasker: Annotate known transposable elements using transposon libraries.
    * SVIM: An alternative structural variant caller optimized for long-read sequencing, if needed.
    * SURVIVOR: Consolidate structural variants across samples for comparative analysis.

    mamba deactivate
    # Create a new conda environment
    mamba create -n transposon_long python=3.6 -y

    # Activate the environment
    mamba activate transposon_long

    mamba install -c bioconda sniffles
    mamba install -c bioconda repeatmodeler repeatmasker

    # configure repeatmasker database
    mamba info --envs
    cd /home/jhuang/mambaforge/envs/transposon_long/share/RepeatMasker

    #mamba install python=3.6
    mamba install -c bioconda svim
    mamba install -c bioconda survivor
  1. Test the installed tools

    # Check versions
    sniffles --version
    RepeatModeler -h
    RepeatMasker -h
    svim --help
    SURVIVOR --help
    mamba install -c conda-forge perl r
  2. Data Preparation

    Raw Signal Data: Nanopore devices generate electrical signal data as DNA passes through the nanopore.
    Basecalling: Tools like Guppy or Dorado are used to convert raw signals into nucleotide sequences (FASTQ files).
  3. Preprocessing

    Quality Filtering: Remove low-quality reads using tools like Filtlong or NanoFilt.
    Adapter Trimming: Identify and remove sequencing adapters with tools like Porechop.
  4. (Optional) Variant Calling for SNP and Indel Detection:

    Tools like Medaka, Longshot, or Nanopolish analyze the aligned reads to identify SNPs and small indels.
  5. (OFFICIAL STARTING POINT) Alignment and Structural Variant Calling: Tools such as Sniffles or SVIM detect large insertions, deletions, and other structural variants. 使用长读长测序工具如 SVIM 或 Sniffles 检测结构变异(e.g. 散在性重复序列)。

      #NOTE that the ./batch1_depth25/trycycler_WT/reads.fastq and F24A430001437_BACctmoD/BGI_result/Separate/${sample}/1.Cleandata/${sample}.filtered_reads.fq.gz are the same!
    
      # -- PREPARING the input fastq-data, merge the fastqz and move the top-directory
    
      # Under raw_data/no_sample_id/20250731_0943_MN45170_FBD12615_97f118c2/fastq_pass
      zcat ./barcode01/FBD12615_pass_barcode01_97f118c2_aa46ecf7_0.fastq.gz ./barcode01/FBD12615_pass_barcode01_97f118c2_aa46ecf7_1.fastq.gz ./barcode01/FBD12615_pass_barcode01_97f118c2_aa46ecf7_2.fastq.gz ./barcode01/FBD12615_pass_barcode01_97f118c2_aa46ecf7_3.fastq.gz ... | gzip > HD46_1.fastq.gz
      mv ./raw_data/no_sample_id/20250731_0943_MN45170_FBD12615_97f118c2/fastq_pass/HD46_1.fastq.gz ~/DATA/Data_Patricia_Transposon_2025
    
        #this are the corresponding sample names:
        #barcode 1: HD46-1
        #barcode 2: HD46-2
        #barcode 3: HD46-3
        #barcode 4: HD46-4
        mv barcode01.fastq.gz HD46_1.fastq.gz
        mv barcode02.fastq.gz HD46_2.fastq.gz
        mv barcode03.fastq.gz HD46_3.fastq.gz
        mv barcode04.fastq.gz HD46_4.fastq.gz
    
      # -- CALCULATE the coverages
        #!/bin/bash
    
        for bam in barcode*_minimap2.sorted.bam; do
            echo "Processing $bam ..."
            avg_cov=$(samtools depth -a "$bam" | awk '{sum+=$3; cnt++} END {if (cnt>0) print sum/cnt; else print 0}')
            echo -e "${bam}\t${avg_cov}" >> coverage_summary.txt
        done
    
      # ---- !!!! LOGIN the suitable environment !!!! ----
      # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! #
      mamba activate transposon_long
    
      # -- TODO: AFTERNOON_DEBUG_THIS: FAILED and not_USED: Alignment and Detect structural variants in each sample using SVIM which used aligner ngmlr or mimimap2
      #mamba install -c bioconda ngmlr
      mamba install -c bioconda svim
    
      #SEARCH FOR "HD46_Ctrl_chr_plasmid.fasta" for finding the insertion-calling-commands
      # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! for all 4 options #
      # ---- Option_1: minimap2 (aligner) + SVIM (structural variant caller) --> SUCCESSFUL ----
    
      for sample in HD46_1 HD46_2 HD46_3 HD46_4 HD46_5 HD46_6 HD46_7 HD46_8 HD46_13; do
          #INS,INV,DUP:TANDEM,DUP:INT,BND
          svim reads --aligner minimap2 --nanopore minimap2+svim_${sample}    ${sample}.fastq.gz HD46_Ctrl_chr_plasmid.fasta  --cores 20 --types INS --min_sv_size 100 --sequence_allele --insertion_sequences --read_names;
      done
    
      #svim alignment svim_alignment_minmap2_1_re 1.sorted.bam CP020463_.fasta --types INS --sequence_alleles --insertion_sequences --read_names
    
      # ---- Option_2: minamap2 (aligner) + Sniffles2 (structural variant caller) --> SUCCESSFUL ----
      #Minimap2: A commonly used aligner for nanopore sequencing data.
      #    Align Long Reads to the WT Reference using Minimap2
      #sniffles -m WT.sorted.bam -v WT.vcf -s 10 -l 50 -t 60
      #  -s 20: Requires at least 20 reads to support an SV for reporting. --> 10
      #  -l 50: Reports SVs that are at least 50 base pairs long.
      #  -t 60: Uses 60 threads for faster processing.
      for sample in HD46_1 HD46_2 HD46_3 HD46_4 HD46_5 HD46_6 HD46_7 HD46_8 HD46_13; do
          #minimap2 --MD -t 60 -ax map-ont HD46_Ctrl_chr_plasmid.fasta ./batch1_depth25/trycycler_${sample}/reads.fastq | samtools sort -o ${sample}.sorted.bam
          minimap2 --MD -t 60 -ax map-ont HD46_Ctrl_chr_plasmid.fasta ${sample}.fastq.gz | samtools sort -o ${sample}_minimap2.sorted.bam
          samtools index ${sample}_minimap2.sorted.bam
          sniffles -m ${sample}_minimap2.sorted.bam -v ${sample}_minimap2+sniffles.vcf -s 10 -l 50 -t 60
          #QUAL < 20 ||
          bcftools filter -e "INFO/SVTYPE != 'INS'" ${sample}_minimap2+sniffles.vcf > ${sample}_minimap2+sniffles_filtered.vcf
      done
    
        #Estimating parameter...
        #        Max dist between aln events: 44
        #        Max diff in window: 76
        #        Min score ratio: 2
        #        Avg DEL ratio: 0.0112045
        #        Avg INS ratio: 0.0364027
        #Start parsing... CP020463
        #                # Processed reads: 10000
        #                # Processed reads: 20000
        #        Finalizing  ..
        #Start genotype calling:
        #        Reopening Bam file for parsing coverage
        #        Finalizing  ..
        #Estimating parameter...
        #        Max dist between aln events: 28
        #        Max diff in window: 89
        #        Min score ratio: 2
        #        Avg DEL ratio: 0.013754
        #        Avg INS ratio: 0.17393
        #Start parsing... CP020463
        #                # Processed reads: 10000
        #                # Processed reads: 20000
        #                # Processed reads: 30000
        #                # Processed reads: 40000
    
        # Results:
        # * barcode01_minimap2+sniffles.vcf
        # * barcode01_minimap2+sniffles_filtered.vcf
        # * barcode02_minimap2+sniffles.vcf
        # * barcode02_minimap2+sniffles_filtered.vcf
        # * barcode03_minimap2+sniffles.vcf
        # * barcode03_minimap2+sniffles_filtered.vcf
        # * barcode04_minimap2+sniffles.vcf
        # * barcode04_minimap2+sniffles_filtered.vcf
    
      #ERROR: No MD string detected! Check bam file! Otherwise generate using e.g. samtools. --> No results!
      #for sample in barcode01 barcode02 barcode03 barcode04; do
      #    sniffles -m svim_reads_minimap2_${sample}/${sample}.fastq.minimap2.coordsorted.bam -v sniffles_minimap2_${sample}.vcf -s 10 -l 50 -t 60
      #    bcftools filter -e "INFO/SVTYPE != 'INS'" sniffles_minimap2_${sample}.vcf > sniffles_minimap2_${sample}_filtered.vcf
      #done
    
      # ---- Option_3: NGMLR (aligner) + SVIM (structural variant caller) --> SUCCESSFUL ----
      for sample in HD46_1 HD46_2 HD46_3 HD46_4 HD46_5 HD46_6 HD46_7 HD46_8 HD46_13; do
          svim reads --aligner ngmlr --nanopore    ngmlr+svim_${sample}       ${sample}.fastq.gz HD46_Ctrl_chr_plasmid.fasta  --cores 10;
      done
    
      # ---- Option_4: NGMLR (aligner) + sniffles (structural variant caller) --> SUCCESSFUL ----
      for sample in HD46_1 HD46_2 HD46_3 HD46_4 HD46_5 HD46_6 HD46_7 HD46_8 HD46_13; do
          sniffles -m ngmlr+svim_${sample}/${sample}.fastq.ngmlr.coordsorted.bam -v ${sample}_ngmlr+sniffles.vcf -s 10 -l 50 -t 60
          bcftools filter -e "INFO/SVTYPE != 'INS'" ${sample}_ngmlr+sniffles.vcf > ${sample}_ngmlr+sniffles_filtered.vcf
      done
    
      #END
  6. Compare and integrate all results produced by minimap2+sniffles and ngmlr+sniffles, and check them each position in IGV!

    # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! #
    mv HD46_1_minimap2+sniffles_filtered.vcf    HD46-1_minimap2+sniffles_filtered.vcf
    mv HD46_1_ngmlr+sniffles_filtered.vcf       HD46-1_ngmlr+sniffles_filtered.vcf
    mv HD46_2_minimap2+sniffles_filtered.vcf    HD46-2_minimap2+sniffles_filtered.vcf
    mv HD46_2_ngmlr+sniffles_filtered.vcf       HD46-2_ngmlr+sniffles_filtered.vcf
    mv HD46_3_minimap2+sniffles_filtered.vcf    HD46-3_minimap2+sniffles_filtered.vcf
    mv HD46_3_ngmlr+sniffles_filtered.vcf       HD46-3_ngmlr+sniffles_filtered.vcf
    mv HD46_4_minimap2+sniffles_filtered.vcf    HD46-4_minimap2+sniffles_filtered.vcf
    mv HD46_4_ngmlr+sniffles_filtered.vcf       HD46-4_ngmlr+sniffles_filtered.vcf
    mv HD46_5_minimap2+sniffles_filtered.vcf    HD46-5_minimap2+sniffles_filtered.vcf
    mv HD46_5_ngmlr+sniffles_filtered.vcf       HD46-5_ngmlr+sniffles_filtered.vcf
    mv HD46_6_minimap2+sniffles_filtered.vcf    HD46-6_minimap2+sniffles_filtered.vcf
    mv HD46_6_ngmlr+sniffles_filtered.vcf       HD46-6_ngmlr+sniffles_filtered.vcf
    mv HD46_7_minimap2+sniffles_filtered.vcf    HD46-7_minimap2+sniffles_filtered.vcf
    mv HD46_7_ngmlr+sniffles_filtered.vcf       HD46-7_ngmlr+sniffles_filtered.vcf
    mv HD46_8_minimap2+sniffles_filtered.vcf    HD46-8_minimap2+sniffles_filtered.vcf
    mv HD46_8_ngmlr+sniffles_filtered.vcf       HD46-8_ngmlr+sniffles_filtered.vcf
    mv HD46_13_minimap2+sniffles_filtered.vcf   HD46-13_minimap2+sniffles_filtered.vcf
    mv HD46_13_ngmlr+sniffles_filtered.vcf      HD46-13_ngmlr+sniffles_filtered.vcf
  7. (NOT_USED) Filtering low-complexity insertions using RepeatMasker (TODO: how to use RepeatModeler to generate own lib?)

      python vcf_to_fasta.py variants.vcf variants.fasta
      #python filter_low_complexity.py variants.fasta filtered_variants.fasta retained_variants.fasta
      #Using RepeatMasker to filter the low-complexity fasta, the used h5 lib is
      /home/jhuang/mambaforge/envs/transposon_long/share/RepeatMasker/Libraries/Dfam.h5    #1.9G
      python /home/jhuang/mambaforge/envs/transposon_long/share/RepeatMasker/famdb.py -i /home/jhuang/mambaforge/envs/transposon_long/share/RepeatMasker/Libraries/Dfam.h5 names 'bacteria' | head
      Exact Matches
      =============
      2 bacteria (blast name), Bacteria 
    (scientific name), eubacteria (genbank common name), Monera (in-part), Procaryotae (in-part), Prokaryota (in-part), Prokaryotae (in-part), prokaryote (in-part), prokaryotes (in-part) Non-exact Matches ================= 1783272 Terrabacteria group (scientific name) 91061 Bacilli (scientific name), Bacilli Ludwig et al. 2010 (authority), Bacillus/Lactobacillus/Streptococcus group (synonym), Firmibacteria (synonym), Firmibacteria Murray 1988 (authority) 1239 Bacillaeota (synonym), Bacillaeota Oren et al. 2015 (authority), Bacillota (synonym), Bacillus/Clostridium group (synonym), clostridial firmicutes (synonym), Clostridium group firmicutes (synonym), Firmacutes (synonym), firmicutes (blast name), Firmicutes (scientific name), Firmicutes corrig. Gibbons and Murray 1978 (authority), Low G+C firmicutes (synonym), low G+C Gram-positive bacteria (common name), low GC Gram+ (common name) Summary of Classes within Firmicutes: * Bacilli (includes many common pathogenic and non-pathogenic Gram-positive bacteria, taxid=91061) * Bacillus (e.g., Bacillus subtilis, Bacillus anthracis) * Staphylococcus (e.g., Staphylococcus aureus, Staphylococcus epidermidis) * Streptococcus (e.g., Streptococcus pneumoniae, Streptococcus pyogenes) * Listeria (e.g., Listeria monocytogenes) * Clostridia (includes many anaerobic species like Clostridium and Clostridioides) * Erysipelotrichia (intestinal bacteria, some pathogenic) * Tissierellia (less-studied, veterinary relevance) * Mollicutes (cell wall-less, includes Mycoplasma species) * Negativicutes (includes some Gram-negative, anaerobic species) RepeatMasker -species Bacilli -pa 4 -xsmall variants.fasta python extract_unmasked_seq.py variants.fasta.masked unmasked_variants.fasta #bcftools filter -i ‘QUAL>30 && INFO/SVLEN>100’ variants.vcf -o filtered.vcf # #bcftools view -i ‘SVTYPE=”INS”‘ variants.vcf | bcftools query -f ‘%CHROM\t%POS\t%REF\t%ALT\t%INFO\n’ > insertions.txt #mamba install -c bioconda vcf2fasta #vcf2fasta variants.vcf -o insertions.fasta #grep “SEQS” variants.vcf | awk ‘{ print $1, $2, $4, $5, $8 }’ > insertions.txt #python3 filtering_low_complexity.py # #vcftools –vcf input.vcf –recode –out filtered_output –minSVLEN 100 #bcftools filter -e ‘INFO/SEQS ~ “^(G+|C+|T+|A+){4,}”‘ variants.vcf -o filtered.vcf # — calculate the percentage of reads To calculate the percentage of reads that contain the insertion from the VCF entry, use the INFO and FORMAT fields provided in the VCF record. Step 1: Extract Relevant Information In the provided VCF entry: RE (Reads Evidence): 733 – the total number of reads supporting the insertion. GT (Genotype): 1/1 – this indicates a homozygous insertion, meaning all reads covering this region are expected to have the insertion. AF (Allele Frequency): 1 – a 100% allele frequency, indicating that every read in this sample supports the insertion. DR (Depth Reference): 0 – the number of reads supporting the reference allele. DV (Depth Variant): 733 – the number of reads supporting the variant allele (insertion). Step 2: Calculate Percentage of Reads Supporting the Insertion Using the formula: Percentage of reads with insertion=(DVDR+DV)×100 Percentage of reads with insertion=(DR+DVDV​)×100 Substitute the values: Percentage=(7330+733)×100=100% Percentage=(0+733733​)×100=100% Conclusion Based on the VCF record, 100% of the reads support the insertion, indicating that the insertion is fully present in the sample (homozygous insertion). This is consistent with the AF=1 and GT=1/1 fields. * In your VCF file generated by Sniffles, the REF=N in the results has a specific meaning: * In a standard VCF, the REF field usually contains the reference base(s) at the variant position. * For structural variants (SVs), especially insertions, there is no reference sequence replaced; the insertion occurs between reference bases. * Therefore, Sniffles uses N as a placeholder in the REF field to indicate “no reference base replaced”. * The actual inserted sequence is then stored in the ALT field.
  8. Why some records have UNRESOLVED in the FILTER field in the Excel output.

    1. Understanding the format
    
        The data appears to be structural variant (SV) calls from Sniffles, probably in a VCF-like tabular format exported to Excel:
    
            * gi|1176884116|gb|CP020463.1| → reference sequence
            * Positions: 1855752 and 2422820
            * N → insertion event
            * SVLEN=999 → size of the insertion
            * AF → allele frequency
            * GT:DR:DV → genotype, depth reference, depth variant (1/1:0:678, example values for a PASS variant)
            * FILTER → whether the variant passed filters (UNRESOLVED means it didn’t pass)
    
    2. What UNRESOLVED usually means
    
        In Sniffles:
    
        * UNRESOLVED is assigned to SVs when the tool cannot confidently resolve the exact sequence or breakpoint.
        * Reasons include:
            - Low read support (RE, DV) relative to the expected coverage
            - Ambiguous alignment at repetitive regions
            - Conflicting strand or orientation signals
            - Allele frequency inconsistent with expectations
    
    3. Examine your two records
    
        First record
    
            POS: 1855752
            SVTYPE: INS
            SVLEN: 999
            RE: 68
            AF: 1
            GT: 1/1
            FILTER: UNRESOLVED
    
        Observations:
    
        * AF = 1 → allele frequency 100%, homozygous insertion
        * RE = 68 → 68 reads support the variant, decent coverage
        * Still UNRESOLVED → likely because Sniffles could not resolve the inserted sequence precisely; sometimes long insertions in repetitive regions are hard to reconstruct fully even with good read support.
    
        Second record
    
            POS: 2422820
            SVTYPE: INS
            SVLEN: 999
            RE: 22
            AF: 0.025522
            GT: 0/0
            FILTER: UNRESOLVED
    
        Observations:
    
        * AF = 0.0255 → very low allele frequency (~2.5%)
        * RE = 22, DR = 840 → very low variant reads vs reference
        * GT = 0/0 → homozygous reference
        * Sniffles marks it UNRESOLVED because the variant is essentially noise, not confidently detected.
    
    4. Key difference between the two
        Feature First record    Second record
        Allele frequency (AF)   1 (high)    0.0255 (very low)
        Variant reads (RE)  68  22
        Genotype (GT)   1/1 0/0
        Reason for UNRESOLVED   Unresolvable inserted sequence
    
    ✅ 5. Conclusion
    
        * Sniffles marks a variant as UNRESOLVED when the SV cannot be confidently characterized.
        * Even if there is good read support (first record), complex insertions can’t always be reconstructed fully.
        * Very low allele frequency (second record) also triggers UNRESOLVED because the signal is too weak compared to background noise.
        * Essentially: “UNRESOLVED” ≠ bad data, it’s just unresolved uncertainty.
  9. (NOT_SURE_HOW_TO_USE) Polishing of assembly: Use tools like Medaka to refine variant calls by leveraging consensus sequences derived from nanopore data.

      mamba install -c bioconda medaka
      medaka-consensus -i aligned_reads.bam -r reference.fasta -o polished_output -t 4
  10. Compare Insertions Across Samples

    Merge Variants Across Samples: Use SURVIVOR to merge and compare the detected insertions in all samples against the WT:
    
    SURVIVOR merge input_vcfs.txt 1000 1 1 1 0 30 merged.vcf
    
        Input: List of VCF files from Sniffles2.
        Output: A consolidated VCF file with shared and unique variants.
    
    Filter WT Insertions:
    
        Identify transposons present only in samples 1–9 by subtracting WT variants using bcftools:
    
            bcftools isec WT.vcf merged.vcf -p comparison_results
  11. Validate and Visualize

    Visualize with IGV: Use IGV to inspect insertion sites in the alignment and confirm quality.
    
    igv.sh
    
    Validate Findings:
        Perform PCR or additional sequencing for key transposon insertion sites to confirm results.
  12. Alternatives to TEPID for Long-Read Data

    If you’re looking for transposon-specific tools for long reads:
    
        REPET: A robust transposon annotation tool compatible with assembled genomes.
        EDTA (Extensive de novo TE Annotator):
            A pipeline to identify, classify, and annotate transposons.
            Works directly on your assembled genomes.
    
            perl EDTA.pl --genome WT.fasta --type all
  13. The WT.vcf file in the pipeline is generated by detecting structural variants (SVs) in the wild-type (WT) genome aligned against itself or using it as a baseline reference. Here’s how you can generate the WT.vcf:

    Steps to Generate WT.vcf
    1. Align WT Reads to the WT Reference Genome
    
    The goal here is to create an alignment of the WT sequencing data to the WT reference genome to detect any self-contained structural variations, such as native insertions, deletions, or duplications.
    
    Command using Minimap2:
    
    minimap2 -ax map-ont WT.fasta WT_reads.fastq | samtools sort -o WT.sorted.bam
    
    Index the BAM file:
    
    samtools index WT.sorted.bam
    
    2. Detect Structural Variants with Sniffles2
    
    Run Sniffles2 on the WT alignment to call structural variants:
    
    sniffles --input WT.sorted.bam --vcf WT.vcf
    
    This step identifies:
    
        Native transposons and insertions present in the WT genome.
        Other structural variants that are part of the reference genome or sequencing artifacts.
    
    Key parameters to consider:
    
        --min_support: Adjust based on your WT sequencing coverage.
        --max_distance: Define proximity for merging variants.
        --min_length: Set a minimum SV size (e.g., >50 bp for transposons).
  14. Clean and Filter the WT.vcf, Variant Filtering: Remove low-confidence variants based on read depth, quality scores, or allele frequency.

    To ensure the WT.vcf only includes relevant transposons or SVs:
    
        Use bcftools or similar tools to filter out low-confidence variants:
    
        bcftools filter -e "QUAL < 20 || INFO/SVTYPE != 'INS'" WT.vcf > WT_filtered.vcf
        bcftools filter -e "QUAL < 1 || INFO/SVTYPE != 'INS'" 1_.vcf > 1_filtered_.vcf
  15. NOTE that in this pipeline, the WT.fasta (reference genome) is typically a high-quality genome sequence from a database or a well-annotated version of your species’ genome. It is not assembled from the WT.fastq sequencing reads in this context. Here’s why:

    Why Use a Reference Genome (WT.fasta) from a Database?
    
        Higher Quality and Completeness:
            Database references (e.g., NCBI, Ensembl) are typically well-assembled, highly polished, and annotated. They serve as a reliable baseline for variant detection.
    
        Consistency:
            Using a standard reference ensures consistent comparisons across your WT and samples (1–9). Variants detected will be relative to this reference, not influenced by possible assembly errors.
    
        Saves Time:
            Assembling a reference genome from WT reads requires significant computational effort. Using an existing reference streamlines the analysis.
    
    Alternative: Assembling WT from FASTQ
    
    If you don’t have a high-quality reference genome (WT.fasta) and must rely on your WT FASTQ reads:
    
        Assemble the genome from your WT.fastq:
            Use long-read assemblers like Flye, Canu, or Shasta to create a draft genome.
    
        flye --nano-raw WT.fastq --out-dir WT_assembly --genome-size 
    Polish the assembly using tools like Racon (with the same reads) or Medaka for higher accuracy. Use the assembled and polished genome as your WT.fasta reference for further steps. Key Takeaways: If you have access to a reliable, high-quality reference genome, use it as the WT.fasta. Only assemble WT.fasta from raw reads (WT.fastq) if no database reference is available for your organism.
  16. Annotate Transposable Elements: Tools like ANNOVAR or SnpEff provide functional insights into the detected variants.

    # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! #
    #Using snpEff to annotate the insertion!
    conda activate /home/jhuang/miniconda3/envs/spandx
# --> BUG:
LOCUS       HD46_Ctrl 2707468 bp    DNA     circular BCT
    02-OCT-2025
DEFINITION  Staphylococcus epidermidis strain HD46-ctrl chromosome, whole
            genome shotgun sequence.
ACCESSION
VERSION

# --> DEBUG: adapt the genbank-file header as follows:
LOCUS       HD46_Ctrl 2707468 bp    DNA     circular BCT 02-OCT-2025
DEFINITION  Staphylococcus epidermidis strain HD46-ctrl chromosome, whole
            genome shotgun sequence.
ACCESSION   HD46_Ctrl
VERSION     HD46_Ctrl.1
DBLINK      BioProject: PRJNA1337321
            BioSample: SAMN52215988
KEYWORDS    .
SOURCE      Staphylococcus epidermidis
  ORGANISM  Staphylococcus epidermidis
            Bacteria; Firmicutes; Bacilli; Bacillales; Staphylococcaceae;
            Staphylococcus.
COMMENT     Annotated genome for HD46_Ctrl.
...
    # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! #
    mkdir ~/miniconda3/envs/spandx/share/snpeff-5.1-2/data/HD46_Ctrl
    cp HD46_Ctrl_chr.gb ~/miniconda3/envs/spandx/share/snpeff-5.1-2/data/HD46_Ctrl/genes.gbk

    vim ~/miniconda3/envs/spandx/share/snpeff-5.1-2/snpEff.config  #HD46_Ctrl.genome : HD46_Ctrl
    /home/jhuang/miniconda3/envs/spandx/bin/snpEff build -genbank HD46_Ctrl      -d

    sed -i 's/^cluster_001_consensus/HD46_Ctrl.1/' HD46-8_ngmlr+sniffles_filtered.vcf
    sed -i 's/^cluster_001_consensus/HD46_Ctrl.1/' HD46-13_ngmlr+sniffles_filtered.vcf
    #snpEff eff -nodownload -no-downstream -no-intergenic -ud 100 -v HD46_Ctrl HD46-8_ngmlr+sniffles_filtered.vcf > HD46-8_ngmlr+sniffles_filtered.annotated.vcf
    #snpEff eff -nodownload -no-downstream -no-intergenic -ud 100 -v HD46_Ctrl HD46-13_ngmlr+sniffles_filtered.vcf > HD46-13_ngmlr+sniffles_filtered.annotated.vcf

    # HD46-8
    snpEff ann -Xmx8g -v -hgvs -canon -ud 200 \
    -stats HD46-8_snpeff_stats.html \
    HD46_Ctrl \
    HD46-8_ngmlr+sniffles_filtered.vcf \
    > HD46-8_ngmlr+sniffles_filtered.annotated.vcf

    # HD46-13
    snpEff ann -Xmx8g -v -hgvs -canon -ud 200 \
    -stats HD46-13_snpeff_stats.html \
    HD46_Ctrl \
    HD46-13_ngmlr+sniffles_filtered.vcf \
    > HD46-13_ngmlr+sniffles_filtered.annotated.vcf
  1. Summarize the results as a Excel-file

    # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! #
    conda activate plot-numpy1
    #python generate_common_vcf.py
    #mv common_variants.xlsx putative_transposons.xlsx
    
    # * Reads each of your VCFs.
    # * Filters variants → only keep those with FILTER == PASS.
    # * Compares the two aligner methods (minimap2+sniffles2 vs ngmlr+sniffles2) per sample.
    # * Keeps only variants that appear in both methods for the same sample.
    # * Outputs: An Excel file with the common variants and a log text file listing which variants were filtered out, and why (not_PASS or not_COMMON_in_two_VCF).
    
    #python generate_fuzzy_common_vcf_v1.py
    #Sample PASS_minimap2   PASS_ngmlr  COMMON
    #  HD46-Ctrl_Ctrl   39  29  28
    #  HD46-1   39  32  29
    #  HD46-2   40  32  28
    #  HD46-3   38  30  27
    #  HD46-4   46  35  32
    #  HD46-5   40  35  31
    #  HD46-6   43  35  30
    #  HD46-7   40  33  28
    #  HD46-8   37  20  11
    #  HD46-13  39  38  27
    
    #Sample PASS_minimap2   PASS_ngmlr  COMMON_FINAL
    #HD46-Ctrl_Ctrl 39  29  6
    #HD46-1 39  32  8
    #HD46-2 40  32  8
    #HD46-3 38  30  6
    #HD46-4 46  35  8
    #HD46-5 40  35  9
    #HD46-6 43  35  10
    #HD46-7 40  33  8
    #HD46-8 37  20  4
    #HD46-13    39  38  5
    
    # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! #
    #!!!! Summarize the results of ngmlr+sniffles !!!!
    python merge_ngmlr+sniffles_filtered_results_and_summarize.py
    
    #!!!! Post-Processing !!!!
    #DELETE "2186168    N   

    . PASS” in Sheet HD46-13 and Summary #DELETE “2427785 N CGTCAGAATCGCTGTCTGCGTCCGAGTCACTGTCTGAGTCTGAATCACTATCTGCGTCTGAGTCACTGTCTG . PASS” due to “0/1:169:117” in HD46-13 and Summary #DELETE “2441640 N GCTCATTAAGAATCATTAAATTAC . PASS” due to 0/1:170:152 in HD46-13 and Summary

  2. Source code of merge_ngmlr+sniffles_filtered_results_and_summarize.py

    python add_ann_to_excel.py         --excel merged_ngmlr+sniffles_variants.xlsx         --sheet8 "HD46-8"         --sheet13 "HD46-13"         --vcf8 HD46-8_ngmlr+sniffles_filtered.annotated.vcf         --vcf13 HD46-13_ngmlr+sniffles_filtered.annotated.vcf         --out merged_ngmlr+sniffles_variants_with_ANN.xlsx
    
    #!/usr/bin/env python3
    # -*- coding: utf-8 -*-
    """
    Add SnpEff ANN columns (for SVTYPE=INS) from annotated VCFs into an Excel workbook,
    with detailed debug about why CHROM+POS may not match.
    
    Key improvements:
    - Stronger CHROM/POS normalization (strip 'chr', unify MT naming, coerce numbers).
    - Explicit detection and logging of sheet key columns used.
    - Debug block prints:
    * Unique key counts in sheet vs VCF (before/after normalization)
    * Example non-matching keys from the sheet and from the VCF (top N)
    * Chromosome naming diagnostics (e.g., 'chr' presence, 'MT'/'M' harmonization)
    * Off-by-N diagnostics via --pos_tolerance (counts for would-match @ ±N)
    * Optional preview of the sheet's SV type column, if present
    - Safer ANN parsing and aggregation.
    - Command-line options: --debug_examples, --pos_tolerance
    
    ANN filling is done only for **exact** (CHROM, POS) equality (as before).
    Tolerance is used only for *diagnostics*, not for filling, to avoid incorrect merges.
    """
    
    import argparse
    import gzip
    import io
    import re
    from pathlib import Path
    from typing import List, Tuple, Dict, Iterable, Set
    
    import pandas as pd
    
    FALLBACK_ANN_FIELDS: List[str] = [
        'Allele','Annotation','Annotation_Impact','Gene_Name','Gene_ID',
        'Feature_Type','Feature_ID','Transcript_BioType','Rank','HGVS.c',
        'HGVS.p','cDNA.pos/cDNA.length','CDS.pos/CDS.length','AA.pos/AA.length',
        'Distance','Errors_Warnings_Info'
    ]
    
    def open_text_maybe_gzip(path: Path):
        if str(path).endswith('.gz'):
            return io.TextIOWrapper(gzip.open(path, 'rb'), encoding='utf-8', errors='ignore')
        return open(path, 'r', encoding='utf-8', errors='ignore')
    
    def normalize_chrom(col: pd.Series) -> pd.Series:
        s = col.astype(str).str.strip()
        s = s.str.replace(r'^(chr|CHR)', '', regex=True)
        # Standardize mitochondrial names to "MT"
        s = s.str.replace(r'^(M|MtDNA|MTDNA|Mito|Mitochondrion)$', 'MT', regex=True, case=False)
        return s.str.upper()
    
    def normalize_pos(col: pd.Series) -> pd.Series:
        # Excel can make ints look like floats; coerce then Int64
        # (We keep Int64 nullable for robustness; we never compare NaNs.)
        s = pd.to_numeric(col, errors='coerce')
        # If people had 0-based starts in the sheet (rare for INS), this won't fix it,
        # but the tolerance debug will reveal a +1 shift if present.
        return s.astype('Int64')
    
    def parse_vcf_ann(vcf_path: Path) -> Tuple[pd.DataFrame, List[str]]:
        ann_fields = None
        header_cols = None
        records = []
    
        with open_text_maybe_gzip(vcf_path) as f:
            for line in f:
                if line.startswith('##INFO=<ID=ANN'):
                    m = re.search(r'Format:\s*([^">]+)', line)
                    if m:
                        ann_fields = [s.strip() for s in m.group(1).split('|')]
                if line.startswith('#CHROM'):
                    header_cols = line.strip().lstrip('#').split('\t')
                    break
    
            if not header_cols:
                raise RuntimeError(f"Could not find VCF header line (#CHROM ...) in {vcf_path}")
    
            if not ann_fields:
                ann_fields = FALLBACK_ANN_FIELDS
    
            ann_cols = [f'ANN_{x}' for x in ann_fields]
    
            for line in f:
                if not line or line[0] == '#':
                    continue
                parts = line.rstrip('\n').split('\t')
                if len(parts) < len(header_cols):
                    continue
                row = dict(zip(header_cols, parts))
                info = row.get('INFO', '')
    
                # Only INS
                if not re.search(r'(?:^|;)SVTYPE=INS(?:;|$)', info):
                    continue
    
                chrom = row.get('#CHROM') or row.get('CHROM')
                pos_str = row.get('POS')
                try:
                    pos = int(pos_str)
                except Exception:
                    continue
    
                # Extract ANN entries
                ann_match = re.search(r'(?:^|;)ANN=([^;]+)', info)
                ann_entries = ann_match.group(1).split(',') if ann_match else []
    
                field_values: Dict[str, List[str]] = {k: [] for k in ann_fields}
                for ann in ann_entries:
                    items = ann.split('|')
                    if len(items) < len(ann_fields):
                        items += [''] * (len(ann_fields) - len(items))
                    elif len(items) > len(ann_fields):
                        items = items[:len(ann_fields)]
                    for k, v in zip(ann_fields, items):
                        field_values[k].append(v)
    
                joined = {f'ANN_{k}': (';'.join(v) if v else '') for k, v in field_values.items()}
                records.append({'CHROM': chrom, 'POS': pos, **joined})
    
        df = pd.DataFrame.from_records(records)
        if not df.empty:
            df['POS'] = pd.to_numeric(df['POS'], errors='coerce').astype('Int64')
            df['CHROM'] = normalize_chrom(df['CHROM'])
        return df, ann_cols
    
    def detect_key_columns(df: pd.DataFrame) -> Dict[str, str]:
        chrom_candidates = ['CHROM', '#CHROM', 'Chrom', 'Chromosome', 'chrom', 'chr', 'Chr']
        pos_candidates   = ['POS', 'Position', 'position', 'pos', 'Start', 'start']
        mapping = {}
        for c in chrom_candidates:
            if c in df.columns:
                mapping['CHROM'] = c
                break
        for p in pos_candidates:
            if p in df.columns:
                mapping['POS'] = p
                break
        return mapping
    
    def normalize_chrom_pos_df(df: pd.DataFrame, keys: Dict[str, str]) -> pd.DataFrame:
        out = df.copy()
        out[keys['CHROM']] = normalize_chrom(out[keys['CHROM']])
        out[keys['POS']]   = normalize_pos(out[keys['POS']])
        return out
    
    def summarize_chr_formats(series: pd.Series, label: str):
        raw = series.astype(str)
        has_chr_prefix = raw.str.startswith(('chr','CHR')).sum()
        mt_like = raw.str.fullmatch(r'(M|MtDNA|MTDNA|Mito|Mitochondrion)', case=False).sum()
        print(f"[{label}] CHROM diagnostics:")
        print(f"  total rows: {len(raw)}")
        print(f"  with 'chr'/'CHR' prefix: {has_chr_prefix}")
        print(f"  mitochondrial names like M/MtDNA/etc: {mt_like}")
    
    def keys_set(df: pd.DataFrame, chrom_col: str, pos_col: str) -> Set[Tuple[str, int]]:
        # Drop NA POS, NA CHROM
        sub = df[[chrom_col, pos_col]].dropna()
        # Ensure ints (drop NA after coercion)
        sub = sub[(sub[pos_col].astype('Int64').notna())]
        return set(zip(sub[chrom_col].astype(str), sub[pos_col].astype('int64')))
    
    def tolerance_match_count(sheet_keys: Iterable[Tuple[str,int]],
                            vcf_keys: Set[Tuple[str,int]],
                            tol: int) -> int:
        if tol <= 0:
            return sum(1 for k in sheet_keys if k in vcf_keys)
        cnt = 0
        for chrom, pos in sheet_keys:
            if (chrom, pos) in vcf_keys:
                cnt += 1
            else:
                matched = False
                # check +/- 1..tol
                for d in range(1, tol+1):
                    if (chrom, pos - d) in vcf_keys or (chrom, pos + d) in vcf_keys:
                        matched = True
                        break
                if matched:
                    cnt += 1
        return cnt
    
    def debug_match_report(df_sheet: pd.DataFrame,
                        vcf_df: pd.DataFrame,
                        keys: Dict[str, str],
                        debug_examples: int = 15,
                        pos_tolerance: int = 1):
        print("\n=== DEBUG: Matching overview ===")
        # Raw diagnostics
        summarize_chr_formats(df_sheet[keys['CHROM']], label="SHEET (raw)")
        summarize_chr_formats(vcf_df['CHROM'], label="VCF (normalized)")
    
        # Normalize sheet
        df_norm = normalize_chrom_pos_df(df_sheet, keys)
        print(f"Detected key columns -> CHROM: '{keys['CHROM']}'  POS: '{keys['POS']}'")
        # Basic stats
        n_sheet_all = len(df_sheet)
        n_sheet_key_nonnull = df_norm[keys['CHROM']].notna().sum() - df_norm[keys['CHROM']].isna().sum()
        n_sheet_pos_nonnull = df_norm[keys['POS']].notna().sum()
        print(f"SHEET rows total: {n_sheet_all}")
        print(f"SHEET rows with non-null CHROM: {n_sheet_key_nonnull}, non-null POS: {n_sheet_pos_nonnull}")
    
        # Unique key counts
        sheet_norm_keys_df = df_norm.rename(columns={keys['CHROM']: 'CHROM', keys['POS']: 'POS'})
        sheet_norm_keys_df = sheet_norm_keys_df.dropna(subset=['CHROM','POS'])
        sheet_norm_keys_df['POS'] = sheet_norm_keys_df['POS'].astype('Int64')
        sheet_keys_unique = keys_set(sheet_norm_keys_df, 'CHROM', 'POS')
        vcf_keys_unique   = keys_set(vcf_df, 'CHROM', 'POS')
    
        print(f"Unique (CHROM,POS) keys -> SHEET: {len(sheet_keys_unique)}  VCF(INS): {len(vcf_keys_unique)}")
    
        # Exact match count
        exact_matches = len(sheet_keys_unique & vcf_keys_unique)
        print(f"Exact key matches (SHEET∩VCF): {exact_matches}")
    
        # Tolerance diagnostics (diagnose off-by-one etc.)
        if pos_tolerance > 0:
            approx_matches = tolerance_match_count(sheet_keys_unique, vcf_keys_unique, pos_tolerance)
            print(f"Keys that would match within ±{pos_tolerance}: {approx_matches}")
    
        # Show some examples of non-matching keys from SHEET
        if debug_examples > 0:
            not_in_vcf = sorted(k for k in sheet_keys_unique if k not in vcf_keys_unique)
            not_in_sheet = sorted(k for k in vcf_keys_unique if k not in sheet_keys_unique)
            print(f"\nExamples of SHEET keys not found in VCF (showing up to {debug_examples}):")
            for k in not_in_vcf[:debug_examples]:
                print("  SHEET-only:", k)
            print(f"\nExamples of VCF keys not found in SHEET (showing up to {debug_examples}):")
            for k in not_in_sheet[:debug_examples]:
                print("  VCF-only:", k)
    
        # Try to detect a type column and report counts
        type_cols = [c for c in df_sheet.columns if c.lower() in ('svtype','type','variant_type','sv_type')]
        if type_cols:
            tcol = type_cols[0]
            is_ins = df_sheet[tcol].astype(str).str.upper() == 'INS'
            print(f"\nType column detected: '{tcol}'. SHEET rows with INS: {int(is_ins.sum())} / {len(df_sheet)}")
            # Of the INS rows, how many have keys that match?
            ins_keys = keys_set(df_norm[is_ins], keys['CHROM'], keys['POS'])
            exact_ins_matches = len(ins_keys & vcf_keys_unique)
            print(f"  INS-only exact key matches: {exact_ins_matches} / {len(ins_keys)}")
            if pos_tolerance > 0:
                approx_ins_matches = tolerance_match_count(ins_keys, vcf_keys_unique, pos_tolerance)
                print(f"  INS-only matches within ±{pos_tolerance}: {approx_ins_matches} / {len(ins_keys)}")
        else:
            print("\nNo explicit type column found in SHEET.")
    
    def merge_ann_into_sheet(df_sheet: pd.DataFrame, vcf_df: pd.DataFrame, ann_cols: List[str],
                            pos_tolerance: int = 1, debug_examples: int = 15) -> pd.DataFrame:
        df = df_sheet.copy()
    
        keys = detect_key_columns(df)
        if 'CHROM' not in keys or 'POS' not in keys:
            print("WARNING: Could not detect CHROM/POS columns in sheet; ANN columns will be empty.")
            for c in ann_cols:
                if c not in df.columns:
                    df[c] = ''
            return df
    
        # DEBUG: run a comprehensive match report
        debug_match_report(df, vcf_df, keys, debug_examples=debug_examples, pos_tolerance=pos_tolerance)
    
        # Normalize sheet keys for merge
        df_norm = normalize_chrom_pos_df(df, keys)
    
        # Prepare VCF map (unique by CHROM,POS), aggregate ANN fields
        vcf_use = vcf_df.copy()
        if vcf_use.empty:
            print("NOTE: No INS records found in VCF; ANN columns will be created but empty.")
        else:
            agg = {c: lambda s: ';'.join([x for x in s.astype(str).tolist() if x]) for c in ann_cols}
            vcf_use = vcf_use.groupby(['CHROM', 'POS'], as_index=False).agg(agg)
    
        # Identify potential type column in sheet
        type_cols = [c for c in df.columns if c.lower() in ('svtype','type','variant_type','sv_type')]
        has_type = bool(type_cols)
        if has_type:
            tcol = type_cols[0]
            is_ins = df[tcol].astype(str).str.upper() == 'INS'
            print(f"\nMERGE: using type column '{tcol}' -> rows marked INS: {int(is_ins.sum())} / {len(df)}")
        else:
            is_ins = pd.Series([False]*len(df), index=df.index)
            print("\nMERGE: no type column -> will fill ANN wherever exact (CHROM,POS) matches VCF INS.")
    
        # Left merge on exact keys only (do not use tolerance for filling, just for diagnostics)
        left = df_norm.rename(columns={keys['CHROM']: 'CHROM', keys['POS']: 'POS'})
        merged = left.merge(vcf_use[['CHROM','POS'] + ann_cols], on=['CHROM','POS'], how='left', suffixes=('',''))
    
        # Initialize ANN columns on original df
        for c in ann_cols:
            if c not in df.columns:
                df[c] = ''
    
        # Fill values:
        for c in ann_cols:
            values = merged[c]
            if has_type:
                df.loc[is_ins, c] = values[is_ins].fillna('').astype(str).values
            else:
                df[c] = values.fillna('').astype(str).values
    
        # Report matching stats on the actual merge
        matched = merged[ann_cols].notna().any(axis=1).sum()
        print(f"\nMERGE RESULT: rows with any ANN filled (exact VCF match): {int(matched)} / {len(df)}")
    
        # Additional hint if tolerance suggests many near-misses
        if pos_tolerance > 0:
            sheet_keys = keys_set(left, 'CHROM', 'POS')
            vcf_keys_unique = keys_set(vcf_use, 'CHROM', 'POS')
            approx = tolerance_match_count(sheet_keys, vcf_keys_unique, pos_tolerance)
            if approx > matched:
                print(f"NOTE: There appear to be {approx - matched} additional rows that would match within ±{pos_tolerance}.")
                print("      This often indicates a 0-based vs 1-based position shift or use of END instead of POS in the sheet.")
    
        return df
    
    def main():
        ap = argparse.ArgumentParser()
        ap.add_argument('--excel', default='merged_ngmlr+sniffles_variants.xlsx', help='Input Excel workbook')
        ap.add_argument('--sheet8', default='HD46-8', help='Sheet name for HD46-8 sample')
        ap.add_argument('--sheet13', default='HD46-13', help='Sheet name for HD46-13 sample')
        ap.add_argument('--vcf8', default='HD46-8_ngmlr+sniffles_filtered.annotated.vcf', help='Annotated VCF for HD46-8')
        ap.add_argument('--vcf13', default='HD46-13_ngmlr+sniffles_filtered.annotated.vcf', help='Annotated VCF for HD46-13')
        ap.add_argument('--out', default='merged_ngmlr+sniffles_variants_with_ANN.xlsx', help='Output Excel path')
        ap.add_argument('--debug_examples', type=int, default=15, help='How many non-match examples to print from each side')
        ap.add_argument('--pos_tolerance', type=int, default=1, help='Diagnostic tolerance (±N bp) for off-by-N checks (used for debug only)')
        args = ap.parse_args()
    
        excel_path = Path(args.excel)
        vcf8_path = Path(args.vcf8)
        vcf13_path = Path(args.vcf13)
        out_path = Path(args.out)
    
        # Load sheets (resolve case-insensitive names)
        xls = pd.ExcelFile(excel_path)
        def resolve_sheet(name: str) -> str:
            if name in xls.sheet_names:
                return name
            lower_map = {s.lower(): s for s in xls.sheet_names}
            return lower_map.get(name.lower(), name)
    
        sheet8 = resolve_sheet(args.sheet8)
        sheet13 = resolve_sheet(args.sheet13)
    
        df8 = pd.read_excel(excel_path, sheet_name=sheet8)
        df13 = pd.read_excel(excel_path, sheet_name=sheet13)
    
        # Parse VCFs (INS only)
        vcf8_df, ann_cols = parse_vcf_ann(vcf8_path)
        vcf13_df, _ = parse_vcf_ann(vcf13_path)
    
        print(f"VCF8 INS variants: {len(vcf8_df)}; VCF13 INS variants: {len(vcf13_df)}")
        print(f"ANN subfields ({len(ann_cols)}): {', '.join(ann_cols)}")
    
        # Merge with diagnostics
        df8_out = merge_ann_into_sheet(df8, vcf8_df, ann_cols,
                                    pos_tolerance=args.pos_tolerance,
                                    debug_examples=args.debug_examples)
        df13_out = merge_ann_into_sheet(df13, vcf13_df, ann_cols,
                                        pos_tolerance=args.pos_tolerance,
                                        debug_examples=args.debug_examples)
    
        # Save
        with pd.ExcelWriter(out_path, engine='xlsxwriter') as writer:
            df8_out.to_excel(writer, sheet_name=sheet8, index=False)
            df13_out.to_excel(writer, sheet_name=sheet13, index=False)
    
        print(f"\nDone. Wrote: {out_path.resolve()}")
    
    if __name__ == '__main__':
        main()
  3. Manually merge all contents of ANN=? to a seperate column ‘ANN’ in the isolate-specific sheets in the Excel-file.

    #Add CHROM and HD46_Ctrl.1 to first column of the input Excel-file
    (plot-numpy1) jhuang@WS-2290C:~/DATA/Data_Patricia_Transposon_2025$ python add_ann_to_excel.py         --excel merged_ngmlr+sniffles_variants.xlsx         --sheet8 "HD46-8"         --sheet13 "HD46-13"         --vcf8 HD46-8_ngmlr+sniffles_filtered.annotated.vcf         --vcf13 HD46-13_ngmlr+sniffles_filtered.annotated.vcf         --out merged_ngmlr+sniffles_variants_with_ANN.xlsx
    #DEL some columns (INFO, NN_Allele, ANN_Rank, ANN_Errors_Warnings_Info, from the table, and COPY the summary-sheet to the final table.

Leave a Reply

Your email address will not be published. Required fields are marked *