gene_x 0 like s 24 view s
Tags: pipeline
For Clinical and HF
a) Please proceed with SNP and indel calling for both clinical samples.
#-- DNAseq_2025_AYE: clinical vs CP149838 (Acinetobacter baumannii strain 2024CK-00130) --
mkdir -p bacto_clinical_vs_CP149838/raw_data; cd bacto_clinical_vs_CP149838;
ln -s ~/Tools/bacto/db/ .;
ln -s ~/Tools/bacto/envs/ .;
ln -s ~/Tools/bacto/local/ .;
cp ~/Tools/bacto/Snakefile .;
cp ~/Tools/bacto/bacto-0.1.json .;
cp ~/Tools/bacto/cluster.json .;
cd raw_data
ln -s ../../X101SC25015922-Z01-J001/01.RawData/clinical/clinical_1.fq.gz clinical_R1.fastq.gz
ln -s ../../X101SC25015922-Z01-J001/01.RawData/clinical/clinical_2.fq.gz clinical_R2.fastq.gz
(bengal3_ac3) /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
# Annotate the snippy results using SNP_matrix.sh from spandx
(spandx) cd bacto_clinical_vs_CP149838/snippy/clinical
(spandx) cp ~/Tools/spandx/bin/SNP_matrix.sh ./
(spandx) cp clinical.filt.vcf out.filtered.vcf
(spandx) cp clinical.filt.vcf out.vcf
#Adapt to "/home/jhuang/miniconda3/envs/bengal3_ac3/bin/snpEff eff -no-downstream -no-intergenic -ud 100 -formatEff -c reference/snpeff.config -dataDir . ref out.vcf > out.annotated.vcf" in SNP_matrix.sh
(spandx) bash SNP_matrix.sh CP149838 .
#(ANNOTATED_RESULTS_SNIPPY) ~/Tools/csv2xls-0.4/csv_to_xls.py All_SNPs_indels_annotated.txt -d$'\t' -o ../../../clinical_vs_CP149838_SNPs_indels_annotated.xls
#-- DNAseq_2025_adeABadeIJ_adeIJK_CM1_CM2: HF vs CP095177.1 (Enterobacter hormaechei strain K528) --
mkdir -p bacto_HF_vs_CP095177/raw_data; cd bacto_HF_vs_CP095177;
ln -s ~/Tools/bacto/db/ .;
ln -s ~/Tools/bacto/envs/ .;
ln -s ~/Tools/bacto/local/ .;
cp ~/Tools/bacto/Snakefile .;
cp ~/Tools/bacto/bacto-0.1.json .;
cp ~/Tools/bacto/cluster.json .;
cd raw_data
ln -s ../../X101SC24115801-Z01-J001/01.RawData/HF/HF_1.fq.gz HF_R1.fastq.gz
ln -s ../../X101SC24115801-Z01-J001/01.RawData/HF/HF_2.fq.gz HF_R2.fastq.gz
mamba activate /home/jhuang/miniconda3/envs/bengal3_ac3
(bengal3_ac3) /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
# Annotate the Snippy results using SNP_matrix.sh from SPANDx, disregarding the runtime error.
mamba activate /home/jhuang/miniconda3/envs/spandx
(spandx) cd bacto_HF_vs_CP095177/snippy/HF
(spandx) cp ~/Tools/spandx/bin/SNP_matrix.sh ./
(spandx) cp HF.filt.vcf out.filtered.vcf
(spandx) cp HF.filt.vcf out.vcf
#Adapt to "/home/jhuang/miniconda3/envs/bengal3_ac3/bin/snpEff eff -no-downstream -no-intergenic -ud 100 -formatEff -c reference/snpeff.config -dataDir . ref out.vcf > out.annotated.vcf" in SNP_matrix.sh
(spandx) bash SNP_matrix.sh CP095177 .
#(ANNOTATED_RESULTS_SNIPPY) ~/Tools/csv2xls-0.4/csv_to_xls.py All_SNPs_indels_annotated.txt -d$'\t' -o ../../../HF_vs_CP095177_SNPs_indels_annotated.xls
b) Additionally, I would like a phylogenetic tree (evolutionary relationship between our samples and other reference genomes) and
#using Dendogram tool
c) A sequence alignment figure (similar to this example: https://macvector.com/blog/2021/10/compare-a-pair-of-genomes/) comparing the reference genome with our samples.
# TODO: Using BRIG: see http://xgenes.com/article/article-content/325/analysis-of-snps-indels-transposons-and-is-elements-in-5-a-baumannii-strains/
* DNAseq_2025_AYE: others vs CU459141
cd ~/Tools/BRIG-0.95-dist
#IMPORTANT_CRITICAL_DEBUG: we should use jdk1.6, otherwise results in ERROR!
~/Tools/Java1.6/jdk1.6.0_45/bin/java -Xmx25000M -jar BRIG.jar
#1. Select Reference Genbank file in first textfield.
#2. Prepare assembled fasta-files and sepecific gene-sequences as "Query sequence" in the folder "~/DATA/Data_Tam_DNAseq_2025_AYE/shovill/ and ~/DATA/Data_Tam_DNAseq_2025_AYE/" by "cp AYE-Q/contigs.fa AYE-Q.fasta; ...; samtools faidx CP059040.fasta CP059040:1844319-1845509 > adeA.fasta; ..." in second textfield.
#3. Saved bundle_sessions --> saved all files in a seperate directory --> open session brigWorkspace_AYE (Note that sometimes the fasta-files is too large (severeal GBs) and all of them have been copied to "~/brigWorkspace_adeABadeIJ_adeIJK_CM1_CM2/gen")
* DNAseq_2025_adeABadeIJ_adeIJK_CM1_CM2: others to CP059040
samtools faidx CP059040.fasta CP059040:1844319-1845509 > adeA.fasta
samtools faidx CP059040.fasta CP059040:1845506-1848616 > adeB.fasta
samtools faidx CP059040.fasta CP059040:740422-741672 > adeI_.fasta
revcomp adeI_.fasta > adeI.fasta
samtools faidx CP059040.fasta CP059040:737233-740409 > adeJ_.fasta
revcomp adeJ_.fasta > adeJ.fasta
samtools faidx CP059040.fasta CP059040:735779-737233 > adeK_.fasta
revcomp adeK_.fasta > adeK.fasta
gene 1844319..1845509
/gene="adeA"
/locus_tag="H0N29_08675"
gene 1845506..1848616
/gene="adeB"
/locus_tag="H0N29_08680"
gene complement(740422..741672)
/gene="adeI"
/locus_tag="H0N29_03550"
gene complement(737233..740409)
/gene="adeJ"
/locus_tag="H0N29_03545"
gene complement(735779..737233)
/gene="adeK"
/locus_tag="H0N29_03540"
#jhuang@WS-2290C:~/brigWorkspace_adeABadeIJ_adeIJK_CM1_CM2/scratch$ vim adeAB.fastaVsCP059040.gb.fna.tab
#jhuang@WS-2290C:~/brigWorkspace_adeABadeIJ_adeIJK_CM1_CM2/scratch$ vim adeIJK.fastaVsCP059040.gb.fna.tab
#TODO: Optimize the cutoff so that vey short matches not used for the BRIG drawing!
#/home/jhuang/miniconda3/bin/blastn -outfmt 6 -query /home/jhuang/DATA/Data_Tam_DNAseq_2025_AYE/shovill/AYE-S.fasta -db /home/jhuang/brigWorkspace/scratch/CU459141.gb.fna -out /home/jhuang/brigWorkspace/scratch/AYE-S.fastaVsCU459141.gb.fna.tab -task blastn
"-evalue 1e-1 -max_target_seqs 1" #-strand both -num_threads 15 -outfmt 6
cd ~/brigWorkspace_adeABadeIJ_adeIJK_CM1_CM2/scratch/
cp /home/jhuang/Tools/bacto/db/CU459141.gb ~/brigWorkspace_AYE #DEBUG due to the empty genbank file
#mv AYE-Q.fastaVsCU459141.gb.fna.tab AYE-Q.fastaVsCU459141.gb.fna.tab_
#mv AYE-S.fastaVsCU459141.gb.fna.tab AYE-S.fastaVsCU459141.gb.fna.tab_
#mv AYE-WT_on_Tig4.fastaVsCU459141.gb.fna.tab AYE-WT_on_Tig4.fastaVsCU459141.gb.fna.tab_
#mv AYE-craA_on_Tig4.fastaVsCU459141.gb.fna.tab AYE-craA_on_Tig4.fastaVsCU459141.gb.fna.tab_
#mv AYE-craA-1_on_Cm200.fastaVsCU459141.gb.fna.tab AYE-craA-1_on_Cm200.fastaVsCU459141.gb.fna.tab_
#mv AYE-craA-2_on_Cm200.fastaVsCU459141.gb.fna.tab AYE-craA-2_on_Cm200.fastaVsCU459141.gb.fna.tab_
python3 ~/Scripts/filter_blast.py AYE-Q.fastaVsCU459141.gb.fna.tab_ AYE-Q.fastaVsCU459141.gb.fna.tab --evalue 1e-10 --min_length 100 --min_identity 90
python3 ~/Scripts/filter_blast.py AYE-S.fastaVsCU459141.gb.fna.tab_ AYE-S.fastaVsCU459141.gb.fna.tab --evalue 1e-10 --min_length 100 --min_identity 90
python3 ~/Scripts/filter_blast.py AYE-WT_on_Tig4.fastaVsCU459141.gb.fna.tab_ AYE-WT_on_Tig4.fastaVsCU459141.gb.fna.tab --evalue 1e-10 --min_length 100 --min_identity 90
python3 ~/Scripts/filter_blast.py AYE-craA_on_Tig4.fastaVsCU459141.gb.fna.tab_ AYE-craA_on_Tig4.fastaVsCU459141.gb.fna.tab --evalue 1e-10 --min_length 100 --min_identity 90
python3 ~/Scripts/filter_blast.py AYE-craA-1_on_Cm200.fastaVsCU459141.gb.fna.tab_ AYE-craA-1_on_Cm200.fastaVsCU459141.gb.fna.tab --evalue 1e-10 --min_length 100 --min_identity 90
python3 ~/Scripts/filter_blast.py AYE-craA-2_on_Cm200.fastaVsCU459141.gb.fna.tab_ AYE-craA-2_on_Cm200.fastaVsCU459141.gb.fna.tab --evalue 1e-10 --min_length 100 --min_identity 90
cp /home/jhuang/Tools/bacto/db/CP059040.gb ~/brigWorkspace_adeABadeIJ_adeIJK_CM1_CM2 #DEBUG due to the empty genbank file
#mv deltaAdeIJK.fastaVsCP059040.gb.fna.tab deltaAdeIJK.fastaVsCP059040.gb.fna.tab_
#mv deltaAdeABIJ.fastaVsCP059040.gb.fna.tab deltaAdeABIJ.fastaVsCP059040.gb.fna.tab_
#mv CM1.fastaVsCP059040.gb.fna.tab CM1.fastaVsCP059040.gb.fna.tab_
#mv CM2.fastaVsCP059040.gb.fna.tab CM2.fastaVsCP059040.gb.fna.tab_
python3 ~/Scripts/filter_blast.py deltaAdeIJK.fastaVsCP059040.gb.fna.tab_ deltaAdeIJK.fastaVsCP059040.gb.fna.tab --evalue 1e-10 --min_length 100 --min_identity 90
python3 ~/Scripts/filter_blast.py deltaAdeABIJ.fastaVsCP059040.gb.fna.tab_ deltaAdeABIJ.fastaVsCP059040.gb.fna.tab --evalue 1e-10 --min_length 100 --min_identity 90
python3 ~/Scripts/filter_blast.py CM1.fastaVsCP059040.gb.fna.tab_ CM1.fastaVsCP059040.gb.fna.tab --evalue 1e-10 --min_length 100 --min_identity 90
python3 ~/Scripts/filter_blast.py CM2.fastaVsCP059040.gb.fna.tab_ CM2.fastaVsCP059040.gb.fna.tab --evalue 1e-10 --min_length 100 --min_identity 90
* DNAseq_2025_AYE: clinical vs gi|2707120326|gb|CP149838.1| (Acinetobacter baumannii strain 2024CK-00130)
mamba activate /home/jhuang/miniconda3/envs/bakta
bakta --db /mnt/nvme1n1p1/bakta_db ~/DATA/Data_Tam_DNAseq_2025_AYE/vrap_clinical/vrap_contig.fasta --prefix vrap_clinical
* DNAseq_2025_adeABadeIJ_adeIJK_CM1_CM2: HF vs gi|2239477175|gb|CP095177.1| (Enterobacter hormaechei strain K528)
bakta --db /mnt/nvme1n1p1/bakta_db ~/DATA/Data_Tam_DNAseq_2025_adeABadeIJ_adeIJK_CM1_CM2/vrap_HF/vrap_contig.fasta --prefix vrap_HF
# Using Artemis (ACT) to draw the comparisons (https://sanger-pathogens.github.io/Artemis/; https://github.com/sanger-pathogens/Artemis)
tar zxf artemis-unix-release-{version}.tar.gz
~/Scripts/embl_to_gbk.py CP149838.embl CP149838.gbk
~/Scripts/embl_to_gbk.py vrap_clinical.embl clinical.gbk
~/Scripts/embl_to_gbk.py CP095177.embl CP095177.gbk
~/Scripts/embl_to_gbk.py vrap_HF.embl HF.gbk
TODO_NEXT_WEEK: comparison_file_1 is defined as follows: Common formats: .crunch (BLAST), .delta (MUMmer), or .align (Exonerate).
blastn -query ref_genome.fasta -subject sample_genome.fasta -outfmt 6 > blast.crunch
# Step 1: Extract FASTA from GenBank (if needed for alignment)
~/Scripts/genbank2fasta.py CP149838.gbk
~/Scripts/genbank2fasta.py clinical.gbk
# Step 2: Run BLAST/MUMmer to generate comparison file
blastn -query CP149838.gbk_converted.fna -subject clinical.gbk_converted.fna -outfmt 6 > blast.crunch
Input:
Sequence file 1: reference.gb (annotated reference genome).
Sequence file 2: sample.gb (annotated sample genome).
Comparison file 1: nucmer.delta (generated with nucmer reference.fasta sample.fasta).
Output:
ACT will display both genomes with annotations, aligned based on the .delta file.
The image appears to be a genome visualization, possibly from a tool like Artemis, ACT (Artemis Comparison Tool), or another genome browser. Here’s what’s likely happening in the plot:
Genome Overview
Three Rows for Forward Strand and Three Rows for Reverse Strand
Each genome sequence has genes and annotations on both forward and reverse strands.
The three rows in each direction represent different levels of gene annotations to avoid overlap and make visualization easier.
Why Three Rows?
Genes in a genome can be closely packed or even overlapping.
To prevent clutter, genome visualization tools often distribute overlapping genes across multiple rows.
This is purely for visual clarity and does not indicate different types of genes.
Key Features in the Image
Black vertical bars: Represent DNA sequences.
Blue/Green Annotations: Likely represent genes or coding sequences (CDS).
Gray Annotations: Might be regulatory elements or pseudogenes.
Thicker Blue Section: Could represent regions of high similarity between sequences
#Some genome browsers use frame lines to indicate the three possible reading frames on each strand.
#Top three lines: Represent the three possible reading frames of the forward strand (5' → 3').
#Bottom three lines: Represent the three possible reading frames of the reverse strand (3' → 5').
#Light grey areas = ORFs (regions that can actually be translated into proteins).
#If an area is not light grey, it likely means that no ORF exists in that reading frame, meaning there are stop codons breaking the sequence.
#TODO_NEXT_MONDAY: how to zoom in the ACT browser?
d) Would it be possible to perform genome annotation for these samples?
bakta --db /mnt/nvme1n1p1/bakta_db --genus Acinetobacter --species baumannii --strain AYE-Q ../shovill/AYE-Q.fasta
bakta --db /mnt/nvme1n1p1/bakta_db --genus Acinetobacter --species baumannii --strain AYE-S ../shovill/AYE-S.fasta
bakta --db /mnt/nvme1n1p1/bakta_db --genus Acinetobacter --species baumannii --strain AYE-WT_on_Tig4 ../shovill/AYE-WT_on_Tig4.fasta
bakta --db /mnt/nvme1n1p1/bakta_db --genus Acinetobacter --species baumannii --strain AYE-craA_on_Tig4 ../shovill/AYE-craA_on_Tig4.fasta
bakta --db /mnt/nvme1n1p1/bakta_db --genus Acinetobacter --species baumannii --strain AYE-craA-1_on_Cm200 ../shovill/AYE-craA-1_on_Cm200.fasta
bakta --db /mnt/nvme1n1p1/bakta_db --genus Acinetobacter --species baumannii --strain AYE-craA-2_on_Cm200 ../shovill/AYE-craA-2_on_Cm200.fasta
cd ~/DATA/Data_Tam_DNAseq_2025_adeABadeIJ_adeIJK_CM1_CM2/bakta
bakta --db /mnt/nvme1n1p1/bakta_db --genus Acinetobacter --species baumannii --strain deltaAdeIJK ../shovill/deltaAdeIJK.fasta
bakta --db /mnt/nvme1n1p1/bakta_db --genus Acinetobacter --species baumannii --strain deltaAdeABIJ ../shovill/deltaAdeABIJ.fasta
bakta --db /mnt/nvme1n1p1/bakta_db --genus Acinetobacter --species baumannii --strain CM1 ../shovill/CM1.fasta
bakta --db /mnt/nvme1n1p1/bakta_db --genus Acinetobacter --species baumannii --strain CM2 ../shovill/CM2.fasta
#END
e) Could you please perform a search for HF sample against CP095179 and CP095178 as Enterobacter hormaechei strain K528 harbours two plasmids.
In addition to my comments from my previous email, could you please perform a search for HF sample against CP095179 and CP095178 as Enterobacter hormaechei strain K528 harbours two plasmids.
Solution: Using the bowtie of vrap to map the reads on ref_genome/reference.fasta (The reference refers to the closest related genome found from the list generated by vrap)
mamba activate /home/jhuang/miniconda3/envs/vrap
(vrap) vrap/vrap.py -1 HF_trimmed_P_1.fastq.gz -2 HF_trimmed_P_2.fastq.gz -o vrap_HF_on_CP095178_and_CP095179 --host /home/jhuang/DATA/Data_Tam_DNAseq_2025_adeABadeIJ_adeIJK_CM1_CM2/CP095178_CP095179.fasta -t 100 -l 200 -g
cd bowtie
mv mapped mapped.sam
samtools view -S -b mapped.sam > mapped.bam
samtools sort mapped.bam -o mapped_sorted.bam
samtools index mapped_sorted.bam
samtools view -H mapped_sorted.bam
samtools flagstat mapped_sorted.bam
#14634302 + 0 in total (QC-passed reads + QC-failed reads)
#0 + 0 secondary
#0 + 0 supplementary
#0 + 0 duplicates
#177416 + 0 mapped (1.21% : N/A)
#11014010 + 0 paired in sequencing
#5507005 + 0 read1
#5507005 + 0 read2
#118328 + 0 properly paired (1.07% : N/A)
#123066 + 0 with itself and mate mapped
#10632 + 0 singletons (0.10% : N/A)
#0 + 0 with mate mapped to a different chr
#0 + 0 with mate mapped to a different chr (mapQ>=5)
samtools depth -m 0 -a mapped_sorted.bam > coverage.txt
grep "CP095178" coverage.txt > CP095178_coverage.txt
grep "CP095179" coverage.txt > CP095179_coverage.txt
import pandas as pd
import matplotlib.pyplot as plt
# Load coverage data
df = pd.read_csv("CP095178_coverage.txt", sep="\t", header=None, names=["chr", "pos", "coverage"])
# Plot
plt.figure(figsize=(10,4))
plt.plot(df["pos"], df["coverage"], color="blue", linewidth=0.5)
plt.xlabel("Genomic Position")
plt.ylabel("Coverage Depth")
plt.title("BAM Coverage Plot")
plt.show()
RESULTS: say CP095178 does not exist, but CP095179 exists (see attached coverage plots)!
Kindly perform a comparative analysis on this data (DNAseq_2025_adeABadeIJ_adeIJK_CM1_CM2 against CP059040):
adeABadeIJ sample (knockout of adeA - H0N29_08675, adeB - H0N29_08680, adeI - H0N29_03550, and adeJ - H0N29_03545, please confirm whether these genes are successfully knocked out.)
adeIJK sample (knockout of adeI - H0N29_03550, adeJ - H0N29_03545, and adeK - H0N29_03540,, please confirm whether these genes are successfully knocked out.)
for sample in adeABadeIJ adeIJK CM1 CM2; do
makeblastdb -in ~/DATA/Data_Tam_DNAseq_2025_adeABadeIJ_adeIJK_CM1_CM2/shovill/${sample}/contigs.fa -dbtype nucl
done
for id in adeA adeB adeI adeJ adeK; do
echo "mkdir ${id}"
echo "for sample in adeABadeIJ adeIJK CM1 CM2; do"
echo "blastn -db ~/DATA/Data_Tam_DNAseq_2025_adeABadeIJ_adeIJK_CM1_CM2/shovill/\${sample}/contigs.fa -query ${id}.fasta -evalue 1e-1 -num_threads 15 -outfmt 6 -strand both -max_target_seqs 1 > ./${id}/\${sample}.blastn"
echo "done"
done
cp mlst/all_mlst_may_be_wrong.txt typing.csv
python ~/Scripts/process_directory.py adeA typing.csv typing_until_adeA.csv
python ~/Scripts/process_directory.py adeB typing_until_adeA.csv typing_until_adeB.csv
python ~/Scripts/process_directory.py adeI typing_until_adeB.csv typing_until_adeI.csv
python ~/Scripts/process_directory.py adeJ typing_until_adeI.csv typing_until_adeJ.csv
python ~/Scripts/process_directory.py adeK typing_until_adeJ.csv typing_until_adeK.csv
#Note that the length of adeK is 1455 nt long. In the sample adeIJK sample, we can find the subsequence from the position 14 to 1455 of the genes in the contig00007 (position 44108-45549).
点赞本文的读者
还没有人对此文章表态
没有评论
Comprehensive smallRNA-7 profiling using exceRpt pipeline with full reference databases (v2)
Mapping of reads to selected viruses in DAMIAN results (version 2)
Mapping of reads to selected viruses in DAMIAN results
Run the viral-ngs Snakemake pipelines inside a Docker environment
© 2023 XGenes.com Impressum