-
run tpp https://transit.readthedocs.io/en/latest/transit_running.html
#-maxreads 10000 or not_given for take all! #-primer AGATGTGTATAAGAGACAG the default primer of Tn5 is TAAGAGACAG! #-primer-start-window 0,159 set 0,159 as default! python3 ~/.local/bin/tpp -bwa /usr/bin/bwa -protocol Tn5 -ref WA-314_m.fna -reads1 ./240405_VH00358_89_AAFC5MTM5/kr1/initial_mutants_rep1_S25_R1_001.fastq -reads2 ./240405_VH00358_89_AAFC5MTM5/kr1/initial_mutants_rep1_S25_R2_001.fastq -output 10_chimera -mismatches initial_mutants_rep1 -bwa-alg mem -primer ACCTACCCCNCCGCTCTC -replicon-ids contig_2_10,contig_2_9,contig_2_8,contig_2_7,contig_2_6,contig_2_5,contig_2_3,contig_2_2,contig_5_10,contig_5_11,contig_5_12,contig_5_13,contig_5_15,contig_5_16,contig_5_17,contig_5_18,contig_5_9,contig_5_8,contig_5_7,contig_5_6,contig_5_5,contig_5_4,contig_5_3,contig_5_2,contig_5_1,contig_4_2,contig_4_1,contig_3_59,contig_3_58,contig_3_57,contig_3_56,contig_3_55,contig_3_54,contig_3_53,contig_3_52,contig_3_51,contig_3_50,contig_3_49,contig_3_48,contig_3_47,contig_3_46,contig_3_44,contig_3_43,contig_3_42,contig_3_41,contig_3_40,contig_3_39,contig_3_38,contig_3_37,contig_3_36,contig_3_35,contig_3_34,contig_3_33,contig_3_32,contig_3_31,contig_3_30,contig_3_29,contig_3_28,contig_3_27,contig_3_26,contig_3_25,contig_3_24,contig_3_23,contig_3_22,contig_3_21,contig_3_20,contig_3_17,contig_3_16,contig_3_15,contig_3_14,contig_3_13,contig_3_12,contig_3_11,contig_3_9,contig_3_8,contig_3_7,contig_3_6,contig_3_5,contig_3_3,contig_3_2,contig_3_1,contig_1_48,contig_1_47,contig_1_46,contig_1_45,contig_1_44,contig_1_43,contig_1_42,contig_1_41,contig_1_40,contig_1_39,contig_1_38,contig_1_37,contig_1_34,contig_1_33,contig_1_32,contig_1_31,contig_1_28,contig_1_27,contig_1_26,contig_1_25,contig_1_24,contig_1_22,contig_1_20,contig_1_19,contig_1_18,contig_1_17,contig_1_16,contig_1_15,contig_1_14,contig_1_13,contig_1_12,contig_1_10,contig_1_8,contig_1_7,contig_1_6,contig_1_5,contig_1_4,contig_1_3,contig_1_2,contig_1_1,contig_C8715,contig_C8943,contig_C9371,contig_C8939,contig_C9357,contig_C8991,contig_C9445,contig_C8689 mv tpp.cfg initial_mutants_rep1.tpp.cfg python3 ~/.local/bin/tpp -bwa /usr/bin/bwa -protocol Tn5 -ref WA-314_m.fna -reads1 ./240405_VH00358_89_AAFC5MTM5/kr1/initial_mutants_rep1_S25_R1_001.fastq -reads2 ./240405_VH00358_89_AAFC5MTM5/kr1/initial_mutants_rep1_S25_R2_001.fastq -output 10 -mismatches initial_mutants_rep1 -bwa-alg mem -replicon-ids contig_2_10,contig_2_9,contig_2_8,contig_2_7,contig_2_6,contig_2_5,contig_2_3,contig_2_2,contig_5_10,contig_5_11,contig_5_12,contig_5_13,contig_5_15,contig_5_16,contig_5_17,contig_5_18,contig_5_9,contig_5_8,contig_5_7,contig_5_6,contig_5_5,contig_5_4,contig_5_3,contig_5_2,contig_5_1,contig_4_2,contig_4_1,contig_3_59,contig_3_58,contig_3_57,contig_3_56,contig_3_55,contig_3_54,contig_3_53,contig_3_52,contig_3_51,contig_3_50,contig_3_49,contig_3_48,contig_3_47,contig_3_46,contig_3_44,contig_3_43,contig_3_42,contig_3_41,contig_3_40,contig_3_39,contig_3_38,contig_3_37,contig_3_36,contig_3_35,contig_3_34,contig_3_33,contig_3_32,contig_3_31,contig_3_30,contig_3_29,contig_3_28,contig_3_27,contig_3_26,contig_3_25,contig_3_24,contig_3_23,contig_3_22,contig_3_21,contig_3_20,contig_3_17,contig_3_16,contig_3_15,contig_3_14,contig_3_13,contig_3_12,contig_3_11,contig_3_9,contig_3_8,contig_3_7,contig_3_6,contig_3_5,contig_3_3,contig_3_2,contig_3_1,contig_1_48,contig_1_47,contig_1_46,contig_1_45,contig_1_44,contig_1_43,contig_1_42,contig_1_41,contig_1_40,contig_1_39,contig_1_38,contig_1_37,contig_1_34,contig_1_33,contig_1_32,contig_1_31,contig_1_28,contig_1_27,contig_1_26,contig_1_25,contig_1_24,contig_1_22,contig_1_20,contig_1_19,contig_1_18,contig_1_17,contig_1_16,contig_1_15,contig_1_14,contig_1_13,contig_1_12,contig_1_10,contig_1_8,contig_1_7,contig_1_6,contig_1_5,contig_1_4,contig_1_3,contig_1_2,contig_1_1,contig_C8715,contig_C8943,contig_C9371,contig_C8939,contig_C9357,contig_C8991,contig_C9445,contig_C8689 mv tpp.cfg initial_mutants_rep1.tpp.cfg python3 ~/.local/bin/tpp -bwa /usr/bin/bwa -protocol Tn5 -ref WA-314_m.fna -reads1 ./240405_VH00358_89_AAFC5MTM5/kr3/LB_culture_rep1_S26_R1_001.fastq.gz -reads2 ./240405_VH00358_89_AAFC5MTM5/kr3/LB_culture_rep1_S26_R2_001.fastq.gz -output LB_culture_rep1 -mismatches 1 -bwa-alg mem -replicon-ids contig_2_10,contig_2_9,contig_2_8,contig_2_7,contig_2_6,contig_2_5,contig_2_3,contig_2_2,contig_5_10,contig_5_11,contig_5_12,contig_5_13,contig_5_15,contig_5_16,contig_5_17,contig_5_18,contig_5_9,contig_5_8,contig_5_7,contig_5_6,contig_5_5,contig_5_4,contig_5_3,contig_5_2,contig_5_1,contig_4_2,contig_4_1,contig_3_59,contig_3_58,contig_3_57,contig_3_56,contig_3_55,contig_3_54,contig_3_53,contig_3_52,contig_3_51,contig_3_50,contig_3_49,contig_3_48,contig_3_47,contig_3_46,contig_3_44,contig_3_43,contig_3_42,contig_3_41,contig_3_40,contig_3_39,contig_3_38,contig_3_37,contig_3_36,contig_3_35,contig_3_34,contig_3_33,contig_3_32,contig_3_31,contig_3_30,contig_3_29,contig_3_28,contig_3_27,contig_3_26,contig_3_25,contig_3_24,contig_3_23,contig_3_22,contig_3_21,contig_3_20,contig_3_17,contig_3_16,contig_3_15,contig_3_14,contig_3_13,contig_3_12,contig_3_11,contig_3_9,contig_3_8,contig_3_7,contig_3_6,contig_3_5,contig_3_3,contig_3_2,contig_3_1,contig_1_48,contig_1_47,contig_1_46,contig_1_45,contig_1_44,contig_1_43,contig_1_42,contig_1_41,contig_1_40,contig_1_39,contig_1_38,contig_1_37,contig_1_34,contig_1_33,contig_1_32,contig_1_31,contig_1_28,contig_1_27,contig_1_26,contig_1_25,contig_1_24,contig_1_22,contig_1_20,contig_1_19,contig_1_18,contig_1_17,contig_1_16,contig_1_15,contig_1_14,contig_1_13,contig_1_12,contig_1_10,contig_1_8,contig_1_7,contig_1_6,contig_1_5,contig_1_4,contig_1_3,contig_1_2,contig_1_1,contig_C8715,contig_C8943,contig_C9371,contig_C8939,contig_C9357,contig_C8991,contig_C9445,contig_C8689 mv tpp.cfg LB_culture_rep1.tpp.cfg python3 ~/.local/bin/tpp -bwa /usr/bin/bwa -protocol Tn5 -ref WA-314_m.fna -reads1 ./240405_VH00358_89_AAFC5MTM5/kr9/intracellular_mutants_24h_rep1_S29_R1_001.fastq.gz -reads2 ./240405_VH00358_89_AAFC5MTM5/kr9/intracellular_mutants_24h_rep1_S29_R2_001.fastq.gz -output intracellular_mutants_24h_rep1 -mismatches 1 -bwa-alg mem -replicon-ids contig_2_10,contig_2_9,contig_2_8,contig_2_7,contig_2_6,contig_2_5,contig_2_3,contig_2_2,contig_5_10,contig_5_11,contig_5_12,contig_5_13,contig_5_15,contig_5_16,contig_5_17,contig_5_18,contig_5_9,contig_5_8,contig_5_7,contig_5_6,contig_5_5,contig_5_4,contig_5_3,contig_5_2,contig_5_1,contig_4_2,contig_4_1,contig_3_59,contig_3_58,contig_3_57,contig_3_56,contig_3_55,contig_3_54,contig_3_53,contig_3_52,contig_3_51,contig_3_50,contig_3_49,contig_3_48,contig_3_47,contig_3_46,contig_3_44,contig_3_43,contig_3_42,contig_3_41,contig_3_40,contig_3_39,contig_3_38,contig_3_37,contig_3_36,contig_3_35,contig_3_34,contig_3_33,contig_3_32,contig_3_31,contig_3_30,contig_3_29,contig_3_28,contig_3_27,contig_3_26,contig_3_25,contig_3_24,contig_3_23,contig_3_22,contig_3_21,contig_3_20,contig_3_17,contig_3_16,contig_3_15,contig_3_14,contig_3_13,contig_3_12,contig_3_11,contig_3_9,contig_3_8,contig_3_7,contig_3_6,contig_3_5,contig_3_3,contig_3_2,contig_3_1,contig_1_48,contig_1_47,contig_1_46,contig_1_45,contig_1_44,contig_1_43,contig_1_42,contig_1_41,contig_1_40,contig_1_39,contig_1_38,contig_1_37,contig_1_34,contig_1_33,contig_1_32,contig_1_31,contig_1_28,contig_1_27,contig_1_26,contig_1_25,contig_1_24,contig_1_22,contig_1_20,contig_1_19,contig_1_18,contig_1_17,contig_1_16,contig_1_15,contig_1_14,contig_1_13,contig_1_12,contig_1_10,contig_1_8,contig_1_7,contig_1_6,contig_1_5,contig_1_4,contig_1_3,contig_1_2,contig_1_1,contig_C8715,contig_C8943,contig_C9371,contig_C8939,contig_C9357,contig_C8991,contig_C9445,contig_C8689 mv tpp.cfg intracellular_mutants_24h_rep1.tpp.cfg python3 ~/.local/bin/tpp -bwa /usr/bin/bwa -protocol Tn5 -ref WA-314_m.fna -reads1 ./240405_VH00358_89_AAFC5MTM5/kr8/extracellular_mutants_24h_rep2_S28_R1_001.fastq.gz -reads2 ./240405_VH00358_89_AAFC5MTM5/kr8/extracellular_mutants_24h_rep2_S28_R2_001.fastq.gz -output extracellular_mutants_24h_rep2 -mismatches 1 -bwa-alg mem -replicon-ids contig_2_10,contig_2_9,contig_2_8,contig_2_7,contig_2_6,contig_2_5,contig_2_3,contig_2_2,contig_5_10,contig_5_11,contig_5_12,contig_5_13,contig_5_15,contig_5_16,contig_5_17,contig_5_18,contig_5_9,contig_5_8,contig_5_7,contig_5_6,contig_5_5,contig_5_4,contig_5_3,contig_5_2,contig_5_1,contig_4_2,contig_4_1,contig_3_59,contig_3_58,contig_3_57,contig_3_56,contig_3_55,contig_3_54,contig_3_53,contig_3_52,contig_3_51,contig_3_50,contig_3_49,contig_3_48,contig_3_47,contig_3_46,contig_3_44,contig_3_43,contig_3_42,contig_3_41,contig_3_40,contig_3_39,contig_3_38,contig_3_37,contig_3_36,contig_3_35,contig_3_34,contig_3_33,contig_3_32,contig_3_31,contig_3_30,contig_3_29,contig_3_28,contig_3_27,contig_3_26,contig_3_25,contig_3_24,contig_3_23,contig_3_22,contig_3_21,contig_3_20,contig_3_17,contig_3_16,contig_3_15,contig_3_14,contig_3_13,contig_3_12,contig_3_11,contig_3_9,contig_3_8,contig_3_7,contig_3_6,contig_3_5,contig_3_3,contig_3_2,contig_3_1,contig_1_48,contig_1_47,contig_1_46,contig_1_45,contig_1_44,contig_1_43,contig_1_42,contig_1_41,contig_1_40,contig_1_39,contig_1_38,contig_1_37,contig_1_34,contig_1_33,contig_1_32,contig_1_31,contig_1_28,contig_1_27,contig_1_26,contig_1_25,contig_1_24,contig_1_22,contig_1_20,contig_1_19,contig_1_18,contig_1_17,contig_1_16,contig_1_15,contig_1_14,contig_1_13,contig_1_12,contig_1_10,contig_1_8,contig_1_7,contig_1_6,contig_1_5,contig_1_4,contig_1_3,contig_1_2,contig_1_1,contig_C8715,contig_C8943,contig_C9371,contig_C8939,contig_C9357,contig_C8991,contig_C9445,contig_C8689 mv tpp.cfg extracellular_mutants_24h_rep2.tpp.cfg python3 ~/.local/bin/tpp -bwa /usr/bin/bwa -protocol Tn5 -ref WA-314_m.fna -reads1 ./240405_VH00358_89_AAFC5MTM5/kr6/growthout_control_24h_rep2_S27_R1_001.fastq.gz -reads2 ./240405_VH00358_89_AAFC5MTM5/kr6/growthout_control_24h_rep2_S27_R2_001.fastq.gz -output growthout_control_24h_rep2 -mismatches 1 -bwa-alg mem -replicon-ids contig_2_10,contig_2_9,contig_2_8,contig_2_7,contig_2_6,contig_2_5,contig_2_3,contig_2_2,contig_5_10,contig_5_11,contig_5_12,contig_5_13,contig_5_15,contig_5_16,contig_5_17,contig_5_18,contig_5_9,contig_5_8,contig_5_7,contig_5_6,contig_5_5,contig_5_4,contig_5_3,contig_5_2,contig_5_1,contig_4_2,contig_4_1,contig_3_59,contig_3_58,contig_3_57,contig_3_56,contig_3_55,contig_3_54,contig_3_53,contig_3_52,contig_3_51,contig_3_50,contig_3_49,contig_3_48,contig_3_47,contig_3_46,contig_3_44,contig_3_43,contig_3_42,contig_3_41,contig_3_40,contig_3_39,contig_3_38,contig_3_37,contig_3_36,contig_3_35,contig_3_34,contig_3_33,contig_3_32,contig_3_31,contig_3_30,contig_3_29,contig_3_28,contig_3_27,contig_3_26,contig_3_25,contig_3_24,contig_3_23,contig_3_22,contig_3_21,contig_3_20,contig_3_17,contig_3_16,contig_3_15,contig_3_14,contig_3_13,contig_3_12,contig_3_11,contig_3_9,contig_3_8,contig_3_7,contig_3_6,contig_3_5,contig_3_3,contig_3_2,contig_3_1,contig_1_48,contig_1_47,contig_1_46,contig_1_45,contig_1_44,contig_1_43,contig_1_42,contig_1_41,contig_1_40,contig_1_39,contig_1_38,contig_1_37,contig_1_34,contig_1_33,contig_1_32,contig_1_31,contig_1_28,contig_1_27,contig_1_26,contig_1_25,contig_1_24,contig_1_22,contig_1_20,contig_1_19,contig_1_18,contig_1_17,contig_1_16,contig_1_15,contig_1_14,contig_1_13,contig_1_12,contig_1_10,contig_1_8,contig_1_7,contig_1_6,contig_1_5,contig_1_4,contig_1_3,contig_1_2,contig_1_1,contig_C8715,contig_C8943,contig_C9371,contig_C8939,contig_C9357,contig_C8991,contig_C9445,contig_C8689 mv tpp.cfg growthout_control_24h_rep2.tpp.cfg #END cp initial_mutants_rep1.tn_stats initial_mutants_rep1.tn_stats_ #Delete all general statistics before the table data in initial_mutants_rep1.tn_stats_; delete the content after "# FR_corr (Fwd templates vs. Rev templates):" sed -i 's/read_count (TA sites only, for Himar1)/read_counts/g' *.tn_stats_ sed -i 's/NZ_mean (among templates)/NZ_mean (mean template count over non-zero TA sites)/g' *.tn_stats_ python3 ../parse_tn_stats.py initial_mutants_rep1.tn_stats_ initial_mutants_rep1.tn_stats.xlsx python3 ../parse_tn_stats.py LB_culture_rep1.tn_stats_ LB_culture_rep1.tn_stats.xlsx #calculate the sum of the first and second columns by "=SUM(B2:B130)" and "=SUM(C2:C130)" 441057 and 276060 mkdir initial_mutants_rep1_wig mv *.wig initial_mutants_rep1_wig/ zip -r initial_mutants_rep1_wig.zip initial_mutants_rep1_wig/ #The counts-files are too big, not nessasary to send: #~/Tools/csv2xls-0.4/csv_to_xls.py initial_mutants_rep1.tn_stats_ *.counts -d$',' -o initial_mutants_rep1.stats.xls; # contig_1_10 699 411 # contig_1_8 3206 2031 # contig_1_7 3787 2376 # contig_1_6 4604 2871 # contig_1_5 2794 1765 # contig_1_4 83 58 # contig_1_3 2944 1882 # contig_1_2 15446 9678 # contig_1_1 14391 8954
-
delete PCR-duplicate for checking if the template counts in tpp are correct
To address the issue of PCR duplicates in paired-end sequencing data (which you referred to as "PCA-duplicate," but I believe you meant "PCR duplicate"), you can use tools designed for post-alignment processing of SAM/BAM files. These tools identify and remove or mark duplicates where both reads of a pair are identical to another pair in the library, often indicating that they are duplicates resulting from PCR amplification rather than unique sequencing events. Step 0. Install Samtools and Picard Tools for Removing PCR Duplicates: Both Samtools and Picard are widely used for manipulating SAM/BAM files, including removing duplicates. Here's how you can use Picard to remove PCR duplicates: conda install -c bioconda picard samtools Step 1: Convert SAM to BAM and sort the BAM file If your file is in SAM format, you need to convert it to BAM format first using Samtools; PCR duplicate removal requires that the BAM file be sorted by coordinate. This can be done using Samtools: samtools view -Sb initial_mutants_rep1.sam > initial_mutants_rep1.bam samtools sort initial_mutants_rep1.bam -o initial_mutants_rep1_sorted.bam #557566 + 0 read1 #557566 + 0 read2 samtools index initial_mutants_rep1_sorted.bam # contig_1_1 14391 8954 #@SQ SN:gi|420257081|ref|NZ_AKKR01000009.1|contig_1_1 LN:84292 # Extract reads from contig_1_1 samtools view -b initial_mutants_rep1_sorted.bam "gi|420257081|ref|NZ_AKKR01000009.1|contig_1_1" > contig_1_1.bam # Run flagstat on the filtered BAM file samtools flagstat contig_1_1.bam #16589 + 0 read1 #16579 + 0 read2 Step 3: Mark or Remove Duplicates Using Picard Picard Tools can be used to mark duplicates. Here, I'll show you how to remove them: # -Xmx4g between java and -jar # java -jar /usr/local/bin/picard.jar picard MarkDuplicates \ I=contig_1_1.bam \ O=contig_1_1_no_duplicates.bam \ M=marked_dup_metrics.txt \ REMOVE_DUPLICATES=true # ## METRICS CLASS picard.sam.DuplicationMetrics #LIBRARY UNPAIRED_READS_EXAMINED READ_PAIRS_EXAMINED SECONDARY_OR_SUPPLEMENTARY_RDS UNMAPPED_READS UNPAIRED_READ_DUPLICATES READ_PAIR_DUPLICATES READ_PAIR_OPTICAL_DUPLICATES PERCENT_DUPLICATION ESTIMATED_LIBRARY_SIZE #Unknown Library 111 16473 81 111 90 6114 0 0.372629 16270 samtools flagstat contig_1_1_no_duplicates.bam #10405 + 0 read1 #10445 + 0 read2 This command marks duplicates (PCR and optical) and removes them, outputting a file without duplicates. The M option specifies where to write metrics about the duplicates. Step 3 (Alternatively). Using samtools to Remove Duplicates Directly Alternatively, if you prefer a simple and fast tool, samtools has a rmdup utility, but it's less versatile compared to Picard: samtools rmdup -S input_sorted.bam output_no_duplicates.bam Note that samtools rmdup is deprecated in recent versions of samtools because it does not handle all edge cases as well as Picard. Considerations Read Alignment: Ensure that your reads are aligned correctly and that the SAM/BAM files are error-free before duplicate removal. Optical vs. PCR Duplicates: Picard differentiates between PCR duplicates (originating from the same fragment of DNA) and optical duplicates (artifacts from the sequencing platform). Ensure that this distinction is clear if relevant for your analysis. Quality Checks: After removing duplicates, it's wise to perform quality control checks to understand how much of your data was affected by duplicates.
-
Check how the failed trimmed reads not contain the the transposon:genomic boundary TAAGAGACAG
#-3.1- find read sequences >VH00358:89:AAFC5MTM5:1:1101:63544:1019_:N:0:TTTCTCTA+CTCGACG ACCTACCCCNCCGCTCTCATCAACCCAATAACGCAGGCAATCAAGCACCCACTGCATCACATAAGGTTGGCTAAGGCGCAATGTATTGCCACAACCGGTCATGTTGTCATATTCACCCTCAGAGGTGAGCCAGTAGTAGCTGGCATTGTCGATCCCACG >VH00358:89:AAFC5MTM5:1:1101:63960:1019_:N:0:TTTCTCTA+CTCGACG CAAAGAGGGGGGCGAAAAGATTTTAAACGATCTTGGCGAAATGAATTTTGAGTTTGTCGTGTGTGACTCACCTGCCGGTATCGAAAGCGGTGCGTTGATGGCACTGTATTTTGCTGACG >VH00358:89:AAFC5MTM5:1:1101:64150:1019_:N:0:TTTCTCTA+CTCGACG TCTTTACCGGGGATGGGACGCAAGATCTGCGCGCACTGGAACCGGCTTATGTTTCCTCCGTTGACAGTGGGAATCTGGCTGGGCATTTGATTGTACTGGCCAATACCTGTGAAGAGTGGGCCGCAGAACCCTTAGCGGCCAACGGGGCCAAGGGATTG >VH00358:89:AAFC5MTM5:1:1101:65400:1019_:N:0:TTTCTCTA+CTCGACG CACCTACCCCCCAGCTCTCATCAACCACAATTGACGCAACATCAGCTGGCGCCATGGCATTAATAACAAACTGGATGAATGGCCCCTGGGATCAACAGCACGAACAAACCGCCGTGTGGTGCCATCAGTTGCGCGCCGAAGGCCATTGACAGAGCACC >VH00358:89:AAFC5MTM5:1:1101:62048:1038_:N:0:TTTCTCTA+CTCGACG CCTACACCCCCGCTCTCATCAACCGCAGCAAAGAAAACGAAAATAATGAAGTCATAAAACTCAAGCGCTCCGCCTAAAGCCGCCAGAGTGAGGGTTTTATAATCTTGCTTATTCAGCCGACGGTTATGAT >VH00358:89:AAFC5MTM5:1:1101:64623:1038_:N:0:TTTCTCTA+CTCGACG CCTACACCCCAGCTCTCATCAACCAATAACCAGTCAACATCACTGACATCATGTTGCTGGCAATATTCCAGCAGTGCATCTGGCGCCCAAAAACGCGCGCGATCCCCTGCTCGCTGACTTAATACTTGAGTGGCGGCCTTGCCCGGCCCAGTAACCCA >VH00358:89:AAFC5MTM5:1:1101:65343:1038_:N:0:TTTCTCTA+CTCGACG ACCTACCACCAAGCTCTCATCAACCTGAATGGATTGAGGGCTACGCTCCCAGATCTGTTCTGCCAATTGGCGCAGATGGTTATCGGTGCAATACAAATGCACAAATGACAAATCGTCTCTTACATCTCAGCTCGATAAACTGCCCTTTGGCATAATGC >VH00358:89:AAFC5MTM5:1:1101:63317:1057_:N:0:GTTCTCTA+CTCGACG GTTGGCGTCCGGCATCTGCATATCACTCATAAAGCGCCTCGATAATCCCACGGATATCCTGGTCGCTAGCTGGGCGCGGGTTGGTGCGCAATGTGAGCTCGGCCAGGGCTGCAGCAATCATATCGGGCAGATGTTGCTGCAATTGGCGCTCATTAACTT >VH00358:89:AAFC5MTM5:1:1101:63430:1057_:N:0:TTTCTCTA+CTCGACG GCACTGGAAGAGCCGACTAGCCTCAATACTCTTGAACTGCTACCGGAATTATTTGCCGCCAATATTGCCTCGGTGAAAATTGAAGGGCGTCAGCGCAGCCCGGCTTATGTCAGCCAGGTGGCGAAAGTGTGGCGGCAGGCAATTGACCGCTATCTGGC >VH00358:89:AAFC5MTM5:1:1101:58299:1076_:N:0:TTTCTCTA+CTCGACG TAAATAGGCCAGACTTGAAATCACACGATCCGGCCAGCGATT #-3.2- generate the genomic sequences blastn -db ../WA-314_m.fna -query initial_mutants_rep1.trimmed1_failed_trim_n20 -out failed_trim_n20_on_yersinia.blastn -evalue 10000 -num_threads 124 -outfmt 6 -max_target_seqs 1 VH00358:89:AAFC5MTM5:1:1101:63544:1019_:N:0:TTTCTCTA+CTCGACG gi|420260421|ref|NZ_AKKR01000094.1|contig_3_51 100.000 141 0 0 19 159 20552 20692 1.20e-70 261 VH00358:89:AAFC5MTM5:1:1101:63960:1019_:N:0:TTTCTCTA+CTCGACG gi|420258256|ref|NZ_AKKR01000038.1|contig_1_38 98.319 119 2 0 1 119 43674 43556 3.14e-55 209 VH00358:89:AAFC5MTM5:1:1101:64150:1019_:N:0:TTTCTCTA+CTCGACG gi|420257402|ref|NZ_AKKR01000018.1|contig_1_12 98.101 158 3 0 1 158 59515 59672 4.26e-75 276 VH00358:89:AAFC5MTM5:1:1101:65400:1019_:N:0:TTTCTCTA+CTCGACG gi|420260713|ref|NZ_AKKR01000106.1|contig_5_2 100.000 57 0 0 18 74 78361 78417 5.91e-24 106 VH00358:89:AAFC5MTM5:1:1101:62048:1038_:N:0:TTTCTCTA+CTCGACG gi|420257081|ref|NZ_AKKR01000009.1|contig_1_1 100.000 110 0 0 21 130 78749 78640 1.62e-53 204 VH00358:89:AAFC5MTM5:1:1101:64623:1038_:N:0:TTTCTCTA+CTCGACG gi|420259377|ref|NZ_AKKR01000067.1|contig_3_23 99.329 149 0 1 10 158 54935 55082 7.13e-73 268 VH00358:89:AAFC5MTM5:1:1101:65343:1038_:N:0:TTTCTCTA+CTCGACG gi|420258810|ref|NZ_AKKR01000051.1|contig_3_3 99.301 143 1 0 16 158 46439 46581 4.29e-70 259 VH00358:89:AAFC5MTM5:1:1101:63317:1057_:N:0:GTTCTCTA+CTCGACG gi|420257889|ref|NZ_AKKR01000029.1|contig_1_25 96.226 159 6 0 1 159 36032 36190 1.20e-70 261 VH00358:89:AAFC5MTM5:1:1101:63430:1057_:N:0:TTTCTCTA+CTCGACG gi|420260713|ref|NZ_AKKR01000106.1|contig_5_2 100.000 158 0 0 1 158 54938 55095 4.23e-80 292 VH00358:89:AAFC5MTM5:1:1101:58299:1076_:N:0:TTTCTCTA+CTCGACG gi|420259608|ref|NZ_AKKR01000075.1|contig_3_31 97.619 42 1 0 1 42 25535 25576 1.04e-14 73.1 samtools faidx WA-314_m.fna "gi|420260421|ref|NZ_AKKR01000094.1|contig_3_51":20534-20692 > genome_rg_for_read1.fasta samtools faidx WA-314_m.fna "gi|420258256|ref|NZ_AKKR01000038.1|contig_1_38":43556-43674 > genome_rg_for_read2.fasta samtools faidx WA-314_m.fna "gi|420257402|ref|NZ_AKKR01000018.1|contig_1_12":59515-59672 > genome_rg_for_read3.fasta samtools faidx WA-314_m.fna "gi|420260713|ref|NZ_AKKR01000106.1|contig_5_2":78344-78501 > genome_rg_for_read4.fasta samtools faidx WA-314_m.fna "gi|420257081|ref|NZ_AKKR01000009.1|contig_1_1":78640-78769 > genome_rg_for_read5.fasta samtools faidx WA-314_m.fna "gi|420259377|ref|NZ_AKKR01000067.1|contig_3_23":54926-55082 > genome_rg_for_read6.fasta samtools faidx WA-314_m.fna "gi|420258810|ref|NZ_AKKR01000051.1|contig_3_3":46424-46581 > genome_rg_for_read7.fasta samtools faidx WA-314_m.fna "gi|420257889|ref|NZ_AKKR01000029.1|contig_1_25":36032-36190 > genome_rg_for_read8.fasta samtools faidx WA-314_m.fna "gi|420260713|ref|NZ_AKKR01000106.1|contig_5_2":54938-55095 > genome_rg_for_read9.fasta samtools faidx WA-314_m.fna "gi|420259608|ref|NZ_AKKR01000075.1|contig_3_31":25535-25576 > genome_rg_for_read10.fasta #-3.3- transposon amplicon start sequence >Transposon_ampli_start ACCTACAACAAAGCTCTCATCAAC CGTGGCGGGGATCCTCTAGAGTCGACCTGCAGGCATGCAAGCTTCAGGGTTGAGATGTGTA TAAGAGACAG #-3.4- generate 10 failed_${read}_group.fasta and multiple align them for read in read1 read2 read3 read4 read5 read6 read7 read8 read9 read10; do cat failed_${read}.fasta genome_rg_for_${read}.fasta transposon_ampli_start.fasta > failed_${read}_group.fasta mafft --adjustdirection --clustalout failed_${read}_group.fasta > failed_${read}_group.aln done cat failed_read1_group.aln failed_read2_group.aln failed_read3_group.aln failed_read4_group.aln failed_read5_group.aln failed_read6_group.aln failed_read7_group.aln failed_read8_group.aln failed_read9_group.aln failed_read10_group.aln > failed_reads_group.aln #-3.5- show the results in checking_failed_reads.pdf
-
Source code of parse_tn_stats.py used in the point 1.
import argparse import pandas as pd def parse_tn_stats_detailed(file_path): with open(file_path, 'r') as file: lines = file.readlines() print("Number of lines read:", len(lines)) if lines: print("First few lines:", lines[:10]) metrics_df = pd.DataFrame() current_metric = None metric_data = {} for line in lines: if line.startswith('# ') and not line.startswith('# '): # Ensure it's not a contig data line # Metric definition if current_metric and metric_data: # Save data before starting new metric for contig, value in metric_data.items(): metrics_df.at[contig, current_metric] = value metric_data = {} # Extract the metric name current_metric = line.split(':')[0].strip('# ') print("Processing new metric:", current_metric) # Debug output else: # Data lines for contigs under the current metric parts = line.strip().split(':') contig = parts[0].strip() value = parts[1].strip() try: value = float(value) if value.lower() != 'nan' else pd.NA except ValueError: value = pd.NA # In case of conversion failure metric_data[contig] = value # Capture the last metric data after finishing all lines if current_metric and metric_data: for contig, value in metric_data.items(): metrics_df.at[contig, current_metric] = value metrics_df.fillna(value=pd.NA, inplace=True) return metrics_df def main(): parser = argparse.ArgumentParser(description="Parse detailed TN stats from a file and output to Excel.") parser.add_argument("input_file", help="Input file path for TN stats.") parser.add_argument("output_file", help="Output Excel file path.") args = parser.parse_args() # Parse the detailed metrics from the provided file path detailed_metrics_df = parse_tn_stats_detailed(args.input_file) # Save the DataFrame to an Excel file with pd.ExcelWriter(args.output_file, engine='openpyxl') as writer: detailed_metrics_df.to_excel(writer, sheet_name='Detailed Metrics') print(f"Final DataFrame saved to Excel: {args.output_file}") if __name__ == "__main__": main()
Processing transposon insertion-site deep sequencing (Tn-seq) data
Leave a reply