Call and merge SNP and Indel results from using two variant calling methods

gene_x 0 like s 340 view s

Tags: variation, pipeline

  1. call variant calling using snippy
    mv snippy_HDRNA_01 snippy_CP133676
    
    mv snippy_HDRNA_03 snippy_CP133677
    
    cp -r snippy_HDRNA_06 snippy_CP133678
    mv snippy_HDRNA_06 snippy_CP133679
    
    cp -r snippy_HDRNA_07 snippy_CP133680
    mv snippy_HDRNA_07 snippy_CP133681
    
    cp -r snippy_HDRNA_08 snippy_CP133682
    mv snippy_HDRNA_08 snippy_CP133683
    
    cp -r snippy_HDRNA_12 snippy_CP133684
    cp -r snippy_HDRNA_12 snippy_CP133685
    cp -r snippy_HDRNA_12 snippy_CP133686
    mv snippy_HDRNA_12 snippy_CP133687
    
    cp -r snippy_HDRNA_16 snippy_CP133688
    cp -r snippy_HDRNA_16 snippy_CP133689
    cp -r snippy_HDRNA_16 snippy_CP133690
    cp -r snippy_HDRNA_16 snippy_CP133691
    mv snippy_HDRNA_16 snippy_CP133692
    
    cp -r snippy_HDRNA_17 snippy_CP133693
    cp -r snippy_HDRNA_17 snippy_CP133694
    mv snippy_HDRNA_17 snippy_CP133695
    
    cp -r snippy_HDRNA_19 snippy_CP133696
    cp -r snippy_HDRNA_19 snippy_CP133697
    cp -r snippy_HDRNA_19 snippy_CP133698
    mv snippy_HDRNA_19 snippy_CP133699
    
    cp -r snippy_HDRNA_20 snippy_CP133700
    mv snippy_HDRNA_20 snippy_CP133701
    
        #git clone https://github.com/huang/bacto
        #mv bacto/* ./
        #rm -rf bacto
    
    for sample in snippy_CP133678 snippy_CP133679 snippy_CP133680 snippy_CP133681 snippy_CP133682 snippy_CP133683 snippy_CP133684 snippy_CP133685 snippy_CP133686 snippy_CP133687 snippy_CP133688 snippy_CP133689 snippy_CP133690 snippy_CP133691 snippy_CP133692 snippy_CP133693 snippy_CP133694 snippy_CP133695 snippy_CP133696 snippy_CP133697 snippy_CP133698 snippy_CP133699 snippy_CP133700 snippy_CP133701; do
        cd ${sample};
        ln -s ~/Tools/bacto/db/ .;
        ln -s ~/Tools/bacto/envs/ .;
        ln -s ~/Tools/bacto/local/ .;
        cp ~/Tools/bacto/Snakefile .;
        cp ~/Tools/bacto/bacto-0.1.json .;
        cp ~/Tools/bacto/cluster.json .;
        cd ..;
    done
    #prepare raw_data and bacto-0.1.json
    #set "reference": "db/CP133***.gb" in bacto-0.1.json
    
    conda activate bengal3_ac3
    #cd snippy_CP133676
    #/home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    #cd snippy_CP133677
    #/home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    cd snippy_CP133678
    (bengal3_ac3) /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    cd ../snippy_CP133679
    /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    cd ../snippy_CP133680
    /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    cd ../snippy_CP133681
    /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    cd ../snippy_CP133682
    /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    
    cd snippy_CP133683
    /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    cd ../snippy_CP133684
    /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    cd ../snippy_CP133685
    /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    cd ../snippy_CP133686
    /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    cd ../snippy_CP133687
    /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    cd ../snippy_CP133688
    /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    cd ../snippy_CP133689
    /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    cd ../snippy_CP133690
    /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    cd ../snippy_CP133691
    /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    cd ../snippy_CP133692
    /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    cd ../snippy_CP133693
    /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    cd ../snippy_CP133694
    /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    cd ../snippy_CP133695
    /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    cd ../snippy_CP133696
    /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    cd ../snippy_CP133697
    /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    cd ../snippy_CP133698
    /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    cd ../snippy_CP133699
    /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    cd ../snippy_CP133700
    /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    cd ../snippy_CP133701
    /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    cd ..
    

2, using spandx calling variants (almost the same results to the one from viral-ngs!)

    mkdir ~/miniconda3/envs/spandx/share/snpeff-5.1-2/data/PP810610
    cp PP810610.1.gb  ~/miniconda3/envs/spandx/share/snpeff-5.1-2/data/PP810610/genes.gbk
    vim ~/miniconda3/envs/spandx/share/snpeff-5.1-2/snpEff.config
    /home/jhuang/miniconda3/envs/spandx/bin/snpEff build  PP810610     -d
    gzip hCoV229E_Rluc_trimmed_P_1.fastq hCoV229E_Rluc_trimmed_P_2.fastq p10_DMSO_trimmed_P_1.fastq p10_DMSO_trimmed_P_2.fastq p10_K22_trimmed_P_1.fastq p10_K22_trimmed_P_2.fastq p10_K7523_trimmed_P_1.fastq p10_K7523_trimmed_P_2.fastq
    ln -s /home/jhuang/Tools/spandx/ spandx
    (spandx) nextflow run spandx/main.nf --fastq "trimmed/*_P_{1,2}.fastq.gz" --ref PP810610.fasta --annotation --database PP810610 -resume

3, summarize all SNPs and Indels from the snippy result directory.

    #Output: snippy_CP133676/snippy/summary_snps_indels.csv
    python3 summarize_snippy_res.py snippy_CP133676/snippy

    #-- post-processing of Outputs_CP133676 produced by SPANDx (temporary not necessary) --
    #cd Outputs_CP133676/Phylogeny_and_annotation
    #awk '{if($3!=$7) print}' < All_SNPs_indels_annotated.txt > All_SNPs_indels_annotated_.txt
    #cut -d$'\t' -f1-7 All_SNPs_indels_annotated_.txt > f1_7
    #grep -v "/" f1_7 > f1_7_
    #grep -v "\." f1_7_ > f1_7__
    #grep -v "*" f1_7__ > f1_7___
    #grep -v "INDEL" f1_7___ > f1_7_SNPs
    #cut -d$'\t' -f2-2 f1_7_SNPs > f1_7_SNPs_f2
    #grep -wFf <(awk 'NR>0 {print $1}' f1_7_SNPs_f2) All_SNPs_indels_annotated_.txt > All_SNPs_annotated.txt
    #grep -v "SNP" f1_7___ > f1_7_indels
    #cut -d$'\t' -f2-2 f1_7_indels > f1_7_indels_f2
    #grep -wFf <(awk 'NR>0 {print $1}' f1_7_indels_f2) All_SNPs_indels_annotated_.txt > All_indels_annotated.txt

4, merge the following two files summary_snps_indels.csv (192) and All_SNPs_indels_annotated.txt (248) --> merged_variants_CP133676.csv (94)

    python3 merge_snps_indels.py snippy_CP133676/snippy/summary_snps_indels.csv Outputs_CP133676/Phylogeny_and_annotation/All_SNPs_indels_annotated.txt merged_variants_CP133676.csv
    #check if the number of the output file is correct?
    comm -12 <(cut -d, -f2 snippy_CP133676/snippy/summary_snps_indels.csv | sort | uniq) <(cut -f2 Outputs_CP133676/Phylogeny_and_annotation/All_SNPs_indels_annotated.txt | sort | uniq) | wc -l
    comm -12 <(cut -d, -f2 snippy_CP133676/snippy/summary_snps_indels.csv | sort | uniq) <(cut -f2 Outputs_CP133676/Phylogeny_and_annotation/All_SNPs_indels_annotated.txt | sort | uniq)

5, (optional, since it should not happed) filter rows without change in snippy_CP133676/snippy/summary_snps_indels.csv (94)

    awk -F, '
    NR == 1 {print; next}  # Print the header line
    {
        ref = $3;  # Reference base (assuming REF is in the 3rd column)
        same = 1;  # Flag to check if all bases are the same as reference
        samples = "";  # Initialize variable to hold sample data
        for (i = 6; i <= NF - 8; i++) {  # Loop through all sample columns, adjusting for the number of fixed columns
            samples = samples $i " ";  # Collect sample data
            if ($i != ref) {
                same = 0;
            }
        }
        if (!same) {
            print $0;  # Print the entire line if not all bases are the same as the reference
            #print "Samples: " samples;  # Print all sample data for checking
        }
    }
    ' merged_variants_CP133676.csv > merged_variants_CP133676_.csv

    #Explanation:
    #    -F, sets the field separator to a comma.
    #    NR == 1 {print; next} prints the header line and skips to the next line.
    #    ref = $3 sets the reference base (assumed to be in the 3rd column).
    #    same = 1 initializes a flag to check if all sample bases are the same as the reference.
    #    samples = ""; initializes a string to collect sample data.
    #    for (i = 6; i <= NF - 8; i++) loops through the sample columns. This assumes the first 5 columns are fixed (CHROM, POS, REF, ALT, TYPE), and the last 8 columns are fixed (Effect, Impact, Functional_Class, Codon_change, Protein_and_nucleotide_change, Amino_Acid_Length, Gene_name, Biotype).
    #    samples = samples $i " "; collects sample data.
    #    if ($i != ref) { same = 0; } checks if any sample base is different from the reference base. If found, it sets same to 0.
    #    if (!same) { print $0; print "Samples: " samples; } prints the entire line and the sample data if not all sample bases are the same as the reference.

6, improve the header

    sed -i '1s/_trimmed_P//g' merged_variants_CP133676.csv

7, check the REF and K1 have the same base and delete those records with difference.

    cut -f3 -d',' merged_variants_CP133676.csv > f3
    cut -f6 -d',' merged_variants_CP133676.csv > f6
    diff f3 f6
    awk -F, '$3 == $6 || NR==1' merged_variants_CP133676.csv > filtered_merged_variants_CP133676.csv #(93)
    cut -f3 -d',' filtered_merged_variants_CP133676.csv > f3
    cut -f6 -d',' filtered_merged_variants_CP133676.csv > f6
    diff f3 f6

8, MANUALLY REMOVE the column f6 in filtered_merged_variants_CP133676.csv, and rename CHROM to HDRNA_01_K01 in the header, summarize chr and plasmids SNPs of a sample together to a single list, save as an Excel-file.

9, code of summarize_snippy_res.py

    import pandas as pd
    import glob
    import argparse
    import os

    #python3 summarize_snps_indels.py snippy_HDRNA_01/snippy

    #The following record for 2365295 is wrong, since I am sure in the HDRNA_01_K010, it should be a 'G', since in HDRNA_01_K010.csv
    #CP133676,2365295,snp,A,G,G:178 A:0
    #
    #The current output is as follows:
    #CP133676,2365295,A,snp,A,A,A,A,A,A,A,A,A,A,None,,,,,,None,None
    #CP133676,2365295,A,snp,A,A,A,A,A,A,A,A,A,A,nan,,,,,,nan,nan
    #grep -v "None,,,,,,None,None" summary_snps_indels.csv > summary_snps_indels_.csv
    #BUG: CP133676,2365295,A,snp,A,A,A,A,A,A,A,A,A,A,nan,,,,,,nan,nan

    import pandas as pd
    import glob
    import argparse
    import os

    def main(base_directory):
        # List of isolate identifiers
        isolates = [f"HDRNA_01_K0{i}" for i in range(1, 11)]
        expected_columns = ["CHROM", "POS", "REF", "ALT", "TYPE", "EFFECT", "LOCUS_TAG", "GENE", "PRODUCT"]

        # Find all CSV files in the directory and its subdirectories
        csv_files = glob.glob(os.path.join(base_directory, '**', '*.csv'), recursive=True)

        # Create an empty DataFrame to store the summary
        summary_df = pd.DataFrame()

        # Iterate over each CSV file
        for file_path in csv_files:
            # Extract isolate identifier from the file name
            isolate = os.path.basename(file_path).replace('.csv', '')
            df = pd.read_csv(file_path)

            # Ensure all expected columns are present, adding missing ones as empty columns
            for col in expected_columns:
                if col not in df.columns:
                    df[col] = None

            # Extract relevant columns
            df = df[expected_columns]

            # Ensure consistent data types
            df = df.astype({"CHROM": str, "POS": int, "REF": str, "ALT": str, "TYPE": str, "EFFECT": str, "LOCUS_TAG": str, "GENE": str, "PRODUCT": str})

            # Add the isolate column with the ALT allele
            df[isolate] = df["ALT"]

            # Drop columns that are not needed for the summary
            df = df.drop(["ALT"], axis=1)

            if summary_df.empty:
                summary_df = df
            else:
                summary_df = pd.merge(summary_df, df, on=["CHROM", "POS", "REF", "TYPE", "EFFECT", "LOCUS_TAG", "GENE", "PRODUCT"], how="outer")

        # Fill missing values with the REF allele for each isolate column
        for isolate in isolates:
            if isolate in summary_df.columns:
                summary_df[isolate] = summary_df[isolate].fillna(summary_df["REF"])
            else:
                summary_df[isolate] = summary_df["REF"]

        # Rename columns to match the required format
        summary_df = summary_df.rename(columns={
            "CHROM": "CHROM",
            "POS": "POS",
            "REF": "REF",
            "TYPE": "TYPE",
            "EFFECT": "Effect",
            "LOCUS_TAG": "Gene_name",
            "GENE": "Biotype",
            "PRODUCT": "Product"
        })

        # Replace any remaining None or NaN values in the non-isolate columns with empty strings
        summary_df = summary_df.fillna("")

        # Add empty columns for Impact, Functional_Class, Codon_change, Protein_and_nucleotide_change, Amino_Acid_Length
        summary_df["Impact"] = ""
        summary_df["Functional_Class"] = ""
        summary_df["Codon_change"] = ""
        summary_df["Protein_and_nucleotide_change"] = ""
        summary_df["Amino_Acid_Length"] = ""

        # Reorder columns
        cols = ["CHROM", "POS", "REF", "TYPE"] + isolates + ["Effect", "Impact", "Functional_Class", "Codon_change", "Protein_and_nucleotide_change", "Amino_Acid_Length", "Gene_name", "Biotype"]
        summary_df = summary_df.reindex(columns=cols)

        # Remove duplicate rows
        summary_df = summary_df.drop_duplicates()

        # Save the summary DataFrame to a CSV file
        output_file = os.path.join(base_directory, "summary_snps_indels.csv")
        summary_df.to_csv(output_file, index=False)

        print("Summary CSV file created successfully at:", output_file)

    if __name__ == "__main__":
        parser = argparse.ArgumentParser(description="Summarize SNPs and Indels from CSV files.")
        parser.add_argument("directory", type=str, help="Base directory containing the CSV files in subdirectories.")
        args = parser.parse_args()

        main(args.directory)

10, code of merge_snps_indels.py

    import pandas as pd
    import argparse
    import os

    def merge_files(input_file1, input_file2, output_file):
        # Read the input files
        df1 = pd.read_csv(input_file1)
        df2 = pd.read_csv(input_file2, sep='\t')

        # Merge the dataframes on the 'POS' column, keeping only the rows that have common 'POS' values
        merged_df = pd.merge(df2, df1[['POS']], on='POS', how='inner')

        # Remove duplicate rows
        merged_df.drop_duplicates(inplace=True)

        # Save the merged dataframe to the output file
        merged_df.to_csv(output_file, index=False)

        print("Merged file created successfully at:", output_file)

    if __name__ == "__main__":
        parser = argparse.ArgumentParser(description="Merge two SNP and Indel files based on the 'POS' column.")
        parser.add_argument("input_file1", type=str, help="Path to the first input file (summary_snps_indels.csv).")
        parser.add_argument("input_file2", type=str, help="Path to the second input file (All_SNPs_indels_annotated.txt).")
        parser.add_argument("output_file", type=str, help="Path to the output file.")
        args = parser.parse_args()

        merge_files(args.input_file1, args.input_file2, args.output_file)

like unlike

点赞本文的读者

还没有人对此文章表态


本文有评论

没有评论

看文章,发评论,不要沉默


© 2023 XGenes.com Impressum