GAMOLA2 pre- and postprocessing

gene_x 0 like s 876 view s

Tags: software, processing, bash

#-- view process.py --
with open("MT880872_CDS.fasta", "r") as f:
    for line in f:
        if line.startswith(">"):
            header_words = line.strip().split("locus_tag=")
            third_word = header_words[1]
            third_word_ = third_word.strip().split("]")
            third_word__ = third_word_[0]
            print(">"+third_word__)
        else:
            print(line.strip())



 #input_file: MT880870_.fasta as pan_genome_reference_.fa; MT880870_.fasta is modified from MT880870.fasta by simplying the fasta headers.
echo ">MT880870_genes" > MT880870_genes.fa;
merge_seq.py MT880870_.fasta >> MT880870_genes.fa;
samtools faidx MT880870_genes.fa
bioawk -c fastx '{ print $name, length($seq) }' < MT880870_.fasta > length.txt 
generate_gene_model.py length.txt > /media/jhuang/Titisee/GAMOLA2/Results/gene_models/MT880870_genes.fa.combined;
cp MT880870_genes.fa /media/jhuang/Titisee/GAMOLA2/Input_sequences

python process.py > MT880871_.fasta
echo ">MT880871_genes" > MT880871_genes.fa;
merge_seq.py MT880871_.fasta >> MT880871_genes.fa;
samtools faidx MT880871_genes.fa
bioawk -c fastx '{ print $name, length($seq) }' < MT880871_.fasta > length.txt 
generate_gene_model.py length.txt > /media/jhuang/Titisee/GAMOLA2/Results/gene_models/MT880871_genes.fa.combined;
cp MT880871_genes.fa /media/jhuang/Titisee/GAMOLA2/Input_sequences

python process.py > MT880872_.fasta
echo ">MT880872_genes" > MT880872_genes.fa;
merge_seq.py MT880872_.fasta >> MT880872_genes.fa;
samtools faidx MT880872_genes.fa
bioawk -c fastx '{ print $name, length($seq) }' < MT880872_.fasta > length.txt 
generate_gene_model.py length.txt > /media/jhuang/Titisee/GAMOLA2/Results/gene_models/MT880872_genes.fa.combined;
cp MT880872_genes.fa /media/jhuang/Titisee/GAMOLA2/Input_sequences



#---- 4 ----
cd /media/jhuang/Titisee/GAMOLA2
#WARNING: !!swissprot as default database, manually choose nr as Blast_db --> too slow, better choose swissprot, it is quick!!
./Gamola.pl    #No Glimmer model and Critica database due to self-extracted ORF; choosing nr.pal or swissprot.pal as Blast_db; COG2014 as COG_db; Pfam-A.hmm as Pfam_db; No Rfam_db; TIGRFAMS_15.0_HMM.LIB as TIGRfam_db
/media/jhuang/Titisee/GAMOLA2/Results/COG_results
#grep "Class: ," roary_182s_95.fa_COG_*
for sample in 10601 10710 11187 11377 12474 12504  1520 1551 2092 2160 2467   289 3455 4694 5862 5863 6166 6464 7618 8007 8406 8784 9157 9373 953; do
  sed -i -e 's/Class: ,/Class: None, None/g' roary_182s_95.fa_COG_${sample}
done
./Gamola.pl



#---- 5 ---- update locustag
cp /media/jhuang/Titisee/GAMOLA2/Consolidated_results/MT880870_genes.fa/MT880870_genes.fa.gb ./
awk '{print $0 "ORIGIN"> "file" NR}' RS='ORIGIN' MT880870_genes.fa.gb
sed -i 's/<!-- /g' file2
sed -i 's/^ //g' file2
cat file1 file2 --> MT880870_genes_.fa.gb
iconv -t UTF-8 -f Windows-1252 MT880870_genes_.fa.gb
#89600270 Jan 26 13:44 roary_182s_95__.fa.gb
#-- if ran [3.2 selected]--
python ~/Scripts/update_locustag.py MT880870_genes_.fa.gb MT880870_.fasta.fai > MT880870_genes__.fa.gb
> MT880870_genes__.fa.gb    #clean old *__.fa.gb



#---- 6.6 ----
cut -d$'\t' -f1 MT880870_.fasta.fai > MT880870_f1
process_gamola_gb_plus.py MT880870_f1 MT880870_genes__.fa.gb > annotated_MT880870.txt  #!!

Some fields may be wrong.
  BiopythonParserWarning)
/usr/local/lib/python2.7/dist-packages/Bio/GenBank/__init__.py:1047: BiopythonParserWarning: Ignoring invalid location: '1186620..1185596'
sed -i -e 's/1186620\.\.1185596/1185596\.\.1186620/g' roary__.fa.gb
#DEBUG "group_12676" -> "group_12667"

#---- ADD DNA-Sequences add the end of table as the last column ----
cut -d',' -f1-1 annotated_MT880870.txt > get_seq_ORF.sh
#extend the file get_seq_ORF.sh to extract all DNA-sequences
#mv the bash file under DIR of MT880870_.fasta.fai 
#samtools faidx MT880870_.fasta "PLKLOBMN_00001" > seq_ORF.fasta
#samtools faidx MT880870_.fasta "PLKLOBMN_00002" >> seq_ORF.fasta
#or in the case simply "cp MT880870_.fasta seq_ORF.fasta"
awk '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);}  END {printf("\n");}' < seq_ORF.fasta > seq_ORF_.fasta
grep -v ">" seq_ORF_.fasta > seq_ORF__.txt
#add "Sequence" to the first line of seq_ORF__.txt
paste -d',' annotated_MT880870.txt seq_ORF__.txt > annotated_MT880870_.txt




#---- repeat 5-->6.6 for MT880871 and MT880872 --------------
cp /media/jhuang/Titisee/GAMOLA2/Consolidated_results/MT880871_genes.fa/MT880871_genes.fa.gb ./
awk '{print $0 "ORIGIN"> "file" NR}' RS='ORIGIN' MT880871_genes.fa.gb
sed -i 's/<!-- /g' file2
sed -i 's/^ //g' file2
cat file1 file2 --> MT880871_genes_.fa.gb
iconv -t UTF-8 -f Windows-1252 MT880871_genes_.fa.gb
python ~/Scripts/update_locustag.py MT880871_genes_.fa.gb MT880871_.fasta.fai > MT880871_genes__.fa.gb
#remove the last line 'ORIGIN'
cut -d$'\t' -f1 MT880871_.fasta.fai > MT880871_f1
process_gamola_gb_plus.py MT880871_f1 MT880871_genes__.fa.gb > annotated_MT880871.txt

cp MT880871_.fasta seq_ORF_71.fasta
awk '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);}  END {printf("\n");}' < seq_ORF_71.fasta > seq_ORF_71_.fasta
grep -v ">" seq_ORF_71_.fasta > seq_ORF_71__.txt
#add 'Sequence' in the first line in seq_ORF_71__.txt
paste -d',' annotated_MT880871.txt seq_ORF_71__.txt > annotated_MT880871_.txt

#--

cp /media/jhuang/Titisee/GAMOLA2/Consolidated_results/MT880872_genes.fa/MT880872_genes.fa.gb ./
awk '{print $0 "ORIGIN"> "file" NR}' RS='ORIGIN' MT880872_genes.fa.gb
sed -i 's/<!-- /g' file2
sed -i 's/^ //g' file2
cat file1 file2 --> MT880872_genes_.fa.gb
iconv -t UTF-8 -f Windows-1252 MT880872_genes_.fa.gb
python ~/Scripts/update_locustag.py MT880872_genes_.fa.gb MT880872_.fasta.fai > MT880872_genes__.fa.gb
#remove the last line 'ORIGIN'
cut -d$'\t' -f1 MT880872_.fasta.fai > MT880872_f1
process_gamola_gb_plus.py MT880872_f1 MT880872_genes__.fa.gb > annotated_MT880872.txt

cp MT880872_.fasta seq_ORF_72.fasta
awk '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);}  END {printf("\n");}' < seq_ORF_72.fasta > seq_ORF_72_.fasta
grep -v ">" seq_ORF_72_.fasta > seq_ORF_72__.txt
#add 'Sequence' in the first line in seq_ORF_72__.txt
paste -d',' annotated_MT880872.txt seq_ORF_72__.txt > annotated_MT880872_.txt

mv annotated_MT880870_.txt annotated__MT880870_.txt
#~/Tools/csv2xls-0.4/csv_to_xls.py annotated__MT880870_.txt annotated_MT880871_.txt annotated_MT880872_.txt -d',' -o annotated_MT880870-MT880872.xls

The script that takes two file names as command line arguments, merges the files based on matching first words, removes the key from file2, and deletes all lines in the merged file in which the key does not occur in file2.

import sys

#"","baseMean","log2FoldChange","lfcSE","stat","pvalue","padj","Gene","Product","PFAM Label","PFAM Match", "PFAM Interpro", "PFAM Product", "TIGR Label", "TIGR Product","COG Label", "COG Code","COG Product","Translation"  #delete "Locus Tag",
#python3 merge2.py hS_vs_TSB_output/background annotated_CP052994_2996.txt hS_vs_TSB_output/background_
#python3 merge2.py hS_vs_TSB_output/downregulated_filtered annotated_CP052994_2996.txt hS_vs_TSB_output/downregulated_filtered_
#python3 merge2.py hS_vs_TSB_output/upregulated_filtered annotated_CP052994_2996.txt hS_vs_TSB_output/upregulated_filtered_
#python3 merge2.py infection_vs_nose_output/background annotated_CP052994_2996.txt infection_vs_nose_output/background_
#python3 merge2.py infection_vs_nose_output/downregulated_filtered annotated_CP052994_2996.txt infection_vs_nose_output/downregulated_filtered_
#python3 merge2.py infection_vs_nose_output/upregulated_filtered annotated_CP052994_2996.txt infection_vs_nose_output/upregulated_filtered_
#sed -i -e 's/HKH70_09175/HKH70_09175 (PLKLOBMN_00173)/g' background_
#sed -i -e 's/HKH70_05165/HKH70_05165 (PLKLOBMN_00016)/g' background_
#sed -i -e 's/HKH70_05170/HKH70_05170 (PLKLOBMN_00017)/g' background_
#sed -i -e 's/HKH70_05175/HKH70_05175 (PLKLOBMN_00018)/g' background_
#python3 ~/Tools/csv2xls-0.4/csv_to_xls.py background_ upregulated_filtered_ downregulated_filtered_ -d',' -o degenes_hS_vs_TSB.xls
#python3 ~/Tools/csv2xls-0.4/csv_to_xls.py background_ upregulated_filtered_ downregulated_filtered_ -d',' -o degenes_infection_vs_nose.xls
# Check if both file names were provided as command line arguments
if len(sys.argv) != 4:
    print("Usage: python merge2.py file1.txt file2.txt output_file.txt")
    sys.exit(1)

# Open file1 for reading and file2 for editing
with open(sys.argv[1], 'r') as f1, open(sys.argv[2], 'r+') as f2:
    # Read file2 into a dictionary
    f2_dict = {}
    for line in f2:
        key, content = line.strip().split(',', 1)
        f2_dict[key] = content

    # Merge file1 and file2
    merged_lines = []
    for line in f1:
        key = line.split(',')[0]
        if key in f2_dict:
            content = f2_dict[key]
            merged_lines.append(line.strip() + ',' + content)

    ## Move the file pointer to the beginning of file2
    #f2.seek(0)
    ## Truncate file2 to delete all its contents
    #f2.truncate()
    ## Rewrite file2 with the updated content
    #for key, content in f2_dict.items():
    #    f2.write(content + '\n')

    # Overwrite file1 with the merged content
    with open(sys.argv[3], 'w') as f3:
        f3.write('\n'.join(merged_lines))

like unlike

点赞本文的读者

还没有人对此文章表态

本文有评论

没有评论

GAMOLA2 pre- and postprocessing

本文有评论

看文章，发评论，不要沉默

最受欢迎文章

最新文章

最多评论文章

推荐相似文章