gene_x 0 like s 498 view s
Tags: software, processing, bash
#-- view process.py --
with open("MT880872_CDS.fasta", "r") as f:
for line in f:
if line.startswith(">"):
header_words = line.strip().split("locus_tag=")
third_word = header_words[1]
third_word_ = third_word.strip().split("]")
third_word__ = third_word_[0]
print(">"+third_word__)
else:
print(line.strip())
#input_file: MT880870_.fasta as pan_genome_reference_.fa; MT880870_.fasta is modified from MT880870.fasta by simplying the fasta headers.
echo ">MT880870_genes" > MT880870_genes.fa;
merge_seq.py MT880870_.fasta >> MT880870_genes.fa;
samtools faidx MT880870_genes.fa
bioawk -c fastx '{ print $name, length($seq) }' < MT880870_.fasta > length.txt
generate_gene_model.py length.txt > /media/jhuang/Titisee/GAMOLA2/Results/gene_models/MT880870_genes.fa.combined;
cp MT880870_genes.fa /media/jhuang/Titisee/GAMOLA2/Input_sequences
python process.py > MT880871_.fasta
echo ">MT880871_genes" > MT880871_genes.fa;
merge_seq.py MT880871_.fasta >> MT880871_genes.fa;
samtools faidx MT880871_genes.fa
bioawk -c fastx '{ print $name, length($seq) }' < MT880871_.fasta > length.txt
generate_gene_model.py length.txt > /media/jhuang/Titisee/GAMOLA2/Results/gene_models/MT880871_genes.fa.combined;
cp MT880871_genes.fa /media/jhuang/Titisee/GAMOLA2/Input_sequences
python process.py > MT880872_.fasta
echo ">MT880872_genes" > MT880872_genes.fa;
merge_seq.py MT880872_.fasta >> MT880872_genes.fa;
samtools faidx MT880872_genes.fa
bioawk -c fastx '{ print $name, length($seq) }' < MT880872_.fasta > length.txt
generate_gene_model.py length.txt > /media/jhuang/Titisee/GAMOLA2/Results/gene_models/MT880872_genes.fa.combined;
cp MT880872_genes.fa /media/jhuang/Titisee/GAMOLA2/Input_sequences
#---- 4 ----
cd /media/jhuang/Titisee/GAMOLA2
#WARNING: !!swissprot as default database, manually choose nr as Blast_db --> too slow, better choose swissprot, it is quick!!
./Gamola.pl #No Glimmer model and Critica database due to self-extracted ORF; choosing nr.pal or swissprot.pal as Blast_db; COG2014 as COG_db; Pfam-A.hmm as Pfam_db; No Rfam_db; TIGRFAMS_15.0_HMM.LIB as TIGRfam_db
/media/jhuang/Titisee/GAMOLA2/Results/COG_results
#grep "Class: ," roary_182s_95.fa_COG_*
for sample in 10601 10710 11187 11377 12474 12504 1520 1551 2092 2160 2467 289 3455 4694 5862 5863 6166 6464 7618 8007 8406 8784 9157 9373 953; do
sed -i -e 's/Class: ,/Class: None, None/g' roary_182s_95.fa_COG_${sample}
done
./Gamola.pl
#---- 5 ---- update locustag
cp /media/jhuang/Titisee/GAMOLA2/Consolidated_results/MT880870_genes.fa/MT880870_genes.fa.gb ./
awk '{print $0 "ORIGIN"> "file" NR}' RS='ORIGIN' MT880870_genes.fa.gb
sed -i 's/<!-- /g' file2
sed -i 's/^ //g' file2
cat file1 file2 --> MT880870_genes_.fa.gb
iconv -t UTF-8 -f Windows-1252 MT880870_genes_.fa.gb
#89600270 Jan 26 13:44 roary_182s_95__.fa.gb
#-- if ran [3.2 selected]--
python ~/Scripts/update_locustag.py MT880870_genes_.fa.gb MT880870_.fasta.fai > MT880870_genes__.fa.gb
> MT880870_genes__.fa.gb #clean old *__.fa.gb
#---- 6.6 ----
cut -d$'\t' -f1 MT880870_.fasta.fai > MT880870_f1
process_gamola_gb_plus.py MT880870_f1 MT880870_genes__.fa.gb > annotated_MT880870.txt #!!
Some fields may be wrong.
BiopythonParserWarning)
/usr/local/lib/python2.7/dist-packages/Bio/GenBank/__init__.py:1047: BiopythonParserWarning: Ignoring invalid location: '1186620..1185596'
sed -i -e 's/1186620\.\.1185596/1185596\.\.1186620/g' roary__.fa.gb
#DEBUG "group_12676" -> "group_12667"
#---- ADD DNA-Sequences add the end of table as the last column ----
cut -d',' -f1-1 annotated_MT880870.txt > get_seq_ORF.sh
#extend the file get_seq_ORF.sh to extract all DNA-sequences
#mv the bash file under DIR of MT880870_.fasta.fai
#samtools faidx MT880870_.fasta "PLKLOBMN_00001" > seq_ORF.fasta
#samtools faidx MT880870_.fasta "PLKLOBMN_00002" >> seq_ORF.fasta
#or in the case simply "cp MT880870_.fasta seq_ORF.fasta"
awk '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);} END {printf("\n");}' < seq_ORF.fasta > seq_ORF_.fasta
grep -v ">" seq_ORF_.fasta > seq_ORF__.txt
#add "Sequence" to the first line of seq_ORF__.txt
paste -d',' annotated_MT880870.txt seq_ORF__.txt > annotated_MT880870_.txt
#---- repeat 5-->6.6 for MT880871 and MT880872 --------------
cp /media/jhuang/Titisee/GAMOLA2/Consolidated_results/MT880871_genes.fa/MT880871_genes.fa.gb ./
awk '{print $0 "ORIGIN"> "file" NR}' RS='ORIGIN' MT880871_genes.fa.gb
sed -i 's/<!-- /g' file2
sed -i 's/^ //g' file2
cat file1 file2 --> MT880871_genes_.fa.gb
iconv -t UTF-8 -f Windows-1252 MT880871_genes_.fa.gb
python ~/Scripts/update_locustag.py MT880871_genes_.fa.gb MT880871_.fasta.fai > MT880871_genes__.fa.gb
#remove the last line 'ORIGIN'
cut -d$'\t' -f1 MT880871_.fasta.fai > MT880871_f1
process_gamola_gb_plus.py MT880871_f1 MT880871_genes__.fa.gb > annotated_MT880871.txt
cp MT880871_.fasta seq_ORF_71.fasta
awk '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);} END {printf("\n");}' < seq_ORF_71.fasta > seq_ORF_71_.fasta
grep -v ">" seq_ORF_71_.fasta > seq_ORF_71__.txt
#add 'Sequence' in the first line in seq_ORF_71__.txt
paste -d',' annotated_MT880871.txt seq_ORF_71__.txt > annotated_MT880871_.txt
#--
cp /media/jhuang/Titisee/GAMOLA2/Consolidated_results/MT880872_genes.fa/MT880872_genes.fa.gb ./
awk '{print $0 "ORIGIN"> "file" NR}' RS='ORIGIN' MT880872_genes.fa.gb
sed -i 's/<!-- /g' file2
sed -i 's/^ //g' file2
cat file1 file2 --> MT880872_genes_.fa.gb
iconv -t UTF-8 -f Windows-1252 MT880872_genes_.fa.gb
python ~/Scripts/update_locustag.py MT880872_genes_.fa.gb MT880872_.fasta.fai > MT880872_genes__.fa.gb
#remove the last line 'ORIGIN'
cut -d$'\t' -f1 MT880872_.fasta.fai > MT880872_f1
process_gamola_gb_plus.py MT880872_f1 MT880872_genes__.fa.gb > annotated_MT880872.txt
cp MT880872_.fasta seq_ORF_72.fasta
awk '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);} END {printf("\n");}' < seq_ORF_72.fasta > seq_ORF_72_.fasta
grep -v ">" seq_ORF_72_.fasta > seq_ORF_72__.txt
#add 'Sequence' in the first line in seq_ORF_72__.txt
paste -d',' annotated_MT880872.txt seq_ORF_72__.txt > annotated_MT880872_.txt
mv annotated_MT880870_.txt annotated__MT880870_.txt
#~/Tools/csv2xls-0.4/csv_to_xls.py annotated__MT880870_.txt annotated_MT880871_.txt annotated_MT880872_.txt -d',' -o annotated_MT880870-MT880872.xls
The script that takes two file names as command line arguments, merges the files based on matching first words, removes the key from file2, and deletes all lines in the merged file in which the key does not occur in file2.
import sys
#"","baseMean","log2FoldChange","lfcSE","stat","pvalue","padj","Gene","Product","PFAM Label","PFAM Match", "PFAM Interpro", "PFAM Product", "TIGR Label", "TIGR Product","COG Label", "COG Code","COG Product","Translation" #delete "Locus Tag",
#python3 merge2.py hS_vs_TSB_output/background annotated_CP052994_2996.txt hS_vs_TSB_output/background_
#python3 merge2.py hS_vs_TSB_output/downregulated_filtered annotated_CP052994_2996.txt hS_vs_TSB_output/downregulated_filtered_
#python3 merge2.py hS_vs_TSB_output/upregulated_filtered annotated_CP052994_2996.txt hS_vs_TSB_output/upregulated_filtered_
#python3 merge2.py infection_vs_nose_output/background annotated_CP052994_2996.txt infection_vs_nose_output/background_
#python3 merge2.py infection_vs_nose_output/downregulated_filtered annotated_CP052994_2996.txt infection_vs_nose_output/downregulated_filtered_
#python3 merge2.py infection_vs_nose_output/upregulated_filtered annotated_CP052994_2996.txt infection_vs_nose_output/upregulated_filtered_
#sed -i -e 's/HKH70_09175/HKH70_09175 (PLKLOBMN_00173)/g' background_
#sed -i -e 's/HKH70_05165/HKH70_05165 (PLKLOBMN_00016)/g' background_
#sed -i -e 's/HKH70_05170/HKH70_05170 (PLKLOBMN_00017)/g' background_
#sed -i -e 's/HKH70_05175/HKH70_05175 (PLKLOBMN_00018)/g' background_
#python3 ~/Tools/csv2xls-0.4/csv_to_xls.py background_ upregulated_filtered_ downregulated_filtered_ -d',' -o degenes_hS_vs_TSB.xls
#python3 ~/Tools/csv2xls-0.4/csv_to_xls.py background_ upregulated_filtered_ downregulated_filtered_ -d',' -o degenes_infection_vs_nose.xls
# Check if both file names were provided as command line arguments
if len(sys.argv) != 4:
print("Usage: python merge2.py file1.txt file2.txt output_file.txt")
sys.exit(1)
# Open file1 for reading and file2 for editing
with open(sys.argv[1], 'r') as f1, open(sys.argv[2], 'r+') as f2:
# Read file2 into a dictionary
f2_dict = {}
for line in f2:
key, content = line.strip().split(',', 1)
f2_dict[key] = content
# Merge file1 and file2
merged_lines = []
for line in f1:
key = line.split(',')[0]
if key in f2_dict:
content = f2_dict[key]
merged_lines.append(line.strip() + ',' + content)
## Move the file pointer to the beginning of file2
#f2.seek(0)
## Truncate file2 to delete all its contents
#f2.truncate()
## Rewrite file2 with the updated content
#for key, content in f2_dict.items():
# f2.write(content + '\n')
# Overwrite file1 with the merged content
with open(sys.argv[3], 'w') as f3:
f3.write('\n'.join(merged_lines))
点赞本文的读者
还没有人对此文章表态
没有评论
RNA-seq data analysis of Yersinia on GRCh38
Should the inputs for GSVA be normalized or raw?
Yops (Yersinia outer proteins) analysis 2
Motif Discovery in Biological Sequences: A Comparison of MEME and HOMER
© 2023 XGenes.com Impressum