-
download go terms from Gene Ontology https://geneontology.org/docs/download-ontology/
import obonet # Load the OBO file url = 'http://purl.obolibrary.org/obo/go/go-basic.obo' graph = obonet.read_obo(url) # Open a file to write to with open('go_terms.csv', 'w') as f: f.write("id,name,namespace\n") # Write header for node_id, data in graph.nodes(data=True): name = data.get('name', '') namespace = data.get('namespace', '') f.write(f"{node_id}\t{name}\t{namespace}\n") # Write data
-
prepare the blastp output
python3 filter_uniref_by_taxid.py uniref100.fasta uniref100_taxid1282.fasta makeblastdb -in uniref90_taxid1282.fasta -dbtype prot makeblastdb -in uniref50_taxid1282.fasta -dbtype prot; makeblastdb -in uniref100_taxid1282.fasta -dbtype prot; # -strand both -max_target_seqs 1 -taxidlist taxid_list.txt blastp -query 1585_CDS.fasta -db /home/jhuang/REFs/uniref100_taxid1282.fasta -out 1585_CDS_on_uniref100.blastp -num_threads 100 -outfmt 6 -evalue 1.0e-30 grep -v "UniRef100_UPI" 1585_CDS_on_uniref100.blastp > 1585_CDS_on_uniref100_.blastp python3 add_RefSeq-ID.py 1585_genes_annotated_with_GO.csv 1585.csv 1585_genes_annotated_with_GO_RefSeq-ID.csv python3 add_UniProt-ID.py #Input are 1585_CDS_on_uniref100_.blastp and 1585_genes_annotated_with_GO_RefSeq-ID.csv; Output is 1585_genes_annotated_with_GO_RefSeq-ID_UniProt-ID.csv. python3 add_Translation.py 1585_genes_annotated_with_GO_RefSeq-ID_UniProt-ID.csv 1585.csv 1585_genes_annotated_with_GO_RefSeq-ID_UniProt-ID_Translation.csv # check if they are running correctly cut -f1 -d',' 1585_genes_annotated_with_GO_NCBI-ID.csv > temp1 cut -f1 -d',' 1585_genes_annotated_with_GO_NCBI-ID_UniProt-ID_Translation.csv > temp2 diff temp1 temp2
-
code of filter_uniref_by_taxid.py
import sys def filter_uniref50(input_file, output_file): with open(input_file, 'r') as in_handle: with open(output_file, 'w') as out_handle: record_started = False for line in in_handle: if line.startswith(">"): if "TaxID=1282 " in line: record_started = True out_handle.write(line) else: record_started = False elif record_started: out_handle.write(line) if __name__ == "__main__": # Check command-line arguments if len(sys.argv) != 3: print("Usage: python3 filter_uniref_by_taxid.py
“) sys.exit(1) # Get input and output file paths from command-line arguments input_file = sys.argv[1] output_file = sys.argv[2] # Filter UniRef50 database based on TaxID=1282 filter_uniref50(input_file, output_file) print(“Filtered UniRef50 database saved to:”, output_file) -
code of add_RefSeq-ID.py
import pandas as pd import sys def main(input_file1, input_file2, output_file): # Load the first table table1 = pd.read_csv(input_file1) # Load the second table table2 = pd.read_csv(input_file2) # Merge the tables on SeqName from table1 and Locus Tag from table2 merged_table = pd.merge(table1, table2[['Locus Tag', 'inference']], left_on='SeqName', right_on='Locus Tag', how='left') # Drop the extra 'Locus Tag' column that is redundant after merge merged_table.drop(columns=['Locus Tag'], inplace=True) # Save the merged table to a new CSV file merged_table.to_csv(output_file, index=False) if __name__ == "__main__": if len(sys.argv) != 4: print("Usage: python script.py
“) #python3 add_NCBI-ID.py 1585_genes_annotated_with_GO.csv 1585.csv 1585_genes_annotated_with_GO_NCBI-ID.csv else: input_file1 = sys.argv[1] input_file2 = sys.argv[2] output_file = sys.argv[3] main(input_file1, input_file2, output_file) -
code of add_UniProt-ID.py
import csv # Read blastp results and create a dictionary mapping CDS IDs to lists of UniRef100 IDs def read_blastp_results(blastp_file): cds_to_uniref100 = {} with open(blastp_file, 'r') as blastp_handle: blastp_reader = csv.reader(blastp_handle, delimiter='\t') for row in blastp_reader: cds_id, uniref100_id = row[0], row[1] if cds_id not in cds_to_uniref100: cds_to_uniref100[cds_id] = [] cds_to_uniref100[cds_id].append(uniref100_id) return cds_to_uniref100 # Add UniRef100 IDs as the last column in the input CSV file def add_uniref100_column(input_file, output_file, cds_to_uniref100): with open(input_file, 'r') as input_handle: with open(output_file, 'w', newline='') as output_handle: reader = csv.reader(input_handle) writer = csv.writer(output_handle) for row in reader: cds_id = row[0] if cds_id in cds_to_uniref100: uniref100_ids = '|'.join(cds_to_uniref100[cds_id]) row.append(uniref100_ids) else: row.append("") # If no UniRef100 IDs found, add empty string writer.writerow(row) if __name__ == "__main__": blastp_file = "1585_CDS_on_uniref100_.blastp" # Replace with your blastp results file input_file = "1585_genes_annotated_with_GO_NCBI-ID.csv" output_file = "1585_genes_annotated_with_GO_NCBI-ID_UniProt-ID.csv" # Read blastp results and create dictionary mapping CDS IDs to lists of UniRef100 IDs cds_to_uniref100 = read_blastp_results(blastp_file) # Add UniRef100 IDs as the last column in the input CSV file add_uniref100_column(input_file, output_file, cds_to_uniref100) print("UniRef100 IDs added to the input CSV file:", output_file)
-
code of add_Translation.py
import pandas as pd import sys def main(input_file1, input_file2, output_file): # Load the first table table1 = pd.read_csv(input_file1) # Load the second table table2 = pd.read_csv(input_file2) # Merge the tables on SeqName from table1 and Locus Tag from table2 merged_table = pd.merge(table1, table2[['Locus Tag', 'Translation']], left_on='SeqName', right_on='Locus Tag', how='left') # Drop the extra 'Locus Tag' column that is redundant after merge merged_table.drop(columns=['Locus Tag'], inplace=True) # Save the merged table to a new CSV file merged_table.to_csv(output_file, index=False) if __name__ == "__main__": if len(sys.argv) != 4: print("Usage: python script.py
“) #python3 add_NCBI-ID_to_1585-table.py 1585_genes_annotated_with_GO.csv 1585.csv 1585_genes_annotated_with_GO_Translation.csv else: input_file1 = sys.argv[1] input_file2 = sys.argv[2] output_file = sys.argv[3] main(input_file1, input_file2, output_file)
GO terms for S. epidermidis
Leave a reply