#!/usr/bin/env python3
import os
import sys
import logging
import tempfile
import shutil
import datetime
from pathlib import Path
import psycopg2
from config import *
from lib.options import Options
from lib.wrap.trimmomatic import Trimmomatic
from lib.wrap.bowtie2 import Bowtie2
from lib.wrap.hmmsearch import HMMsearch
from lib.wrap.assembler import Assembler
from lib.wrap.trinity import Trinity
from lib.wrap.spades import SPAdes
from lib.wrap.idba import IDBA
from lib.wrap.no_assembler import No_Assembler
from lib.wrap.blast import Blast
from lib.wrap.blastn import Blastn
from lib.wrap.blastp import Blastp
from lib.usage import Usage
from lib.prepare_run import PrepareRun
from lib.contig_ops import ContigOps
from lib.ranking_ops import RankingOps
from lib.task_manager import TaskManager
from lib.taxonomy_ops import TaxonomyOps
from lib.check_dependencies import CheckDependencies
START_TIME = datetime.datetime.now()
options = Options.parse(sys.argv[1:])
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
handler = logging.StreamHandler(sys.stderr)
logger.addHandler(handler)
ContigOps = ContigOps()
RankingOps = RankingOps()
TaskManager = TaskManager()
checkpoint = 0
trimmomatic = Trimmomatic(TRIMMOMATIC_PATH, TRIMMOMATIC_PARAMS, TRIMMOMATIC_PE_PARAMS,
TRIMMOMATIC_SE_PARAMS, options.threads, logger)
bowtie2 = Bowtie2(BOWTIE2_PATH, BOWTIE2_PARAMS, options.threads, logger)
assembler = None
if options.no_assembly:
assembler = No_Assembler(None, None, options.threads, logger)
else:
if ASSEMBLER == "trinity":
assembler = Trinity(TRINITY_PATH, TRINITY_PARAMS, options.threads, logger)
elif ASSEMBLER == "spades":
assembler = SPAdes(SPADES_PATH, SPADES_PARAMS, options.threads, logger)
elif ASSEMBLER == "idba":
assembler = IDBA_ud(IDBA_PATH, IDBA_PARAMS, options.threads, logger)
hmmsearch = HMMsearch(HMMSEARCH_PATH, f"{HMMSEARCH_PARAMS} --domE {HMMSEARCH_EVALUE}",
PFAM_PATH, options.threads, logger)
blastn = Blastn(BLASTN_PATH, BLASTN_PARAMS, BLASTN_NT_PATH, options.threads, logger)
blastp = Blastp(BLASTP_PATH, BLASTP_PARAMS, BLASTP_NR_PATH, options.threads, logger)
def finish(failure_msg = nil)
if @options.clean
begin
FileUtils.rm_rf(@options.derived[:sample_folder])
rescue => e
@logger.error "#{e}\nFailed to remove folder '#{@options.derived[:sample_folder]}'. Please remove it manually."
end
end
pp @options.derived if @options.debug
if failure_msg
checkpoint(failure_msg, 20)
exit(false)
else
checkpoint('Analysis Complete')
cmd = "INSERT INTO analysis (finished) VALUES ('now')"
@logger.debug(cmd)
@conn.exec(cmd)
exit(true)
end
end
# ... (other parts of the script) ...
checkpoint('Generating summary report')
require_relative File.join('lib', 'wrap', 'report')
report = Report.new(@options, @logger)
report.create
# Upload results to the user-specified storage (e.g., AWS S3, Google Cloud Storage)
checkpoint('Uploading results')
if @options.upload_results
require_relative File.join('lib', 'wrap', 'upload_results')
uploader = UploadResults.new(@options, @logger)
uploader.upload
end
finish()
Category Archives: Articles
How to use Biopython to perform phylogenetic analysis?
Biopython is a powerful library for bioinformatics that provides tools for manipulating biological sequences, working with 3D structures, performing genome analysis, and more. Among its many features, it also has tools for phylogenetic analysis.
Here’s a simple example of how you can use Biopython to perform phylogenetic analysis using the following steps:
- Obtain sequence data (e.g., from a FASTA file or GenBank).
- Perform multiple sequence alignment (MSA) using a suitable algorithm.
- Construct a phylogenetic tree using a tree-building method.
- Visualize the phylogenetic tree.
Note that Biopython does not have built-in tools for performing multiple sequence alignment, so you can use an external tool like MUSCLE or Clustal Omega for this step. However, Biopython can parse the output of these tools.
Here’s an example workflow:
Step 1: Obtain sequence data
Let’s assume you have a FASTA file named sequences.fasta containing multiple sequences.
Step 2: Perform multiple sequence alignment
First, install the necessary libraries:
pip install biopython
Now, align your sequences using an external tool like MUSCLE or Clustal Omega. You can also use Biopython’s AlignIO module to parse the output.
For example, if you’re using Clustal Omega, you can run:
clustalo -i sequences.fasta -o aligned_sequences.clustal --outfmt=clustal
Step 3: Construct a phylogenetic tree
We will use the Phylo module in Biopython to construct a tree using the neighbor-joining method.
from Bio import AlignIO
from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor
from Bio import Phylo
# Read the alignment file
alignment = AlignIO.read("aligned_sequences.clustal", "clustal")
# Calculate the distance matrix
calculator = DistanceCalculator("identity")
distance_matrix = calculator.get_distance(alignment)
# Construct the tree using the neighbor-joining method
constructor = DistanceTreeConstructor()
tree = constructor.nj(distance_matrix)
# Save the tree to a file in Newick format
Phylo.write(tree, "phylogenetic_tree.newick", "newick")
Step 4: Visualize the phylogenetic tree
You can visualize the tree using Biopython’s Phylo.draw() function or export it to a file and use external tools like FigTree, iTOL, or Dendroscope.
To visualize the tree using Biopython’s Phylo.draw() function:
import matplotlib.pyplot as plt
# Draw the tree
Phylo.draw(tree)
# Save the tree as an image
plt.savefig("phylogenetic_tree.png")
These steps provide a basic example of how to perform phylogenetic analysis using Biopython. You can further customize the analysis and visualization by exploring the various features and options available in the library.
Powerful oneliners for quick data processing in bioinformatics
There are many command-line tools and utilities that can be useful in bioinformatics for quick data processing, analysis, and manipulation. Some of these oneliner tools include:
-
awk: A versatile text processing tool that can be used to filter, reformat, and transform data.
awk '{print $1}' input.txt
-
sed: A stream editor for filtering and transforming text.
sed 's/A/T/g' input.txt > output.txt
-
grep: A tool to search for patterns in text files.
grep “ATG” input.fasta
-
sort: Sorts lines in a text file.
sort input.txt > sorted_input.txt
-
uniq: Removes duplicate lines from a sorted file or displays the number of occurrences of each line.
uniq -c input.txt > unique_counts.txt
-
wc: Counts lines, words, and characters in a file.
wc -l input.txt
-
cut: Removes sections from each line of a file.
cut -f1,3 input.txt > output.txt
-
paste: Joins corresponding lines of multiple files.
paste file1.txt file2.txt > combined.txt
-
tr: Translates or deletes characters.
tr ‘atcg’ ‘TAGC’ < input.txt > output.txt
-
curl: Transfers data from or to a server.
curl -O “https://example.com/file.fasta“
-
bioawk: An extension of awk with built-in functions for biological data.
bioawk -c fastx ‘{print $name, length($seq)}’ input.fasta
-
seqkit: A cross-platform toolkit for FASTA/Q file manipulation.
seqkit stat input.fasta
-
Samtools is a widely-used suite of tools for manipulating and analyzing high-throughput sequencing (HTS) data in the SAM, BAM, and CRAM formats. Here are some examples of how you can use Samtools for various tasks:
-
Convert SAM to BAM format: To convert a SAM (Sequence Alignment/Map) file to a compressed binary BAM (Binary Alignment/Map) file, you can use the samtools view command with the -bS option: samtools view -bS input.sam > output.bam
-
Sort a BAM file: To sort a BAM file by genomic coordinates, you can use the samtools sort command: samtools sort input.bam -o sorted_output.bam
-
Index a sorted BAM file: To create an index for a sorted BAM file, which allows you to quickly access alignments overlapping particular genomic regions, you can use the samtools index command:
samtools index sorted_output.bam
-
Generate an alignment statistics report: To create a summary report of alignment statistics, such as the number of mapped and unmapped reads, you can use the samtools flagstat command:
samtools flagstat input.bam > report.txt
-
Extract reads aligned to a specific region: To extract reads aligned to a specific genomic region, you can use the samtools view command with the -h option and the region of interest:
samtools view -h input.bam chr1:10000-20000 > region_output.bam
-
Filter alignments: To filter alignments based on specific criteria, such as minimum mapping quality, you can use the samtools view command with the -q option:
samtools view -q 20 input.bam > filtered_output.bam
-
Generate a pileup: To create a pileup file from a BAM file, which displays the sequencing depth at each position of the reference genome, you can use the samtools mpileup command:
samtools mpileup -f reference.fasta input.bam > output.pileup
-
-
Bcftools is a set of utilities for variant calling and manipulating VCF (Variant Call Format) and BCF (Binary Call Format) files. Here are some examples of how you can use Bcftools for various tasks:
-
Call variants: To call variants from a BAM or CRAM file using the consensus caller, you can use the bcftools mpileup command followed by bcftools call:
bcftools mpileup -Ou -f reference.fasta input.bam | bcftools call -mv -Ov -o output.vcf
-
Filter variants: To filter variants based on specific criteria, such as quality or depth, you can use the bcftools filter command:
bcftools filter -i ‘QUAL > 20 && DP > 10’ input.vcf -o filtered_output.vcf
-
View VCF/BCF file: To view the contents of a VCF or BCF file, you can use the bcftools view command:
bcftools view input.vcf
-
Sort a VCF file: To sort a VCF file by genomic coordinates, you can use the bcftools sort command:
bcftools sort input.vcf -o sorted_output.vcf
-
Index a VCF file: To create an index for a VCF or BCF file, which allows you to quickly access variants overlapping specific genomic regions, you can use the bcftools index command:
bcftools index sorted_output.vcf
-
Concatenate multiple VCF files: To concatenate multiple VCF files, ensuring that they have the same sample columns in the same order, you can use the bcftools concat command:
bcftools concat input1.vcf input2.vcf -o combined_output.vcf
-
Generate consensus sequences: To create consensus sequences by applying VCF variants to a reference genome, you can use the bcftools consensus command:
bcftools consensus -f reference.fasta input.vcf.gz > consensus.fasta
-
Normalize indels: To normalize indels in a VCF file (left-align and trim indels), you can use the bcftools norm command:
bcftools norm -f reference.fasta input.vcf -o normalized_output.vcf
-
-
Bedtools is a powerful suite of tools for working with genomic intervals in various file formats, such as BED, GFF/GTF, and VCF. Here are some examples of how you can use Bedtools for various tasks:
-
Intersect intervals: To find overlapping intervals between two files, you can use the bedtools intersect command:
bedtools intersect -a input1.bed -b input2.bed > output.bed
-
Merge intervals: To merge overlapping or adjacent intervals in a single file, you can use the bedtools merge command:
bedtools merge -i input.bed > output.bed
-
Subtract intervals: To subtract intervals in one file from another, you can use the bedtools subtract command:
bedtools subtract -a input1.bed -b input2.bed > output.bed
-
Get genomic sequences: To extract sequences from a reference genome corresponding to intervals in a BED file, you can use the bedtools getfasta command:
bedtools getfasta -fi reference.fasta -bed input.bed -fo output.fasta
-
Sort intervals: To sort genomic intervals by chromosome and start position, you can use the bedtools sort command:
bedtools sort -i input.bed > sorted_output.bed
-
Calculate coverage: To compute the depth at each position or the depth for each interval in a BED file, you can use the bedtools coverage command:
bedtools coverage -a input1.bed -b input2.bed > output.bed
-
Find closest features: To find the closest feature in another file for each feature in a BED file, you can use the bedtools closest command:
bedtools closest -a input1.bed -b input2.bed > output.bed
-
Compute statistics: To calculate various summary statistics for each feature in a BED file, such as the mean, median, or standard deviation of scores, you can use the bedtools groupby command:
bedtools groupby -i input.bed -g 1 -c 5 -o mean,median,stdev > output.bed
-
You can often combine these tools using pipes (|) to create powerful oneliners for complex data processing tasks.
END
Draw scatter_3d using plotly 4.10 (+ellipses?)
import plotly.graph_objects as go
import pandas as pd
from sklearn.decomposition import PCA
import numpy as np
from scipy.linalg import eigh, sqrtm
# Read in data as a pandas dataframe
#df = pd.DataFrame({
# 'PC1': [-13.999925, -12.504291, -12.443057, -13.065235, -17.316215],
# 'PC2': [-1.498823, -3.342411, -6.067055, -8.205809, 3.293993],
# 'PC3': [-3.335085, 15.207755, -14.725450, 15.078469, -6.917358],
# 'condition': ['GFP d3', 'GFP d3', 'GFP d8', 'GFP d8', 'GFP+mCh d9/12'],
# 'donor': ['DI', 'DII', 'DI', 'DII', 'DI']
#})
df = pd.read_csv('merged_df_28PCs.csv', index_col=0, header=0)
df['condition'] = df['condition'].replace("GFP+mCh d9/12", "ctrl LTtr+sT d9/12")
df['condition'] = df['condition'].replace("sT+LTtr d9/12", "LTtr+sT d9/12")
df['condition'] = df['condition'].replace("GFP d3", "ctrl LT/LTtr d3")
df['condition'] = df['condition'].replace("GFP d8", "ctrl LT/LTtr d8")
df['condition'] = df['condition'].replace("mCh d3", "ctrl sT d3")
df['condition'] = df['condition'].replace("mCh d8", "ctrl sT d8")
# Fit PCA model to reduce data dimensions to 3
pca = PCA(n_components=3)
pca.fit(df.iloc[:, :-3])
X_reduced = pca.transform(df.iloc[:, :-3])
# Add reduced data back to dataframe
df['PC1'] = X_reduced[:, 0]
df['PC2'] = X_reduced[:, 1]
df['PC3'] = X_reduced[:, 2]
# Create PCA plot with 3D scatter
fig = go.Figure()
#['circle', 'circle-open', 'square', 'square-open', 'diamond', 'diamond-open', 'cross', 'x']
# if donor == 'DI' else marker=dict(size=2, opacity=0.8, color=condition_color, symbol=donor_symbol)
#decrease diamond size to 6 while keep the circle as size 10 in the following code:
#'rgb(128, 150, 128)'
#I need three families of colors, always from light to deep, the first one should close to grey.
#the first serie for 'ctrl LTtr+sT d9/12', 'LTtr+sT d9/12'
#the second serie for 'ctrl LT/LTtr d3', 'ctrl LT/LTtr d8', 'LT d3', 'LT d8', 'LTtr d3', 'LTtr d8'
#the third serie for 'ctrl sT d3', 'ctrl sT d8', 'sT d3', 'sT d8', 'sT+LT d3'
condition_color_map_untreated = {'untreated':'black'}
donor_symbol_map_untreated = {'DI': 'circle-open', 'DII': 'diamond-open'}
#condition_color_map = {'ctrl LTtr+sT d9/12': 'green', 'GFP d3': 'blue', 'GFP d8': 'red', 'GFP+mCh d9/12': 'green', 'LT d3': 'orange'}
condition_color_map = {
'ctrl LTtr+sT d9/12': 'black',
'LTtr+sT d9/12': '#a14a1a',
'ctrl LT/LTtr d3': '#D3D3D3',
'ctrl LT/LTtr d8': '#A9A9A9',
'LT d3': '#b2df8a',
'LT d8': '#33a02c',
'LTtr d3': '#a6cee3',
'LTtr d8': '#1f78b4',
'ctrl sT d3': '#696969',
'ctrl sT d8': '#2F4F4F',
'sT d3': '#fb9a99',
'sT d8': '#e31a1c',
'sT+LT d3': 'magenta'
}
donor_symbol_map = {'DI': 'circle', 'DII': 'diamond'}
for donor, donor_symbol in donor_symbol_map_untreated.items():
for condition, condition_color in condition_color_map_untreated.items():
mask = (df['condition'] == condition) & (df['donor'] == donor)
fig.add_trace(go.Scatter3d(x=df.loc[mask, 'PC1'], y=df.loc[mask, 'PC2'], z=df.loc[mask, 'PC3'],
mode='markers',
name=f'{condition}' if donor == 'DI' else None,
legendgroup=f'{condition}',
showlegend=True if donor == 'DI' else False,
marker=dict(size=4 if donor_symbol in ['diamond-open'] else 6, opacity=0.8, color=condition_color, symbol=donor_symbol)))
for donor, donor_symbol in donor_symbol_map.items():
for condition, condition_color in condition_color_map.items():
mask = (df['condition'] == condition) & (df['donor'] == donor)
fig.add_trace(go.Scatter3d(x=df.loc[mask, 'PC1'], y=df.loc[mask, 'PC2'], z=df.loc[mask, 'PC3'],
mode='markers',
name=f'{condition}' if donor == 'DI' else None,
legendgroup=f'{condition}',
showlegend=True if donor == 'DI' else False,
marker=dict(size=4 if donor_symbol in ['diamond'] else 6, opacity=0.8, color=condition_color, symbol=donor_symbol)))
for donor, donor_symbol in donor_symbol_map.items():
fig.add_trace(go.Scatter3d(x=[None], y=[None], z=[None],
mode='markers',
name=donor,
legendgroup=f'{donor}',
showlegend=True,
marker=dict(size=6, opacity=1, color='black', symbol=donor_symbol),
hoverinfo='none'))
'''
# Calculate mean and covariance for each subset of data for eclipse drawing
DI_mask = (df['donor'] == 'DI') & (~df['condition'].isin(['untreated']))
DII_mask = (df['donor'] == 'DII') & (~df['condition'].isin(['untreated']))
untreated_mask = (df['condition'] == 'untreated')
mean_DI = df.loc[DI_mask, ['PC1', 'PC2', 'PC3']].mean().values
cov_DI = df.loc[DI_mask, ['PC1', 'PC2', 'PC3']].cov().values
mean_DII = df.loc[DII_mask, ['PC1', 'PC2', 'PC3']].mean().values
cov_DII = df.loc[DII_mask, ['PC1', 'PC2', 'PC3']].cov().values
mean_untreated = df.loc[untreated_mask, ['PC1', 'PC2', 'PC3']].mean().values
cov_untreated = df.loc[untreated_mask, ['PC1', 'PC2', 'PC3']].cov().values
# Function to create ellipses
def create_ellipse(center, radii, rotation_matrix, resolution=100):
u = np.linspace(0, 2 * np.pi, resolution)
v = np.linspace(0, np.pi, resolution)
x = radii[0] * np.outer(np.cos(u), np.sin(v))
y = radii[1] * np.outer(np.sin(u), np.sin(v))
z = radii[2] * np.outer(np.ones_like(u), np.cos(v))
for i in range(len(x)):
for j in range(len(x)):
[x[i, j], y[i, j], z[i, j]] = np.dot([x[i, j], y[i, j], z[i, j]], rotation_matrix) + center
return x, y, z
def ellipsoid_params(mean, cov, conf_interval=0.9):
vals, vecs = eigh(cov)
vals = vals[::-1]
vecs = vecs[:, ::-1]
radii = np.sqrt(vals * chi2.ppf(conf_interval, df=3))
rotation_matrix = vecs
return mean, radii, rotation_matrix
from scipy.stats import chi2
center_DI, radii_DI, rotation_matrix_DI = ellipsoid_params(mean_DI, cov_DI)
center_DII, radii_DII, rotation_matrix_DII = ellipsoid_params(mean_DII, cov_DII)
#center_untreated, radii_untreated, rotation_matrix_untreated = ellipsoid_params(mean_untreated, cov_untreated)
# Generate ellipses using create_ellipse function
x_DI, y_DI, z_DI = create_ellipse(center_DI, radii_DI, rotation_matrix_DI)
x_DII, y_DII, z_DII = create_ellipse(center_DII, radii_DII, rotation_matrix_DII)
#x_untreated, y_untreated, z_untreated = create_ellipse(center_untreated, radii_untreated, rotation_matrix_untreated)
fig.add_trace(go.Mesh3d(x=x_DI.flatten(), y=y_DI.flatten(), z=z_DI.flatten(),
alphahull=5,
opacity=0.1,
color='blue',
name='DI ellipse',
showlegend=False))
fig.add_trace(go.Mesh3d(x=x_DII.flatten(), y=y_DII.flatten(), z=z_DII.flatten(),
alphahull=5,
opacity=0.1,
color='red',
name='DII ellipse',
showlegend=False))
'''
# Annotations for the legend blocks
fig.update_layout(
annotations=[
dict(x=1.1, y=1.0, xref='paper', yref='paper', showarrow=False,
text='Condition', font=dict(size=15)),
dict(x=1.1, y=0.6, xref='paper', yref='paper', showarrow=False,
text='Donor', font=dict(size=15))
],
scene=dict(
aspectmode='cube',
xaxis=dict(gridcolor='lightgrey', linewidth=2, linecolor='black', backgroundcolor='white', zerolinecolor='black', zerolinewidth=2, title='PC1: 36% v.'),
yaxis=dict(gridcolor='lightgrey', linewidth=2, linecolor='black', backgroundcolor='white', zerolinecolor='black', zerolinewidth=2, title='PC2: 17% v.'),
zaxis=dict(gridcolor='lightgrey', linewidth=2, linecolor='black',backgroundcolor='white', zerolinecolor='black', zerolinewidth=2, title='PC3: 15% variance'),
bgcolor='white'
),
margin=dict(l=5, r=5, b=5, t=0) # Adjust the margins to prevent clipping of axis titles
)
#fig.show()
fig.write_image("fig4.svg")
Creating and Training a Toy Transformer Model with Python: A Step-by-Step Guide
GPT (short for “Generative Pre-trained Transformer”) is a type of transformer model, which is an advanced deep learning architecture. It is based on the Transformer architecture introduced by Vaswani et al. in the paper “Attention is All You Need” in 2017. GPT and its successors, such as GPT-2 and GPT-3, have been developed and released by OpenAI.
Yes, you can train a toy transformer model with Python. It will likely be smaller and less powerful than GPT models, but it can help you understand the core concepts and workings of the transformer architecture. There are several libraries and resources available in Python for creating and training transformer models, including TensorFlow and PyTorch.
Here is a high-level overview of the process:
- Install necessary libraries: Install TensorFlow or PyTorch, depending on which library you prefer for deep learning tasks.
-
Prepare the dataset: Collect and preprocess the text data you will use to train your model. The data should be tokenized (converted into smaller units, such as words or subwords) and encoded into a suitable format for the model.
-
Define the model architecture: Implement the transformer model architecture, including the attention mechanism, multi-head self-attention, positional encoding, feed-forward layers, and other necessary components.
-
Train the model: Train your toy transformer model on the prepared dataset using an appropriate optimizer, learning rate schedule, and loss function (e.g., cross-entropy loss for language modeling tasks). Make sure to set aside a portion of the data for validation to monitor the model\’s performance during training and prevent overfitting.
-
Evaluate the model: After training, evaluate your toy transformer model on a test dataset to measure its performance. Common evaluation metrics for language models include perplexity, BLEU score, and ROUGE score, depending on the specific task you\’re focusing on.
-
Fine-tune and experiment: You can fine-tune your toy transformer model by adjusting hyperparameters, such as learning rate, batch size, and model size (e.g., the number of layers and attention heads). Experiment with different settings to improve the model’s performance.
-
Keep in mind that training a transformer model, even a toy one, can be computationally expensive and may require a significant amount of time and resources. Using a GPU or cloud-based services, such as Google Colab, can help speed up the training process.
There are also many tutorials and code examples available online to help you get started with building and training your own toy transformer model in Python. Remember to consult the official documentation of the deep learning libraries (TensorFlow or PyTorch) and any other resources to ensure you’re using the most up-to-date practices and techniques.
Downloading DNA and Protein Sequences from NCBI Using Biopython and Taxonomy ID
Biopython is a useful library for working with biological data in Python. You can use it to download sequences from GenBank by specifying a taxonomy ID. Here’s a code example to download sequences using Biopython’s Entrez module, which is an interface to the NCBI Entrez databases, including GenBank:
from Bio import Entrez
from Bio import SeqIO
# Set your email address (required by NCBI)
Entrez.email = "your_email@example.com"
# Specify the taxonomy ID
taxonomy_id = "your_taxonomy_id_here"
# Search for records in the nucleotide database using the taxonomy ID
search_query = f"txid{taxonomy_id}[Organism:exp]"
handle = Entrez.esearch(db="nucleotide", term=search_query)
# Parse the search results
record = Entrez.read(handle)
handle.close()
# Get the GenBank IDs of the records
genbank_ids = record["IdList"]
# Fetch the sequences using the GenBank IDs
sequences = []
for genbank_id in genbank_ids:
handle = Entrez.efetch(db="nucleotide", id=genbank_id, rettype="gb", retmode="text")
seq_record = SeqIO.read(handle, "genbank")
handle.close()
sequences.append(seq_record)
# Print the fetched sequences
for seq in sequences:
print(seq)
Replace “your_email@example.com” with your actual email address, and “your_taxonomy_id_here” with the specific taxonomy ID you’re interested in. This script will search the nucleotide database in NCBI GenBank using the specified taxonomy ID and download the corresponding sequences.
Here’s a brief explanation of the code:
- Import the necessary modules from Biopython: Entrez for accessing the NCBI databases and SeqIO for handling sequence records.
- Set your email address. This is required by NCBI to track the usage of their resources.
- Specify the taxonomy ID you want to search for.
- Create a search query using the taxonomy ID, and search the nucleotide database.
- Parse the search results to get a list of GenBank IDs.
- Use the GenBank IDs to fetch the sequences, and store them in a list.
- Print the fetched sequences in the console.
Please note that the script will fetch all the sequences associated with the specified taxonomy ID. If there are many sequences, it might take a while to download them all. Also, make sure you have installed Biopython in your Python environment before running the script. If you haven’t, you can install it using pip:
pip install biopython
If you want to download protein sequences instead of nucleotide sequences, you can make a few modifications to the code I provided earlier. The main change is to search and fetch data from the “protein” database instead of the “nucleotide” database. Here’s the updated code:
from Bio import Entrez
from Bio import SeqIO
# Set your email address (required by NCBI)
Entrez.email = "your_email@example.com"
# Specify the taxonomy ID
taxonomy_id = "your_taxonomy_id_here"
# Search for records in the protein database using the taxonomy ID
search_query = f"txid{taxonomy_id}[Organism:exp]"
handle = Entrez.esearch(db="protein", term=search_query)
# Parse the search results
record = Entrez.read(handle)
handle.close()
# Get the protein IDs of the records
protein_ids = record["IdList"]
# Fetch the sequences using the protein IDs
sequences = []
for protein_id in protein_ids:
handle = Entrez.efetch(db="protein", id=protein_id, rettype="gb", retmode="text")
seq_record = SeqIO.read(handle, "genbank")
handle.close()
sequences.append(seq_record)
# Print the fetched sequences
for seq in sequences:
print(seq)
This code is almost identical to the previous one, but with two important changes:
- The Entrez.esearch(): function has its db parameter set to “protein” instead of “nucleotide”. This change ensures that the search is performed in the protein database rather than the nucleotide database.
- The Entrez.efetch(): function also has its db parameter set to “protein” for the same reason. This change ensures that the sequences are fetched from the protein database.
Preparing custom gtf and bed files for nextflow RNA-seq pipeline
-- 0, generate the ref.gtf and ref.bed from GenBank file: HD21-2_new.gtf and HD21-2_new.bed --
#[PREPARE gtf from gff] https://github.com/gpertea/gffread/issues/3
#DEL gb2gff.py HD21-2_chr.gbk > HD21-2.gtf
#DEL gb2gff.py HD21-2_plasmid_pHD21-2_1.gbk >> HD21-2.gtf
#DEL gb2gff.py HD21-2_plasmid_pHD21-2_2.gbk >> HD21-2.gtf
#-- STEP 0.0: download the gff3 files --
cat HD21-2_chr.gff3 HD21-2_plasmid1.gff3 HD21-2_plasmid2.gff3 > HD21-2.gff3
#-- STEP 0.1: rename CDS to exon! --
gffread -E -F -O -T HD21-2.gff3 -o HD21-2.gtf
sed -i -e "s/\tCDS\t/\texon\t/g" HD21-2.gtf
#Genbank_CDS --> Genbank_exon
#cmsearch_CDS --> cmsearch_exon
#tRNAscan-SE_CDS --> tRNAscan-SE_exon
#or xxx_CDS --> xxx_exon --> xxx[cmsearch, Genbank, tRNAscan] exon
#
#-- STEP 0.2: add protein_coding "xxxx" to type exon
#exon (former Genbank CDS): 2657 = (Genbank transcript + cmsearch transcript + tRNAscan-SE transcript) = (2586+61+10)
#Genbank region: 7
# transcript: 2649
#Genbank_exon == Genbank transcript 2586
#tRNAscan-SE_exon == tRNAscan-SE transcript 61
#cmsearch_exon == cmsearch transcript 10
------------------------------------------------------
#total 5313 (transcript 2649-->bed, exon 2657, region 7)
#in kate "\texon" --> "_exon"
grep "Genbank_exon" HD21-2.gtf > HD21-2_Genbank_exon.gtf #add gene_biotype "protein_coding"; at the end of line (2586)
grep "tRNAscan-SE_exon" HD21-2.gtf > HD21-2_tRNAscan-SE_exon.gtf #add gene_biotype "tRNA"; at the end of line (61)
grep "cmsearch_exon" HD21-2.gtf > HD21-2_cmsearch_exon.gtf #add gene_biotype "RNase_P_RNA";| gene_biotype "ncRNA";| gene_biotype "rRNA";| gene_biotype "tmRNA";| gene_biotype "SRP_RNA"; at the end (10)
grep -i -e "cmsearch" HD21-2.gtf > temp
grep -i -e "gene_biotype" temp > temp2
CP052994.1 cmsearch transcript 1286239 1286622 . - . transcript_id "rna-HKH70_06005"; gene_id "gene-HKH70_06005"; gene_name "rnpB"; Dbxref "RFAM:RF00011"; gbkey "ncRNA"; gene "rnpB"; inference "COORDINATES: profile:INFERNAL:1.1.1"; locus_tag "HKH70_06005"; product "RNase P RNA component class B"; Name "rnpB"; gene_biotype "RNase_P_RNA";
CP052994.1 cmsearch transcript 1452715 1452905 . - . transcript_id "rna-HKH70_06865"; gene_id "gene-HKH70_06865"; gene_name "ssrS"; Dbxref "RFAM:RF00013"; gbkey "ncRNA"; gene "ssrS"; inference "COORDINATES: profile:INFERNAL:1.1.1"; locus_tag "HKH70_06865"; product "6S RNA"; Name "ssrS"; gene_biotype "ncRNA";
CP052994.1 cmsearch transcript 1706296 1706410 . - . transcript_id "rna-HKH70_08190"; gene_id "gene-HKH70_08190"; gene_name "rrf"; Dbxref "RFAM:RF00001"; gbkey "rRNA"; gene "rrf"; inference "COORDINATES: profile:INFERNAL:1.1.1"; locus_tag "HKH70_08190"; product "5S ribosomal RNA"; Name "rrf"; gene_biotype "rRNA";
...
grep -v "_exon" HD21-2.gtf > HD21-2_non_exon.gtf #2656
cat HD21-2_non_exon.gtf HD21-2_Genbank_exon.gtf HD21-2_tRNAscan-SE_exon.gtf HD21-2_cmsearch_exon.gtf > HD21-2_new.gtf #5313
#in kate "exon" --> "\texon"
#[Eventually] sort the HD21-2_new.gtf using libreoffice
#[Eventually] replace "" with "; "transcript_id with transcript_id; "\n with \n in kate
#-- STEP 0.3: PREPARE bed from gtf
#[Eventually] replace '\t0\t' with '\t.\t' in the gff-file in the kate-editor
gffread -E -F --bed HD21-2_new.gtf -o HD21-2_new.bed # .. loaded 2649 genomic features from HD21-2_new.gtf
#[IMPORTANT] delelte [*.1] in gtf, bed. In fa could be e.g. ">CP052994 Staphylococcus epidermidis strain HD21-2 chromosome, complete genome"
#GTF: CP052994.1 Genbank exon 1 1356 . + . transcript_id "gene-HKH70_00005"; gene_id "gene-HKH70_00005"; gene_name "dnaA"; Dbxref "NCBI_GP:QRJ41002.1"; Name "QRJ41002.1"; gbkey "CDS"; gene "dnaA"; inference "COORDINATES: similar to AA sequence:RefSeq:WP_017723675.1"; locus_tag "HKH70_00005"; product "chromosomal replication initiator protein DnaA"; protein_id "QRJ41002.1"; transl_table "11"; gene_biotype "protein_coding";
#BED: CP052994.1 0 1356 gene-HKH70_00005 100 + 0 1356 0,0,0 1 1356, 0, CDS=0:1356;CDSphase=0;geneID=gene-HKH70_00005;gene_name=dnaA;Name=dnaA;gbkey=Gene;gene=dnaA;gene_biotype=protein_coding;locus_tag=HKH70_00005
-------
-- 2 --
conda activate rnaseq
# --> !!!! SUCCESSFUL !!!!
nextflow run rnaseq --reads '/home/jhuang/DATA/Data_Anna_HAPDICS_RNASeq/raw_data/*_R{1,2}.fq.gz' --fasta /home/jhuang/DATA/Data_Anna_HAPDICS_RNASeq/HD21-2.fa --gtf /home/jhuang/DATA/Data_Anna_HAPDICS_RNASeq/HD21-2_new.gtf --bed12 /home/jhuang/DATA/Data_Anna_HAPDICS_RNASeq/HD21-2_new.bed -profile standard --aligner hisat2 --fcGroupFeatures gene_id --fcGroupFeaturesType gene_biotype --saveReference --skip_genebody_coverage --skip_dupradar --skip_preseq --skip_edger -resume
#default fcGroupFeatures="gene_biotype" (or gbkey)
--skip_qc Skip all QC steps apart from MultiQC
--skip_fastqc Skip FastQC
--skip_rseqc Skip RSeQC
--skip_genebody_coverage Skip calculating genebody coverage
--skip_preseq Skip Preseq
--skip_dupradar Skip dupRadar (and Picard MarkDups)
--skip_edger Skip edgeR MDS plot and heatmap
--skip_multiqc Skip MultiQC
47 popular functions in Biopython
Here are 47 popular functions in Biopython:
- Seq: Represents a biological sequence, such as DNA or protein.
- SeqRecord: Represents a sequence with metadata, including ID and description.
- SeqIO: Handles reading and writing sequence files in various formats.
- AlignIO: Handles reading and writing multiple sequence alignment files in various formats.
- Bio.Alphabet: Defines sequence alphabets, such as IUPAC codes for DNA and protein.
- Bio.SeqUtils: Provides various utilities for working with sequences, such as computing molecular weight and codon usage.
- Bio.SeqIO.QualityIO: Handles reading and writing FASTQ files.
- Bio.SeqIO.PhdIO: Handles reading and writing PHD files.
- Bio.SeqIO.FastaIO: Handles reading and writing FASTA files.
- Bio.SeqIO.ClustalIO: Handles reading and writing ClustalW files.
- Bio.SeqIO.EmbossIO: Handles reading and writing EMBOSS files.
- Bio.SeqIO.GenBankIO: Handles reading and writing GenBank files.
- Bio.SeqIO.SwissIO: Handles reading and writing Swiss-Prot files.
- Bio.SeqIO.TabIO: Handles reading and writing tab-separated files.
- Bio.SeqIO.NexusIO: Handles reading and writing NEXUS files.
- Bio.SearchIO: Parses search results from various tools, such as BLAST and HMMER.
- Bio.Blast: Runs BLAST searches against various databases.
- Bio.KEGG: Provides access to the KEGG database.
- Bio.PDB: Handles reading and writing PDB files, as well as structure manipulation and analysis.
- Bio.Align: Represents a multiple sequence alignment, with associated metadata and annotations.
- Bio.Align.Applications: Wrappers for running multiple sequence alignment tools, such as ClustalW and Muscle.
- Bio.Restriction: Provides tools for simulating and analyzing DNA restriction digests.
- Bio.SeqFeature: Represents a feature or annotation on a sequence, such as a gene or a mutation.
- Bio.Graphics.GenomeDiagram: Generates graphical representations of genome sequences, with features and annotations.
- Bio.Graphics.BasicChromosome: Generates basic chromosome diagrams.
- Bio.Emboss.Applications: Wrappers for running EMBOSS tools, such as Needle and Water.
- Bio.SeqIO.Example: Provides example sequence data for testing and learning.
- Bio.AlignIO.Example: Provides example alignment data for testing and learning.
- Bio.Align.AlignInfo: Computes various summary statistics and information content for a multiple sequence alignment.
- Bio.Align.PairwiseAligner: Computes pairwise alignments between two sequences.
- Bio.Motif: Represents sequence motifs, such as transcription factor binding sites.
- Bio.Motif.PWM: Represents a position weight matrix, commonly used to represent sequence motifs.
- Bio.Alphabet.IUPAC: Defines IUPAC codes for various sequence alphabets, such as DNA and protein.
- Bio.Alphabet.Gapped: Defines a gapped sequence alphabet, used for representing alignments.
- Bio.SeqIO.AceIO: Handles reading and writing ACE files, commonly used for sequencing data.
- Bio.SeqIO.GameXMLIO: Handles reading and writing GAME XML files, used for gene annotation.
- Bio.SeqIO.LasIO: A module in Biopython that provides a parser for the PacBio LAS file format. The LAS file format is used to store PacBio long-read sequencing data, including raw signal data, basecalls, and metadata.
- Bio.SeqUtils.molecular_weight(seq) – Calculates the molecular weight of a sequence.
- Bio.Phylo.PAML.codeml() – A module for running the PAML codeml program for estimating rates of evolution and selection.
- Bio.AlignIO.parse(handle, format) – Parses multiple sequence alignments from a file.
- Bio.Alphabet.IUPAC.unambiguous_dna – Defines the unambiguous DNA alphabet.
- Bio.Seq.reverse_complement(seq) – Returns the reverse complement of a sequence.
- Bio.Seq.translate(seq) – Translates a nucleotide sequence into the corresponding amino acid sequence.
- Bio.SeqUtils.seq1(seq) – Converts a protein sequence from three-letter code to one-letter code.
- Bio.SeqIO.write(records, handle, format) – Writes a sequence record or a list of sequence records to a file.
- Bio.PDB.PDBList() – A module for downloading PDB files from the RCSB PDB server.
- Bio.Blast.NCBIWWW.qblast(program, database, sequence) – Sends a sequence to the NCBI BLAST server for homology search.
GAMOLA2 pre- and postprocessing
#-- view process.py --
with open("MT880872_CDS.fasta", "r") as f:
for line in f:
if line.startswith(">"):
header_words = line.strip().split("locus_tag=")
third_word = header_words[1]
third_word_ = third_word.strip().split("]")
third_word__ = third_word_[0]
print(">"+third_word__)
else:
print(line.strip())
#input_file: MT880870_.fasta as pan_genome_reference_.fa; MT880870_.fasta is modified from MT880870.fasta by simplying the fasta headers.
echo ">MT880870_genes" > MT880870_genes.fa;
merge_seq.py MT880870_.fasta >> MT880870_genes.fa;
samtools faidx MT880870_genes.fa
bioawk -c fastx '{ print $name, length($seq) }' < MT880870_.fasta > length.txt
generate_gene_model.py length.txt > /media/jhuang/Titisee/GAMOLA2/Results/gene_models/MT880870_genes.fa.combined;
cp MT880870_genes.fa /media/jhuang/Titisee/GAMOLA2/Input_sequences
python process.py > MT880871_.fasta
echo ">MT880871_genes" > MT880871_genes.fa;
merge_seq.py MT880871_.fasta >> MT880871_genes.fa;
samtools faidx MT880871_genes.fa
bioawk -c fastx '{ print $name, length($seq) }' < MT880871_.fasta > length.txt
generate_gene_model.py length.txt > /media/jhuang/Titisee/GAMOLA2/Results/gene_models/MT880871_genes.fa.combined;
cp MT880871_genes.fa /media/jhuang/Titisee/GAMOLA2/Input_sequences
python process.py > MT880872_.fasta
echo ">MT880872_genes" > MT880872_genes.fa;
merge_seq.py MT880872_.fasta >> MT880872_genes.fa;
samtools faidx MT880872_genes.fa
bioawk -c fastx '{ print $name, length($seq) }' < MT880872_.fasta > length.txt
generate_gene_model.py length.txt > /media/jhuang/Titisee/GAMOLA2/Results/gene_models/MT880872_genes.fa.combined;
cp MT880872_genes.fa /media/jhuang/Titisee/GAMOLA2/Input_sequences
#---- 4 ----
cd /media/jhuang/Titisee/GAMOLA2
#WARNING: !!swissprot as default database, manually choose nr as Blast_db --> too slow, better choose swissprot, it is quick!!
./Gamola.pl #No Glimmer model and Critica database due to self-extracted ORF; choosing nr.pal or swissprot.pal as Blast_db; COG2014 as COG_db; Pfam-A.hmm as Pfam_db; No Rfam_db; TIGRFAMS_15.0_HMM.LIB as TIGRfam_db
/media/jhuang/Titisee/GAMOLA2/Results/COG_results
#grep "Class: ," roary_182s_95.fa_COG_*
for sample in 10601 10710 11187 11377 12474 12504 1520 1551 2092 2160 2467 289 3455 4694 5862 5863 6166 6464 7618 8007 8406 8784 9157 9373 953; do
sed -i -e 's/Class: ,/Class: None, None/g' roary_182s_95.fa_COG_${sample}
done
./Gamola.pl
#---- 5 ---- update locustag
cp /media/jhuang/Titisee/GAMOLA2/Consolidated_results/MT880870_genes.fa/MT880870_genes.fa.gb ./
awk '{print $0 "ORIGIN"> "file" NR}' RS='ORIGIN' MT880870_genes.fa.gb
sed -i 's/</ /g' file2
sed -i 's/^ //g' file2
cat file1 file2 > MT880870_genes_.fa.gb
iconv -t UTF-8 -f Windows-1252 MT880870_genes_.fa.gb
#89600270 Jan 26 13:44 roary_182s_95__.fa.gb
#-- if ran [3.2 selected]--
python ~/Scripts/update_locustag.py MT880870_genes_.fa.gb MT880870_.fasta.fai > MT880870_genes__.fa.gb
> MT880870_genes__.fa.gb #clean old *__.fa.gb
#---- 6.6 ----
cut -d$'\t' -f1 MT880870_.fasta.fai > MT880870_f1
process_gamola_gb_plus.py MT880870_f1 MT880870_genes__.fa.gb > annotated_MT880870.txt #!!
Some fields may be wrong.
BiopythonParserWarning)
/usr/local/lib/python2.7/dist-packages/Bio/GenBank/__init__.py:1047: BiopythonParserWarning: Ignoring invalid location: '1186620..1185596'
sed -i -e 's/1186620\.\.1185596/1185596\.\.1186620/g' roary__.fa.gb
#DEBUG "group_12676" -> "group_12667"
#---- ADD DNA-Sequences add the end of table as the last column ----
cut -d',' -f1-1 annotated_MT880870.txt > get_seq_ORF.sh
#extend the file get_seq_ORF.sh to extract all DNA-sequences
#mv the bash file under DIR of MT880870_.fasta.fai
#samtools faidx MT880870_.fasta "PLKLOBMN_00001" > seq_ORF.fasta
#samtools faidx MT880870_.fasta "PLKLOBMN_00002" >> seq_ORF.fasta
#or in the case simply "cp MT880870_.fasta seq_ORF.fasta"
awk '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);} END {printf("\n");}' < seq_ORF.fasta > seq_ORF_.fasta
grep -v ">" seq_ORF_.fasta > seq_ORF__.txt
#add "Sequence" to the first line of seq_ORF__.txt
paste -d',' annotated_MT880870.txt seq_ORF__.txt > annotated_MT880870_.txt
#---- repeat 5-->6.6 for MT880871 and MT880872 --------------
cp /media/jhuang/Titisee/GAMOLA2/Consolidated_results/MT880871_genes.fa/MT880871_genes.fa.gb ./
awk '{print $0 "ORIGIN"> "file" NR}' RS='ORIGIN' MT880871_genes.fa.gb
sed -i 's/</ /g' file2
sed -i 's/^ //g' file2
cat file1 file2 > MT880871_genes_.fa.gb
iconv -t UTF-8 -f Windows-1252 MT880871_genes_.fa.gb
python ~/Scripts/update_locustag.py MT880871_genes_.fa.gb MT880871_.fasta.fai > MT880871_genes__.fa.gb
#remove the last line 'ORIGIN'
cut -d$'\t' -f1 MT880871_.fasta.fai > MT880871_f1
process_gamola_gb_plus.py MT880871_f1 MT880871_genes__.fa.gb > annotated_MT880871.txt
cp MT880871_.fasta seq_ORF_71.fasta
awk '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);} END {printf("\n");}' < seq_ORF_71.fasta > seq_ORF_71_.fasta
grep -v ">" seq_ORF_71_.fasta > seq_ORF_71__.txt
#add 'Sequence' in the first line in seq_ORF_71__.txt
paste -d',' annotated_MT880871.txt seq_ORF_71__.txt > annotated_MT880871_.txt
#--
cp /media/jhuang/Titisee/GAMOLA2/Consolidated_results/MT880872_genes.fa/MT880872_genes.fa.gb ./
awk '{print $0 "ORIGIN"> "file" NR}' RS='ORIGIN' MT880872_genes.fa.gb
sed -i 's/</ /g' file2
sed -i 's/^ //g' file2
cat file1 file2 > MT880872_genes_.fa.gb
iconv -t UTF-8 -f Windows-1252 MT880872_genes_.fa.gb
python ~/Scripts/update_locustag.py MT880872_genes_.fa.gb MT880872_.fasta.fai > MT880872_genes__.fa.gb
#remove the last line 'ORIGIN'
cut -d$'\t' -f1 MT880872_.fasta.fai > MT880872_f1
process_gamola_gb_plus.py MT880872_f1 MT880872_genes__.fa.gb > annotated_MT880872.txt
cp MT880872_.fasta seq_ORF_72.fasta
awk '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);} END {printf("\n");}' < seq_ORF_72.fasta > seq_ORF_72_.fasta
grep -v ">" seq_ORF_72_.fasta > seq_ORF_72__.txt
#add 'Sequence' in the first line in seq_ORF_72__.txt
paste -d',' annotated_MT880872.txt seq_ORF_72__.txt > annotated_MT880872_.txt
mv annotated_MT880870_.txt annotated__MT880870_.txt
#~/Tools/csv2xls-0.4/csv_to_xls.py annotated__MT880870_.txt annotated_MT880871_.txt annotated_MT880872_.txt -d',' -o annotated_MT880870-MT880872.xls
The script that takes two file names as command line arguments, merges the files based on matching first words, removes the key from file2, and deletes all lines in the merged file in which the key does not occur in file2.
import sys
#"","baseMean","log2FoldChange","lfcSE","stat","pvalue","padj","Gene","Product","PFAM Label","PFAM Match", "PFAM Interpro", "PFAM Product", "TIGR Label", "TIGR Product","COG Label", "COG Code","COG Product","Translation" #delete "Locus Tag",
#python3 merge2.py hS_vs_TSB_output/background annotated_CP052994_2996.txt hS_vs_TSB_output/background_
#python3 merge2.py hS_vs_TSB_output/downregulated_filtered annotated_CP052994_2996.txt hS_vs_TSB_output/downregulated_filtered_
#python3 merge2.py hS_vs_TSB_output/upregulated_filtered annotated_CP052994_2996.txt hS_vs_TSB_output/upregulated_filtered_
#python3 merge2.py infection_vs_nose_output/background annotated_CP052994_2996.txt infection_vs_nose_output/background_
#python3 merge2.py infection_vs_nose_output/downregulated_filtered annotated_CP052994_2996.txt infection_vs_nose_output/downregulated_filtered_
#python3 merge2.py infection_vs_nose_output/upregulated_filtered annotated_CP052994_2996.txt infection_vs_nose_output/upregulated_filtered_
#sed -i -e 's/HKH70_09175/HKH70_09175 (PLKLOBMN_00173)/g' background_
#sed -i -e 's/HKH70_05165/HKH70_05165 (PLKLOBMN_00016)/g' background_
#sed -i -e 's/HKH70_05170/HKH70_05170 (PLKLOBMN_00017)/g' background_
#sed -i -e 's/HKH70_05175/HKH70_05175 (PLKLOBMN_00018)/g' background_
#python3 ~/Tools/csv2xls-0.4/csv_to_xls.py background_ upregulated_filtered_ downregulated_filtered_ -d',' -o degenes_hS_vs_TSB.xls
#python3 ~/Tools/csv2xls-0.4/csv_to_xls.py background_ upregulated_filtered_ downregulated_filtered_ -d',' -o degenes_infection_vs_nose.xls
# Check if both file names were provided as command line arguments
if len(sys.argv) != 4:
print("Usage: python merge2.py file1.txt file2.txt output_file.txt")
sys.exit(1)
# Open file1 for reading and file2 for editing
with open(sys.argv[1], 'r') as f1, open(sys.argv[2], 'r+') as f2:
# Read file2 into a dictionary
f2_dict = {}
for line in f2:
key, content = line.strip().split(',', 1)
f2_dict[key] = content
# Merge file1 and file2
merged_lines = []
for line in f1:
key = line.split(',')[0]
if key in f2_dict:
content = f2_dict[key]
merged_lines.append(line.strip() + ',' + content)
## Move the file pointer to the beginning of file2
#f2.seek(0)
## Truncate file2 to delete all its contents
#f2.truncate()
## Rewrite file2 with the updated content
#for key, content in f2_dict.items():
# f2.write(content + '\n')
# Overwrite file1 with the merged content
with open(sys.argv[3], 'w') as f3:
f3.write('\n'.join(merged_lines))
Retrieving KEGG Genes Using Bioservices in Python
Biopython does not have built-in support for KEGG database. However, you can use the bioservices library to retrieve and interact with KEGG data. To fetch all available genes in the KEGG database, you would need to iterate through each organism and collect all their genes. Note that this process might take a long time and may not be efficient, as there are thousands of organisms and millions of genes in the KEGG database.
Use the bioservices library to fetch the list of available organisms and retrieve genes for the first few organisms:
from bioservices import KEGG
# Initialize KEGG API
kegg_api = KEGG()
# Get the list of organisms
organisms_raw = kegg_api.list("organism")
organisms = [entry.split("\t")[1] for entry in organisms_raw.split("\n") if entry]
#['hsa', 'ptr', 'pps', 'ggo', 'pon', 'nle', 'hmh', 'mcc', 'mcf', 'mthb', 'mni', 'csab', 'caty', 'panu', 'tge', 'mleu', 'rro', 'rbb', 'tfn', 'pteh', 'cang', 'cjc', 'sbq', 'cimi', 'csyr', 'mmur', 'lcat', 'pcoq', 'oga', 'mmu', 'mcal', ... , 'loki', 'psyt', 'agw', 'arg']
# Limit the number of organisms and genes for demonstration purposes
organism_limit = 3
gene_limit = 10
# Iterate through the organisms
for organism in organisms[:organism_limit]:
print(f"Organism: {organism}")
# Get the list of genes for the current organism
genes = kegg_api.list(f"{organism}").split("\n")[:gene_limit]
# Iterate through the genes and print gene identifiers
for gene_entry in genes:
gene_id = gene_entry.split("\t")[0]
print(f"Gene ID: {gene_id}")
print("\n")
#Organism: hsa
#Gene ID: hsa:102466751
#Gene ID: hsa:100302278
#Gene ID: hsa:79501
#Gene ID: hsa:112268260
#Gene ID: hsa:729759
#Gene ID: hsa:124904706
#Gene ID: hsa:105378947
#Gene ID: hsa:113219467
#Gene ID: hsa:81399
#Gene ID: hsa:148398
#...
This code will print the gene identifiers of the first 10 genes for the first 3 organisms in the KEGG database. You can modify the organism_limit and gene_limit variables to change the number of organisms and genes processed.
Remember that fetching all genes from the KEGG database might take a significant amount of time and may not be efficient. It’s usually more practical to focus on specific organisms or pathways of interest.