Author Archives: gene_x

Add two legends for two dimensions in scatter_3d() in plotly 4.10

add two legends for two dimensions in plotly 4.10 for the functio scatter_3d(), i.e. one legend for the color, one for the symbol

In Plotly 4.10, you can add two legends for two dimensions in scatter_3d() by creating a custom trace for each legend and adding them to the layout. Here’s an example code

import plotly.graph_objects as go

# Sample data
x = [1, 2, 3, 4, 5]
y = [5, 4, 3, 2, 1]
z = [1, 2, 3, 4, 5]
color = [1, 1, 2, 2, 3]
symbol = [1, 2, 1, 2, 1]

# Create custom trace for color legend
color_legend_trace = go.Scatter3d(
    x=[],
    y=[],
    z=[],
    mode='markers',
    name='Color',
    marker=dict(
        size=5,
        opacity=0.7,
        colorscale='Viridis',
        color=[],
        showscale=True
    )
)

# Create custom trace for symbol legend
symbol_legend_trace = go.Scatter3d(
    x=[],
    y=[],
    z=[],
    mode='markers',
    name='Symbol',
    marker=dict(
        size=5,
        opacity=0.7,
        symbol=[],
        showscale=True
    )
)

# Populate custom traces with data
for i in range(len(x)):
    color_legend_trace['x'] += [x[i]]
    color_legend_trace['y'] += [y[i]]
    color_legend_trace['z'] += [z[i]]
    color_legend_trace['marker']['color'] += [color[i]]

    symbol_legend_trace['x'] += [x[i]]
    symbol_legend_trace['y'] += [y[i]]
    symbol_legend_trace['z'] += [z[i]]
    symbol_legend_trace['marker']['symbol'] += [symbol[i]]

# Create main scatter plot trace
scatter_trace = go.Scatter3d(
    x=x,
    y=y,
    z=z,
    mode='markers',
    marker=dict(
        size=5,
        opacity=0.7,
        colorscale='Viridis',
        color=color,
        symbol=symbol,
        showscale=False
    )
)

# Create layout with two legends
layout = go.Layout(
    scene=dict(
        aspectmode='data',
        legend=dict(
            itemsizing='constant'
        )
    ),
    legend=dict(
        orientation='h',
        yanchor='bottom',
        y=1.02,
        xanchor='right',
        x=1
    ),
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0
    )
)

# Add custom traces and main scatter plot trace to figure
fig = go.Figure(data=[color_legend_trace, symbol_legend_trace, scatter_trace], layout=layout)

# Show figure
fig.show()

This code will create a 3D scatter plot with two legends: one for the color and one for the symbol. The color_legend_trace is a custom trace that shows the color legend, while the symbol_legend_trace is a custom trace that shows the symbol legend. The scatter_trace is the main scatter plot trace that contains the data points. In the layout, we set the legend property of the scene to None to remove the default legend, and add two custom legends using the legend property of the layout. Finally, we create a Figure object with the custom traces and the main scatter plot trace, and show it.


DEBUGGED!

import plotly.graph_objects as go

# Sample data
x = [1, 2, 3, 4, 5]
y = [5, 4, 3, 2, 1]
z = [1, 2, 3, 4, 5]
color = ['#AA0DFE', '#3283FE', '#85660D', '#782AB6', '#565656']
symbol = ['circle', 'circle-open', 'square', 'square-open', 'diamond']

# Create custom trace for color legend
color_legend_trace = go.Scatter3d(
    x=[1, 2, 3, 4, 5],
    y=[5, 4, 3, 2, 1],
    z=[1, 2, 3, 4, 5],
    mode='markers',
    name='Color',
    marker=dict(
        size=5,
        opacity=0.7,
        colorscale='Viridis',
        color=['#AA0DFE', '#3283FE', '#85660D', '#782AB6', '#565656'],
        showscale=True
    )
)

# Create custom trace for symbol legend
symbol_legend_trace = go.Scatter3d(
    x=[1, 2, 3, 4, 5],
    y=[5, 4, 3, 2, 1],
    z=[1, 2, 3, 4, 5],
    mode='markers',
    name='Symbol',
    marker=dict(
        size=5,
        opacity=0.7,
        symbol=['circle', 'circle-open', 'square', 'square-open', 'diamond'],
        showscale=True
    )
)

#Error due to the tuple type.
## Populate custom traces with data
#for i in range(len(x)):
#    color_legend_trace['x'] += [x[i]]
#    color_legend_trace['y'] += [y[i]]
#    color_legend_trace['z'] += [z[i]]
#    color_legend_trace['marker']['color'] += [color[i]]
#    
#    symbol_legend_trace['x'] += [x[i]]
#    symbol_legend_trace['y'] += [y[i]]
#    symbol_legend_trace['z'] += [z[i]]
#    symbol_legend_trace['marker']['symbol'] += [symbol[i]]

# Create main scatter plot trace
scatter_trace = go.Scatter3d(
    x=x,
    y=y,
    z=z,
    mode='markers',
    marker=dict(
        size=5,
        opacity=0.7,
        colorscale='Viridis',
        color=color,
        symbol=symbol,
        showscale=False
    )
)

#scene = go.Scene(
#    xaxis=dict(title='X Axis'),
#    yaxis=dict(title='Y Axis'),
#    zaxis=dict(title='Z Axis'),
#    # Remove the 'legend' property from the Scene object
#)

# Create layout with two legends
layout = go.Layout(
    #scene=dict(
    #    aspectmode='data'
    #),
    legend=dict(
        orientation='h',
        yanchor='bottom',
        y=1.02,
        xanchor='right',
        x=1
    ),
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0
    )
)

# Add custom traces and main scatter plot trace to figure
fig = go.Figure(data=[color_legend_trace, symbol_legend_trace, scatter_trace], layout=layout)
#fig = go.Figure(data=[scatter], layout=go.Layout(
#    title='My 3D Scatter Plot',
#    legend=dict(
#        title='Legend Title',
#        font=dict(size=12)
#    )
#))

# Show figure
#fig.show()
fig.write_image("fig2.svg")

TODO: categorical legends!

Add two legends for two dimensions in plotly, i.e. one legend for the color, one for the symbol in plotly

import plotly.graph_objects as go
import pandas as pd

# Create a sample dataframe
df = pd.DataFrame({'category_1': ['A', 'B', 'A', 'B', 'A', 'B'],
                   'category_2': ['X', 'Y', 'X', 'Y', 'X', 'Y'],
                   'value_1': [1, 2, 3, 4, 5, 6],
                   'value_2': [7, 8, 9, 10, 11, 12]})

fig = go.Figure()

# Create scatter plot for category_1
for cat in df['category_1'].unique():
    fig.add_trace(go.Scatter(x=df.loc[df['category_1'] == cat, 'value_1'],
                             y=df.loc[df['category_1'] == cat, 'value_2'],
                             mode='markers',
                             name=cat,
                             legendgroup='category_1',
                             marker=dict(symbol='circle')))

# Create scatter plot for category_2
symbols = {'X': 'star', 'Y': 'diamond'}
for cat in df['category_2'].unique():
    fig.add_trace(go.Scatter(x=df.loc[df['category_2'] == cat, 'value_1'],
                             y=df.loc[df['category_2'] == cat, 'value_2'],
                             mode='markers',
                             name=cat,
                             legendgroup='category_2',
                             marker=dict(symbol=symbols[cat], color='black'),
                             showlegend=True,
                             visible='legendonly'))

# Update legend properties
fig.update_layout(legend=dict(tracegroupgap=30))

# Add custom legend group names using annotations
fig.update_layout(
    annotations=[
        go.layout.Annotation(
            text="Group 1",
            x=1.05,
            y=1,
            xref="paper",
            yref="paper",
            showarrow=False,
            font=dict(size=14, color='black'),
            bgcolor='rgba(255, 255, 255, 0.8)',
            bordercolor='black',
            borderwidth=1,
            xanchor='left',
            yanchor='top',
            valign='top'
        ),
        go.layout.Annotation(
            text="Group 2",
            x=1.05,
            y=0.65,
            xref="paper",
            yref="paper",
            showarrow=False,
            font=dict(size=14, color='black'),
            bgcolor='rgba(255, 255, 255, 0.8)',
            bordercolor='black',
            borderwidth=1,
            xanchor='left',
            yanchor='top',
            valign='top'
        )
    ]
)

fig.show()

Draw scatter_3d using plotly 4.10

Simplied but not working version.

import plotly.graph_objects as go
import pandas as pd
from sklearn.decomposition import PCA
import numpy as np

# Provided dataframe
df = pd.DataFrame({
    'PC1': [-13.999925, -12.504291, -12.443057, -13.065235, -17.316215],
    'PC2': [-1.498823, -3.342411, -6.067055, -8.205809, 3.293993],
    'PC3': [-3.335085, 15.207755, -14.725450, 15.078469, -6.917358],
    'condition': ['GFP d3', 'GFP d3', 'GFP d8', 'GFP d8', 'GFP+mCh d9/12'],
    'donor': ['DI', 'DII', 'DI', 'DII', 'DI']
})

# Create PCA plot with 3D scatter
fig = px.scatter_3d(df, x='PC1', y='PC2', z='PC3', color='condition', symbol='donor',
                    title='PCA with 3 dimensions')

# Custom legend for condition
conditions = df['condition'].unique()
colors = px.colors.qualitative.Plotly[:len(conditions)]

for i, cond in enumerate(conditions):
    fig.add_trace(go.Scatter3d(x=[None], y=[None], z=[None],
                               mode='markers',
                               marker=dict(size=6, color=colors[i]),
                               showlegend=True, name=cond))

# Custom legend for donor
donors = df['donor'].unique()
symbols = ['circle', 'diamond']

for i, donor in enumerate(donors):
    fig.add_trace(go.Scatter3d(x=[None], y=[None], z=[None],
                               mode='markers',
                               marker=dict(size=6, color='black', symbol=symbols[i]),
                               showlegend=True, name=donor))

# Annotations for the legend blocks
fig.update_layout(
    annotations=[
        dict(x=1.1, y=1.0, xref='paper', yref='paper', showarrow=False,
             text='Condition', font=dict(size=15)),
        dict(x=1.1, y=0.6, xref='paper', yref='paper', showarrow=False,
             text='Donor', font=dict(size=15))
    ]
)

fig.show()

Draw scatter_3d using plotly 4.10 (SUCCESSFUL)

import plotly.graph_objects as go
import pandas as pd
from sklearn.decomposition import PCA
import numpy as np

# Read in data as a pandas dataframe
#df = pd.DataFrame({
#    'PC1': [-13.999925, -12.504291, -12.443057, -13.065235, -17.316215],
#    'PC2': [-1.498823, -3.342411, -6.067055, -8.205809, 3.293993],
#    'PC3': [-3.335085, 15.207755, -14.725450, 15.078469, -6.917358],
#    'condition': ['GFP d3', 'GFP d3', 'GFP d8', 'GFP d8', 'GFP+mCh d9/12'],
#    'donor': ['DI', 'DII', 'DI', 'DII', 'DI']
#})
df = pd.read_csv('merged_df_28PCs.csv', index_col=0, header=0)

# Fit PCA model to reduce data dimensions to 3
pca = PCA(n_components=3)
pca.fit(df.iloc[:, :-3])
X_reduced = pca.transform(df.iloc[:, :-3])

# Add reduced data back to dataframe
df['PC1'] = X_reduced[:, 0]
df['PC2'] = X_reduced[:, 1]
df['PC3'] = X_reduced[:, 2]

# Create PCA plot with 3D scatter
fig = go.Figure()

#['circle', 'circle-open', 'square', 'square-open', 'diamond', 'diamond-open', 'cross', 'x']
condition_color_map = {'GFP d3': 'blue', 'GFP d8': 'red', 'GFP+mCh d9/12': 'green', 'LT d3': 'orange'}
donor_symbol_map = {'DI': 'circle', 'DII': 'cross'}

for donor, donor_symbol in donor_symbol_map.items():
    for condition, condition_color in condition_color_map.items():
        mask = (df['condition'] == condition) & (df['donor'] == donor)
        fig.add_trace(go.Scatter3d(x=df.loc[mask, 'PC1'], y=df.loc[mask, 'PC2'], z=df.loc[mask, 'PC3'],
                                   mode='markers',
                                   name=f'{condition}' if donor == 'DI' else None,
                                   legendgroup=f'{condition}',
                                   showlegend=True if donor == 'DI' else False,
                                   marker=dict(size=10, opacity=0.8, color=condition_color, symbol=donor_symbol)))

for donor, donor_symbol in donor_symbol_map.items():
    fig.add_trace(go.Scatter3d(x=[None], y=[None], z=[None],
                               mode='markers',
                               name=donor,
                               showlegend=True,
                               marker=dict(size=10, opacity=1, color='black', symbol=donor_symbol),
                               hoverinfo='none'))

# Annotations for the legend blocks
fig.update_layout(
    annotations=[
        dict(x=1.1, y=1.0, xref='paper', yref='paper', showarrow=False,
             text='Condition', font=dict(size=15)),
        dict(x=1.1, y=0.6, xref='paper', yref='paper', showarrow=False,
             text='Donor', font=dict(size=15))
    ],
    scene=dict(
        aspectmode='cube',
        xaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title='PC1: 36% v.'),
        yaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title='PC2: 17% v.'),
        zaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title='PC3: 15% variance'),
        bgcolor='white'
    ),
    margin=dict(l=5, r=5, b=5, t=5)  # Adjust the margins to prevent clipping of axis titles
)

#fig.show()
fig.write_image("fig1.svg")

DAMIAN: A Comprehensive Online Platform for the Detection and Analysis of Viral and Microbial Infectious Agents using NGS Data

Abstract:

Next-generation sequencing (NGS) has revolutionized the field of genomics, allowing for rapid detection and analysis of infectious agents. However, the complexity of NGS data and the lack of user-friendly tools have limited the widespread adoption of these techniques. Here, we present DAMIAN (Detection & Analysis of viral and Microbial Infectious Agents by NGS), a comprehensive online platform designed to facilitate the identification and characterization of viral and microbial pathogens from NGS data. DAMIAN streamlines the process of data input, analysis, and visualization, making it accessible to researchers with varying levels of computational expertise.

Introduction:

1.1 Background and motivation

Discuss the challenges faced by researchers in the detection and analysis of viral and microbial infectious agents using NGS data.
Explain the need for a user-friendly, comprehensive online platform to address these challenges.

1.2 Overview of DAMIAN

Introduce DAMIAN as a solution to these challenges.
Briefly describe the features and functionality of DAMIAN.

Methods:

2.1 Data input and preprocessing

Explain how users can upload their NGS data to the platform.
Describe the preprocessing steps performed by DAMIAN, including quality control and adapter trimming.

2.2 Identification of infectious agents

Detail the algorithms and databases employed by DAMIAN to identify viral and microbial pathogens in the input data.

2.3 Data analysis and visualization

Discuss the various analysis and visualization tools available within DAMIAN, including phylogenetic analysis, genome assembly, and functional annotation.

Results:

3.1 Case studies

Present case studies showcasing the effectiveness and utility of DAMIAN in identifying and analyzing infectious agents from NGS data.

3.2 Comparison with existing tools

Compare the performance of DAMIAN with other available tools and platforms.

Discussion:

4.1 Advantages of DAMIAN

Highlight the benefits of using DAMIAN, such as ease of use, comprehensiveness, and accessibility.

4.2 Future developments and improvements

Discuss potential future enhancements to the platform, including the incorporation of new algorithms, databases, and analysis tools.

Conclusion:

Summarize the main points of the manuscript and reiterate the value of DAMIAN as a comprehensive online platform for the detection and analysis of viral and microbial infectious agents using NGS data.

A simple machine learning example using Python using scikit-learn

A simple machine learning example using Python and the scikit-learn library for the classification of the Iris dataset. The Iris dataset is a classic dataset containing measurements of iris flowers and their species. We will use a Decision Tree classifier to classify the species based on the measurements.

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Load the Iris dataset
iris = datasets.load_iris()
X = iris.data
y = iris.target

# Split the dataset into train and test sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create a Decision Tree classifier and train it on the train set
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Calculate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy of the Decision Tree classifier: {accuracy:.2f}")

This code will output the accuracy of the Decision Tree classifier on the Iris dataset. We can experiment with other classifiers and their parameters to see how the results change.

damian.py

#!/usr/bin/env python3

import os
import sys
import logging
import tempfile
import shutil
import datetime
from pathlib import Path
import psycopg2

from config import *

from lib.options import Options
from lib.wrap.trimmomatic import Trimmomatic
from lib.wrap.bowtie2 import Bowtie2
from lib.wrap.hmmsearch import HMMsearch
from lib.wrap.assembler import Assembler
from lib.wrap.trinity import Trinity
from lib.wrap.spades import SPAdes
from lib.wrap.idba import IDBA
from lib.wrap.no_assembler import No_Assembler
from lib.wrap.blast import Blast
from lib.wrap.blastn import Blastn
from lib.wrap.blastp import Blastp
from lib.usage import Usage
from lib.prepare_run import PrepareRun
from lib.contig_ops import ContigOps
from lib.ranking_ops import RankingOps
from lib.task_manager import TaskManager
from lib.taxonomy_ops import TaxonomyOps
from lib.check_dependencies import CheckDependencies

START_TIME = datetime.datetime.now()

options = Options.parse(sys.argv[1:])
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
handler = logging.StreamHandler(sys.stderr)
logger.addHandler(handler)

ContigOps = ContigOps()
RankingOps = RankingOps()
TaskManager = TaskManager()

checkpoint = 0
trimmomatic = Trimmomatic(TRIMMOMATIC_PATH, TRIMMOMATIC_PARAMS, TRIMMOMATIC_PE_PARAMS,
                          TRIMMOMATIC_SE_PARAMS, options.threads, logger)
bowtie2 = Bowtie2(BOWTIE2_PATH, BOWTIE2_PARAMS, options.threads, logger)

assembler = None
if options.no_assembly:
    assembler = No_Assembler(None, None, options.threads, logger)
else:
    if ASSEMBLER == "trinity":
        assembler = Trinity(TRINITY_PATH, TRINITY_PARAMS, options.threads, logger)
    elif ASSEMBLER == "spades":
        assembler = SPAdes(SPADES_PATH, SPADES_PARAMS, options.threads, logger)
    elif ASSEMBLER == "idba":
        assembler = IDBA_ud(IDBA_PATH, IDBA_PARAMS, options.threads, logger)

hmmsearch = HMMsearch(HMMSEARCH_PATH, f"{HMMSEARCH_PARAMS} --domE {HMMSEARCH_EVALUE}",
                      PFAM_PATH, options.threads, logger)
blastn = Blastn(BLASTN_PATH, BLASTN_PARAMS, BLASTN_NT_PATH, options.threads, logger)
blastp = Blastp(BLASTP_PATH, BLASTP_PARAMS, BLASTP_NR_PATH, options.threads, logger)

def finish(failure_msg = nil)
  if @options.clean
    begin
      FileUtils.rm_rf(@options.derived[:sample_folder])
    rescue => e
      @logger.error "#{e}\nFailed to remove folder '#{@options.derived[:sample_folder]}'. Please remove it manually."
    end
  end

  pp @options.derived if @options.debug

  if failure_msg
    checkpoint(failure_msg, 20)
    exit(false)
  else
    checkpoint('Analysis Complete')
    cmd = "INSERT INTO analysis (finished) VALUES ('now')"
    @logger.debug(cmd)
    @conn.exec(cmd)
    exit(true)
  end
end

# ... (other parts of the script) ...

checkpoint('Generating summary report')
require_relative File.join('lib', 'wrap', 'report')
report = Report.new(@options, @logger)
report.create

# Upload results to the user-specified storage (e.g., AWS S3, Google Cloud Storage)
checkpoint('Uploading results')
if @options.upload_results
  require_relative File.join('lib', 'wrap', 'upload_results')
  uploader = UploadResults.new(@options, @logger)
  uploader.upload
end

finish()

How to use Biopython to perform phylogenetic analysis?

Biopython is a powerful library for bioinformatics that provides tools for manipulating biological sequences, working with 3D structures, performing genome analysis, and more. Among its many features, it also has tools for phylogenetic analysis.

Here’s a simple example of how you can use Biopython to perform phylogenetic analysis using the following steps:

  • Obtain sequence data (e.g., from a FASTA file or GenBank).
  • Perform multiple sequence alignment (MSA) using a suitable algorithm.
  • Construct a phylogenetic tree using a tree-building method.
  • Visualize the phylogenetic tree.

Note that Biopython does not have built-in tools for performing multiple sequence alignment, so you can use an external tool like MUSCLE or Clustal Omega for this step. However, Biopython can parse the output of these tools.

Here’s an example workflow:

Step 1: Obtain sequence data

Let’s assume you have a FASTA file named sequences.fasta containing multiple sequences.

Step 2: Perform multiple sequence alignment

First, install the necessary libraries:

pip install biopython

Now, align your sequences using an external tool like MUSCLE or Clustal Omega. You can also use Biopython’s AlignIO module to parse the output.

For example, if you’re using Clustal Omega, you can run:

clustalo -i sequences.fasta -o aligned_sequences.clustal --outfmt=clustal
Step 3: Construct a phylogenetic tree

We will use the Phylo module in Biopython to construct a tree using the neighbor-joining method.

from Bio import AlignIO
from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor
from Bio import Phylo

# Read the alignment file
alignment = AlignIO.read("aligned_sequences.clustal", "clustal")

# Calculate the distance matrix
calculator = DistanceCalculator("identity")
distance_matrix = calculator.get_distance(alignment)

# Construct the tree using the neighbor-joining method
constructor = DistanceTreeConstructor()
tree = constructor.nj(distance_matrix)

# Save the tree to a file in Newick format
Phylo.write(tree, "phylogenetic_tree.newick", "newick")
Step 4: Visualize the phylogenetic tree

You can visualize the tree using Biopython’s Phylo.draw() function or export it to a file and use external tools like FigTree, iTOL, or Dendroscope.

To visualize the tree using Biopython’s Phylo.draw() function:

import matplotlib.pyplot as plt

# Draw the tree
Phylo.draw(tree)

# Save the tree as an image
plt.savefig("phylogenetic_tree.png")

These steps provide a basic example of how to perform phylogenetic analysis using Biopython. You can further customize the analysis and visualization by exploring the various features and options available in the library.

Powerful oneliners for quick data processing in bioinformatics

There are many command-line tools and utilities that can be useful in bioinformatics for quick data processing, analysis, and manipulation. Some of these oneliner tools include:

  • awk: A versatile text processing tool that can be used to filter, reformat, and transform data.

    awk '{print $1}' input.txt
  • sed: A stream editor for filtering and transforming text.

    sed 's/A/T/g' input.txt > output.txt
  • grep: A tool to search for patterns in text files.

    grep “ATG” input.fasta

  • sort: Sorts lines in a text file.

    sort input.txt > sorted_input.txt

  • uniq: Removes duplicate lines from a sorted file or displays the number of occurrences of each line.

    uniq -c input.txt > unique_counts.txt

  • wc: Counts lines, words, and characters in a file.

    wc -l input.txt

  • cut: Removes sections from each line of a file.

    cut -f1,3 input.txt > output.txt

  • paste: Joins corresponding lines of multiple files.

    paste file1.txt file2.txt > combined.txt

  • tr: Translates or deletes characters.

    tr ‘atcg’ ‘TAGC’ < input.txt > output.txt

  • curl: Transfers data from or to a server.

    curl -O “https://example.com/file.fasta

  • bioawk: An extension of awk with built-in functions for biological data.

    bioawk -c fastx ‘{print $name, length($seq)}’ input.fasta

  • seqkit: A cross-platform toolkit for FASTA/Q file manipulation.

    seqkit stat input.fasta

  • Samtools is a widely-used suite of tools for manipulating and analyzing high-throughput sequencing (HTS) data in the SAM, BAM, and CRAM formats. Here are some examples of how you can use Samtools for various tasks:

    • Convert SAM to BAM format: To convert a SAM (Sequence Alignment/Map) file to a compressed binary BAM (Binary Alignment/Map) file, you can use the samtools view command with the -bS option: samtools view -bS input.sam > output.bam

    • Sort a BAM file: To sort a BAM file by genomic coordinates, you can use the samtools sort command: samtools sort input.bam -o sorted_output.bam

    • Index a sorted BAM file: To create an index for a sorted BAM file, which allows you to quickly access alignments overlapping particular genomic regions, you can use the samtools index command:

      samtools index sorted_output.bam

    • Generate an alignment statistics report: To create a summary report of alignment statistics, such as the number of mapped and unmapped reads, you can use the samtools flagstat command:

      samtools flagstat input.bam > report.txt

    • Extract reads aligned to a specific region: To extract reads aligned to a specific genomic region, you can use the samtools view command with the -h option and the region of interest:

      samtools view -h input.bam chr1:10000-20000 > region_output.bam

    • Filter alignments: To filter alignments based on specific criteria, such as minimum mapping quality, you can use the samtools view command with the -q option:

      samtools view -q 20 input.bam > filtered_output.bam

    • Generate a pileup: To create a pileup file from a BAM file, which displays the sequencing depth at each position of the reference genome, you can use the samtools mpileup command:

      samtools mpileup -f reference.fasta input.bam > output.pileup

  • Bcftools is a set of utilities for variant calling and manipulating VCF (Variant Call Format) and BCF (Binary Call Format) files. Here are some examples of how you can use Bcftools for various tasks:

    • Call variants: To call variants from a BAM or CRAM file using the consensus caller, you can use the bcftools mpileup command followed by bcftools call:

      bcftools mpileup -Ou -f reference.fasta input.bam | bcftools call -mv -Ov -o output.vcf

    • Filter variants: To filter variants based on specific criteria, such as quality or depth, you can use the bcftools filter command:

      bcftools filter -i ‘QUAL > 20 && DP > 10’ input.vcf -o filtered_output.vcf

    • View VCF/BCF file: To view the contents of a VCF or BCF file, you can use the bcftools view command:

      bcftools view input.vcf

    • Sort a VCF file: To sort a VCF file by genomic coordinates, you can use the bcftools sort command:

      bcftools sort input.vcf -o sorted_output.vcf

    • Index a VCF file: To create an index for a VCF or BCF file, which allows you to quickly access variants overlapping specific genomic regions, you can use the bcftools index command:

      bcftools index sorted_output.vcf

    • Concatenate multiple VCF files: To concatenate multiple VCF files, ensuring that they have the same sample columns in the same order, you can use the bcftools concat command:

      bcftools concat input1.vcf input2.vcf -o combined_output.vcf

    • Generate consensus sequences: To create consensus sequences by applying VCF variants to a reference genome, you can use the bcftools consensus command:

      bcftools consensus -f reference.fasta input.vcf.gz > consensus.fasta

    • Normalize indels: To normalize indels in a VCF file (left-align and trim indels), you can use the bcftools norm command:

      bcftools norm -f reference.fasta input.vcf -o normalized_output.vcf

  • Bedtools is a powerful suite of tools for working with genomic intervals in various file formats, such as BED, GFF/GTF, and VCF. Here are some examples of how you can use Bedtools for various tasks:

    • Intersect intervals: To find overlapping intervals between two files, you can use the bedtools intersect command:

      bedtools intersect -a input1.bed -b input2.bed > output.bed

    • Merge intervals: To merge overlapping or adjacent intervals in a single file, you can use the bedtools merge command:

      bedtools merge -i input.bed > output.bed

    • Subtract intervals: To subtract intervals in one file from another, you can use the bedtools subtract command:

      bedtools subtract -a input1.bed -b input2.bed > output.bed

    • Get genomic sequences: To extract sequences from a reference genome corresponding to intervals in a BED file, you can use the bedtools getfasta command:

      bedtools getfasta -fi reference.fasta -bed input.bed -fo output.fasta

    • Sort intervals: To sort genomic intervals by chromosome and start position, you can use the bedtools sort command:

      bedtools sort -i input.bed > sorted_output.bed

    • Calculate coverage: To compute the depth at each position or the depth for each interval in a BED file, you can use the bedtools coverage command:

    bedtools coverage -a input1.bed -b input2.bed > output.bed

    • Find closest features: To find the closest feature in another file for each feature in a BED file, you can use the bedtools closest command:

      bedtools closest -a input1.bed -b input2.bed > output.bed

    • Compute statistics: To calculate various summary statistics for each feature in a BED file, such as the mean, median, or standard deviation of scores, you can use the bedtools groupby command:

      bedtools groupby -i input.bed -g 1 -c 5 -o mean,median,stdev > output.bed

You can often combine these tools using pipes (|) to create powerful oneliners for complex data processing tasks.

END

Draw scatter_3d using plotly 4.10 (+ellipses?)

import plotly.graph_objects as go
import pandas as pd
from sklearn.decomposition import PCA
import numpy as np
from scipy.linalg import eigh, sqrtm

# Read in data as a pandas dataframe
#df = pd.DataFrame({
#    'PC1': [-13.999925, -12.504291, -12.443057, -13.065235, -17.316215],
#    'PC2': [-1.498823, -3.342411, -6.067055, -8.205809, 3.293993],
#    'PC3': [-3.335085, 15.207755, -14.725450, 15.078469, -6.917358],
#    'condition': ['GFP d3', 'GFP d3', 'GFP d8', 'GFP d8', 'GFP+mCh d9/12'],
#    'donor': ['DI', 'DII', 'DI', 'DII', 'DI']
#})
df = pd.read_csv('merged_df_28PCs.csv', index_col=0, header=0)
df['condition'] = df['condition'].replace("GFP+mCh d9/12", "ctrl LTtr+sT d9/12")
df['condition'] = df['condition'].replace("sT+LTtr d9/12", "LTtr+sT d9/12")
df['condition'] = df['condition'].replace("GFP d3", "ctrl LT/LTtr d3")
df['condition'] = df['condition'].replace("GFP d8", "ctrl LT/LTtr d8")
df['condition'] = df['condition'].replace("mCh d3", "ctrl sT d3")
df['condition'] = df['condition'].replace("mCh d8", "ctrl sT d8")

# Fit PCA model to reduce data dimensions to 3
pca = PCA(n_components=3)
pca.fit(df.iloc[:, :-3])
X_reduced = pca.transform(df.iloc[:, :-3])

# Add reduced data back to dataframe
df['PC1'] = X_reduced[:, 0]
df['PC2'] = X_reduced[:, 1]
df['PC3'] = X_reduced[:, 2]

# Create PCA plot with 3D scatter
fig = go.Figure()

#['circle', 'circle-open', 'square', 'square-open', 'diamond', 'diamond-open', 'cross', 'x']
# if donor == 'DI' else marker=dict(size=2, opacity=0.8, color=condition_color, symbol=donor_symbol)

#decrease diamond size to 6 while keep the circle as size 10 in the following code:
#'rgb(128, 150, 128)'
#I need three families of colors, always from light to deep, the first one should close to grey.
#the first serie for 'ctrl LTtr+sT d9/12', 'LTtr+sT d9/12' 
#the second serie for 'ctrl LT/LTtr d3', 'ctrl LT/LTtr d8', 'LT d3', 'LT d8', 'LTtr d3', 'LTtr d8'
#the third serie for 'ctrl sT d3', 'ctrl sT d8', 'sT d3', 'sT d8', 'sT+LT d3'

condition_color_map_untreated = {'untreated':'black'}
donor_symbol_map_untreated = {'DI': 'circle-open', 'DII': 'diamond-open'}
#condition_color_map = {'ctrl LTtr+sT d9/12': 'green', 'GFP d3': 'blue', 'GFP d8': 'red', 'GFP+mCh d9/12': 'green', 'LT d3': 'orange'}
condition_color_map = {
    'ctrl LTtr+sT d9/12': 'black',
    'LTtr+sT d9/12': '#a14a1a',

    'ctrl LT/LTtr d3': '#D3D3D3',
    'ctrl LT/LTtr d8': '#A9A9A9',
    'LT d3': '#b2df8a',
    'LT d8': '#33a02c',
    'LTtr d3': '#a6cee3',
    'LTtr d8': '#1f78b4',

    'ctrl sT d3': '#696969',
    'ctrl sT d8': '#2F4F4F',
    'sT d3': '#fb9a99',
    'sT d8': '#e31a1c',
    'sT+LT d3': 'magenta'
}
donor_symbol_map = {'DI': 'circle', 'DII': 'diamond'}

for donor, donor_symbol in donor_symbol_map_untreated.items():
    for condition, condition_color in condition_color_map_untreated.items():
        mask = (df['condition'] == condition) & (df['donor'] == donor)
        fig.add_trace(go.Scatter3d(x=df.loc[mask, 'PC1'], y=df.loc[mask, 'PC2'], z=df.loc[mask, 'PC3'],
                                   mode='markers',
                                   name=f'{condition}' if donor == 'DI' else None,
                                   legendgroup=f'{condition}',
                                   showlegend=True if donor == 'DI' else False,
                                   marker=dict(size=4 if donor_symbol in ['diamond-open'] else 6, opacity=0.8, color=condition_color, symbol=donor_symbol)))

for donor, donor_symbol in donor_symbol_map.items():
    for condition, condition_color in condition_color_map.items():
        mask = (df['condition'] == condition) & (df['donor'] == donor)
        fig.add_trace(go.Scatter3d(x=df.loc[mask, 'PC1'], y=df.loc[mask, 'PC2'], z=df.loc[mask, 'PC3'],
                                   mode='markers',
                                   name=f'{condition}' if donor == 'DI' else None,
                                   legendgroup=f'{condition}',
                                   showlegend=True if donor == 'DI' else False,
                                   marker=dict(size=4 if donor_symbol in ['diamond'] else 6, opacity=0.8, color=condition_color, symbol=donor_symbol)))

for donor, donor_symbol in donor_symbol_map.items():
    fig.add_trace(go.Scatter3d(x=[None], y=[None], z=[None],
                               mode='markers',
                               name=donor,
                               legendgroup=f'{donor}',
                               showlegend=True,
                               marker=dict(size=6, opacity=1, color='black', symbol=donor_symbol),
                               hoverinfo='none'))

'''
# Calculate mean and covariance for each subset of data for eclipse drawing
DI_mask = (df['donor'] == 'DI') & (~df['condition'].isin(['untreated']))
DII_mask = (df['donor'] == 'DII') & (~df['condition'].isin(['untreated']))
untreated_mask = (df['condition'] == 'untreated')

mean_DI = df.loc[DI_mask, ['PC1', 'PC2', 'PC3']].mean().values
cov_DI = df.loc[DI_mask, ['PC1', 'PC2', 'PC3']].cov().values

mean_DII = df.loc[DII_mask, ['PC1', 'PC2', 'PC3']].mean().values
cov_DII = df.loc[DII_mask, ['PC1', 'PC2', 'PC3']].cov().values

mean_untreated = df.loc[untreated_mask, ['PC1', 'PC2', 'PC3']].mean().values
cov_untreated = df.loc[untreated_mask, ['PC1', 'PC2', 'PC3']].cov().values

# Function to create ellipses
def create_ellipse(center, radii, rotation_matrix, resolution=100):
    u = np.linspace(0, 2 * np.pi, resolution)
    v = np.linspace(0, np.pi, resolution)
    x = radii[0] * np.outer(np.cos(u), np.sin(v))
    y = radii[1] * np.outer(np.sin(u), np.sin(v))
    z = radii[2] * np.outer(np.ones_like(u), np.cos(v))
    for i in range(len(x)):
        for j in range(len(x)):
            [x[i, j], y[i, j], z[i, j]] = np.dot([x[i, j], y[i, j], z[i, j]], rotation_matrix) + center
    return x, y, z
def ellipsoid_params(mean, cov, conf_interval=0.9):
    vals, vecs = eigh(cov)
    vals = vals[::-1]
    vecs = vecs[:, ::-1]
    radii = np.sqrt(vals * chi2.ppf(conf_interval, df=3))
    rotation_matrix = vecs
    return mean, radii, rotation_matrix

from scipy.stats import chi2
center_DI, radii_DI, rotation_matrix_DI = ellipsoid_params(mean_DI, cov_DI)
center_DII, radii_DII, rotation_matrix_DII = ellipsoid_params(mean_DII, cov_DII)
#center_untreated, radii_untreated, rotation_matrix_untreated = ellipsoid_params(mean_untreated, cov_untreated)

# Generate ellipses using create_ellipse function
x_DI, y_DI, z_DI = create_ellipse(center_DI, radii_DI, rotation_matrix_DI)
x_DII, y_DII, z_DII = create_ellipse(center_DII, radii_DII, rotation_matrix_DII)
#x_untreated, y_untreated, z_untreated = create_ellipse(center_untreated, radii_untreated, rotation_matrix_untreated)

fig.add_trace(go.Mesh3d(x=x_DI.flatten(), y=y_DI.flatten(), z=z_DI.flatten(),
                        alphahull=5,
                        opacity=0.1,
                        color='blue',
                        name='DI ellipse',
                        showlegend=False))
fig.add_trace(go.Mesh3d(x=x_DII.flatten(), y=y_DII.flatten(), z=z_DII.flatten(),
                        alphahull=5,
                        opacity=0.1,
                        color='red',
                        name='DII ellipse',
                        showlegend=False))
'''

# Annotations for the legend blocks
fig.update_layout(
    annotations=[
        dict(x=1.1, y=1.0, xref='paper', yref='paper', showarrow=False,
             text='Condition', font=dict(size=15)),
        dict(x=1.1, y=0.6, xref='paper', yref='paper', showarrow=False,
             text='Donor', font=dict(size=15))
    ],
    scene=dict(
        aspectmode='cube',
        xaxis=dict(gridcolor='lightgrey', linewidth=2, linecolor='black', backgroundcolor='white', zerolinecolor='black', zerolinewidth=2, title='PC1: 36% v.'),
        yaxis=dict(gridcolor='lightgrey', linewidth=2, linecolor='black', backgroundcolor='white', zerolinecolor='black', zerolinewidth=2, title='PC2: 17% v.'),
        zaxis=dict(gridcolor='lightgrey', linewidth=2, linecolor='black',backgroundcolor='white', zerolinecolor='black', zerolinewidth=2, title='PC3: 15% variance'),
        bgcolor='white'
    ),
    margin=dict(l=5, r=5, b=5, t=0)  # Adjust the margins to prevent clipping of axis titles
)

#fig.show()
fig.write_image("fig4.svg")