Category Archives: Articles

draw pca plots with ggplot2 (2D) and plotly (3D)

#TODOs: next week
#- try install ggplot2_3d
#- try install kaleido

TODO: using python to generate the 3D plot! https://pypi.org/project/plotly/

# -- before pca --
#png("pca.png", 1200, 800)
svg("pca.svg")
plotPCA(rld, intgroup=c("replicates"))
#plotPCA(rld, intgroup = c("replicates", "batch"))
#plotPCA(rld, intgroup = c("replicates", "ids"))
#plotPCA(rld, "batch")
dev.off()

#TODO:adding label in the figure, change Donor I in blue and donor II in orange
#https://loading.io/color/feature/Paired-12/
svg("pca2.svg")
#plotPCA(rld, intgroup=c("replicates"))
#plotPCA(rld, intgroup = c("replicates", "batch"))
plotPCA(rld, intgroup = c("donor"))
#plotPCA(rld, "batch")
dev.off()
#TODO: adding label in the figure
svg("pca3.svg")
plotPCA(rld, intgroup=c("replicates2"))
dev.off()

#https://loading.io/color/feature/Paired-12/
#https://support.bioconductor.org/p/66404/
# -- calculate PC3 from rld --
library(genefilter)
ntop <- 500
rv <- rowVars(assay(rld))
select <- order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))]
mat <- t( assay(rld)[select, ] )
pc <- prcomp(mat)
pc$x[,1:3]

#To my case:
mat <- t( assay(rld) )
pc <- prcomp(mat)
#pca <- yourFavoritePCA( mat )

pc$x[,1:3]
#                          PC1         PC2        PC3
#untreated DI      -27.5379705   1.4478299  -6.389731
#untreated DII     -28.3320463   0.6794066   2.073768
#mCh d3 DII          2.8988953  -6.4372647  10.252829
#sT d3 DII           5.1869876   2.6116282  13.816117
#mCh d8 DII        -20.8047275   1.0708861   3.394721
#sT d8 DII          -4.5144119  19.6230473   8.357902
#mCh d3 DI          -4.5690693  -8.8938297  -7.391567
#sT d3 DI           -7.6326832   5.3781061   2.214181
#mCh d8 DI           0.8536828  -5.0593045 -13.325567
#sT d8 DI            1.9232111  24.8795741  -4.162946
#GFP d3 DII        -12.5042914  -3.3424106  15.207755
#LTtr d3 DII         5.2309178  -9.6124712   8.328132
#GFP d8 DII        -13.0652347  -8.2058086  15.078469
#LTtr d8 DII        13.0678654  -2.0677676   9.188943
#GFP d3 DI         -13.9999251  -1.4988226  -3.335085
#LTtr d3 DI          2.6090782  -9.5753559 -10.022324
#GFP d8 DI         -12.4430571  -6.0670545 -14.725450
#LTtr d8 DI          8.9794396   3.4918629 -14.410118
#LT d8 DII          18.8388058   0.2459081   2.334700
#LT d8 DI           15.2986278  -0.6055500 -11.034778
#GFP+mCh d9/12 DI  -17.3162152   3.2939931  -6.917358
#sT+LTtr d9/12 DI    6.8517730  17.9282911  -6.209778
#GFP+mCh d9/12 DII   2.0874834  -6.7379107   8.810602
#sT+LTtr d9/12 DII  19.3883422  19.6033774   4.314808
#LT d3 DI            6.5376031  -8.5766236  -6.500155
#LT d3 DII          17.8400725 -11.7362896   1.117396
#sT+LT d3 DI        16.6029944  -7.7951798  -5.593658
#sT+LT d3 DII       18.5238521  -4.0422674   5.528193

# vs.
#data
#                     PC1        PC2         group condition donor          name
#untreated DI  -27.537970  1.4478299  untreated:DI untreated    DI  untreated DI
#untreated DII -28.332046  0.6794066 untreated:DII untreated   DII untreated DII
#mCh d3 DII      2.898895 -6.4372647    mCh d3:DII    mCh d3   DII    mCh d3 DII

# -- construct a data structure (merged_df) as above with data and pc --
library(ggplot2)
data <- plotPCA(rld, intgroup=c("condition", "donor"), returnData=TRUE)
#calculate all PCs including PC3 with the following codes
library(genefilter)
ntop <- 500
rv <- rowVars(assay(rld))
select <- order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))]
mat <- t( assay(rld)[select, ] )
pc <- prcomp(mat)
pc$x[,1:3]
df_pc <- data.frame(pc$x[,1:3])

identical(rownames(data), rownames(df_pc)) #-->TRUE
## define the desired order of row names
#desired_order <- rownames(data)
## sort the data frame by the desired order of row names
#df <- df[match(desired_order, rownames(df_pc)), ]

data$PC1 <- NULL
data$PC2 <- NULL
merged_df <- merge(data, df_pc, by = "row.names")
#merged_df <- merged_df[, -1]
row.names(merged_df) <- merged_df$Row.names
merged_df$Row.names <- NULL  # remove the "name" column
merged_df$name <- NULL 
merged_df <- merged_df[, c("PC1","PC2","PC3","group","condition","donor")]

# -- draw 3D with merged_df using plot3D --
#https://stackoverflow.com/questions/45052188/how-to-plot-3d-scatter-diagram-using-ggplot
devtools::install_github("AckerDWM/gg3D")
library("gg3D")
png("pca10.png",800,800)
#svg("pca10.svg",10,10)
#methods(class = "prcomp")
#summary(pc) #--> Proportion of Variance  0.3647 0.1731 0.1515
#percentVar <- round(100 * attr(data, "percentVar"))
percentVar <- c(36,17,15)
#scatterplot3d

#Unfortunately, ggplot does not support 3D plotting. It is designed for creating 2D plots and visualizations in R. However, there are other packages available in R for creating 3D plots, such as plot3D, scatterplot3d, and rgl. These packages can be used to create 3D scatter plots, surface plots, and more complex 3D visualizations. You can install and load these packages in R using the following commands:
install.packages("plot3D")
library(plot3D)
install.packages("scatterplot3d")
library(scatterplot3d)
install.packages("rgl")
library(rgl)
#Once you have loaded these packages, you can create 3D plots using their respective functions. For example, you can create a 3D scatter plot using the plot3D package with the following code:

#https://plotly.com/r/3d-scatter-plots/
library(plotly)
data(mtcars)
png("xxx.png", 1200, 800)
plot_ly(mtcars, x = ~mpg, y = ~wt, z = ~qsec, type = "scatter3d", mode = "markers")
dev.off()

#  zlab(paste0("PC3: ",percentVar[3],"% variance")) + 
#scatterplot3d(merged_df[,c("PC1","PC2","PC3")], pch=16, color="blue", main="3D Scatter Plot")
# labs(x = "PC1", y = "PC2", z = "PC3") +
#axes_3D() + stat_3D() +
colors = "Set1",
#marker = list(symbol = ~shapes[group])
#using the corresponding keywords ("square", "triangle-up", "diamond", etc.). 

labs <- list(x = paste0("PC1: ",percentVar[1],"% variance"), y = paste0("PC2: ",percentVar[2],"% variance"), z = paste0("PC3: ",percentVar[3],"% variance"))
#ggplot(merged_df, aes(x=PC1, y=PC2, z=PC3, color=condition, shape=donor)) +

#https://stackoverflow.com/questions/75452609/update-color-in-different-marker-in-plotly-r
dt <- iris
dt$shape_1 <- c("Yes","No")
dt$color_1 <- c("Medium","Large","Small")

library(plotly)
library(kaleido)
fig <- plot_ly(dt,
        x=1:nrow(iris),
        y=~Sepal.Length,
        type="scatter",
        mode='markers',
        color=~Species,
        colors = c("#4477AA","#DDCC77","#CC6677"),
        symbol = ~shape_1,
        symbols = c("triangle-up", "circle"),
        size = 20) %>% add_surface()
# save the chart as an SVG file using Kaleido
kaleido(fig, file = "chart.svg", format = "svg", width = 800, height = 600, 
        scale = 2, output_options = list(bg = "white"))

#        inherit = F,
#        size = ~Sepal.Width, 
#        sizes = c(10, 100) * 10)

plot_ly(dt,
+         x=1:nrow(iris),
+         y=~Sepal.Length,
+         type="scatter",
+         mode='markers',
+         color=~Species,
+         colors = c("#4477AA","#DDCC77","#CC6677"),
+         size = ~Sepal.Width, 
+         symbol = ~shape_1,
+         symbols = c("triangle-up", "circle"),
+         inherit = F,
+         sizes = c(10, 100) * 10))
%>%
+ add_trace(type="scatter",
+           mode = "text",
+           text=~Sepal.Width,
+           textposition = "top right",
+           color = ~color_1,
+           colors = c("black","green","blue"),
+           textfont = list(size = 10)
+ )

factors(merged_df$condition)

"GFP d3"        "GFP d8"        "GFP+mCh d9/12" "LT d3"        
 [5] "LT d8"         "LTtr d3"       "LTtr d8"       "mCh d3"       
 [9] "mCh d8"        "sT d3"         "sT d8"         "sT+LT d3"     
[13] "sT+LTtr d9/12" "untreated"

merged_df$condition <- factor(merged_df$condition, levels=c("untreated","mCh d3","mCh d8","GFP+mCh d9/12","GFP d3","GFP d8","sT d3","sT d8","LT d3","LT d8","LTtr d3","LTtr d8","sT+LT d3","sT+LTtr d9/12"))
merged_df$donor <- as.character(merged_df$donor)
# Define a list of shapes for each group
shapes <- list("circle", "triangle-up")
plot_ly(merged_df, x=~PC1, y=~PC2, z=~PC3, type = "scatter3d", mode = "markers",   color=~condition, colors = c("grey","#a6cee3","#1f78b4","cyan","#b2df8a","#33a02c","#fb9a99","#e31a1c","#fdbf6f","#ff7f00","#cab2d6","#6a3d9a","#ffff99","#a14a1a"), symbol=~donor, symbols = c("triangle-up", "circle"))
  geom_point_3d() +
  scale_x_continuous(name = labs$x) +
  scale_y_continuous(name = labs$y) +
  scale_z_continuous(name = labs$z) +
  geom_point(size=8) + 
  scale_color_manual(values = c("untreated" = "grey",
                                "mCh d3"="#a6cee3",
                                "mCh d8"="#1f78b4",
                                "GFP+mCh d9/12"="cyan",
                                "GFP d3"="#b2df8a",
                                "GFP d8"="#33a02c",
                                "sT d3"="#fb9a99",
                                "sT d8"="#e31a1c",
                                "LT d3"="#fdbf6f",
                                "LT d8"="#ff7f00",
                                "LTtr d3"="#cab2d6",
                                "LTtr d8"="#6a3d9a",
                                "sT+LT d3"="#ffff99",                        
                                "sT+LTtr d9/12"="#a14a1a")) +
                                theme(axis.text = element_text(face="bold",size = 21), axis.title = element_text(face="bold",size = 21)) + theme(legend.text = element_text(size = 20)) + theme(legend.title = element_text(size = 22)) + guides(color = guide_legend(override.aes = list(size = 10)), shape = guide_legend(override.aes = list(size = 10)), alpha = guide_legend(override.aes = list(size = 10)))

#axis.title = element_text(face="bold",size = 20)
#p + theme(axis.text.x = element_text(face="bold",size=14), axis.text.y = element_text(face="bold",size=14))
#+ coord_fixed()
#+ theme(
#    # Set the width to 6 inches
#    fig.width = 6,
#    # Set the height to 4 inches
#    fig.height = 4
#  )

svg("pca4.svg")
plotPCA(rld, intgroup=c("days"))
dev.off()

scatter plot with categorical data using ggplot2

load packages

library(tidyverse)
library(palmerpenguins)
library(ggbeeswarm)
library(ggforce)

#remotes::install_github("allisonhorst/palmerpenguins")
# peek at penguins data
#glimpse(penguins)

# create some example data
x <- c(rep("A", 100), rep("B", 100), rep("C", 100))
y <- rnorm(300)

# create a data frame with x, y, and color columns
df <- data.frame(x = x, y = y, color = ifelse(c(1:300) %in% c(5, 10, 15), "Highlighted", "Normal"))

#  geom_point(size = 3) +
# plot the data with points colored by category and highlight
ggplot(df, aes(x = x, y = y, color = color)) +
  scale_color_manual(values = c("Normal" = "black", "Highlighted" = "red")) +
  geom_beeswarm(cex = 1.5) +
  theme_classic()

 #In this script, we use ggplot2 to create a scatter plot with categorical data and highlight some points. We start by creating an example dataset with a categorical variable x and a continuous variable y. We then create a data frame df with x, y, and color columns. The color column is set to "Highlighted" for the points we want to highlight, and "Normal" for the rest.

 #We then use ggplot2 to plot the data. We set x, y, and color to the corresponding columns in df using the aes() function. We use geom_point() to plot the points, and set the size argument to control the size of the points. We use scale_color_manual() to set the colors for the "Normal" and "Highlighted" categories. Finally, we use theme_classic() to set the theme of the plot to a classic theme.

Install top 24 Python Libraries for Data Science with pip

There are many packages that can be installed using pip, the Python package manager. Some of the commonly used packages that can be installed with pip include:

  • TensorFlow: an open-source machine learning framework developed by Google for building and training machine learning models.
  • NumPy: a library for scientific computing with Python, providing efficient numerical operations for multi-dimensional arrays and matrices.
  • SciPy: a collection of libraries for scientific and technical computing with Python, including tools for optimization, linear algebra, signal processing, and more.
  • Pandas: a library for data manipulation and analysis in Python, providing tools for reading, writing, and manipulating tabular data.
  • Matplotlib: a library for creating visualizations and plots in Python, providing tools for creating various types of charts and graphs.
  • Keras: an open-source neural network library written in Python, designed to enable fast experimentation with deep neural networks.
  • SciKit-Learn: a library for machine learning in Python, providing tools for data preprocessing, feature extraction, supervised and unsupervised learning, and model evaluation.
  • PyTorch: an open-source machine learning framework developed by Facebook for building and training machine learning models.
  • Scrapy: a framework for web scraping and crawling in Python, providing tools for extracting data from websites and APIs.
  • BeautifulSoup: a library for parsing HTML and XML documents in Python, providing tools for extracting and manipulating data from web pages.
  • LightGBM: a gradient boosting framework that uses tree-based learning algorithms, designed to be efficient and scalable for large-scale machine learning tasks.
  • ELI5: a library for explaining and visualizing machine learning models in Python, providing tools for feature importances, model weights, and more.
  • Theano: a library for numerical computation in Python, designed to allow developers to define, optimize, and evaluate mathematical expressions involving multi-dimensional arrays.
  • NuPIC: a machine intelligence platform for building intelligent applications, based on the principles of neuroscience and machine learning.
  • Ramp: a library for building predictive models in Python, designed to simplify the process of building and evaluating machine learning models.
  • Pipenv: a tool for managing Python dependencies and virtual environments, designed to make it easier to manage packages and versions.
  • Bob: a toolbox for machine learning in Python, providing tools for face recognition, speaker recognition, and more.
  • PyBrain: a library for building and training neural networks in Python, designed to be modular and flexible for a wide range of tasks.
  • Caffe2: a deep learning framework developed by Facebook for building and training neural networks, designed to be efficient and scalable for large-scale tasks.
  • Chainer: a Python-based deep learning framework for building and training neural networks, designed to be flexible and scalable for a wide range of tasks.
  • Django is a high-level web framework that provides a structured and scalable way to build web applications in Python. It includes built-in tools for handling tasks such as authentication, URL routing, and database schema migrations.
  • Flask is a lightweight web framework that provides flexibility and simplicity to developers. It allows you to build web applications and APIs in Python with minimal boilerplate code and provides support for extensions to add functionality.
  • Bottle is another lightweight web framework that allows you to build web applications and APIs in Python. It is designed to be simple and easy to use, with minimal dependencies.
  • Requests is a package that provides a simple and easy-to-use interface for sending HTTP requests in Python. It supports various HTTP methods such as GET, POST, PUT, DELETE, etc. and also allows you to customize headers, cookies, and other request parameters.

For example, to install the NumPy package, you can use the following command:

pip install numpy
pip install plotly==4.10.0

You can use the following command to check which packages are currently installed in your Python environment using pip:

pip list

This command will display a list of all the packages that have been installed using pip, along with their version numbers. If you want to check the version number of a specific package, you can use the following command:

pip show plotly
pip list | grep plotly

How to correct indent errors in Python?

There are several tools and editors that can help you correct indent errors in Python. Here are a few:

  • Integrated Development Environments (IDEs): Popular IDEs such as PyCharm, Visual Studio Code, and Spyder have built-in features to help you identify and correct indentation errors.

  • Text Editors: Text editors such as Sublime Text and Notepad++ can help you identify and correct indentation errors as well. However, you may need to install third-party plugins or packages to get this functionality.

  • Linters: Linters are tools that can check your code for syntax and formatting errors. Some popular Python linters include Flake8, Pylint, and Pyflakes. These tools can help you identify indentation errors as well as other common mistakes.

  • Online Tools: There are also several online tools available that can help you check your Python code for indentation errors. For example, you can use the Python Indentation Validator at https://python-indentation-validator.com/ to check your code.

Remember that in Python, indentation is important and errors can lead to unexpected results or even cause your code to fail. It’s important to always pay attention to indentation when writing Python code.

Add two legends for two dimensions in scatter_3d() in plotly 4.10

add two legends for two dimensions in plotly 4.10 for the functio scatter_3d(), i.e. one legend for the color, one for the symbol

In Plotly 4.10, you can add two legends for two dimensions in scatter_3d() by creating a custom trace for each legend and adding them to the layout. Here’s an example code

import plotly.graph_objects as go

# Sample data
x = [1, 2, 3, 4, 5]
y = [5, 4, 3, 2, 1]
z = [1, 2, 3, 4, 5]
color = [1, 1, 2, 2, 3]
symbol = [1, 2, 1, 2, 1]

# Create custom trace for color legend
color_legend_trace = go.Scatter3d(
    x=[],
    y=[],
    z=[],
    mode='markers',
    name='Color',
    marker=dict(
        size=5,
        opacity=0.7,
        colorscale='Viridis',
        color=[],
        showscale=True
    )
)

# Create custom trace for symbol legend
symbol_legend_trace = go.Scatter3d(
    x=[],
    y=[],
    z=[],
    mode='markers',
    name='Symbol',
    marker=dict(
        size=5,
        opacity=0.7,
        symbol=[],
        showscale=True
    )
)

# Populate custom traces with data
for i in range(len(x)):
    color_legend_trace['x'] += [x[i]]
    color_legend_trace['y'] += [y[i]]
    color_legend_trace['z'] += [z[i]]
    color_legend_trace['marker']['color'] += [color[i]]

    symbol_legend_trace['x'] += [x[i]]
    symbol_legend_trace['y'] += [y[i]]
    symbol_legend_trace['z'] += [z[i]]
    symbol_legend_trace['marker']['symbol'] += [symbol[i]]

# Create main scatter plot trace
scatter_trace = go.Scatter3d(
    x=x,
    y=y,
    z=z,
    mode='markers',
    marker=dict(
        size=5,
        opacity=0.7,
        colorscale='Viridis',
        color=color,
        symbol=symbol,
        showscale=False
    )
)

# Create layout with two legends
layout = go.Layout(
    scene=dict(
        aspectmode='data',
        legend=dict(
            itemsizing='constant'
        )
    ),
    legend=dict(
        orientation='h',
        yanchor='bottom',
        y=1.02,
        xanchor='right',
        x=1
    ),
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0
    )
)

# Add custom traces and main scatter plot trace to figure
fig = go.Figure(data=[color_legend_trace, symbol_legend_trace, scatter_trace], layout=layout)

# Show figure
fig.show()

This code will create a 3D scatter plot with two legends: one for the color and one for the symbol. The color_legend_trace is a custom trace that shows the color legend, while the symbol_legend_trace is a custom trace that shows the symbol legend. The scatter_trace is the main scatter plot trace that contains the data points. In the layout, we set the legend property of the scene to None to remove the default legend, and add two custom legends using the legend property of the layout. Finally, we create a Figure object with the custom traces and the main scatter plot trace, and show it.


DEBUGGED!

import plotly.graph_objects as go

# Sample data
x = [1, 2, 3, 4, 5]
y = [5, 4, 3, 2, 1]
z = [1, 2, 3, 4, 5]
color = ['#AA0DFE', '#3283FE', '#85660D', '#782AB6', '#565656']
symbol = ['circle', 'circle-open', 'square', 'square-open', 'diamond']

# Create custom trace for color legend
color_legend_trace = go.Scatter3d(
    x=[1, 2, 3, 4, 5],
    y=[5, 4, 3, 2, 1],
    z=[1, 2, 3, 4, 5],
    mode='markers',
    name='Color',
    marker=dict(
        size=5,
        opacity=0.7,
        colorscale='Viridis',
        color=['#AA0DFE', '#3283FE', '#85660D', '#782AB6', '#565656'],
        showscale=True
    )
)

# Create custom trace for symbol legend
symbol_legend_trace = go.Scatter3d(
    x=[1, 2, 3, 4, 5],
    y=[5, 4, 3, 2, 1],
    z=[1, 2, 3, 4, 5],
    mode='markers',
    name='Symbol',
    marker=dict(
        size=5,
        opacity=0.7,
        symbol=['circle', 'circle-open', 'square', 'square-open', 'diamond'],
        showscale=True
    )
)

#Error due to the tuple type.
## Populate custom traces with data
#for i in range(len(x)):
#    color_legend_trace['x'] += [x[i]]
#    color_legend_trace['y'] += [y[i]]
#    color_legend_trace['z'] += [z[i]]
#    color_legend_trace['marker']['color'] += [color[i]]
#    
#    symbol_legend_trace['x'] += [x[i]]
#    symbol_legend_trace['y'] += [y[i]]
#    symbol_legend_trace['z'] += [z[i]]
#    symbol_legend_trace['marker']['symbol'] += [symbol[i]]

# Create main scatter plot trace
scatter_trace = go.Scatter3d(
    x=x,
    y=y,
    z=z,
    mode='markers',
    marker=dict(
        size=5,
        opacity=0.7,
        colorscale='Viridis',
        color=color,
        symbol=symbol,
        showscale=False
    )
)

#scene = go.Scene(
#    xaxis=dict(title='X Axis'),
#    yaxis=dict(title='Y Axis'),
#    zaxis=dict(title='Z Axis'),
#    # Remove the 'legend' property from the Scene object
#)

# Create layout with two legends
layout = go.Layout(
    #scene=dict(
    #    aspectmode='data'
    #),
    legend=dict(
        orientation='h',
        yanchor='bottom',
        y=1.02,
        xanchor='right',
        x=1
    ),
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0
    )
)

# Add custom traces and main scatter plot trace to figure
fig = go.Figure(data=[color_legend_trace, symbol_legend_trace, scatter_trace], layout=layout)
#fig = go.Figure(data=[scatter], layout=go.Layout(
#    title='My 3D Scatter Plot',
#    legend=dict(
#        title='Legend Title',
#        font=dict(size=12)
#    )
#))

# Show figure
#fig.show()
fig.write_image("fig2.svg")

TODO: categorical legends!

Add two legends for two dimensions in plotly, i.e. one legend for the color, one for the symbol in plotly

import plotly.graph_objects as go
import pandas as pd

# Create a sample dataframe
df = pd.DataFrame({'category_1': ['A', 'B', 'A', 'B', 'A', 'B'],
                   'category_2': ['X', 'Y', 'X', 'Y', 'X', 'Y'],
                   'value_1': [1, 2, 3, 4, 5, 6],
                   'value_2': [7, 8, 9, 10, 11, 12]})

fig = go.Figure()

# Create scatter plot for category_1
for cat in df['category_1'].unique():
    fig.add_trace(go.Scatter(x=df.loc[df['category_1'] == cat, 'value_1'],
                             y=df.loc[df['category_1'] == cat, 'value_2'],
                             mode='markers',
                             name=cat,
                             legendgroup='category_1',
                             marker=dict(symbol='circle')))

# Create scatter plot for category_2
symbols = {'X': 'star', 'Y': 'diamond'}
for cat in df['category_2'].unique():
    fig.add_trace(go.Scatter(x=df.loc[df['category_2'] == cat, 'value_1'],
                             y=df.loc[df['category_2'] == cat, 'value_2'],
                             mode='markers',
                             name=cat,
                             legendgroup='category_2',
                             marker=dict(symbol=symbols[cat], color='black'),
                             showlegend=True,
                             visible='legendonly'))

# Update legend properties
fig.update_layout(legend=dict(tracegroupgap=30))

# Add custom legend group names using annotations
fig.update_layout(
    annotations=[
        go.layout.Annotation(
            text="Group 1",
            x=1.05,
            y=1,
            xref="paper",
            yref="paper",
            showarrow=False,
            font=dict(size=14, color='black'),
            bgcolor='rgba(255, 255, 255, 0.8)',
            bordercolor='black',
            borderwidth=1,
            xanchor='left',
            yanchor='top',
            valign='top'
        ),
        go.layout.Annotation(
            text="Group 2",
            x=1.05,
            y=0.65,
            xref="paper",
            yref="paper",
            showarrow=False,
            font=dict(size=14, color='black'),
            bgcolor='rgba(255, 255, 255, 0.8)',
            bordercolor='black',
            borderwidth=1,
            xanchor='left',
            yanchor='top',
            valign='top'
        )
    ]
)

fig.show()

Draw scatter_3d using plotly 4.10

Simplied but not working version.

import plotly.graph_objects as go
import pandas as pd
from sklearn.decomposition import PCA
import numpy as np

# Provided dataframe
df = pd.DataFrame({
    'PC1': [-13.999925, -12.504291, -12.443057, -13.065235, -17.316215],
    'PC2': [-1.498823, -3.342411, -6.067055, -8.205809, 3.293993],
    'PC3': [-3.335085, 15.207755, -14.725450, 15.078469, -6.917358],
    'condition': ['GFP d3', 'GFP d3', 'GFP d8', 'GFP d8', 'GFP+mCh d9/12'],
    'donor': ['DI', 'DII', 'DI', 'DII', 'DI']
})

# Create PCA plot with 3D scatter
fig = px.scatter_3d(df, x='PC1', y='PC2', z='PC3', color='condition', symbol='donor',
                    title='PCA with 3 dimensions')

# Custom legend for condition
conditions = df['condition'].unique()
colors = px.colors.qualitative.Plotly[:len(conditions)]

for i, cond in enumerate(conditions):
    fig.add_trace(go.Scatter3d(x=[None], y=[None], z=[None],
                               mode='markers',
                               marker=dict(size=6, color=colors[i]),
                               showlegend=True, name=cond))

# Custom legend for donor
donors = df['donor'].unique()
symbols = ['circle', 'diamond']

for i, donor in enumerate(donors):
    fig.add_trace(go.Scatter3d(x=[None], y=[None], z=[None],
                               mode='markers',
                               marker=dict(size=6, color='black', symbol=symbols[i]),
                               showlegend=True, name=donor))

# Annotations for the legend blocks
fig.update_layout(
    annotations=[
        dict(x=1.1, y=1.0, xref='paper', yref='paper', showarrow=False,
             text='Condition', font=dict(size=15)),
        dict(x=1.1, y=0.6, xref='paper', yref='paper', showarrow=False,
             text='Donor', font=dict(size=15))
    ]
)

fig.show()

Draw scatter_3d using plotly 4.10 (SUCCESSFUL)

import plotly.graph_objects as go
import pandas as pd
from sklearn.decomposition import PCA
import numpy as np

# Read in data as a pandas dataframe
#df = pd.DataFrame({
#    'PC1': [-13.999925, -12.504291, -12.443057, -13.065235, -17.316215],
#    'PC2': [-1.498823, -3.342411, -6.067055, -8.205809, 3.293993],
#    'PC3': [-3.335085, 15.207755, -14.725450, 15.078469, -6.917358],
#    'condition': ['GFP d3', 'GFP d3', 'GFP d8', 'GFP d8', 'GFP+mCh d9/12'],
#    'donor': ['DI', 'DII', 'DI', 'DII', 'DI']
#})
df = pd.read_csv('merged_df_28PCs.csv', index_col=0, header=0)

# Fit PCA model to reduce data dimensions to 3
pca = PCA(n_components=3)
pca.fit(df.iloc[:, :-3])
X_reduced = pca.transform(df.iloc[:, :-3])

# Add reduced data back to dataframe
df['PC1'] = X_reduced[:, 0]
df['PC2'] = X_reduced[:, 1]
df['PC3'] = X_reduced[:, 2]

# Create PCA plot with 3D scatter
fig = go.Figure()

#['circle', 'circle-open', 'square', 'square-open', 'diamond', 'diamond-open', 'cross', 'x']
condition_color_map = {'GFP d3': 'blue', 'GFP d8': 'red', 'GFP+mCh d9/12': 'green', 'LT d3': 'orange'}
donor_symbol_map = {'DI': 'circle', 'DII': 'cross'}

for donor, donor_symbol in donor_symbol_map.items():
    for condition, condition_color in condition_color_map.items():
        mask = (df['condition'] == condition) & (df['donor'] == donor)
        fig.add_trace(go.Scatter3d(x=df.loc[mask, 'PC1'], y=df.loc[mask, 'PC2'], z=df.loc[mask, 'PC3'],
                                   mode='markers',
                                   name=f'{condition}' if donor == 'DI' else None,
                                   legendgroup=f'{condition}',
                                   showlegend=True if donor == 'DI' else False,
                                   marker=dict(size=10, opacity=0.8, color=condition_color, symbol=donor_symbol)))

for donor, donor_symbol in donor_symbol_map.items():
    fig.add_trace(go.Scatter3d(x=[None], y=[None], z=[None],
                               mode='markers',
                               name=donor,
                               showlegend=True,
                               marker=dict(size=10, opacity=1, color='black', symbol=donor_symbol),
                               hoverinfo='none'))

# Annotations for the legend blocks
fig.update_layout(
    annotations=[
        dict(x=1.1, y=1.0, xref='paper', yref='paper', showarrow=False,
             text='Condition', font=dict(size=15)),
        dict(x=1.1, y=0.6, xref='paper', yref='paper', showarrow=False,
             text='Donor', font=dict(size=15))
    ],
    scene=dict(
        aspectmode='cube',
        xaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title='PC1: 36% v.'),
        yaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title='PC2: 17% v.'),
        zaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title='PC3: 15% variance'),
        bgcolor='white'
    ),
    margin=dict(l=5, r=5, b=5, t=5)  # Adjust the margins to prevent clipping of axis titles
)

#fig.show()
fig.write_image("fig1.svg")

DAMIAN: A Comprehensive Online Platform for the Detection and Analysis of Viral and Microbial Infectious Agents using NGS Data

Abstract:

Next-generation sequencing (NGS) has revolutionized the field of genomics, allowing for rapid detection and analysis of infectious agents. However, the complexity of NGS data and the lack of user-friendly tools have limited the widespread adoption of these techniques. Here, we present DAMIAN (Detection & Analysis of viral and Microbial Infectious Agents by NGS), a comprehensive online platform designed to facilitate the identification and characterization of viral and microbial pathogens from NGS data. DAMIAN streamlines the process of data input, analysis, and visualization, making it accessible to researchers with varying levels of computational expertise.

Introduction:

1.1 Background and motivation

Discuss the challenges faced by researchers in the detection and analysis of viral and microbial infectious agents using NGS data.
Explain the need for a user-friendly, comprehensive online platform to address these challenges.

1.2 Overview of DAMIAN

Introduce DAMIAN as a solution to these challenges.
Briefly describe the features and functionality of DAMIAN.

Methods:

2.1 Data input and preprocessing

Explain how users can upload their NGS data to the platform.
Describe the preprocessing steps performed by DAMIAN, including quality control and adapter trimming.

2.2 Identification of infectious agents

Detail the algorithms and databases employed by DAMIAN to identify viral and microbial pathogens in the input data.

2.3 Data analysis and visualization

Discuss the various analysis and visualization tools available within DAMIAN, including phylogenetic analysis, genome assembly, and functional annotation.

Results:

3.1 Case studies

Present case studies showcasing the effectiveness and utility of DAMIAN in identifying and analyzing infectious agents from NGS data.

3.2 Comparison with existing tools

Compare the performance of DAMIAN with other available tools and platforms.

Discussion:

4.1 Advantages of DAMIAN

Highlight the benefits of using DAMIAN, such as ease of use, comprehensiveness, and accessibility.

4.2 Future developments and improvements

Discuss potential future enhancements to the platform, including the incorporation of new algorithms, databases, and analysis tools.

Conclusion:

Summarize the main points of the manuscript and reiterate the value of DAMIAN as a comprehensive online platform for the detection and analysis of viral and microbial infectious agents using NGS data.

A simple machine learning example using Python using scikit-learn

A simple machine learning example using Python and the scikit-learn library for the classification of the Iris dataset. The Iris dataset is a classic dataset containing measurements of iris flowers and their species. We will use a Decision Tree classifier to classify the species based on the measurements.

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Load the Iris dataset
iris = datasets.load_iris()
X = iris.data
y = iris.target

# Split the dataset into train and test sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create a Decision Tree classifier and train it on the train set
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Calculate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy of the Decision Tree classifier: {accuracy:.2f}")

This code will output the accuracy of the Decision Tree classifier on the Iris dataset. We can experiment with other classifiers and their parameters to see how the results change.