Phyloseq + MicrobiotaProcess + PICRUSt2

gene_x 0 like s 1339 view s

Tags: R, metagenomics, 16S, tool, pipeline

Phyloseq.Rmd

author: ""
date: '`r format(Sys.time(), "%d %m %Y")`'
header-includes:
  - \usepackage{color, fancyvrb}
output:
  rmdformats::readthedown:
    highlight: kate
    number_sections : yes    
  pdf_document: 
    toc: yes
    toc_depth: 2
    number_sections : yes
---

```{r, echo=FALSE, warning=FALSE}
## Global options
# TODO: reproduce the html with the additional figure/SVN-files for editing.
# IMPORTANT NOTE: needs before "mkdir figures"
#rmarkdown::render('Phyloseq.Rmd',output_file='Phyloseq.html')
```

```{r load-packages, include=FALSE}
library(knitr)
library(rmdformats)
library(readxl)
library(dplyr)
library(kableExtra)

options(max.print="75")
knitr::opts_chunk$set(fig.width=8, 
                      fig.height=6, 
                      eval=TRUE, 
                      cache=TRUE,
                      echo=TRUE,
                      prompt=FALSE,
                      tidy=TRUE,
                      comment=NA,
                      message=FALSE,
                      warning=FALSE)
opts_knit$set(width=85)
# Phyloseq R library
#* Phyloseq web site : https://joey711.github.io/phyloseq/index.html
#* See in particular tutorials for
#    - importing data: https://joey711.github.io/phyloseq/import-data.html
#    - heat maps: https://joey711.github.io/phyloseq/plot_heatmap-examples.html
```

# Data

Import raw data and assign sample key:

```{r, echo=TRUE, warning=FALSE}
#extend map_corrected.txt with Diet and Flora
#setwd("~/DATA/Data_Laura_16S_2/core_diversity_e4753")
map_corrected <- read.csv("map_corrected.txt", sep="\t", row.names=1)
knitr::kable(map_corrected) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
```

# Prerequisites to be installed

* R : https://pbil.univ-lyon1.fr/CRAN/
* R studio : https://www.rstudio.com/products/rstudio/download/#download

```R
install.packages("dplyr")     # To manipulate dataframes
install.packages("readxl")    # To read Excel files into R
install.packages("ggplot2")   # for high quality graphics
install.packages("heatmaply")
source("https://bioconductor.org/biocLite.R")
biocLite("phyloseq")
```

```{r libraries, echo=TRUE, message=FALSE}
library("readxl") # necessary to import the data from Excel file
library("ggplot2") # graphics
library("picante")
library("microbiome") # data analysis and visualisation
library("phyloseq") # also the basis of data object. Data analysis and visualisation
library("ggpubr") # publication quality figures, based on ggplot2
library("dplyr") # data handling, filter and reformat data frames
library("RColorBrewer") # nice color options
library("heatmaply")
library(vegan)
library(gplots)
```

# Read the data and create phyloseq objects

Three tables are needed

* OTU
* Taxonomy
* Samples

```{r, echo=TRUE, warning=FALSE}
    #Change your working directory to where the files are located
    ps.ng.tax <- import_biom("./table_even42369.biom", "./clustering/rep_set.tre")
    sample <- read.csv("./map_corrected.txt", sep="\t", row.names=1)
    SAM = sample_data(sample, errorIfNULL = T)
    rownames(SAM) <-
    c("1","2","3","5","6","7","8","9","10","12","13","14","15","16","17","18","19","20","21","22","23","24","25","26","27","28","29","30","31","32","33","34","35","36","37","38","39","40","41","42","43","44","46","47","48","49","50","51","52","53","55")
    ps.ng.tax <- merge_phyloseq(ps.ng.tax, SAM)
    print(ps.ng.tax) 
    colnames(tax_table(ps.ng.tax)) <- c("Domain","Phylum","Class","Order","Family","Genus","Species")
    saveRDS(ps.ng.tax, "./ps.ng.tax.rds")
```

Visualize data
```{r, echo=TRUE, warning=FALSE}
  sample_names(ps.ng.tax)
  rank_names(ps.ng.tax)
  sample_variables(ps.ng.tax)
```

Normalize number of reads in each sample using median sequencing depth.
```{r, echo=TRUE, warning=FALSE}
# RAREFACTION
#set.seed(9242)  # This will help in reproducing the filtering and nomalisation. 
#ps.ng.tax <- rarefy_even_depth(ps.ng.tax, sample.size = 42369)
#total <- 42369

# NORMALIZE number of reads in each sample using median sequencing depth.
total = median(sample_sums(ps.ng.tax))
#> total
#[1] 42369
standf = function(x, t=total) round(t * (x / sum(x)))
ps.ng.tax = transform_sample_counts(ps.ng.tax, standf)
ps.ng.tax_rel <- microbiome::transform(ps.ng.tax, "compositional")

saveRDS(ps.ng.tax, "./ps.ng.tax.rds")
hmp.meta <- meta(ps.ng.tax)
hmp.meta$sam_name <- rownames(hmp.meta)
```

# Heatmaps

```{r, echo=TRUE, warning=FALSE}
#MOVE_FROM_ABOVE: The number of reads used for normalization is **`r sprintf("%.0f", total)`**. 
#A basic heatmap using the default parameters.
#  plot_heatmap(ps.ng.tax, method = "NMDS", distance = "bray")
#NOTE that giving the correct OTU numbers in the text (1%, 0.5%, ...)!!!
```

We consider the most abundant OTUs for heatmaps. For example one can only take OTUs that represent at least 1% of reads in at least one sample. Remember we normalized all the sampples to median number of reads (total).  We are left with only 168 OTUS which makes the reading much more easy.
```{r, echo=TRUE, warning=FALSE}

# Custom function to plot a heatmap with the specified sample order
#plot_heatmap_custom <- function(ps, sample_order, method = "NMDS", distance = "bray") {
ps.ng.tax_abund <- phyloseq::filter_taxa(ps.ng.tax, function(x) sum(x > total*0.01) > 0, TRUE)
kable(otu_table(ps.ng.tax_abund)) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
# Calculate the relative abundance for each sample
ps.ng.tax_abund_rel <- transform_sample_counts(ps.ng.tax_abund, function(x) x / sum(x))

datamat_ = as.data.frame(otu_table(ps.ng.tax_abund))
#datamat <- datamat_[c("1","2","5","6","7",  "8","9","10","12","13","14",    "15","16","17","18","19","20",  "21","22","23","24","25","26","27","28",    "29","30","31","32",  "33","34","35","36","37","38","39","51",    "40","41","42","43","44","46",  "47","48","49","50","52","53","55")]
datamat <- datamat_[c("8","9","10","12","13","14",    "21","22","23","24","25","26","27","28",    "33","34","35","36","37","38","39","51",    "47","48","49","50","52","53","55")]
hr <- hclust(as.dist(1-cor(t(datamat), method="pearson")), method="complete")
hc <- hclust(as.dist(1-cor(datamat, method="spearman")), method="complete")
mycl = cutree(hr, h=max(hr$height)/1.08)
mycol = c("YELLOW", "DARKBLUE", "DARKORANGE", "DARKMAGENTA", "DARKCYAN", "DARKRED",  "MAROON", "DARKGREEN", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN","LIGHTGREEN", "BLUE", "ORANGE", "CYAN", "RED", "GREEN");

mycol = mycol[as.vector(mycl)]
sampleCols <- rep('GREY',ncol(datamat))
#names(sampleCols) <- c("Group1", "Group1", "Group1", "Group1", "Group1",   "Group2", "Group2", "Group2", "Group2", "Group2","Group2",    "Group3", "Group3", "Group3", "Group3",   "Group1", "Group1", "Group1", "Group1", "Group1",   "Group1", "Group1", "Group1", "Group1", "Group1",  "Group1", "Group1", "Group1", "Group1", "Group1",  "Group1", "Group1", "Group1", "Group1", "Group1",  "Group1", "Group1", "Group1", "Group1", "Group1",    "Group1", "Group1", "Group1", "Group1", "Group1",  "Group1", "Group1", "Group1", "Group1", "Group1")

#sampleCols[colnames(datamat)=='1'] <- '#a6cee3'
#sampleCols[colnames(datamat)=='2'] <- '#a6cee3'
#sampleCols[colnames(datamat)=='5'] <- '#a6cee3'
#sampleCols[colnames(datamat)=='6'] <- '#a6cee3'
#sampleCols[colnames(datamat)=='7'] <- '#a6cee3'

sampleCols[colnames(datamat)=='8'] <- '#1f78b4'
sampleCols[colnames(datamat)=='9'] <- '#1f78b4'
sampleCols[colnames(datamat)=='10'] <- '#1f78b4'
sampleCols[colnames(datamat)=='12'] <- '#1f78b4'
sampleCols[colnames(datamat)=='13'] <- '#1f78b4'
sampleCols[colnames(datamat)=='14'] <- '#1f78b4'

#sampleCols[colnames(datamat)=='15'] <- '#b2df8a'
#sampleCols[colnames(datamat)=='16'] <- '#b2df8a'
#sampleCols[colnames(datamat)=='17'] <- '#b2df8a'
#sampleCols[colnames(datamat)=='18'] <- '#b2df8a'
#sampleCols[colnames(datamat)=='19'] <- '#b2df8a'
#sampleCols[colnames(datamat)=='20'] <- '#b2df8a'

sampleCols[colnames(datamat)=='21'] <- '#33a02c'
sampleCols[colnames(datamat)=='22'] <- '#33a02c'
sampleCols[colnames(datamat)=='23'] <- '#33a02c'
sampleCols[colnames(datamat)=='24'] <- '#33a02c'
sampleCols[colnames(datamat)=='25'] <- '#33a02c'
sampleCols[colnames(datamat)=='26'] <- '#33a02c'
sampleCols[colnames(datamat)=='27'] <- '#33a02c'
sampleCols[colnames(datamat)=='28'] <- '#33a02c'

#sampleCols[colnames(datamat)=='29'] <- '#fb9a99'
#sampleCols[colnames(datamat)=='30'] <- '#fb9a99'
#sampleCols[colnames(datamat)=='31'] <- '#fb9a99'
#sampleCols[colnames(datamat)=='32'] <- '#fb9a99'

sampleCols[colnames(datamat)=='33'] <- '#e31a1c'
sampleCols[colnames(datamat)=='34'] <- '#e31a1c'
sampleCols[colnames(datamat)=='35'] <- '#e31a1c'
sampleCols[colnames(datamat)=='36'] <- '#e31a1c'
sampleCols[colnames(datamat)=='37'] <- '#e31a1c'
sampleCols[colnames(datamat)=='38'] <- '#e31a1c'
sampleCols[colnames(datamat)=='39'] <- '#e31a1c'
sampleCols[colnames(datamat)=='51'] <- '#e31a1c'

#sampleCols[colnames(datamat)=='40'] <- '#cab2d6'
#sampleCols[colnames(datamat)=='41'] <- '#cab2d6'
#sampleCols[colnames(datamat)=='42'] <- '#cab2d6'
#sampleCols[colnames(datamat)=='43'] <- '#cab2d6'
#sampleCols[colnames(datamat)=='44'] <- '#cab2d6'
#sampleCols[colnames(datamat)=='46'] <- '#cab2d6'

sampleCols[colnames(datamat)=='47'] <- '#6a3d9a'
sampleCols[colnames(datamat)=='48'] <- '#6a3d9a'
sampleCols[colnames(datamat)=='49'] <- '#6a3d9a'
sampleCols[colnames(datamat)=='50'] <- '#6a3d9a'
sampleCols[colnames(datamat)=='52'] <- '#6a3d9a'
sampleCols[colnames(datamat)=='53'] <- '#6a3d9a'
sampleCols[colnames(datamat)=='55'] <- '#6a3d9a'
#bluered(75)
#color_pattern <- colorRampPalette(c("blue", "white", "red"))(100)
library(RColorBrewer)
custom_palette <- colorRampPalette(brewer.pal(9, "Blues"))
heatmap_colors <- custom_palette(100)
#colors <- heatmap_color_default(100)
png("figures/heatmap.png", width=1200, height=2400)
#par(mar=c(2, 2, 2, 2))  , lwid=1    lhei=c(0.7, 10)) # Adjust height of color keys   keysize=0.3, 
heatmap.2(as.matrix(datamat),Rowv=as.dendrogram(hr),Colv = NA, dendrogram = 'row',
            scale='row',trace='none',col=heatmap_colors, cexRow=1.2, cexCol=1.5,
            RowSideColors = mycol, ColSideColors = sampleCols, srtCol=15, labRow=row.names(datamat), key=TRUE, margins=c(10, 15), lhei=c(0.7, 15), lwid=c(1,8))
dev.off()
```
```{r, echo=TRUE, warning=FALSE, fig.cap="Heatmap", out.width = '100%', fig.align= "center"}
knitr::include_graphics("./figures/heatmap.png")
```

```{r, echo=FALSE, warning=FALSE}
  #It is possible to use different distances and different multivaraite methods. Many different built-in distances can be used.
  #dist_methods <- unlist(distanceMethodList)
  #print(dist_methods)
```

\pagebreak

# Taxonomic summary

## Bar plots in phylum level

```{r, echo=FALSE, warning=FALSE}
#Make the bargraph nicer by removing OTUs boundaries. This is done by adding ggplot2 modifier.
# 1: uniform color. Color is for the border, fill is for the inside
#ggplot(mtcars, aes(x=as.factor(cyl) )) +
#  geom_bar(color="blue", fill=rgb(0.1,0.4,0.5,0.7) )
# 2: Using Hue
#ggplot(mtcars, aes(x=as.factor(cyl), fill=as.factor(cyl) )) + 
#  geom_bar( ) +
#  scale_fill_hue(c = 40) +
#  theme(legend.position="none")
# 3: Using RColorBrewer
#ggplot(mtcars, aes(x=as.factor(cyl), fill=as.factor(cyl) )) + 
#  geom_bar( ) +
#  scale_fill_brewer(palette = "Set1") +
#  theme(legend.position="none")
# 4: Using greyscale:
#ggplot(mtcars, aes(x=as.factor(cyl), fill=as.factor(cyl) )) + 
#  geom_bar( ) +
#  scale_fill_grey(start = 0.25, end = 0.75) +
#  theme(legend.position="none")
# 5: Set manualy
#ggplot(mtcars, aes(x=as.factor(cyl), fill=as.factor(cyl) )) +  
#  geom_bar( ) +
#  scale_fill_manual(values = c("red", "green", "blue") ) +
#  theme(legend.position="none")
#NOT SUCCESSFUL!
#allGroupsColors<- c(
#  "grey0", "grey50", "dodgerblu", "deepskyblue",
#  "red", "darkred", "green", "green4")
#  plot_bar(ps.ng.tax_rel, fill="Phylum") + 
#  geom_bar(stat="identity", position="stack") + scale_color_manual(values = allGroupsColors) #, fill=Phylum   + scale_fill_brewer(palette = "Set1")
  # ##### Keep only the most abundant phyla and 
  #ps.ng.tax_most <- subset_taxa(ps.ng.tax_rel, Phylum %in% c("D_1__Actinobacteria")) #1.57
  #ps.ng.tax_most <- subset_taxa(ps.ng.tax_rel, Phylum %in% c("D_1__Bacteroidetes"))  #27.27436
  #ps.ng.tax_most <- subset_taxa(ps.ng.tax_rel, Phylum %in% c("D_1__Cyanobacteria"))  #0.02244249
  #ps.ng.tax_most <- subset_taxa(ps.ng.tax_rel, Phylum %in% c("D_1__Epsilonbacteraeota"))  #0.01309145
  #ps.ng.tax_most <- subset_taxa(ps.ng.tax_rel, Phylum %in% c("D_1__Euryarchaeota"))  #0.1210024
  #ps.ng.tax_most <- subset_taxa(ps.ng.tax_rel, Phylum %in% c("D_1__Firmicutes"))     #32.50589
  #ps.ng.tax_most <- subset_taxa(ps.ng.tax_rel, Phylum %in% c("D_1__Lentisphaerae"))  #0.0001870208
  #ps.ng.tax_most <- subset_taxa(ps.ng.tax_rel, Phylum %in% c("D_1__Patescibacteria")) #0.008789976
  #ps.ng.tax_most <- subset_taxa(ps.ng.tax_rel, Phylum %in% c("D_1__Planctomycetes")) #0.01365252
  #ps.ng.tax_most <- subset_taxa(ps.ng.tax_rel, Phylum %in% c("D_1__Proteobacteria")) #6.769216
  #ps.ng.tax_most <- subset_taxa(ps.ng.tax_rel, Phylum %in% c("D_1__Synergistetes"))  #0.005049561 
  #ps.ng.tax_most <- subset_taxa(ps.ng.tax_rel, Phylum %in% c("D_1__Tenericutes"))    #0.0005610623
  #ps.ng.tax_most <- subset_taxa(ps.ng.tax_rel, Phylum %in% c("D_1__Verrucomicrobia"))  #2.076304
  #ps.ng.tax_most <- subset_taxa(ps.ng.tax_rel, Phylum %in% c(NA))  #sum(otu_table(ps.ng.tax_most)) = 2.619413
```
```{r, echo=TRUE, warning=FALSE}
  library(ggplot2)
  geom.text.size = 6
  theme.size = 8 #(14/5) * geom.text.size
  #ps.ng.tax_most <- subset_taxa(ps.ng.tax_rel, Phylum %in% c("D_1__Actinobacteria", "D_1__Bacteroidetes", "D_1__Firmicutes", "D_1__Proteobacteria", "D_1__Verrucomicrobia", NA))
  ps.ng.tax_most = phyloseq::filter_taxa(ps.ng.tax_rel, function(x) mean(x) > 0.001, TRUE)
  #CONSOLE(OPTIONAL): for sampleid in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73; do
  #echo "otu_table(ps.ng.tax_most)[,${sampleid}]=otu_table(ps.ng.tax_most)[,${sampleid}]/sum(otu_table(ps.ng.tax_most)[,${sampleid}])" done
  #OR
  ps.ng.tax_most_ = transform_sample_counts(ps.ng.tax_most, function(x) x / sum(x))
```

```{r, echo=FALSE, warning=FALSE}
  ##--Creating 100% stacked bar plots with less abundant taxa in a sub-category #901--
  ##https://github.com/joey711/phyloseq/issues/901
  ##ps.ng.tax_most_df <- psmelt(ps.ng.tax_most_)  #5986x19
  #glom <- tax_glom(ps.ng.tax_most_, taxrank = 'Phylum')
  #tax_table(glom) # should list # taxa as # phyla
  #data <- psmelt(glom) # create dataframe from phyloseq object
  #data$Phylum <- as.character(data$Phylum) #convert to character
  ##simple way to rename phyla with < 1% abundance
  #data$Phylum[data$Abundance < 0.001] <- "< 0.1% abund."
  #
  #library(plyr)
  #medians <- ddply(data, ~Phylum, function(x) c(median=median(x$Abundance)))
  #remainder <- medians[medians$median <= 0.001,]$Phylum
  #data[data$Phylum %in% remainder,]$Phylum <- "Phyla < 0.1% abund."
  #data$Phylum[data$Abundance < 0.001] <- "Phyla < 0.1% abund."
  ##--> data are not used!
  #
  ##in class level
  #glom <- tax_glom(ps.ng.tax_most_, taxrank = 'Class')
  #tax_table(glom) # should list # taxa as # phyla
  #data <- psmelt(glom) # create dataframe from phyloseq object
  #data$Class <- as.character(data$Class) #convert to character
  #
  ##simple way to rename phyla with < 1% abundance
  #data$Class[data$Abundance < 0.001] <- "< 0.1% abund."
  #Count = length(unique(data$Class))
  # 
  ##unique(data$Class)
  ##data$Class <- factor(data$Class, levels = c("Bacilli", "Bacteroidia", "Verrucomicrobiae", "Clostridia", "Gammaproteobacteria", "Alphaproteobacteria", "Actinobacteria", "Negativicutes", "Erysipelotrichia", "Methanobacteria", "< 0.1% abund."))
  ##------- Creating 100% stacked bar plots END --------

  library(stringr)
#FITTING1: 
# tax_table(ps.ng.tax_most_)[1,"Domain"] <- str_split(tax_table(ps.ng.tax_most_)[1,"Domain"], "__")[[1]][2]
# ... ...
# tax_table(ps.ng.tax_most_)[167,"Species"] <- str_split(tax_table(ps.ng.tax_most_)[167,"Species"], "__")[[1]][2]
#ps.ng.tax_most_
#in total [ 89 taxa and 55 samples ]
#otu_table()   OTU Table:         [ 166 taxa and 54 samples ]
#otu_table()   OTU Table:         [ 168 taxa and 50 samples ]
#for id in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100  101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166; do   
#echo "tax_table(ps.ng.tax_most_)[${id},\"Domain\"] <- str_split(tax_table(ps.ng.tax_most_)[${id},\"Domain\"], \"__\")[[1]][2]"
#echo "tax_table(ps.ng.tax_most_)[${id},\"Phylum\"] <- str_split(tax_table(ps.ng.tax_most_)[${id},\"Phylum\"], \"__\")[[1]][2]"
#echo "tax_table(ps.ng.tax_most_)[${id},\"Class\"] <- str_split(tax_table(ps.ng.tax_most_)[${id},\"Class\"], \"__\")[[1]][2]"
#echo "tax_table(ps.ng.tax_most_)[${id},\"Order\"] <- str_split(tax_table(ps.ng.tax_most_)[${id},\"Order\"], \"__\")[[1]][2]"
#echo "tax_table(ps.ng.tax_most_)[${id},\"Family\"] <- str_split(tax_table(ps.ng.tax_most_)[${id},\"Family\"], \"__\")[[1]][2]"
#echo "tax_table(ps.ng.tax_most_)[${id},\"Genus\"] <- str_split(tax_table(ps.ng.tax_most_)[${id},\"Genus\"], \"__\")[[1]][2]"
#echo "tax_table(ps.ng.tax_most_)[${id},\"Species\"] <- str_split(tax_table(ps.ng.tax_most_)[${id},\"Species\"], \"__\")[[1]][2]"
#done

tax_table(ps.ng.tax_most_)[1,"Domain"] <- str_split(tax_table(ps.ng.tax_most_)[1,"Domain"], "__")[[1]][2]
tax_table(ps.ng.tax_most_)[1,"Phylum"] <- str_split(tax_table(ps.ng.tax_most_)[1,"Phylum"], "__")[[1]][2]
tax_table(ps.ng.tax_most_)[1,"Class"] <- str_split(tax_table(ps.ng.tax_most_)[1,"Class"], "__")[[1]][2]
tax_table(ps.ng.tax_most_)[1,"Order"] <- str_split(tax_table(ps.ng.tax_most_)[1,"Order"], "__")[[1]][2]
tax_table(ps.ng.tax_most_)[1,"Family"] <- str_split(tax_table(ps.ng.tax_most_)[1,"Family"], "__")[[1]][2]
tax_table(ps.ng.tax_most_)[1,"Genus"] <- str_split(tax_table(ps.ng.tax_most_)[1,"Genus"], "__")[[1]][2]
tax_table(ps.ng.tax_most_)[1,"Species"] <- str_split(tax_table(ps.ng.tax_most_)[1,"Species"], "__")[[1]][2]
#... ...
tax_table(ps.ng.tax_most_)[167,"Domain"] <- str_split(tax_table(ps.ng.tax_most_)[167,"Domain"], "__")[[1]][2]
tax_table(ps.ng.tax_most_)[167,"Phylum"] <- str_split(tax_table(ps.ng.tax_most_)[167,"Phylum"], "__")[[1]][2]
tax_table(ps.ng.tax_most_)[167,"Class"] <- str_split(tax_table(ps.ng.tax_most_)[167,"Class"], "__")[[1]][2]
tax_table(ps.ng.tax_most_)[167,"Order"] <- str_split(tax_table(ps.ng.tax_most_)[167,"Order"], "__")[[1]][2]
tax_table(ps.ng.tax_most_)[167,"Family"] <- str_split(tax_table(ps.ng.tax_most_)[167,"Family"], "__")[[1]][2]
tax_table(ps.ng.tax_most_)[167,"Genus"] <- str_split(tax_table(ps.ng.tax_most_)[167,"Genus"], "__")[[1]][2]
tax_table(ps.ng.tax_most_)[167,"Species"] <- str_split(tax_table(ps.ng.tax_most_)[167,"Species"], "__")[[1]][2]
```

```{r, echo=TRUE, warning=FALSE}
  #aes(color="Phylum", fill="Phylum") --> aes()
  #ggplot(data=data, aes(x=Sample, y=Abundance, fill=Phylum)) 
  plot_bar(ps.ng.tax_most_, fill="Phylum") + geom_bar(aes(), stat="identity", position="stack") +
  scale_fill_manual(values = c("darkblue", "darkgoldenrod1", "darkseagreen", "darkorchid", "darkolivegreen1", "lightskyblue", "darkgreen", "deeppink", "khaki2", "firebrick", "brown1", "darkorange1", "cyan1", "royalblue4", "darksalmon", "darkblue","royalblue4", "dodgerblue3", "steelblue1", "lightskyblue", "darkseagreen", "darkgoldenrod1", "darkseagreen", "darkorchid", "darkolivegreen1", "brown1", "darkorange1", "cyan1", "darkgrey")) + theme(axis.text = element_text(size = 5, colour="black")) + theme(legend.position="bottom") + guides(fill=guide_legend(nrow=2))                                  #6 instead of theme.size
```
```{r, echo=FALSE, warning=FALSE}
  #png("abc.png")
  #knitr::include_graphics("./Phyloseq_files/figure-html/unnamed-chunk-7-1.png")
  #dev.off()
```

\pagebreak

Regroup together pre vs post stroke samples and normalize number of reads in each group using median sequencing depth.

```{r, echo=TRUE, warning=FALSE}
  ps.ng.tax_most_pre_post_stroke <- merge_samples(ps.ng.tax_most_, "pre_post_stroke")
  ps.ng.tax_most_pre_post_stroke_ = transform_sample_counts(ps.ng.tax_most_pre_post_stroke, function(x) x / sum(x))
  #plot_bar(ps.ng.tax_most_SampleType_, fill = "Phylum") + geom_bar(aes(color=Phylum, fill=Phylum), stat="identity", position="stack")
  plot_bar(ps.ng.tax_most_pre_post_stroke_, fill="Phylum") + geom_bar(aes(), stat="identity", position="stack") +
  scale_fill_manual(values = c("darkblue", "darkgoldenrod1", "darkseagreen", "darkorchid", "darkolivegreen1", "lightskyblue", "darkgreen", "deeppink", "khaki2", "firebrick", "brown1", "darkorange1", "cyan1", "royalblue4", "darksalmon", "darkblue","royalblue4", "dodgerblue3", "steelblue1", "lightskyblue", "darkseagreen", "darkgoldenrod1", "darkseagreen", "darkorchid", "darkolivegreen1", "brown1", "darkorange1", "cyan1", "darkgrey")) + theme(axis.text = element_text(size = theme.size, colour="black"))
```

\pagebreak

Use color according to phylum. Do separate panels Stroke and Sex_age.
```{r, echo=TRUE, warning=FALSE}  
  ps.ng.tax_most_copied <- data.table::copy(ps.ng.tax_most_)
  #FITTING6: regulate the bar height if it has replicates: 5+6+6+8+4+8+6+7=25+25=50
  otu_table(ps.ng.tax_most_)[,c("1")] <- otu_table(ps.ng.tax_most_)[,c("1")]/5
  otu_table(ps.ng.tax_most_)[,c("2")] <- otu_table(ps.ng.tax_most_)[,c("2")]/5
  otu_table(ps.ng.tax_most_)[,c("5")] <- otu_table(ps.ng.tax_most_)[,c("5")]/5
  otu_table(ps.ng.tax_most_)[,c("6")] <- otu_table(ps.ng.tax_most_)[,c("6")]/5
  otu_table(ps.ng.tax_most_)[,c("7")] <- otu_table(ps.ng.tax_most_)[,c("7")]/5

  otu_table(ps.ng.tax_most_)[,c("8")] <- otu_table(ps.ng.tax_most_)[,c("8")]/6
  otu_table(ps.ng.tax_most_)[,c("9")] <- otu_table(ps.ng.tax_most_)[,c("9")]/6
  otu_table(ps.ng.tax_most_)[,c("10")] <- otu_table(ps.ng.tax_most_)[,c("10")]/6
  otu_table(ps.ng.tax_most_)[,c("12")] <- otu_table(ps.ng.tax_most_)[,c("12")]/6
  otu_table(ps.ng.tax_most_)[,c("13")] <- otu_table(ps.ng.tax_most_)[,c("13")]/6
  otu_table(ps.ng.tax_most_)[,c("14")] <- otu_table(ps.ng.tax_most_)[,c("14")]/6

  otu_table(ps.ng.tax_most_)[,c("15")] <- otu_table(ps.ng.tax_most_)[,c("15")]/6
  otu_table(ps.ng.tax_most_)[,c("16")] <- otu_table(ps.ng.tax_most_)[,c("16")]/6
  otu_table(ps.ng.tax_most_)[,c("17")] <- otu_table(ps.ng.tax_most_)[,c("17")]/6
  otu_table(ps.ng.tax_most_)[,c("18")] <- otu_table(ps.ng.tax_most_)[,c("18")]/6
  otu_table(ps.ng.tax_most_)[,c("19")] <- otu_table(ps.ng.tax_most_)[,c("19")]/6
  otu_table(ps.ng.tax_most_)[,c("20")] <- otu_table(ps.ng.tax_most_)[,c("20")]/6

  otu_table(ps.ng.tax_most_)[,c("21")] <- otu_table(ps.ng.tax_most_)[,c("21")]/8
  otu_table(ps.ng.tax_most_)[,c("22")] <- otu_table(ps.ng.tax_most_)[,c("22")]/8
  otu_table(ps.ng.tax_most_)[,c("23")] <- otu_table(ps.ng.tax_most_)[,c("23")]/8
  otu_table(ps.ng.tax_most_)[,c("24")] <- otu_table(ps.ng.tax_most_)[,c("24")]/8
  otu_table(ps.ng.tax_most_)[,c("25")] <- otu_table(ps.ng.tax_most_)[,c("25")]/8
  otu_table(ps.ng.tax_most_)[,c("26")] <- otu_table(ps.ng.tax_most_)[,c("26")]/8
  otu_table(ps.ng.tax_most_)[,c("27")] <- otu_table(ps.ng.tax_most_)[,c("27")]/8
  otu_table(ps.ng.tax_most_)[,c("28")] <- otu_table(ps.ng.tax_most_)[,c("28")]/8

  otu_table(ps.ng.tax_most_)[,c("29")] <- otu_table(ps.ng.tax_most_)[,c("29")]/4
  otu_table(ps.ng.tax_most_)[,c("30")] <- otu_table(ps.ng.tax_most_)[,c("30")]/4
  otu_table(ps.ng.tax_most_)[,c("31")] <- otu_table(ps.ng.tax_most_)[,c("31")]/4
  otu_table(ps.ng.tax_most_)[,c("32")] <- otu_table(ps.ng.tax_most_)[,c("32")]/4

  otu_table(ps.ng.tax_most_)[,c("33")] <- otu_table(ps.ng.tax_most_)[,c("33")]/8
  otu_table(ps.ng.tax_most_)[,c("34")] <- otu_table(ps.ng.tax_most_)[,c("34")]/8
  otu_table(ps.ng.tax_most_)[,c("35")] <- otu_table(ps.ng.tax_most_)[,c("35")]/8
  otu_table(ps.ng.tax_most_)[,c("36")] <- otu_table(ps.ng.tax_most_)[,c("36")]/8
  otu_table(ps.ng.tax_most_)[,c("37")] <- otu_table(ps.ng.tax_most_)[,c("37")]/8
  otu_table(ps.ng.tax_most_)[,c("38")] <- otu_table(ps.ng.tax_most_)[,c("38")]/8
  otu_table(ps.ng.tax_most_)[,c("39")] <- otu_table(ps.ng.tax_most_)[,c("39")]/8
  otu_table(ps.ng.tax_most_)[,c("51")] <- otu_table(ps.ng.tax_most_)[,c("51")]/8

  otu_table(ps.ng.tax_most_)[,c("40")] <- otu_table(ps.ng.tax_most_)[,c("40")]/6
  otu_table(ps.ng.tax_most_)[,c("41")] <- otu_table(ps.ng.tax_most_)[,c("41")]/6
  otu_table(ps.ng.tax_most_)[,c("42")] <- otu_table(ps.ng.tax_most_)[,c("42")]/6
  otu_table(ps.ng.tax_most_)[,c("43")] <- otu_table(ps.ng.tax_most_)[,c("43")]/6
  otu_table(ps.ng.tax_most_)[,c("44")] <- otu_table(ps.ng.tax_most_)[,c("44")]/6
  otu_table(ps.ng.tax_most_)[,c("46")] <- otu_table(ps.ng.tax_most_)[,c("46")]/6

  otu_table(ps.ng.tax_most_)[,c("47")] <- otu_table(ps.ng.tax_most_)[,c("47")]/7
  otu_table(ps.ng.tax_most_)[,c("48")] <- otu_table(ps.ng.tax_most_)[,c("48")]/7
  otu_table(ps.ng.tax_most_)[,c("49")] <- otu_table(ps.ng.tax_most_)[,c("49")]/7
  otu_table(ps.ng.tax_most_)[,c("50")] <- otu_table(ps.ng.tax_most_)[,c("50")]/7
  otu_table(ps.ng.tax_most_)[,c("52")] <- otu_table(ps.ng.tax_most_)[,c("52")]/7
  otu_table(ps.ng.tax_most_)[,c("53")] <- otu_table(ps.ng.tax_most_)[,c("53")]/7
  otu_table(ps.ng.tax_most_)[,c("55")] <- otu_table(ps.ng.tax_most_)[,c("55")]/7

  #plot_bar(ps.ng.tax_most_swab_, x="Phylum", fill = "Phylum", facet_grid = Patient~RoundDay) + geom_bar(aes(color=Phylum, fill=Phylum), stat="identity", position="stack") + theme(axis.text = element_text(size = theme.size, colour="black"))
  plot_bar(ps.ng.tax_most_, x="Phylum", fill="Phylum", facet_grid = pre_post_stroke~Sex_age) + geom_bar(aes(), stat="identity", position="stack") +
  scale_fill_manual(values = c("darkblue", "darkgoldenrod1", "darkseagreen", "darkorchid", "darkolivegreen1", "lightskyblue", "darkgreen", "deeppink", "khaki2", "firebrick", "brown1", "darkorange1", "cyan1", "royalblue4", "darksalmon", "darkblue","royalblue4", "dodgerblue3", "steelblue1", "lightskyblue", "darkseagreen", "darkgoldenrod1", "darkseagreen", "darkorchid", "darkolivegreen1", "brown1", "darkorange1", "cyan1", "darkgrey")) + theme(axis.text = element_text(size = 5, colour="black"), axis.text.x=element_blank(), axis.ticks=element_blank()) + theme(legend.position="bottom") + guides(fill=guide_legend(nrow=2))
```
```{r, echo=FALSE, warning=FALSE}  
  #knitr::include_graphics("./Phyloseq_files/figure-html/unnamed-chunk-10-1.png")
  #> tax_table(carbom)
  #Taxonomy Table:     [205 taxa by 7 taxonomic ranks]:
  #     Domain      Supergroup       Division      Class                 
  #Otu001 "Eukaryota" "Archaeplastida" "Chlorophyta" "Mamiellophyceae"    
  #       Order                      Family                 Genus     
  #Otu001 "Mamiellales"              "Bathycoccaceae"       "Ostreococcus"
  #sample_data(ps.ng.tax)
```

## Bar plots in class level
```{r, echo=TRUE, warning=FALSE}
  plot_bar(ps.ng.tax_most_copied, fill="Class") + geom_bar(aes(), stat="identity", position="stack") +
  scale_fill_manual(values = c("darkblue", "darkgoldenrod1", "darkseagreen", "darkorchid", "darkolivegreen1", "lightskyblue", "darkgreen", "deeppink", "khaki2", "firebrick", "brown1", "darkorange1", "cyan1", "royalblue4", "darksalmon", "darkblue","royalblue4", "dodgerblue3", "steelblue1", "lightskyblue", "darkseagreen", "darkgoldenrod1", "darkseagreen", "darkorchid", "darkolivegreen1", "brown1", "darkorange1", "cyan1", "darkgrey")) + theme(axis.text = element_text(size = 5, colour="black")) + theme(legend.position="bottom") + guides(fill=guide_legend(nrow=3))
```

Regroup together pre vs post stroke samples and normalize number of reads in each group using median sequencing depth.
```{r, echo=TRUE, warning=FALSE}
  plot_bar(ps.ng.tax_most_pre_post_stroke_, fill="Class") + geom_bar(aes(), stat="identity", position="stack") +
  scale_fill_manual(values = c("darkblue", "darkgoldenrod1", "darkseagreen", "darkorchid", "darkolivegreen1", "lightskyblue", "darkgreen", "deeppink", "khaki2", "firebrick", "brown1", "darkorange1", "cyan1", "royalblue4", "darksalmon", "darkblue","royalblue4", "dodgerblue3", "steelblue1", "lightskyblue", "darkseagreen", "darkgoldenrod1", "darkseagreen", "darkorchid", "darkolivegreen1", "brown1", "darkorange1", "cyan1", "darkgrey")) + theme(axis.text = element_text(size = theme.size, colour="black"))
```
\pagebreak

Use color according to class. Do separate panels Stroke and Sex_age.
```{r, echo=TRUE, warning=FALSE}
  #-- If existing replicates, to be processed as follows --
  plot_bar(ps.ng.tax_most_, x="Class", fill="Class", facet_grid = pre_post_stroke~Sex_age) + geom_bar(aes(), stat="identity", position="stack") +
  scale_fill_manual(values = c("darkblue", "darkgoldenrod1", "darkseagreen", "darkorchid", "darkolivegreen1", "lightskyblue", "darkgreen", "deeppink", "khaki2", "firebrick", "brown1", "darkorange1", "cyan1", "royalblue4", "darksalmon", "darkblue","royalblue4", "dodgerblue3", "steelblue1", "lightskyblue", "darkseagreen", "darkgoldenrod1", "darkseagreen", "darkorchid", "darkolivegreen1", "brown1", "darkorange1", "cyan1", "darkgrey")) + theme(axis.text = element_text(size = 5, colour="black"), axis.text.x=element_blank(), axis.ticks=element_blank()) + theme(legend.position="bottom") + guides(fill=guide_legend(nrow=3)) 
```

## Bar plots in order level

```{r, echo=TRUE, warning=FALSE}
  plot_bar(ps.ng.tax_most_copied, fill="Order") + geom_bar(aes(), stat="identity", position="stack") +
  scale_fill_manual(values = c("darkblue", "darkgoldenrod1", "darkseagreen", "darkorchid", "darkolivegreen1", "lightskyblue", "darkgreen", "deeppink", "khaki2", "firebrick", "brown1", "darkorange1", "cyan1", "royalblue4", "darksalmon", "darkblue","royalblue4", "dodgerblue3", "steelblue1", "lightskyblue", "darkseagreen", "darkgoldenrod1", "darkseagreen", "darkorchid", "darkolivegreen1", "brown1", "darkorange1", "cyan1", "darkgrey")) + theme(axis.text = element_text(size = 5, colour="black")) + theme(legend.position="bottom") + guides(fill=guide_legend(nrow=4))                                 
```

Regroup together pre vs post stroke and normalize number of reads in each group using median sequencing depth.
```{r, echo=TRUE, warning=FALSE}
  plot_bar(ps.ng.tax_most_pre_post_stroke_, fill="Order") + geom_bar(aes(), stat="identity", position="stack") +
  scale_fill_manual(values = c("darkblue", "darkgoldenrod1", "darkseagreen", "darkorchid", "darkolivegreen1", "lightskyblue", "darkgreen", "deeppink", "khaki2", "firebrick", "brown1", "darkorange1", "cyan1", "royalblue4", "darksalmon", "darkblue","royalblue4", "dodgerblue3", "steelblue1", "lightskyblue", "darkseagreen", "darkgoldenrod1", "darkseagreen", "darkorchid", "darkolivegreen1", "brown1", "darkorange1", "cyan1", "darkgrey")) + theme(axis.text = element_text(size = theme.size, colour="black")) + theme(legend.position="bottom") + guides(fill=guide_legend(nrow=4))
```
\pagebreak

Use color according to order. Do separate panels Stroke and Sex_age.
```{r, echo=TRUE, warning=FALSE}
  #FITTING7: regulate the bar height if it has replicates
  plot_bar(ps.ng.tax_most_, x="Order", fill="Order", facet_grid = pre_post_stroke~Sex_age) + geom_bar(aes(), stat="identity", position="stack") +
  scale_fill_manual(values = c("darkblue", "darkgoldenrod1", "darkseagreen", "darkorchid", "darkolivegreen1", "lightskyblue", "darkgreen", "deeppink", "khaki2", "firebrick", "brown1", "darkorange1", "cyan1", "royalblue4", "darksalmon", "darkblue","royalblue4", "dodgerblue3", "steelblue1", "lightskyblue", "darkseagreen", "darkgoldenrod1", "darkseagreen", "darkorchid", "darkolivegreen1", "brown1", "darkorange1", "cyan1", "darkgrey")) + theme(axis.text = element_text(size = 5, colour="black"), axis.text.x=element_blank(), axis.ticks=element_blank()) + theme(legend.position="bottom") + guides(fill=guide_legend(nrow=4))
```

## Bar plots in family level

```{r, echo=TRUE, warning=FALSE}
  plot_bar(ps.ng.tax_most_copied, fill="Family") + geom_bar(aes(), stat="identity", position="stack") +
  scale_fill_manual(values = c("#FF0000", "#000000", "#0000FF", "#C0C0C0", "#FFFFFF", "#FFFF00", "#00FFFF", "#FFA500", "#00FF00", "#808080", "#FF00FF", "#800080", "#FDD017", "#0000A0", "#3BB9FF", "#008000", "#800000", "#ADD8E6", "#F778A1", "#800517", "#736F6E", "#F52887", "#C11B17", "#5CB3FF", "#A52A2A", "#FF8040", "#2B60DE", "#736AFF", "#1589FF", "#98AFC7", "#8D38C9", "#307D7E", "#F6358A", "#151B54", "#6D7B8D", "#FDEEF4", "#FF0080", "#F88017", "#2554C7", "#FFF8C6", "#D4A017", "#306EFF", "#151B8D", "#9E7BFF", "#EAC117", "#E0FFFF", "#15317E", "#6C2DC7", "#FBB917", "#FCDFFF", "#15317E", "#254117", "#FAAFBE", "#357EC7")) + theme(axis.text = element_text(size = 5, colour="black")) + theme(legend.position="bottom") + guides(fill=guide_legend(nrow=8))
```

Regroup together pre vs post stroke samples and normalize number of reads in each group using median sequencing depth.
```{r, echo=TRUE, warning=FALSE}
  plot_bar(ps.ng.tax_most_pre_post_stroke_, fill="Family") + geom_bar(aes(), stat="identity", position="stack") +
  scale_fill_manual(values = c("#FF0000", "#000000", "#0000FF", "#C0C0C0", "#FFFFFF", "#FFFF00", "#00FFFF", "#FFA500", "#00FF00", "#808080", "#FF00FF", "#800080", "#FDD017", "#0000A0", "#3BB9FF", "#008000", "#800000", "#ADD8E6", "#F778A1", "#800517", "#736F6E", "#F52887", "#C11B17", "#5CB3FF", "#A52A2A", "#FF8040", "#2B60DE", "#736AFF", "#1589FF", "#98AFC7", "#8D38C9", "#307D7E", "#F6358A", "#151B54", "#6D7B8D", "#FDEEF4", "#FF0080", "#F88017", "#2554C7", "#FFF8C6", "#D4A017", "#306EFF", "#151B8D", "#9E7BFF", "#EAC117", "#E0FFFF", "#15317E", "#6C2DC7", "#FBB917", "#FCDFFF", "#15317E", "#254117", "#FAAFBE", "#357EC7")) + theme(axis.text = element_text(size = theme.size, colour="black")) + theme(legend.position="bottom") + guides(fill=guide_legend(nrow=8))
```
\pagebreak

Use color according to family. Do separate panels Stroke and Sex_age.
```{r, echo=TRUE, warning=FALSE}
  #-- If existing replicates, to be processed as follows --
  plot_bar(ps.ng.tax_most_, x="Family", fill="Family", facet_grid = pre_post_stroke~Sex_age) + geom_bar(aes(), stat="identity", position="stack") +
  scale_fill_manual(values = c("#FF0000", "#000000", "#0000FF", "#C0C0C0", "#FFFFFF", "#FFFF00", "#00FFFF", "#FFA500", "#00FF00", "#808080", "#FF00FF", "#800080", "#FDD017", "#0000A0", "#3BB9FF", "#008000", "#800000", "#ADD8E6", "#F778A1", "#800517", "#736F6E", "#F52887", "#C11B17", "#5CB3FF", "#A52A2A", "#FF8040", "#2B60DE", "#736AFF", "#1589FF", "#98AFC7", "#8D38C9", "#307D7E", "#F6358A", "#151B54", "#6D7B8D", "#FDEEF4", "#FF0080", "#F88017", "#2554C7", "#FFF8C6", "#D4A017", "#306EFF", "#151B8D", "#9E7BFF", "#EAC117", "#E0FFFF", "#15317E", "#6C2DC7", "#FBB917", "#FCDFFF", "#15317E", "#254117", "#FAAFBE", "#357EC7")) + theme(axis.text = element_text(size = 5, colour="black"), axis.text.x=element_blank(), axis.ticks=element_blank()) + theme(legend.position="bottom") + guides(fill=guide_legend(nrow=8)) 
```

```{r, echo=FALSE, warning=FALSE}
  #MOVE_FROM_ABOVE: ## Bar plots in genus level
  #MOVE_FROM_ABOVE: Regroup together pre vs post stroke samples and normalize number of reads in each group using median sequencing depth.
  #plot_bar(ps.ng.tax_most_pre_post_stroke_, fill="Genus") + geom_bar(aes(), stat="identity", position="stack") +
  #scale_fill_manual(values = c("#FF0000", "#000000", "#0000FF", "#C0C0C0", "#FFFFFF", "#FFFF00", "#00FFFF", "#FFA500", "#00FF00", "#808080", "#FF00FF", "#800080", "#FDD017", "#0000A0", "#3BB9FF", "#008000", "#800000", "#ADD8E6", "#F778A1", "#800517", "#736F6E", "#F52887", "#C11B17", "#5CB3FF", "#A52A2A", "#FF8040", "#2B60DE", "#736AFF", "#1589FF", "#98AFC7", "#8D38C9", "#307D7E", "#F6358A", "#151B54", "#6D7B8D", "#FDEEF4", "#FF0080", "#F88017", "#2554C7", "#FFF8C6", "#D4A017", "#306EFF", "#151B8D", "#9E7BFF", "#EAC117", "#E0FFFF", "#15317E", "#6C2DC7", "#FBB917", "#FCDFFF", "#15317E", "#254117", "#FAAFBE", "#357EC7")) + theme(axis.text = element_text(size = theme.size, colour="black")) + theme(legend.position="bottom")
```
\pagebreak

```{r, echo=FALSE, warning=FALSE}
  #MOVE_FROM_ABOVE: Use color according to genus. Do separate panels Stroke and Sex_age.
  ##-- If existing replicates, to be processed as follows --
  #plot_bar(ps.ng.tax_most_, x="Genus", fill="Genus", facet_grid = pre_post_stroke~Sex_age) + geom_bar(aes(), stat="identity", position="stack") +
  #scale_fill_manual(values = c("#FF0000", "#000000", "#0000FF", "#C0C0C0", "#FFFFFF", "#FFFF00", "#00FFFF", "#FFA500", "#00FF00", "#808080", "#FF00FF", "#800080", "#FDD017", "#0000A0", "#3BB9FF", "#008000", "#800000", "#ADD8E6", "#F778A1", "#800517", "#736F6E", "#F52887", "#C11B17", "#5CB3FF", "#A52A2A", "#FF8040", "#2B60DE", "#736AFF", "#1589FF", "#98AFC7", "#8D38C9", "#307D7E", "#F6358A", "#151B54", "#6D7B8D", "#FDEEF4", "#FF0080", "#F88017", "#2554C7", "#FFF8C6", "#D4A017", "#306EFF", "#151B8D", "#9E7BFF", "#EAC117", "#E0FFFF", "#15317E", "#6C2DC7", "#FBB917", "#FCDFFF", "#15317E", "#254117", "#FAAFBE", "#357EC7")) + theme(axis.text = element_text(size = 6, colour="black"), axis.text.x=element_blank(), axis.ticks=element_blank()) + theme(legend.position="bottom") + guides(fill=guide_legend(nrow=18)) 
```

\pagebreak

# Alpha diversity
Plot Chao1 richness estimator, Observed OTUs, Shannon index, and Phylogenetic diversity. 
Regroup together samples from the same group.
```{r, echo=FALSE, warning=FALSE}
# using rarefied data
#FITTING2: CONSOLE: 
#gunzip table_even4753.biom.gz
#alpha_diversity.py -i table_even42369.biom --metrics chao1,observed_otus,shannon,PD_whole_tree -o adiv_even.txt -t ../clustering/rep_set.tre
#gunzip table_even4753.biom.gz
#alpha_diversity.py -i table_even4753.biom --metrics chao1,observed_otus,shannon,PD_whole_tree -o adiv_even.txt -t ../clustering_stool/rep_set.tre
#gunzip table_even4753.biom.gz
#alpha_diversity.py -i table_even4753.biom --metrics chao1,observed_otus,shannon,PD_whole_tree -o adiv_even.txt -t ../clustering_swab/rep_set.tre
```

```{r, echo=TRUE, warning=FALSE}
hmp.div_qiime <- read.csv("adiv_even.txt", sep="\t") 
colnames(hmp.div_qiime) <- c("sam_name", "chao1", "observed_otus", "shannon", "PD_whole_tree")
row.names(hmp.div_qiime) <- hmp.div_qiime$sam_name
div.df <- merge(hmp.div_qiime, hmp.meta, by = "sam_name")
div.df2 <- div.df[, c("Group", "chao1", "shannon", "observed_otus", "PD_whole_tree")]
colnames(div.df2) <- c("Group", "Chao-1", "Shannon", "OTU", "Phylogenetic Diversity")
#colnames(div.df2)
options(max.print=999999)
#27     H47 830.5000 5.008482 319               10.60177
#FITTING4: if occuring "Computation failed in `stat_signif()`:not enough 'y' observations"
#means: the patient H47 contains only one sample, it should be removed for the statistical p-values calculations. 
#delete H47(1)
#div.df2 <- div.df2[-c(3), ] 
#div.df2 <- div.df2[-c(55,54, 45,40,39,27,26,25,1), ] 
knitr::kable(div.df2) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

#https://uc-r.github.io/t_test
#We can perform the test with t.test and transform our data and we can also perform the nonparametric test with the wilcox.test function.
stat.test.Shannon <- compare_means(
Shannon ~ Group, data = div.df2,
method = "t.test"
)
knitr::kable(stat.test.Shannon) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

div_df_melt <- reshape2::melt(div.df2)
#head(div_df_melt)

#https://plot.ly/r/box-plots/#horizontal-boxplot
#http://www.sthda.com/english/wiki/print.php?id=177
#https://rpkgs.datanovia.com/ggpubr/reference/as_ggplot.html
#http://www.sthda.com/english/articles/24-ggpubr-publication-ready-plots/82-ggplot2-easy-way-to-change-graphical-parameters/
#https://plot.ly/r/box-plots/#horizontal-boxplot
#library("gridExtra")
#par(mfrow=c(4,1))
p <- ggboxplot(div_df_melt, x = "Group", y = "value",
              facet.by = "variable", 
              scales = "free",
              width = 0.5,
              fill = "gray", legend= "right")
#ggpar(p, xlab = FALSE, ylab = FALSE)
lev <- levels(factor(div_df_melt$Group)) # get the variables
#FITTING4: delete H47(1) in lev
#lev <- lev[-c(3)]
# make a pairwise list that we want to compare.
#my_stat_compare_means
#https://stackoverflow.com/questions/47839988/indicating-significance-with-ggplot2-in-a-boxplot-with-multiple-groups
L.pairs <- combn(seq_along(lev), 2, simplify = FALSE, FUN = function(i) lev[i]) #%>% filter(p.signif != "ns")
my_stat_compare_means  <- function (mapping = NULL, data = NULL, method = NULL, paired = FALSE, 
    method.args = list(), ref.group = NULL, comparisons = NULL, 
    hide.ns = FALSE, label.sep = ", ", label = NULL, label.x.npc = "left", 
    label.y.npc = "top", label.x = NULL, label.y = NULL, tip.length = 0.03, 
    symnum.args = list(), geom = "text", position = "identity", 
    na.rm = FALSE, show.legend = NA, inherit.aes = TRUE, ...) 
{
    if (!is.null(comparisons)) {
        method.info <- ggpubr:::.method_info(method)
        method <- method.info$method
        method.args <- ggpubr:::.add_item(method.args, paired = paired)
        if (method == "wilcox.test") 
            method.args$exact <- FALSE
        pms <- list(...)
        size <- ifelse(is.null(pms$size), 0.3, pms$size)
        color <- ifelse(is.null(pms$color), "black", pms$color)
        map_signif_level <- FALSE
        if (is.null(label)) 
            label <- "p.format"
        if (ggpubr:::.is_p.signif_in_mapping(mapping) | (label %in% "p.signif")) {
            if (ggpubr:::.is_empty(symnum.args)) {
                map_signif_level <- c(`****` = 1e-04, `***` = 0.001, 
                  `**` = 0.01, `*` = 0.05, ns = 1)
            } else {
              map_signif_level <- symnum.args
            } 
            if (hide.ns) 
                names(map_signif_level)[5] <- " "
        }
        step_increase <- ifelse(is.null(label.y), 0.12, 0)
        ggsignif::geom_signif(comparisons = comparisons, y_position = label.y, 
            test = method, test.args = method.args, step_increase = step_increase, 
            size = size, color = color, map_signif_level = map_signif_level, 
            tip_length = tip.length, data = data)
    } else {
        mapping <- ggpubr:::.update_mapping(mapping, label)
        layer(stat = StatCompareMeans, data = data, mapping = mapping, 
            geom = geom, position = position, show.legend = show.legend, 
            inherit.aes = inherit.aes, params = list(label.x.npc = label.x.npc, 
                label.y.npc = label.y.npc, label.x = label.x, 
                label.y = label.y, label.sep = label.sep, method = method, 
                method.args = method.args, paired = paired, ref.group = ref.group, 
                symnum.args = symnum.args, hide.ns = hide.ns, 
                na.rm = na.rm, ...))
    }
}

p2 <- p + 
stat_compare_means(
  method="t.test",
  comparisons = list(c("Group1", "Group2"), c("Group1", "Group3"), c("Group1", "Group4"), c("Group1", "Group6"), c("Group1", "Group8"), c("Group2", "Group5"),c("Group4", "Group5"),c("Group4", "Group6"),c("Group4", "Group7"),c("Group6", "Group7")), 
  label = "p.signif",
  symnum.args <- list(cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 1), symbols = c("****", "***", "**", "*", "ns"))
)
#comparisons = L.pairs,
#symnum.args <- list(cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05), symbols = c("****", "***", "**", "*")),
#stat_pvalue_manual
#print(p2)
#https://stackoverflow.com/questions/20500706/saving-multiple-ggplots-from-ls-into-one-and-separate-files-in-r
#FITTING3: mkdir figures
ggsave("./figures/alpha_diversity_Group.png", device="png", height = 10, width = 12)
ggsave("./figures/alpha_diversity_Group.svg", device="svg", height = 10, width = 12)

p3 <- p + 
stat_compare_means(
  method="t.test",
  comparisons = list(c("Group2", "Group4"), c("Group2", "Group6"), c("Group4", "Group8"), c("Group6", "Group8")), 
  label = "p.signif",
  symnum.args <- list(cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 1), symbols = c("****", "***", "**", "*", "ns")),
)
#symnum.args <- list(cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05), symbols = c("****", "***", "**", "*")),
#stat_pvalue_manual
#print(p2)
#https://stackoverflow.com/questions/20500706/saving-multiple-ggplots-from-ls-into-one-and-separate-files-in-r
#FITTING3: mkdir figures
ggsave("./figures/alpha_diversity_Group2.png", device="png", height = 10, width = 12)
ggsave("./figures/alpha_diversity_Group2.svg", device="svg", height = 10, width = 12)
```

# Selected alpha diversity
```{r, echo=TRUE, warning=FALSE, fig.cap="Alpha diversity", out.width = '100%', fig.align= "center"}
knitr::include_graphics("./figures/alpha_diversity_Group2.png")

selected_alpha_diversities<-read.csv("selected_alpha_diversities.txt",sep="\t")
knitr::kable(selected_alpha_diversities) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
```

# Beta diversity
```{r, echo=TRUE, warning=FALSE, fig.cap="Beta diversity", out.width = '100%', fig.align= "center"}
#file:///home/jhuang/DATA/Data_Marius_16S/core_diversity_e42369/bdiv_even42369_Group/unweighted_unifrac_boxplots/Group_Stats.txt
beta_diversity_group_stats<-read.csv("unweighted_unifrac_boxplots_Group_Stats.txt",sep="\t")
knitr::kable(beta_diversity_group_stats) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
#NOTE: Run this Phyloseq0.Rmd, then run the code of MicrobiotaProcess.R to manually generate PCoA.png, then run this Phyloseq.Rmd!
#NOTE: AT_FIRST_DEACTIVATE_THIS_LINE: knitr::include_graphics("./figures/PCoA.png")
```

# Differential abundance analysis

Differential abundance analysis aims to find the differences in the abundance of each taxa between two groups of samples, assigning a significance value to each comparison.

## Group2 vs Group4

```{r, echo=TRUE, warning=FALSE}
library("DESeq2")
#ALTERNATIVE using ps.ng.tax_most_copied: ps.ng.tax (40594) vs. ps.ng.tax_most_copied (166)
ps.ng.tax_sel <- ps.ng.tax
#FITTING5: correct the id of the group members, see FITTING6
otu_table(ps.ng.tax_sel) <- otu_table(ps.ng.tax)[,c("8","9","10","12","13","14",  "21","22","23","24","25","26","27","28")]
diagdds = phyloseq_to_deseq2(ps.ng.tax_sel, ~Group)
diagdds$Group <- relevel(diagdds$Group, "Group4")
diagdds = DESeq(diagdds, test="Wald", fitType="parametric")
resultsNames(diagdds)

res = results(diagdds, cooksCutoff = FALSE)
alpha = 0.05
sigtab = res[which(res$padj < alpha), ]
sigtab = cbind(as(sigtab, "data.frame"), as(tax_table(ps.ng.tax_sel)[rownames(sigtab), ], "matrix"))
sigtab <- sigtab[rownames(sigtab) %in% rownames(tax_table(ps.ng.tax_most_copied)), ]
kable(sigtab) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
#rownames(sigtab) %in% rownames(tax_table(ps.ng.tax_most_copied))
#subv %in% v
### returns a vector TRUE FALSE
#is.element(subv, v)
### returns a vector TRUE FALSE

library("ggplot2")
theme_set(theme_bw())
scale_fill_discrete <- function(palname = "Set1", ...) {
    scale_fill_brewer(palette = palname, ...)
}
x = tapply(sigtab$log2FoldChange, sigtab$Order, function(x) max(x))
x = sort(x)
sigtab$Order = factor(as.character(sigtab$Order), levels=names(x))
x = tapply(sigtab$log2FoldChange, sigtab$Family, function(x) max(x))
x = sort(x)
sigtab$Family = factor(as.character(sigtab$Family), levels=names(x))
ggplot(sigtab, aes(x=log2FoldChange, y=Family, color=Order)) + geom_point(aes(size=padj)) + scale_size_continuous(name="padj",range=c(8,4))+
  theme(axis.text.x = element_text(angle = -25, hjust = 0, vjust=0.5))

#Error in checkForExperimentalReplicates(object, modelMatrix) : 
#  The design matrix has the same number of samples and coefficients to fit,
#  so estimation of dispersion is not possible. Treating samples
#  as replicates was deprecated in v1.20 and no longer supported since v1.22.
```

## Group2 vs Group6

```{r, echo=TRUE, warning=FALSE}
ps.ng.tax_sel <- ps.ng.tax
otu_table(ps.ng.tax_sel) <- otu_table(ps.ng.tax)[,c("8","9","10","12","13","14",    "33","34","35","36","37","38","39","51")]
diagdds = phyloseq_to_deseq2(ps.ng.tax_sel, ~Group)
diagdds$Group <- relevel(diagdds$Group, "Group6")
diagdds = DESeq(diagdds, test="Wald", fitType="parametric")
resultsNames(diagdds)

res = results(diagdds, cooksCutoff = FALSE)
alpha = 0.05
sigtab = res[which(res$padj < alpha), ]
sigtab = cbind(as(sigtab, "data.frame"), as(tax_table(ps.ng.tax_sel)[rownames(sigtab), ], "matrix"))
sigtab <- sigtab[rownames(sigtab) %in% rownames(tax_table(ps.ng.tax_most_copied)), ]
kable(sigtab) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

library("ggplot2")
theme_set(theme_bw())
scale_fill_discrete <- function(palname = "Set1", ...) {
    scale_fill_brewer(palette = palname, ...)
}
x = tapply(sigtab$log2FoldChange, sigtab$Order, function(x) max(x))
x = sort(x)
sigtab$Order = factor(as.character(sigtab$Order), levels=names(x))
x = tapply(sigtab$log2FoldChange, sigtab$Family, function(x) max(x))
x = sort(x)
sigtab$Family = factor(as.character(sigtab$Family), levels=names(x))
ggplot(sigtab, aes(x=log2FoldChange, y=Family, color=Order)) + geom_point(aes(size=padj)) + scale_size_continuous(name="padj",range=c(8,4))+
  theme(axis.text.x = element_text(angle = -25, hjust = 0, vjust=0.5))
```

## Group4 vs Group8

```{r, echo=TRUE, warning=FALSE}
ps.ng.tax_sel <- ps.ng.tax
otu_table(ps.ng.tax_sel) <- otu_table(ps.ng.tax)[,c("21","22","23","24","25","26","27","28",    "47","48","49","50","52","53","55")]
diagdds = phyloseq_to_deseq2(ps.ng.tax_sel, ~Group)
diagdds$Group <- relevel(diagdds$Group, "Group8")
diagdds = DESeq(diagdds, test="Wald", fitType="parametric")
resultsNames(diagdds)

res = results(diagdds, cooksCutoff = FALSE)
alpha = 0.05
sigtab = res[which(res$padj < alpha), ]
sigtab = cbind(as(sigtab, "data.frame"), as(tax_table(ps.ng.tax_sel)[rownames(sigtab), ], "matrix"))
sigtab <- sigtab[rownames(sigtab) %in% rownames(tax_table(ps.ng.tax_most_copied)), ]
kable(sigtab) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

library("ggplot2")
theme_set(theme_bw())
scale_fill_discrete <- function(palname = "Set1", ...) {
    scale_fill_brewer(palette = palname, ...)
}
x = tapply(sigtab$log2FoldChange, sigtab$Order, function(x) max(x))
x = sort(x)
sigtab$Order = factor(as.character(sigtab$Order), levels=names(x))
x = tapply(sigtab$log2FoldChange, sigtab$Family, function(x) max(x))
x = sort(x)
sigtab$Family = factor(as.character(sigtab$Family), levels=names(x))
ggplot(sigtab, aes(x=log2FoldChange, y=Family, color=Order)) + geom_point(aes(size=padj)) + scale_size_continuous(name="padj",range=c(8,4))+
  theme(axis.text.x = element_text(angle = -25, hjust = 0, vjust=0.5))
```

## Group6 vs Group8

```{r, echo=TRUE, warning=FALSE}
ps.ng.tax_sel <- ps.ng.tax
otu_table(ps.ng.tax_sel) <- otu_table(ps.ng.tax)[,c("33","34","35","36","37","38","39","51",    "47","48","49","50","52","53","55")]
diagdds = phyloseq_to_deseq2(ps.ng.tax_sel, ~Group)
diagdds$Group <- relevel(diagdds$Group, "Group8")
diagdds = DESeq(diagdds, test="Wald", fitType="parametric")
resultsNames(diagdds)

res = results(diagdds, cooksCutoff = FALSE)
alpha = 0.05
sigtab = res[which(res$padj < alpha), ]
sigtab = cbind(as(sigtab, "data.frame"), as(tax_table(ps.ng.tax_sel)[rownames(sigtab), ], "matrix"))
sigtab <- sigtab[rownames(sigtab) %in% rownames(tax_table(ps.ng.tax_most_copied)), ]
kable(sigtab) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

library("ggplot2")
theme_set(theme_bw())
scale_fill_discrete <- function(palname = "Set1", ...) {
    scale_fill_brewer(palette = palname, ...)
}
x = tapply(sigtab$log2FoldChange, sigtab$Order, function(x) max(x))
x = sort(x)
sigtab$Order = factor(as.character(sigtab$Order), levels=names(x))
x = tapply(sigtab$log2FoldChange, sigtab$Family, function(x) max(x))
x = sort(x)
sigtab$Family = factor(as.character(sigtab$Family), levels=names(x))
ggplot(sigtab, aes(x=log2FoldChange, y=Family, color=Order)) + geom_point(aes(size=padj)) + scale_size_continuous(name="padj",range=c(8,4))+
  theme(axis.text.x = element_text(angle = -25, hjust = 0, vjust=0.5))
```

MicrobiotaProcess.R

# -----------------------------------
# ---- prepare the R environment ----
#Rscript MicrobiotaProcess.R
#NOTE: exit R script, then login again R-environment; rm -rf Phyloseq*_cache
rmarkdown::render('Phyloseq.Rmd',output_file='Phyloseq.html')

# -----------------------------
# ---- 3.1. bridges other tools
##https://github.com/YuLab-SMU/MicrobiotaProcess
##https://www.bioconductor.org/packages/release/bioc/vignettes/MicrobiotaProcess/inst/doc/MicrobiotaProcess.html
##https://chiliubio.github.io/microeco_tutorial/intro.html#framework
##https://yiluheihei.github.io/microbiomeMarker/reference/plot_cladogram.html
#BiocManager::install("MicrobiotaProcess")
#install.packages("microeco")
#install.packages("ggalluvial")
#install.packages("ggh4x")

library(MicrobiotaProcess)    
library(microeco)
library(ggalluvial)
library(ggh4x)
library(gghalves)

## Convert the phyloseq object to a MicrobiotaProcess object
#mp <- as.MicrobiotaProcess(ps.ng.tax)

#mt <- phyloseq2microeco(ps.ng.tax) #--> ERROR
#abundance_table <- mt$abun_table
#taxonomy_table <- mt$tax_table

#ps.ng.tax_abund <- phyloseq::filter_taxa(ps.ng.tax, function(x) sum(x > total*0.01) > 0, TRUE)
#ps.ng.tax_most = phyloseq::filter_taxa(ps.ng.tax_rel, function(x) mean(x) > 0.001, TRUE)

##OPTION1: take all samples, prepare mpse_abund!
##mpse <- ps.ng.tax %>% as.MPSE()
#mpse_abund <- ps.ng.tax_abund %>% as.MPSE()

##OPTION2: take partial samples, prepare mpse_abund
ps.ng.tax_sel <- ps.ng.tax  #IMPORTANT
##otu_table(ps.ng.tax_sel) <- otu_table(ps.ng.tax)[,c("1","2","5","6","7",  "15","16","17","18","19","20",  "29","30","31","32",  "40","41","42","43","44","46")]
##NOTE: Only choose Group2, Group4, Group6, Group8
#> ps.ng.tax_sel
#otu_table()   OTU Table:         [ 37465 taxa and 29 samples ]
#sample_data() Sample Data:       [ 29 samples by 10 sample variables ]
#tax_table()   Taxonomy Table:    [ 37465 taxa by 7 taxonomic ranks ]
#phy_tree()    Phylogenetic Tree: [ 37465 tips and 37461 internal nodes ]
otu_table(ps.ng.tax_sel) <- otu_table(ps.ng.tax)[,c("8","9","10","12","13","14",  "21","22","23","24","25","26","27","28",  "33","34","35","36","37","38","39","51",  "47","48","49","50","52","53","55")]
#For quick calculation
#otu_table(ps.ng.tax_sel) <- otu_table(ps.ng.tax_abund)[,c("8","9","10","12","13","14",  "21","22","23","24","25","26","27","28",  "33","34","35","36","37","38","39","51",  "47","48","49","50","52","53","55")]
mpse_abund <- ps.ng.tax_sel %>% as.MPSE()

# -----------------------------------
# ---- 3.2. alpha diversity analysis 
# Rarefied species richness + RareAbundance
mpse_abund %<>% mp_rrarefy()
# 'chunks' represent the split number of each sample to calculate alpha
# diversity, default is 400. e.g. If a sample has total 40000
# reads, if chunks is 400, it will be split to 100 sub-samples
# (100, 200, 300,..., 40000), then alpha diversity index was
# calculated based on the sub-samples. 
# '.abundance' the column name of abundance, if the '.abundance' is not be 
# rarefied calculate rarecurve, user can specific 'force=TRUE'.

# + RareAbundance
mpse_abund %<>% 
    mp_cal_rarecurve(
        .abundance = RareAbundance,
        chunks = 400
    )
# The RareAbundanceRarecurve column will be added the colData slot 
# automatically (default action="add")
mpse_abund %>% print(width=180, n=100)

# default will display the confidence interval around smooth.
# se=TRUE
p1 <- mpse_abund %>% 
      mp_plot_rarecurve(
        .rare = RareAbundanceRarecurve, 
        .alpha = Observe,
      )

p2 <- mpse_abund %>% 
      mp_plot_rarecurve(
        .rare = RareAbundanceRarecurve, 
        .alpha = Observe, 
        .group = pre_post_stroke
      ) +
      scale_color_manual(values=c("#00A087FF", "#3C5488FF")) +
      scale_fill_manual(values=c("#00A087FF", "#3C5488FF"), guide="none")

# combine the samples belong to the same groups if plot.group=TRUE
p3 <- mpse_abund %>% 
      mp_plot_rarecurve(
        .rare = RareAbundanceRarecurve, 
        .alpha = "Observe", 
        .group = pre_post_stroke, 
        plot.group = TRUE
      ) +
      scale_color_manual(values=c("#00A087FF", "#3C5488FF")) +
      scale_fill_manual(values=c("#00A087FF", "#3C5488FF"),guide="none")

png("rarefaction_of_samples_or_groups.png", width=1080, height=600)
p1 + p2 + p3
dev.off()

# ------------------------------------------
# 3.3. calculate alpha index and visualization
library(ggplot2)
library(MicrobiotaProcess)
mpse_abund %<>% 
    mp_cal_alpha(.abundance=RareAbundance)
mpse_abund

f1 <- mpse_abund %>% 
      mp_plot_alpha(
        .group=pre_post_stroke, 
        .alpha=c(Observe, Chao1, ACE, Shannon, Simpson, Pielou)
      ) +
      scale_fill_manual(values=c("#00A087FF", "#3C5488FF"), guide="none") +
      scale_color_manual(values=c("#00A087FF", "#3C5488FF"), guide="none")

f2 <- mpse_abund %>%
      mp_plot_alpha(
        .alpha=c(Observe, Chao1, ACE, Shannon, Simpson, Pielou)
      )
png("alpha_diversity_comparison.png", width=1000, height=1000)
f1 / f2
dev.off()

# -------------------------------------------
# 3.4. The visualization of taxonomy abundance (Class)
mpse_abund %<>%
    mp_cal_abundance( # for each samples
      .abundance = RareAbundance
    ) %>%
    mp_cal_abundance( # for each groups 
      .abundance=RareAbundance,
      .group=pre_post_stroke
    )
mpse_abund

# visualize the relative abundance of top 20 phyla for each sample.
p1 <- mpse_abund %>%
        mp_plot_abundance(
          .abundance=RareAbundance,
          .group=time, 
          taxa.class = Class, 
          topn = 20,
          relative = TRUE
        )
# visualize the abundance (rarefied) of top 20 phyla for each sample.
p2 <- mpse_abund %>%
          mp_plot_abundance(
            .abundance=RareAbundance,
            .group=time,
            taxa.class = Class,
            topn = 20,
            relative = FALSE
          )
png("relative_abundance_and_abundance.png", width= 1200, height=600) #NOT PRODUCED!
p1 / p2
dev.off()

h1 <- mpse_abund %>%
        mp_plot_abundance(
          .abundance = RareAbundance,
          .group = pre_post_stroke,
          taxa.class = Class,
          relative = TRUE,
          topn = 20,
          geom = 'heatmap',
          features.dist = 'euclidean',
          features.hclust = 'average',
          sample.dist = 'bray',
          sample.hclust = 'average'
        )

h2 <- mpse_abund %>%
          mp_plot_abundance(
            .abundance = RareAbundance,
            .group = pre_post_stroke,
            taxa.class = Class,
            relative = FALSE,
            topn = 20,
            geom = 'heatmap',
            features.dist = 'euclidean',
            features.hclust = 'average',
            sample.dist = 'bray',
            sample.hclust = 'average'
          )
# the character (scale or theme) of figure can be adjusted by set_scale_theme
# refer to the mp_plot_dist
png("relative_abundance_and_abundance_heatmap.png", width= 1200, height=600)
aplot::plot_list(gglist=list(h1, h2), tag_levels="A")
dev.off()

# visualize the relative abundance of top 20 class for each .group (pre_post_stroke)
p3 <- mpse_abund %>%
        mp_plot_abundance(
            .abundance=RareAbundance, 
            .group=pre_post_stroke,
            taxa.class = Class,
            topn = 20,
            plot.group = TRUE
          )

# visualize the abundance of top 20 phyla for each .group (time)
p4 <- mpse_abund %>%
          mp_plot_abundance(
            .abundance=RareAbundance,
            .group= pre_post_stroke,
            taxa.class = Class,
            topn = 20,
            relative = FALSE,
            plot.group = TRUE
          )
png("relative_abundance_and_abundance_groups.png", width= 1000, height=1000)
p3 / p4
dev.off()

# ---------------------------
# 3.5. Beta diversity analysis

# ---------------------------------------------
# 3.5.1 The distance between samples or groups
# standardization
# mp_decostand wraps the decostand of vegan, which provides
# many standardization methods for community ecology.
# default is hellinger, then the abundance processed will
# be stored to the assays slot. 
mpse_abund %<>% 
    mp_decostand(.abundance=Abundance)
mpse_abund

# calculate the distance between the samples.
# the distance will be generated a nested tibble and added to the
# colData slot.
mpse_abund %<>% mp_cal_dist(.abundance=hellinger, distmethod="bray")
mpse_abund

# mp_plot_dist provides there methods to visualize the distance between the samples or groups
# when .group is not provided, the dot heatmap plot will be return
p1 <- mpse_abund %>% mp_plot_dist(.distmethod = bray)
png("distance_between_samples.png", width= 1000, height=1000)
p1
dev.off()

# when .group is provided, the dot heatmap plot with group information will be return.
p2 <- mpse_abund %>% mp_plot_dist(.distmethod = bray, .group = pre_post_stroke)
# The scale or theme of dot heatmap plot can be adjusted using set_scale_theme function.
p2 %>% set_scale_theme(
          x = scale_fill_manual(
                values=c("orange", "deepskyblue"), 
                guide = guide_legend(
                            keywidth = 1, 
                            keyheight = 0.5, 
                            title.theme = element_text(size=8),
                            label.theme = element_text(size=6)
                )
              ), 
          aes_var = pre_post_stroke # specific the name of variable 
      ) %>%
      set_scale_theme(
          x = scale_color_gradient(
                guide = guide_legend(keywidth = 0.5, keyheight = 0.5)
              ),
          aes_var = bray
      ) %>%
      set_scale_theme(
          x = scale_size_continuous(
                range = c(0.1, 3),
                guide = guide_legend(keywidth = 0.5, keyheight = 0.5)
              ),
          aes_var = bray
      )
png("distance_between_samples_with_group_info.png", width= 1000, height=1000)
p2
dev.off()

# when .group is provided and group.test is TRUE, the comparison of different groups will be returned
p3 <- mpse_abund %>% mp_plot_dist(.distmethod = bray, .group = pre_post_stroke, group.test=TRUE, textsize=2)
png("comparison_of_distance.png", width= 1000, height=1000)
p3
dev.off()

# -----------------------
# 3.5.2 The PCoA analysis

#install.packages("corrr")
library(corrr)
#install.packages("ggside")
library(ggside)
mpse_abund %<>% 
    mp_cal_pcoa(.abundance=hellinger, distmethod="bray")
# The dimensions of ordination analysis will be added the colData slot (default).
mpse_abund
#> methods(class=class(mpse_abund))
# [1] [                        [[<-                     [<-                     
# [4] $                        $<-                      arrange                 
# [7] as_tibble                as.data.frame            as.phyloseq             
#[10] coerce                   coerce<-                 colData<-               
#[13] distinct                 filter                   group_by                
#[16] left_join                mp_adonis                mp_aggregate_clade      
#[19] mp_aggregate             mp_anosim                mp_balance_clade        
#[22] mp_cal_abundance         mp_cal_alpha             mp_cal_cca              
#[25] mp_cal_clust             mp_cal_dca               mp_cal_dist             
#[28] mp_cal_nmds              mp_cal_pca               mp_cal_pcoa             
#[31] mp_cal_pd_metric         mp_cal_rarecurve         mp_cal_rda              
#[34] mp_cal_upset             mp_cal_venn              mp_decostand            
#[37] mp_diff_analysis         mp_diff_clade            mp_envfit               
#[40] mp_extract_abundance     mp_extract_assays        mp_extract_dist         
#[43] mp_extract_feature       mp_extract_internal_attr mp_extract_rarecurve    
#[46] mp_extract_refseq        mp_extract_sample        mp_extract_taxonomy     
#[49] mp_extract_tree          mp_filter_taxa           mp_mantel               
#[52] mp_mrpp                  mp_plot_abundance        mp_plot_alpha           
#[55] mp_plot_diff_boxplot     mp_plot_diff_res         mp_plot_dist            
#[58] mp_plot_ord              mp_plot_rarecurve        mp_plot_upset           
#[61] mp_plot_venn             mp_rrarefy               mp_select_as_tip        
#[64] mp_stat_taxa             mutate                   otutree                 
#[67] otutree<-                print                    pull                    
#[70] refsequence              refsequence<-            rename                  
#[73] rownames<-               select                   show                    
# [ reached getOption("max.print") -- omitted 6 entries ]
#see '?methods' for accessing help and source code

# We also can perform adonis or anosim to check whether it is significant to the dissimilarities of groups.
mpse_abund %<>%
    mp_adonis(.abundance=hellinger, .formula=~Group, distmethod="bray", permutations=9999, action="add")
mpse_abund %>% mp_extract_internal_attr(name=adonis)

# ("1","2","5","6","7",  "15","16","17","18","19","20",  "29","30","31","32",  "40","41","42","43","44","46")
#div.df2[div.df2 == "Group1"] <- "aged.post"
#div.df2[div.df2 == "Group3"] <- "young.post"
#div.df2[div.df2 == "Group5"] <- "aged.post"
#div.df2[div.df2 == "Group7"] <- "young.post"

# ("8","9","10","12","13","14",  "21","22","23","24","25","26","27","28",  "33","34","35","36","37","38","39","51",  "47","48","49","50","52","53","55")
#div.df2[div.df2 == "Group2"] <- "aged.pre"
#div.df2[div.df2 == "Group4"] <- "young.pre"
#div.df2[div.df2 == "Group6"] <- "aged.pre"
#div.df2[div.df2 == "Group8"] <- "young.pre"

#Group1: f.aged and post
#Group2: f.aged and pre
#Group3: f.young and post
#Group4: f.young and pre
#Group5: m.aged and post
#Group6: m.aged and pre
#Group7: m.young and post
#Group8: m.young and pre

#[,c("1","2","5","6","7",                "8","9","10","12","13","14")]
#[,c("15","16","17","18","19","20",      "21","22","23","24","25","26","27","28")]
#[,c("29","30","31","32",                "33","34","35","36","37","38","39","51")]
#[,c("40","41","42","43","44","46",      "47","48","49","50","52","53","55")]

p1 <- mpse_abund %>%
        mp_plot_ord(
          .ord = pcoa, 
          .group = Group, 
          .color = Group, 
          .size = 2.4,
          .alpha = 1,
          ellipse = TRUE,
          show.legend = FALSE # don't display the legend of stat_ellipse
        ) +
        scale_fill_manual(
          #values = c("#a6cee3", "#1f78b4", "#b2df8a", "#33a02c", "#fb9a99", "#e31a1c", "#cab2d6", "#6a3d9a"), 
          #values = c("#a6cee3", "#b2df8a", "#fb9a99", "#cab2d6"), 
          values = c("#1f78b4", "#33a02c", "#e31a1c", "#6a3d9a"), 
          guide = guide_legend(keywidth=1.6, keyheight=1.6, label.theme=element_text(size=12))
        ) +
        scale_color_manual(
          #values=c("#a6cee3", "#1f78b4", "#b2df8a", "#33a02c", "#fb9a99", "#e31a1c", "#cab2d6", "#6a3d9a"),
          #values = c("#a6cee3", "#b2df8a", "#fb9a99", "#cab2d6"),
          values = c("#1f78b4", "#33a02c", "#e31a1c", "#6a3d9a"), 
          guide = guide_legend(keywidth=1.6, keyheight=1.6, label.theme=element_text(size=12))
        )
        #scale_fill_manual(values=c("#FF0000", "#000000", "#0000FF", "#C0C0C0", "#00FF00", "#FFFF00", "#00FFFF", "#FFA500")) +
        #scale_color_manual(values=c("#FF0000", "#000000", "#0000FF", "#C0C0C0", "#00FF00", "#FFFF00", "#00FFFF", "#FFA500"))
        #scale_fill_manual(values=c("#00A087FF", "#3C5488FF")) +
        #scale_color_manual(values=c("#00A087FF", "#3C5488FF")) 
#png("PCoA.png", width= 1000, height=1000)
svg("PCoA.svg", width= 11, height=10)
#svg("PCoA_.svg", width=10, height=10)
#svg("PCoA.svg")
pdf("PCoA.pdf")
p1
dev.off()

# The size of point also can be mapped to other variables such as Observe, or Shannon 
# Then the alpha diversity and beta diversity will be displayed simultaneously.
p2 <- mpse_abund %>% 
        mp_plot_ord(
          .ord = pcoa, 
          .group = Group, 
          .color = Group, 
          .size = Shannon,
          .alpha = Observe,
          ellipse = TRUE,
          show.legend = FALSE # don't display the legend of stat_ellipse 
        ) +
        scale_fill_manual(
          values = c("#FF0000", "#000000", "#0000FF", "#C0C0C0", "#00FF00", "#FFFF00", "#00FFFF", "#FFA500"), 
          guide = guide_legend(keywidth=0.6, keyheight=0.6, label.theme=element_text(size=8))
        ) +
        scale_color_manual(
          values=c("#FF0000", "#000000", "#0000FF", "#C0C0C0", "#00FF00", "#FFFF00", "#00FFFF", "#FFA500"),
          guide = guide_legend(keywidth=0.6, keyheight=0.6, label.theme=element_text(size=8))
        ) +
        scale_size_continuous(
          range=c(0.5, 3),
          guide = guide_legend(keywidth=0.6, keyheight=0.6, label.theme=element_text(size=8))
        )

# ------------------------------------------
# 3.5.3 Hierarchical cluster (tree) analysis
#input should contain hellinger!
mpse_abund %<>%
      mp_cal_clust(
        .abundance = hellinger, 
        distmethod = "bray",
        hclustmethod = "average", # (UPGAE)
        action = "add" # action is used to control which result will be returned
      )
mpse_abund

# if action = 'add', the result of hierarchical cluster will be added to the MPSE object
# mp_extract_internal_attr can extract it. It is a treedata object, so it can be visualized
# by ggtree.
sample.clust <- mpse_abund %>% mp_extract_internal_attr(name='SampleClust')
sample.clust

library(ggtree)
p_cluster <- ggtree(sample.clust) + 
      geom_tippoint(aes(color=pre_post_stroke)) +
      geom_tiplab(as_ylab = TRUE) +
      ggplot2::scale_x_continuous(expand=c(0, 0.01))
png("hierarchical_cluster1.png", width= 1000, height=800)
p_cluster
dev.off()

library(ggtreeExtra)
library(ggplot2)
# # Extract relative abundance of phyla
# phyla.tb <- mouse.time.mpse %>% 
#             mp_extract_abundance(taxa.class=Phylum, topn=30)
# # The abundance of each samples is nested, it can be flatted using the unnest of tidyr.
# phyla.tb %<>% tidyr::unnest(cols=RareAbundanceBySample) %>% dplyr::rename(Phyla="label")
# phyla.tb
#
# p1 <- p + 
#       geom_fruit(
#          data=phyla.tb,
#          geom=geom_col,
#          mapping = aes(x = RelRareAbundanceBySample, 
#                        y = Sample, 
#                        fill = Phyla
#                  ),
#          orientation = "y",
#          #offset = 0.4,
#          pwidth = 3, 
#          axis.params = list(axis = "x", 
#                             title = "The relative abundance of phyla (%)",
#                             title.size = 4,
#                             text.size = 2, 
#                             vjust = 1),
#          grid.params = list()
#       )
# png("hierarchical_cluster2.png", width = 1000, height = 800)
# p1
# dev.off()

# Extract relative abundance of classes
mpse_abund %>% print(width=150)
class.tb <- mpse_abund %>% 
            mp_extract_abundance(taxa.class = Class, topn = 30)
# Flatten and rename the columns
class.tb %<>% tidyr::unnest(cols = RareAbundanceBySample) %>% dplyr::rename(Class = "label")
# View the data frame
class.tb
# Create the plot
p1 <- p + 
      geom_fruit(
        data = class.tb,
        geom = geom_col,
        mapping = aes(x = RelRareAbundanceBySample, 
                      y = Sample, 
                      fill = Class
                ),
        orientation = "y",
        pwidth = 3, 
        axis.params = list(axis = "x", 
                            title = "The relative abundance of classes (%)",
                            title.size = 4,
                            text.size = 2, 
                            vjust = 1),
        grid.params = list()
      )

# Save the plot to a file
png("hierarchical_cluster2.png", width = 1000, height = 800)
print(p1)
dev.off()

# -----------------------
# 3.6 Biomarker discovery
library(ggtree)
library(ggtreeExtra)
library(ggplot2)
library(MicrobiotaProcess)
library(tidytree)
library(ggstar)
library(forcats)
mpse_abund %>% print(width=150)

mpse_abund %<>%
    mp_cal_abundance( # for each samples
      .abundance = RareAbundance
    ) %>%
    mp_cal_abundance( # for each groups 
      .abundance=RareAbundance,
      .group=pre_post_stroke
    )
mpse_abund

mpse_abund %<>%
    mp_diff_analysis(
      .abundance = RelRareAbundanceBySample,
      .group = pre_post_stroke,
      first.test.alpha = 0.01
    )

# The result is stored to the taxatree or otutree slot, you can use mp_extract_tree to extract the specific slot.
taxa.tree <- mpse_abund %>% 
              mp_extract_tree(type="taxatree")
taxa.tree

## And the result tibble of different analysis can also be extracted 
## with tidytree (>=0.3.5)
taxa.tree %>% select(label, nodeClass, LDAupper, LDAmean, LDAlower, Sign_pre_post_stroke, pvalue, fdr) %>% dplyr::filter(!is.na(fdr))
taxa.tree %>% print(width=150, n=100)

#.data, layout, tree.type, .taxa.class, tiplab.size, offset.abun, pwidth.abun, offset.effsize, pwidth.effsize, group.abun, tiplab.linetype
p <- mpse_abund %>%
      mp_plot_diff_res(
        group.abun = TRUE,
        pwidth.abun=0.05,
        offset.abun=0.02,
        pwidth.effsize=0.3,
        offset.effsize=0.46,
        tiplab.size = 4.9
      ) +
      scale_fill_manual(values=c("deepskyblue", "orange")) +
      scale_fill_manual(
        aesthetics = "fill_new", # The fill aes was renamed to "fill_new" for the abundance dotplot layer
        values = c("deepskyblue", "orange")
      ) +
      scale_fill_manual(
        aesthetics = "fill_new_new", # The fill aes for hight light layer of tree was renamed to 'fill_new_new'
        values = c("#E41A1C", "#377EB8", "#4DAF4A",
                    "#984EA3", "#FF7F00", "#FFFF33",
                    "#A65628", "#F781BF", "#00FFFF", "#999999"
                  )
      ) + 
      theme(
    axis.title = element_text(size = 28),        # Font size for axis titles
    axis.text = element_text(size = 28),          # Font size for axis text
    plot.title = element_text(size = 28),         # Font size for plot title
    legend.title = element_text(size = 16),       # Font size for legend title
    legend.text = element_text(size = 14)         # Font size for legend text
  )
#p$layers[[2]]$geom <- geom_tiplab(fontsize = 22)  # Change 12 to the desired font size

png("differently_expressed_otu.png", width=2000, height=2000)
#svg("p7.svg",width=8, height=8)
p
dev.off()

f <- mpse_abund %>%
    mp_plot_diff_cladogram(
      label.size = 2.5,
      hilight.alpha = .3,
      bg.tree.size = .5,
      bg.point.size = 2,
      bg.point.stroke = .25
    ) +
    scale_fill_diff_cladogram( # set the color of different group.
      values = c('deepskyblue', 'orange')
    ) +
    scale_size_continuous(range = c(1, 4))
#png("cladogram.png", width=1000, height=1000)
svg("cladogram.svg", width=10, height=10)
f
dev.off()

## Extract the OTU table and taxonomy table from the phyloseq object
#otu_table <- phyloseq::otu_table(ps.ng.tax_abund) %>% as.data.frame() %>% as.matrix()
#tax_table <- phyloseq::tax_table(ps.ng.tax_abund) %>% as.data.frame() %>% as.matrix()
#write.csv(otu_table, file="otu_table.csv")
#write.csv(tax_table, file="tax_table.csv")
#~/Tools/csv2xls-0.4/csv_to_xls.py otu_table.csv tax_table.csv -d',' -o otu_tax.xls

3.1. Environment Setup: It sets up a Conda environment named picrust2, using the conda create command and then activates this environment using conda activate picrust2.

    #https://github.com/picrust/picrust2/wiki/PICRUSt2-Tutorial-(v2.2.0-beta)#minimum-requirements-to-run-full-tutorial
    conda create -n picrust2 -c bioconda -c conda-forge picrust2=2.2.0_b
    conda activate picrust2

3.2. Data Preparation: The script creates a new directory called picrust2_out, then enters it using mkdir and cd commands. It then identifies input files that are needed for the analysis: metadata.tsv, seqs.fna, table.biom. The biom commands are used to inspect and convert the BIOM format files.

    mkdir picrust2_out
    cd picrust2_out

    # Identifying input data
    # Note: Replace the paths and filenames with your actual data if different
    # metadata.tsv == ../map_corrected.txt
    # seqs.fna     == ../clustering/seqs.fna
    # table.biom   == ../core_diversity_e42369/table_even42369.biom

    # Inspect and convert the BIOM format files
    biom head -i ../core_diversity_e42369/table_even42369.biom
    biom summarize-table -i ../core_diversity_e42369/table_even42369.biom
    biom convert -i ../core_diversity_e42369/table_even42369.biom -o table_even42369.tsv --to-tsv

3.3. Running PiCRUST2: The place_seqs.py command aligns the input sequences to a reference tree. The hsp.py commands generate hidden state prediction for multiple functional categories.

    #insert reads into reference tree using EPA-NG
    cp ../clustering/rep_set.fna ./
    grep ">" rep_set.fna | wc -l  #44238
    vim table_even42369.tsv       #40596-2

    samtools faidx rep_set.fna
    cut -f1-1 table_even42369.tsv > table_even42369.id
    #manually modify table_even42369.id by replacing "\n" with " >> seqs.fna\nsamtools faidx rep_set.fna "
    run table_even42369.id

    rm -rf intermediate/
    place_seqs.py -s seqs.fna -o out.tre -p 64 --intermediate intermediate/place_seqs

    #castor: Efficient Phylogenetics on Large Trees
    #https://github.com/picrust/picrust2/wiki/Hidden-state-prediction

    hsp.py -i 16S -t out.tre -o 16S_predicted_and_nsti.tsv.gz -p 100 -n
    hsp.py -i COG -t out.tre -o COG_predicted.tsv.gz -p 100
    hsp.py -i PFAM -t out.tre -o PFAM_predicted.tsv.gz -p 100
    hsp.py -i KO -t out.tre -o KO_predicted.tsv.gz -p 100
    hsp.py -i EC -t out.tre -o EC_predicted.tsv.gz -p 100
    hsp.py -i TIGRFAM -t out.tre -o TIGRFAM_predicted.tsv.gz -p 100
    hsp.py -i PHENO -t out.tre -o PHENO_predicted.tsv.gz -p 100

>In this table the predicted copy number of all Enzyme Classification (EC) numbers is shown for each ASV. The NSTI values per ASV are not in this table since we did not specify the -n option. EC numbers are a type of gene family defined based on the chemical reactions they catalyze. For instance, EC:1.1.1.1 corresponds to alcohol dehydrogenase. In this tutorial we are focusing on EC numbers since they can be used to infer MetaCyc pathway levels (see below).

    zless -S EC_predicted.tsv.gz
    sequence        EC:1.1.1.1      EC:1.1.1.10     EC:1.1.1.100    ...
    20e568023c10eaac834f1c110aacea18        2       0       3    ...
    23fe12a325dfefcdb23447f43b6b896e        0       0       1    ...
    288c8176059111c4c7fdfb0cd5afce64        1       0       1    ...
    ...

    ##Why import the tsv file to MyData?
    #MyData <- read.csv(file="./COG_predicted.tsv", header=TRUE, sep="\t", row.names=1)   #6806 4598  e.g. COG5665
    #MyData <- read.csv(file="./PFAM_predicted.tsv", header=TRUE, sep="\t", row.names=1)  #6806 11089 e.g. PF17225
    #MyData <- read.csv(file="./KO_predicted.tsv", header=TRUE, sep="\t", row.names=1)    #6806 10543 e.g. K19791
    #MyData <- read.csv(file="./EC_predicted.tsv", header=TRUE, sep="\t", row.names=1)    #6806 2913  e.g. EC.6.6.1.2
    #MyData <- read.csv(file="./16S_predicted.tsv", header=TRUE, sep="\t", row.names=1)   #6806    1     e.g. X16S_rRNA_Count
    #MyData <- read.csv(file="./TIGRFAM_predicted.tsv", header=TRUE, sep="\t", row.names=1)  #6806 4287  e.g. TIGR04571
    #MyData <- read.csv(file="./PHENO_predicted.tsv", header=TRUE, sep="\t", row.names=1)    #6806   41  e.g. Use_of_nitrate_as_electron_acceptor, Xylose_utilizing

3.4. The metagenome_pipeline.py commands perform metagenomic prediction for several functional categories. Predicted gene families weighted by the relative abundance of ASVs in their community. In other words, we are interested in inferring the metagenomes of the communities.

    #Generate metagenome predictions using EC numbers https://en.wikipedia.org/wiki/List_of_enzymes#Category:EC_1.1_(act_on_the_CH-OH_group_of_donors)
    metagenome_pipeline.py -i ../core_diversity_e42369/table_even42369.biom -m 16S_predicted_and_nsti.tsv.gz -f COG_predicted.tsv.gz -o COG_metagenome_out --strat_out
    metagenome_pipeline.py -i ../core_diversity_e42369/table_even42369.biom -m 16S_predicted_and_nsti.tsv.gz -f EC_predicted.tsv.gz -o EC_metagenome_out --strat_out
    metagenome_pipeline.py -i ../core_diversity_e42369/table_even42369.biom -m 16S_predicted_and_nsti.tsv.gz -f KO_predicted.tsv.gz -o KO_metagenome_out --strat_out
    metagenome_pipeline.py -i ../core_diversity_e42369/table_even42369.biom -m 16S_predicted_and_nsti.tsv.gz -f PFAM_predicted.tsv.gz -o PFAM_metagenome_out --strat_out
    metagenome_pipeline.py -i ../core_diversity_e42369/table_even42369.biom -m 16S_predicted_and_nsti.tsv.gz -f TIGRFAM_predicted.tsv.gz -o TIGRFAM_metagenome_out --strat_out

3.5. Pathway-level inference: By default this script infers MetaCyc pathway abundances based on EC number abundances, although different gene families and pathways can also be optionally specified. This script performs a number of steps by default, which are based on the approach implemented in HUMAnN2:

Regroups EC numbers to MetaCyc reactions.
Infers which MetaCyc pathways are present based on these reactions with MinPath.

Calculates and returns the abundance of pathways identified as present.

pathway_pipeline.py -i EC_metagenome_out/pred_metagenome_contrib.tsv.gz -o pathways_out -p 15

#Note that the path of map files is under /home/jhuang/anaconda3/envs/picrust2/lib/python3.6/site-packages/picrust2/default_files/pathway_mapfiles
pathway_pipeline.py -i COG_metagenome_out/pred_metagenome_contrib.tsv.gz -o KEGG_pathways_out -p 15 --no_regroup --map /home/jhuang/anaconda3/envs/picrust2/lib/python3.6/site-packages/picrust2/default_files/pathway_mapfiles/KEGG_pathways_to_KO.tsv

#Mapping predicted KO abundances to legacy KEGG pathways (with stratified output that represents contributions to community-wide abundances):
pathway_pipeline.py -i KO_metagenome_out/pred_metagenome_strat.tsv.gz -o KEGG_pathways_out --no_regroup --map /home/jhuang/anaconda3/envs/picrust2/lib/python3.6/site-packages/picrust2/default_files/pathway_mapfiles/KEGG_pathways_to_KO.tsv

#Map EC numbers to MetaCyc pathways and get stratified output corresponding to contribution of predicted gene family abundances within each predicted genome:
pathway_pipeline.py -i EC_metagenome_out/pred_metagenome_unstrat.tsv.gz -o pathways_out_per_seq --per_sequence_contrib --per_sequence_abun EC_metagenome_out/seqtab_norm.tsv.gz --per_sequence_function EC_predicted.tsv.gz

3.6. Add functional descriptions: Finally, it can be useful to have a description of each functional id in the output abundance tables. The below commands will add these descriptions as new column in gene family and pathway abundance tables

    #--6.1. Add descriptions in gene family tables
    add_descriptions.py -i COG_metagenome_out/pred_metagenome_unstrat.tsv.gz -m COG -o COG_metagenome_out/pred_metagenome_unstrat_descrip.tsv.gz
    add_descriptions.py -i EC_metagenome_out/pred_metagenome_unstrat.tsv.gz -m EC -o EC_metagenome_out/pred_metagenome_unstrat_descrip.tsv.gz
    add_descriptions.py -i KO_metagenome_out/pred_metagenome_unstrat.tsv.gz -m KO -o KO_metagenome_out/pred_metagenome_unstrat_descrip.tsv.gz   # EC and METACYC is a pair, EC for gene_annotation and METACYC for pathway_annotation
    add_descriptions.py -i PFAM_metagenome_out/pred_metagenome_unstrat.tsv.gz -m PFAM -o PFAM_metagenome_out/pred_metagenome_unstrat_descrip.tsv.gz
    add_descriptions.py -i TIGRFAM_metagenome_out/pred_metagenome_unstrat.tsv.gz -m TIGRFAM -o TIGRFAM_metagenome_out/pred_metagenome_unstrat_descrip.tsv.gz

    #--6.2. Add descriptions in pathway abundance tables
    add_descriptions.py -i pathways_out/path_abun_unstrat.tsv.gz -m METACYC -o pathways_out/path_abun_unstrat_descrip.tsv.gz
    gunzip path_abun_unstrat_descrip.tsv.gz

    #Error - no rows remain after regrouping input table. The default pathway and regroup mapfiles are meant for EC numbers. Note that KEGG pathways are not supported since KEGG is a closed-source database, but you can input custom pathway mapfiles if you have access. If you are using a custom function database did you mean to set the --no-regroup flag and/or change the default pathways mapfile used?
    #If ERROR --> USE the METACYC for downstream analyses!!!

    add_descriptions.py -i pathways_out/path_abun_unstrat.tsv.gz -o KEGG_pathways_out/path_abun_unstrat_descrip.tsv.gz --custom_map_table /home/jhuang/anaconda3/envs/picrust2/lib/python3.6/site-packages/picrust2/default_files/description_mapfiles/KEGG_pathways_info.tsv.gz

3.7. Visualization

    #7.1 STAMP
    #https://github.com/picrust/picrust2/wiki/STAMP-example
    conda activate stamp; #import path_abun_unstrat_descrip.tsv to STAMP
    conda deactivate
    conda install -c bioconda stamp

    #conda install -c bioconda stamp
    #sudo pip install pyqi
    #sudo apt-get install libblas-dev liblapack-dev gfortran
    #sudo apt-get install freetype* python-pip python-dev python-numpy python-scipy python-matplotlib
    #sudo pip install STAMP
    #conda install -c bioconda stamp

    conda create -n stamp -c bioconda/label/cf201901 stamp
    brew install pyqt

    #DEBUG the environment
    conda install pyqt=4
    #conda install icu=56

    e.g. path_abun_unstrat_descrip.tsv.gz and metadata.tsv from the tutorial)
    cut -d$'\t' -f1 map_corrected.txt > 1
    cut -d$'\t' -f5 map_corrected.txt > 5
    cut -d$'\t' -f6 map_corrected.txt > 6
    paste -d$'\t' 1 5 > 1_5
    paste -d$'\t' 1_5 6 > metadata.tsv
    #SampleID --> SampleID
    SampleID    Facility    Genotype
    100CHE6KO   PaloAlto    KO
    101CHE6WT   PaloAlto    WT

    #7.2. ALDEx2
    https://bioconductor.org/packages/release/bioc/html/ALDEx2.html

    #7.3. Convert png to svg and pdf
    inkscape error_bar.png --export-plain-svg=error_bar.svg (embbed)

    sudo apt update
    sudo apt install autotrace

    sudo apt-get install -y libpng-dev libtiff-dev imagemagick
    git clone https://github.com/autotrace/autotrace.git

    cd autotrace
    #sudo apt install intltool
    #sudo apt install gettext libglib2.0-dev
    #sudo apt install libtool libtool-bin
    #sudo apt install automake
    sudo apt-get install libxml-parser-perl
    ./autogen.sh
    ./configure
    make

    autotrace -output-format svg -output-file error_bar.svg error_bar.png

like unlike

点赞本文的读者

还没有人对此文章表态

本文有评论

没有评论

Phyloseq + MicrobiotaProcess + PICRUSt2

本文有评论

看文章，发评论，不要沉默

最受欢迎文章

最新文章

最多评论文章

推荐相似文章