Updated Step 5: PNG Figures + Complete Excel Exports
Prerequisites Update
Add openxlsx to your package list (for Excel export):
# Install if needed
install.packages(c("phyloseq", "ggplot2", "vegan", "dplyr",
"tidyr", "pheatmap", "openxlsx"))
And load it:
library(openxlsx)
Complete Step 5 — Replace your existing Step 5 with this
# =============================================================================
# STEP 5 — Visualisations (PNG) + Complete Excel exports
# =============================================================================
dir.create("figures", showWarnings = FALSE)
dir.create("tables", showWarnings = FALSE)
# Helper: safe log2 fold change (adds pseudocount to avoid log(0))
safe_log2fc <- function(x, y, pseudo = 1e-6) {
log2((x + pseudo) / (y + pseudo))
}
# =============================================================================
# 5a. Top-N species stacked bar plot (Loc1 vs Loc4)
# =============================================================================
top_n_sp <- 20
top_species <- names(sort(rowMeans(otu_table(species_ps)),
decreasing = TRUE))[1:top_n_sp]
ps_top <- prune_taxa(top_species, species_ps)
df_species <- psmelt(ps_top) %>%
mutate(Species = factor(Species, levels = rev(top_species)))
p_species <- ggplot(df_species, aes(x = Location, y = Abundance, fill = Species)) +
geom_bar(stat = "identity", position = "fill", width = 0.6) +
coord_flip() +
scale_fill_viridis_d(option = "D") +
labs(title = paste("Top", top_n_sp, "Species by Location"),
x = "Location", y = "Relative Abundance", fill = "Species") +
theme_minimal(base_size = 12) +
theme(legend.position = "bottom",
legend.key.size = unit(0.4, "cm"),
legend.text = element_text(size = 7))
ggsave("figures/species_top20_barplot.png", p_species,
width = 10, height = 8, dpi = 300, bg = "white")
cat("✅ Saved: figures/species_top20_barplot.png\n")
# =============================================================================
# 5b. Species heatmap (all species, row-scaled)
# =============================================================================
otu_mat <- as.matrix(otu_table(species_ps))
# Filter species with very low abundance (max < 0.01% across both samples)
keep <- apply(otu_mat, 1, max) > 0.0001
otu_filt <- otu_mat[keep, , drop = FALSE]
# Annotation column
ann_col <- data.frame(Location = metadata[colnames(otu_filt), "Location"],
row.names = colnames(otu_filt))
# Write heatmap to PNG
png("figures/species_heatmap.png",
width = 10, height = max(8, 0.25 * nrow(otu_filt) + 2),
units = "in", res = 300)
pheatmap(otu_filt,
scale = "row",
clustering_distance_rows = "euclidean",
clustering_method = "complete",
annotation_col = ann_col,
main = "Species Abundance Heatmap (row-scaled)",
fontsize_row = 6,
fontsize_col = 10,
show_rownames = nrow(otu_filt) <= 80)
dev.off()
cat("✅ Saved: figures/species_heatmap.png\n")
# =============================================================================
# 5c. Top pathways stacked bar plot
# =============================================================================
top_n_pw <- 20
top_pw_names <- names(sort(rowMeans(otu_table(pathway_ps)),
decreasing = TRUE))[1:top_n_pw]
ps_pw_top <- prune_taxa(top_pw_names, pathway_ps)
df_pw <- psmelt(ps_pw_top) %>%
mutate(Pathway = factor(Pathway, levels = rev(top_pw_names)))
p_pw <- ggplot(df_pw, aes(x = Location, y = Abundance, fill = Pathway)) +
geom_bar(stat = "identity", position = "fill", width = 0.6) +
coord_flip() +
scale_fill_viridis_d(option = "C") +
labs(title = paste("Top", top_n_pw, "HUMAnN Pathways by Location"),
x = "Location", y = "Relative Abundance", fill = "Pathway") +
theme_minimal(base_size = 12) +
theme(legend.position = "bottom",
legend.key.size = unit(0.4, "cm"),
legend.text = element_text(size = 7))
ggsave("figures/pathways_top20_barplot.png", p_pw,
width = 10, height = 8, dpi = 300, bg = "white")
cat("✅ Saved: figures/pathways_top20_barplot.png\n")
# =============================================================================
# 5d. Pathway dot plot (Loc1 vs Loc4)
# =============================================================================
df_pw_wide <- as.data.frame(otu_table(pathway_ps)) %>%
rownames_to_column("Pathway") %>%
filter(Pathway %in% top_pw_names) %>%
pivot_longer(-Pathway, names_to = "Sample", values_to = "Abundance") %>%
left_join(data.frame(Sample = rownames(metadata_clean),
Location = metadata_clean$Location,
stringsAsFactors = FALSE),
by = "Sample")
p_dot <- ggplot(df_pw_wide, aes(x = Location, y = Pathway,
size = Abundance, color = Abundance)) +
geom_point() +
scale_color_viridis_c() +
labs(title = "Pathway Abundance: Loc1 vs Loc4",
x = "Location", y = "Pathway") +
theme_minimal(base_size = 11) +
theme(axis.text.y = element_text(size = 7))
ggsave("figures/pathway_dotplot.png", p_dot,
width = 8, height = 10, dpi = 300, bg = "white")
cat("✅ Saved: figures/pathway_dotplot.png\n")
# =============================================================================
# 5e. COMPLETE species list → Excel
# =============================================================================
# Build full species table (ALL species, no cutoff)
sp_full <- as.data.frame(otu_table(species_ps)) %>%
rownames_to_column("Species")
# Ensure columns exist (defensive)
if (!all(c("Soil_Loc1", "Soil_Loc4") %in% colnames(sp_full))) {
stop("⚠️ Expected columns 'Soil_Loc1' and 'Soil_Loc4' not found in species OTU table.")
}
sp_full <- sp_full %>%
mutate(
Abundance_Loc1 = Soil_Loc1,
Abundance_Loc4 = Soil_Loc4,
Diff_Loc4_minus_Loc1 = Soil_Loc4 - Soil_Loc1,
Log2FC_Loc4_vs_Loc1 = safe_log2fc(Soil_Loc4, Soil_Loc1),
Present_in_Loc1 = Soil_Loc1 > 0,
Present_in_Loc4 = Soil_Loc4 > 0,
Total_Abundance = Soil_Loc1 + Soil_Loc4,
Mean_Abundance = (Soil_Loc1 + Soil_Loc4) / 2
) %>%
select(Species,
Abundance_Loc1, Abundance_Loc4,
Diff_Loc4_minus_Loc1, Log2FC_Loc4_vs_Loc1,
Present_in_Loc1, Present_in_Loc4,
Total_Abundance, Mean_Abundance) %>%
arrange(desc(abs(Diff_Loc4_minus_Loc1)))
# Write multi-sheet Excel workbook for species
sp_wb <- createWorkbook()
addWorksheet(sp_wb, "All_Species")
addWorksheet(sp_wb, "Top50_by_Diff")
addWorksheet(sp_wb, "Top50_by_Abundance")
addWorksheet(sp_wb, "Loc1_only")
addWorksheet(sp_wb, "Loc4_only")
addWorksheet(sp_wb, "Shared")
writeData(sp_wb, "All_Species", sp_full)
writeData(sp_wb, "Top50_by_Diff", head(sp_full, 50))
writeData(sp_wb, "Top50_by_Abundance",
sp_full %>% arrange(desc(Mean_Abundance)) %>% head(50))
writeData(sp_wb, "Loc1_only",
sp_full %>% filter(Present_in_Loc1 & !Present_in_Loc4))
writeData(sp_wb, "Loc4_only",
sp_full %>% filter(!Present_in_Loc1 & Present_in_Loc4))
writeData(sp_wb, "Shared",
sp_full %>% filter(Present_in_Loc1 & Present_in_Loc4))
# Formatting
header_style <- createStyle(textDecoration = "bold", bgFill = "#D3D3D3")
for (sh in c("All_Species","Top50_by_Diff","Top50_by_Abundance",
"Loc1_only","Loc4_only","Shared")) {
addStyle(sp_wb, sh, style = header_style, rows = 1, gridExpand = TRUE)
setColWidths(sp_wb, sh, cols = 1:ncol(sp_full), widths = "auto")
freezePane(sp_wb, sh, firstRow = TRUE)
}
saveWorkbook(sp_wb, "tables/species_Loc1_vs_Loc4.xlsx", overwrite = TRUE)
cat(sprintf("✅ Saved: tables/species_Loc1_vs_Loc4.xlsx (%d species total)\n",
nrow(sp_full)))
# =============================================================================
# 5f. COMPLETE pathway list → Excel
# =============================================================================
pw_full <- as.data.frame(otu_table(pathway_ps)) %>%
rownames_to_column("Pathway")
if (!all(c("Soil_Loc1", "Soil_Loc4") %in% colnames(pw_full))) {
stop("⚠️ Expected columns 'Soil_Loc1' and 'Soil_Loc4' not found in pathway OTU table.")
}
pw_full <- pw_full %>%
mutate(
Abundance_Loc1 = Soil_Loc1,
Abundance_Loc4 = Soil_Loc4,
Diff_Loc4_minus_Loc1 = Soil_Loc4 - Soil_Loc1,
Log2FC_Loc4_vs_Loc1 = safe_log2fc(Soil_Loc4, Soil_Loc1),
Present_in_Loc1 = Soil_Loc1 > 0,
Present_in_Loc4 = Soil_Loc4 > 0,
Total_Abundance = Soil_Loc1 + Soil_Loc4,
Mean_Abundance = (Soil_Loc1 + Soil_Loc4) / 2
) %>%
select(Pathway,
Abundance_Loc1, Abundance_Loc4,
Diff_Loc4_minus_Loc1, Log2FC_Loc4_vs_Loc1,
Present_in_Loc1, Present_in_Loc4,
Total_Abundance, Mean_Abundance) %>%
arrange(desc(abs(Diff_Loc4_minus_Loc1)))
# Write multi-sheet Excel workbook for pathways
pw_wb <- createWorkbook()
addWorksheet(pw_wb, "All_Pathways")
addWorksheet(pw_wb, "Top50_by_Diff")
addWorksheet(pw_wb, "Top50_by_Abundance")
addWorksheet(pw_wb, "Loc1_only")
addWorksheet(pw_wb, "Loc4_only")
addWorksheet(pw_wb, "Shared")
writeData(pw_wb, "All_Pathways", pw_full)
writeData(pw_wb, "Top50_by_Diff", head(pw_full, 50))
writeData(pw_wb, "Top50_by_Abundance",
pw_full %>% arrange(desc(Mean_Abundance)) %>% head(50))
writeData(pw_wb, "Loc1_only",
pw_full %>% filter(Present_in_Loc1 & !Present_in_Loc4))
writeData(pw_wb, "Loc4_only",
pw_full %>% filter(!Present_in_Loc1 & Present_in_Loc4))
writeData(pw_wb, "Shared",
pw_full %>% filter(Present_in_Loc1 & Present_in_Loc4))
for (sh in c("All_Pathways","Top50_by_Diff","Top50_by_Abundance",
"Loc1_only","Loc4_only","Shared")) {
addStyle(pw_wb, sh, style = header_style, rows = 1, gridExpand = TRUE)
setColWidths(pw_wb, sh, cols = 1:ncol(pw_full), widths = "auto")
freezePane(pw_wb, sh, firstRow = TRUE)
}
saveWorkbook(pw_wb, "tables/pathways_Loc1_vs_Loc4.xlsx", overwrite = TRUE)
cat(sprintf("✅ Saved: tables/pathways_Loc1_vs_Loc4.xlsx (%d pathways total)\n",
nrow(pw_full)))
# =============================================================================
# Summary
# =============================================================================
cat("\n========================================\n")
cat("STEP 5 COMPLETE\n")
cat("========================================\n")
cat("Figures (PNG, 300 dpi):\n")
cat(" • figures/species_top20_barplot.png\n")
cat(" • figures/species_heatmap.png\n")
cat(" • figures/pathways_top20_barplot.png\n")
cat(" • figures/pathway_dotplot.png\n")
cat("\nExcel tables (complete lists, no cutoff):\n")
cat(sprintf(" • tables/species_Loc1_vs_Loc4.xlsx (%d species)\n", nrow(sp_full)))
cat(sprintf(" • tables/pathways_Loc1_vs_Loc4.xlsx (%d pathways)\n", nrow(pw_full)))
cat("========================================\n")
What You Get
📊 Figures (PNG, 300 dpi, publication-ready)
| File | Content | Size |
|---|---|---|
species_top20_barplot.png |
Top 20 species stacked bar | 10 × 8 in |
species_heatmap.png |
All species above 0.01% threshold, row-scaled | 10 × auto (scales with # species) |
pathways_top20_barplot.png |
Top 20 pathways stacked bar | 10 × 8 in |
pathway_dotplot.png |
Top 20 pathways dot plot | 8 × 10 in |
📑 Excel Files (complete lists, no cutoff)
Each workbook contains 6 sheets:
| Sheet | Content | ||
|---|---|---|---|
All_Species / All_Pathways |
Every detected feature, sorted by absolute difference | ||
Top50_by_Diff |
Top 50 features by | Loc4 − Loc1 | |
Top50_by_Abundance |
Top 50 features by mean abundance | ||
Loc1_only |
Features detected only in Loc1 | ||
Loc4_only |
Features detected only in Loc4 | ||
Shared |
Features detected in both locations |
📋 Columns in each Excel file
| Column | Meaning |
|---|---|
Species / Pathway |
Feature name |
Abundance_Loc1 |
Raw relative abundance in Loc1 |
Abundance_Loc4 |
Raw relative abundance in Loc4 |
Diff_Loc4_minus_Loc1 |
Absolute difference (Loc4 − Loc1) |
Log2FC_Loc4_vs_Loc1 |
Log₂ fold change (with pseudocount 1e-6 to handle zeros) |
Present_in_Loc1 |
TRUE/FALSE |
Present_in_Loc4 |
TRUE/FALSE |
Total_Abundance |
Sum across both samples |
Mean_Abundance |
Mean across both samples |
Notes
-
Log₂ fold change: Uses a small pseudocount (
1e-6) to avoidlog(0). For features absent in one location, this gives a large but finite fold change — interpret these as “presence/absence” rather than true fold change. -
Heatmap filtering: Species with maximum abundance < 0.01% across both samples are excluded to keep the heatmap readable. Adjust the threshold in
keep <- apply(otu_mat, 1, max) > 0.0001if needed. -
Excel formatting: Headers are bold with grey background, columns auto-sized, and the first row is frozen for easy scrolling.
-
File locations: All outputs go into
figures/andtables/subfolders inside your current working directory (reports/).