1 Analysis

The phyloseq R package provides a robust framework for downstream microbiome analysis and visualization, and the scripts used here were adapted from the phyloseq tutorial.
Results and visualizations are reported based on SILVA for consistency; however, taxonomy assignments from the other databases may be loaded to reproduce the same workflow using the scripts provided below.

1.1 Run Docker

Prepare seqtab_nochim.RDS and DADA2_taxonomy_matrix_SILVA.rds, which we generated from the last session, in the /Desktop/MiSeq_SOP folder
If you don’t have them, you can download them: seqtab_nochim.RDS and DADA2_taxonomy_matrix_SILVA.rds
Open a terminal and run the command to start the Microbiome Docker container

$ docker run -it -v ~/Desktop/MiSeq_SOP:/home/hub1 bioinfohub/microbiome R

1.2 Load R packages

library(dada2)
library(phyloseq)
library(dplyr)
library(tidyr)
library(tibble)
library(htmlwidgets)
library(htmltools)
library(sankeyD3)
library(Biostrings)
library(ggplot2)
library(data.table)

1.3 Directory settings

You should see both seqtab_nochim.RDS and DADA2_taxonomy_matrix_SILVA.rds files

base_dir <- "/home/hub1"
dir(base_dir)

1.4 Import ASV table

if (!exists("seqtab_nochim")) {
        seqtab_nochim <- readRDS(file.path(base_dir, "seqtab_nochim.rds"))
}
if (!exists("tax_mat")) {
        tax_mat <- readRDS(file.path(base_dir, "DADA2_taxonomy_matrix_SILVA.rds"))
}
dim(tax_mat)

## [1] 196   7

1.5 Format conversion

First, the DADA2 output objects needs to be converted into a phyloseq object

sample_ids <- rownames(seqtab_nochim)
subject_raw <- sapply(strsplit(sample_ids, "D"), `[`, 1)
sex <- substr(subject_raw, 1, 1)
day <- as.integer(sapply(strsplit(sample_ids, "D"), `[`, 2))

sample_df <- data.frame(
        Subject = sample_ids,
        Sex = factor(sex, levels = c("M", "F")),
        Day = day
)
sample_df$Group <- factor(ifelse(sample_df$Day > 100, "Late", "Early"),
        levels = c("Early", "Late"),
        labels = c("E", "L")
)
sample_df$Category <- factor(paste0(sample_df$Group, "_", sample_df$Sex),
        levels = c("E_M", "E_F", "L_M", "L_F")
)
rownames(sample_df) <- sample_ids

phylo_obj <- phyloseq(
        otu_table(seqtab_nochim, taxa_are_rows = FALSE),
        sample_data(sample_df),
        tax_table(tax_mat)
)

dna <- DNAStringSet(taxa_names(phylo_obj))
names(dna) <- taxa_names(phylo_obj)
phylo_obj <- merge_phyloseq(phylo_obj, dna)
taxa_names(phylo_obj) <- sprintf("ASV%03d", seq(ntaxa(phylo_obj)))
saveRDS(phylo_obj, file.path(base_dir, "DADA2_phyloseq_obj.rds"))
phylo_obj

## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 196 taxa and 12 samples ]
## sample_data() Sample Data:       [ 12 samples by 5 sample variables ]
## tax_table()   Taxonomy Table:    [ 196 taxa by 7 taxonomic ranks ]
## refseq()      DNAStringSet:      [ 196 reference sequences ]

1.6 Alpha diversity over time

Visualize within-sample diversity using Shannon and Simpson indices across sampling Day, with points colored by Group (Early/Late) and shaped by Sex (Male/Female).

1.6.1 Group highlighted by color

group_cols <- c("E" = "#009988", "L" = "#EE7733") # teal and orange
rich_plot <- plot_richness(phylo_obj,
        x = "Day", measures = c("Shannon", "Simpson"),
        color = "Group", shape = "Sex"
) +
        theme_bw() +
        scale_color_manual(values = group_cols) +
        geom_point(size = 3)
pdf(file.path(base_dir, "alpha_diversity_by_group.pdf"), width = 6, height = 6)
print(rich_plot)
dev.off()

1.6.2 Sex highlighted by color

sex_cols <- c("M" = "#0077BB", "F" = "#BB5566") # blue and red
rich_plot <- plot_richness(phylo_obj,
        x = "Day", measures = c("Shannon", "Simpson"),
        shape = "Group", color = "Sex"
) +
        theme_bw() +
        scale_color_manual(values = sex_cols) +
        geom_point(size = 3)
pdf(file.path(base_dir, "alpha_diversity_by_sex.pdf"), width = 6, height = 6)
print(rich_plot)
dev.off()

1.7 Beta diversity

Assess between-sample community differences using Bray–Curtis distances on relative-abundance data and visualize them with NMDS, with points colored by Group (Early/Late) and shaped by Sex (Male/Female)

ps_nz <- prune_samples(sample_sums(phylo_obj) > 0, phylo_obj)
otu <- as(otu_table(ps_nz), "matrix")
if (taxa_are_rows(ps_nz)) otu <- t(otu)
otu[!is.finite(otu)] <- 0
otu_table(ps_nz) <- otu_table(otu, taxa_are_rows = FALSE)

ps_prop <- transform_sample_counts(ps_nz, function(x) x / sum(x))
nmds_bray <- ordinate(ps_prop, method = "NMDS", distance = "bray", trace = 0)

ordi_plot <- plot_ordination(ps_prop, nmds_bray, color = "Group", shape = "Sex") +
        theme_bw() +
        scale_color_manual(values = group_cols)
pdf(file.path(base_dir, "beta_diversity.pdf"), width = 6, height = 6)
print(ordi_plot)
dev.off()

1.8 Taxonomic relative abundance

Plot the top 20 most abundant ASVs as relative abundance bar charts, aggregated and colored by Taxa, and faceted by Group (Early/Late) and Sex (Male/Female) across sampling days.

top <- names(sort(taxa_sums(phylo_obj), decreasing = TRUE))[1:20]

ps_top <- transform_sample_counts(phylo_obj, function(OTU) OTU / sum(OTU))
ps_top <- prune_taxa(top, ps_top)

df_plot <- psmelt(ps_top)
df_plot$Family <- as.character(df_plot$Family)
df_plot$Family[is.na(df_plot$Family) | df_plot$Family == ""] <- "Unassigned"

dt <- as.data.table(df_plot)
dt_sum <- dt[, .(Abundance = mean(Abundance, na.rm = TRUE)), by = .(Category, Family)]
dt_sum[, Abundance := Abundance / sum(Abundance), by = Category]

top20_taxa <- ggplot(dt_sum, aes(x = Category, y = Abundance, fill = Family)) +
        geom_col(color = NA) +
        scale_y_continuous(limits = c(0, 1), labels = scales::percent) +
        scale_fill_brewer(palette = "Set2", drop = FALSE) +
        theme_bw() +
        theme(
                legend.position = "bottom",
                axis.text.x = element_text(angle = 90, vjust = 0, hjust = 1)
        ) +
        coord_flip()

pdf(file.path(base_dir, "top20_taxa_by_group.pdf"), width = 8, height = 5)
print(top20_taxa)
dev.off()

1.9 Taxonomic lineage profiles by category

Taxonomic composition is summarized as lineage-level flows, i.e., Sankey chart, from higher to lower ranks for each category, enabling rapid comparison of dominant clades and where group-specific differences emerge along the taxonomic hierarchy.

source("/home/rstudio/R/generate_sankey_plot.R")
meta_df <- data.frame(phyloseq::sample_data(phylo_obj), stringsAsFactors = FALSE)
meta_df$Sample <- rownames(meta_df)
tt_cols <- colnames(phyloseq::tax_table(phylo_obj))

# E_M (Early + Male), E_F (Early + Female), L_M (Late + Male), L_F (Late + Female)
cat_value <- "E_M"
        
dat <- build_sankey_data(ps = phylo_obj, meta_df = meta_df, category_value = cat_value)
title_text <- sprintf("%s(#Samples=%d, #Taxa=%d, #Reads=%d)", cat_value, dat$nsamples, dat$ntaxa, dat$total_reads)
w <- render_one_sankey(dat, title_text)

htmlwidgets::saveWidget(w, file = file.path(base_dir, paste0("sankey_", cat_value, ".html")))

2 SessionInfo

─ Session info ────────────────────────────────────────────────────────────────────────────────────
 setting  value
 version  R version 4.5.2 (2025-10-31)
 os       Ubuntu 24.04.3 LTS
 system   aarch64, linux-gnu
 ui       X11
 language (EN)
 collate  en_US.UTF-8
 ctype    en_US.UTF-8
 
─ Packages ────────────────────────────────────────────────────────────────────────────────────────
 package      * version  date (UTC) lib source
 BiocGenerics * 0.54.1   2025-10-12 [1] Bioconductor 3.21 (R 4.5.2)
 Biostrings   * 2.76.0   2025-04-15 [1] Bioconductor 3.21 (R 4.5.2)
 dada2        * 1.36.0   2025-04-15 [1] Bioconductor 3.21 (R 4.5.2)
 data.table   * 1.18.2.1 2026-01-27 [1] RSPM (R 4.5.0)
 dplyr        * 1.2.0    2026-02-03 [1] RSPM (R 4.5.0)
 generics     * 0.1.4    2025-05-09 [1] RSPM (R 4.5.0)
 GenomeInfoDb * 1.44.3   2025-09-21 [1] Bioconductor 3.21 (R 4.5.2)
 ggplot2      * 4.0.2    2026-02-03 [1] RSPM (R 4.5.0)
 htmltools    * 0.5.9    2025-12-04 [1] RSPM (R 4.5.0)
 htmlwidgets  * 1.6.4    2023-12-06 [1] RSPM (R 4.5.0)
 IRanges      * 2.42.0   2025-04-15 [1] Bioconductor 3.21 (R 4.5.2)
 phyloseq     * 1.52.0   2025-04-15 [1] Bioconductor 3.21 (R 4.5.2)
 Rcpp         * 1.1.1    2026-01-10 [1] RSPM (R 4.5.0)
 S4Vectors    * 0.46.0   2025-04-15 [1] Bioconductor 3.21 (R 4.5.2)
 sankeyD3     * 0.3.2    2026-02-15 [1] Github (fbreitwieser/sankeyD3@fd50a74)
 tibble       * 3.3.1    2026-01-11 [1] RSPM (R 4.5.0)
 tidyr        * 1.3.2    2025-12-19 [1] RSPM (R 4.5.0)
 XVector      * 0.48.0   2025-04-15 [1] Bioconductor 3.21 (R 4.5.2)

 [1] /usr/local/lib/R/site-library
 [2] /usr/local/lib/R/library
 * ── Packages attached to the search path.

───────────────────────────────────────────────────────────────────────────────────────────────────

Microbiome data analysis and visualization.

Heewon Seo

March 24, 2026

1 Analysis

1.1 Run Docker

1.2 Load R packages

1.3 Directory settings

1.4 Import ASV table

1.5 Format conversion

1.6 Alpha diversity over time

1.6.1 Group highlighted by color

1.6.2 Sex highlighted by color

1.7 Beta diversity

1.8 Taxonomic relative abundance

1.9 Taxonomic lineage profiles by category

2 SessionInfo