Daily Archives: 2025年12月5日

Pipeline to Create Circos DE Plots from DESeq2 Output (Data_JuliaFuchs_RNAseq_2025)

    #Set the comparison condition in the R-script
    comparison <- "Moxi_18h_vs_Untreated_18h"
    # e.g.
    # comparison <- "Mitomycin_18h_vs_Untreated_18h"
    # comparison <- "Mitomycin_8h_vs_Untreated_8h"
    # comparison <- "Moxi_8h_vs_Untreated_8h"
    # comparison <- "Mitomycin_4h_vs_Untreated_4h"
    # comparison <- "Moxi_4h_vs_Untreated_4h"
    (r_env) Rscript make_circos_from_deseq.R

    (circos-env) cd circos_Moxi_18h_vs_Untreated_18h
    (circos-env) touch karyotype.txt (see the step2)
    (circos-env) touch circos.conf (see the step3)
    cp circos_Moxi_18h_vs_Untreated_18h/karyotype.txt circos_Moxi_8h_vs_Untreated_8h/
    cp circos_Moxi_18h_vs_Untreated_18h/karyotype.txt circos_Moxi_4h_vs_Untreated_4h/
    cp circos_Moxi_18h_vs_Untreated_18h/karyotype.txt circos_Mitomycin_18h_vs_Untreated_18h/
    cp circos_Moxi_18h_vs_Untreated_18h/karyotype.txt circos_Mitomycin_8h_vs_Untreated_8h/
    cp circos_Moxi_18h_vs_Untreated_18h/karyotype.txt circos_Mitomycin_4h_vs_Untreated_4h/
    cp circos_Moxi_18h_vs_Untreated_18h/circos.conf circos_Moxi_8h_vs_Untreated_8h/
    cp circos_Moxi_18h_vs_Untreated_18h/circos.conf circos_Moxi_4h_vs_Untreated_4h/
    cp circos_Moxi_18h_vs_Untreated_18h/circos.conf circos_Mitomycin_18h_vs_Untreated_18h/
    cp circos_Moxi_18h_vs_Untreated_18h/circos.conf circos_Mitomycin_8h_vs_Untreated_8h/
    cp circos_Moxi_18h_vs_Untreated_18h/circos.conf circos_Mitomycin_4h_vs_Untreated_4h/

    (circos-env) circos -conf circos.conf
    #or (circos-env) /home/jhuang/mambaforge/envs/circos-env/bin/circos  -conf /mnt/md1/DATA/Data_JuliaFuchs_RNAseq_2025/results/star_salmon/degenes/circos/circos.conf

    #(circos-env) jhuang@WS-2290C:/mnt/md1/DATA/Data_JuliaFuchs_RNAseq_2025/results/star_salmon/degenes$ find . -name "*circos.png"
    mv ./circos_Moxi_4h_vs_Untreated_4h/circos.png circos_Moxi_4h_vs_Untreated_4h.png
    mv ./circos_Moxi_18h_vs_Untreated_18h/circos.png circos_Moxi_18h_vs_Untreated_18h.png
    mv ./circos_Moxi_8h_vs_Untreated_8h/circos.png circos_Moxi_8h_vs_Untreated_8h.png
    mv ./circos_Mitomycin_18h_vs_Untreated_18h/circos.png circos_Mitomycin_18h_vs_Untreated_18h.png
    mv ./circos_Mitomycin_8h_vs_Untreated_8h/circos.png circos_Mitomycin_8h_vs_Untreated_8h.png
    mv ./circos_Mitomycin_4h_vs_Untreated_4h/circos.png circos_Mitomycin_4h_vs_Untreated_4h.png

touch karyotype.txt

    chr - CP052959.1 CP052959.1 0 2706926 grey

touch circos.conf

    karyotype = karyotype.txt

<ideogram>

<spacing>
    default = 0.005r
    </spacing>

    radius       = 0.80r
    thickness    = 20p
    fill         = yes

    show_label   = yes
    label_radius = 1.05r
    label_size   = 30p
    label_font   = bold
    label_parallel = yes
    </ideogram>

    # --- Wichtig: Schalter auf Top-Level, NICHT im 
<ticks>-Block ---
    show_ticks       = yes
    show_tick_labels = yes

<ticks>
    # Starte direkt an der äußeren Ideogramm-Kante
    radius      = dims(ideogram,radius_outer)
    orientation = out          # Ticks nach außen zeichnen (oder 'in' für nach innen)
    color       = black
    thickness   = 2p
    size        = 8p

    # kleine Ticks alle 100 kb, ohne Label

<tick>
    spacing      = 50000
    size         = 8p
    thickness    = 3p
    color        = black
    show_label   = no
    </tick>

    # große Ticks alle 500 kb, mit Label in Mb

<tick>
    spacing      = 100000
    size         = 12p
    thickness    = 5p
    color        = black

    show_label   = yes
    label_size   = 18p
    label_offset = 6p
    multiplier   = 0.000001     # in Mb
    format       = %.1f
    suffix       = " Mb"
    </tick>

    </ticks>

<plots>

    # Density of up-regulated genes

<plot>
    show      = yes
    type      = histogram
    file      = data/density_up.txt
    r0        = 0.88r
    r1        = 0.78r
    fill_color = red
    thickness = 1p
    </plot>

    # Density of down-regulated genes

<plot>
    show      = yes
    type      = histogram
    file      = data/density_down.txt
    r0        = 0.78r
    r1        = 0.68r
    fill_color = blue
    thickness = 1p
    </plot>

    # Scatter of individual significantly DE genes

<plot>
    show           = yes
    type           = scatter
    file           = data/genes_scatter.txt
    r0             = 0.46r
    r1             = 0.76r
    glyph          = circle
    glyph_size     = 5p
    stroke_thickness = 0

    min            = -15
    max            =  15

<rules>

<rule>
    condition = var(value) > 0
    color     = red
    </rule>

<rule>
    condition = var(value) < 0
    color     = blue
    </rule>
    </rules>

    </plot>

    </plots>

<image>
    <<include etc/image.conf>>
    </image>

    <<include etc/colors_fonts_patterns.conf>>
    <<include etc/housekeeping.conf>>

Rscript make_circos_from_deseq.R

 ############################################################
 # make_circos_from_deseq.R
 #
 # - Read DESeq2 results (annotated CSV)
 # - Read BED annotation
 # - Merge by GeneID
 # - Classify genes (up / down / ns)
 # - Create Circos input files:
 #     * genes_scatter.txt
 #     * density_up.txt
 #     * density_down.txt
 # - Create annotated versions of the above
 # - Export annotated tables into one Excel workbook
 ############################################################

 ###############################
 # 0. Single parameter to set
 ###############################

 # Just change this line for each comparison:
 #comparison <- "Moxi_18h_vs_Untreated_18h"
 # e.g.
 #comparison <- "Mitomycin_18h_vs_Untreated_18h"
 #comparison <- "Mitomycin_8h_vs_Untreated_8h"
 #comparison <- "Moxi_8h_vs_Untreated_8h"
 #comparison <- "Mitomycin_4h_vs_Untreated_4h"
 comparison <- "Moxi_4h_vs_Untreated_4h"

 ###############################
 # 0. Derived settings
 ###############################

 # DESeq2 result file (annotated)
 deseq_file <- paste0(comparison, "-all_annotated.csv")

 # BED file with gene coordinates (constant for this genome)
 bed_file   <- "CP052959_m.bed"

 # Create a filesystem-friendly directory name from comparison
 safe_comp  <- gsub("[^A-Za-z0-9_]+", "_", comparison)

 # Output directory for Circos files (e.g. circos_Moxi_18h_vs_Untreated_18h)
 out_dir    <- paste0("circos_", safe_comp)

 # Thresholds for significance
 padj_cutoff <- 0.05
 lfc_cutoff  <- 1      # |log2FC| >= 1

 # Bin size for density in bp (e.g. 10000 = 10 kb)
 bin_size <- 10000

 # Comparison label for Excel / plots (human-readable)
 comparison_label <- comparison

 options(scipen = 999) # turn off scientific notation

 ###############################
 # 1. Setup & packages
 ###############################

 if (!dir.exists(out_dir)) dir.create(out_dir, showWarnings = FALSE)
 data_dir <- file.path(out_dir, "data")
 if (!dir.exists(data_dir)) dir.create(data_dir, showWarnings = FALSE)

 # openxlsx for Excel export
 if (!requireNamespace("openxlsx", quietly = TRUE)) {
 stop("Package 'openxlsx' is required. Please install it with install.packages('openxlsx').")
 }
 library(openxlsx)

 ###############################
 # 2. Read data
 ###############################

 message("Reading DESeq2 results from: ", deseq_file)
 deseq <- read.csv(deseq_file, stringsAsFactors = FALSE)

 # Check required columns in DESeq2 table
 required_cols <- c("GeneID", "log2FoldChange", "padj")
 if (!all(required_cols %in% colnames(deseq))) {
 stop("DESeq2 table must contain columns: ", paste(required_cols, collapse = ", "))
 }

 message("Reading BED annotation from: ", bed_file)
 bed_cols <- c("chr","start","end","gene_id","score",
             "strand","thickStart","thickEnd",
             "itemRgb","blockCount","blockSizes","blockStarts")

 annot <- read.table(
 bed_file,
 header = FALSE,
 sep = "\t",
 stringsAsFactors = FALSE,
 col.names = bed_cols
 )

 ###############################
 # 3. Merge DESeq2 + annotation
 ###############################

 # Match DESeq2 GeneID (e.g. "gene-HJI06_09365") to BED gene_id
 merged <- merge(
 deseq,
 annot[, c("chr", "start", "end", "gene_id")],
 by.x = "GeneID",
 by.y = "gene_id"
 )

 if (nrow(merged) == 0) {
 stop("No overlap between DESeq2 GeneID and BED gene_id.")
 }

 # Midpoint of each gene
 merged$mid <- round((merged$start + merged$end) / 2)

 ###############################
 # 4. Classify genes
 ###############################

 merged$regulation <- "ns"
 merged$regulation[merged$padj < padj_cutoff & merged$log2FoldChange >=  lfc_cutoff]  <- "up"
 merged$regulation[merged$padj < padj_cutoff & merged$log2FoldChange <= -lfc_cutoff] <- "down"

 table_reg <- table(merged$regulation)
 message("Regulation counts: ", paste(names(table_reg), table_reg, collapse = " | "))

 ###############################
 # 5. Scatter files (per-gene)
 ###############################

 scatter <- merged[merged$regulation != "ns", ]

 # Circos scatter input: chr  start  end  value
 scatter_file <- file.path(data_dir, "genes_scatter.txt")
 scatter_out  <- scatter[, c("chr", "mid", "mid", "log2FoldChange")]

 write.table(scatter_out,
             scatter_file,
             quote = FALSE, sep = "\t",
             row.names = FALSE, col.names = FALSE)

 message("Written Circos scatter file: ", scatter_file)

 # Annotated scatter for Excel / inspection
 scatter_annot <- scatter[, c(
 "chr",
 "mid",              # start
 "mid",              # end
 "log2FoldChange",
 "GeneID",
 "padj",
 "regulation"
 )]
 colnames(scatter_annot)[1:4] <- c("chr", "start", "end", "log2FoldChange")

 scatter_annot_file <- file.path(data_dir, "genes_scatter_annotated.tsv")
 write.table(scatter_annot,
             scatter_annot_file,
             quote = FALSE, sep = "\t",
             row.names = FALSE, col.names = TRUE)

 message("Written annotated scatter file: ", scatter_annot_file)

 ###############################
 # 6. Density function (bins)
 ###############################

 bin_chr <- function(df_chr, bin_size, direction = c("up", "down")) {
 direction <- match.arg(direction)

 chr_name <- df_chr$chr[1]
 max_pos  <- max(df_chr$mid)

 # number of bins
 n_bins <- ceiling((max_pos + 1) / bin_size)

 starts <- seq(0, by = bin_size, length.out = n_bins)
 ends   <- starts + bin_size

 # init counts & gene lists
 counts    <- integer(n_bins)
 gene_list <- vector("list", n_bins)

 df_dir <- df_chr[df_chr$regulation == direction, ]

 if (nrow(df_dir) > 0) {
     # bin index for each gene
     bin_index <- floor(df_dir$mid / bin_size) + 1
     bin_index[bin_index < 1]        <- 1
     bin_index[bin_index > n_bins]   <- n_bins

     # accumulate counts and GeneIDs
     for (i in seq_len(nrow(df_dir))) {
     idx <- bin_index[i]
     counts[idx] <- counts[idx] + 1L
     gene_list[[idx]] <- c(gene_list[[idx]], df_dir$GeneID[i])
     }
 }

 gene_ids <- vapply(
     gene_list,
     function(x) {
     if (length(x) == 0) "" else paste(unique(x), collapse = ";")
     },
     character(1)
 )

 data.frame(
     chr      = chr_name,
     start    = as.integer(starts),
     end      = as.integer(ends),
     value    = as.integer(counts),
     gene_ids = gene_ids,
     stringsAsFactors = FALSE
 )
 }

 ###############################
 # 7. Density up/down for all chromosomes
 ###############################

 chr_list <- split(merged, merged$chr)

 density_up_list   <- lapply(chr_list, bin_chr, bin_size = bin_size, direction = "up")
 density_down_list <- lapply(chr_list, bin_size = bin_size, FUN = bin_chr, direction = "down")

 density_up   <- do.call(rbind, density_up_list)
 density_down <- do.call(rbind, density_down_list)

 # Plain Circos input (no gene_ids)
 density_up_file   <- file.path(data_dir, "density_up.txt")
 density_down_file <- file.path(data_dir, "density_down.txt")

 write.table(density_up[, c("chr", "start", "end", "value")],
             density_up_file,
             quote = FALSE, sep = "\t",
             row.names = FALSE, col.names = FALSE)

 write.table(density_down[, c("chr", "start", "end", "value")],
             density_down_file,
             quote = FALSE, sep = "\t",
             row.names = FALSE, col.names = FALSE)

 message("Written Circos density files: ",
         density_up_file, " and ", density_down_file)

 # Annotated density files (with gene_ids)
 density_up_annot_file   <- file.path(data_dir, "density_up_annotated.tsv")
 density_down_annot_file <- file.path(data_dir, "density_down_annotated.tsv")

 write.table(density_up,
             density_up_annot_file,
             quote = FALSE, sep = "\t",
             row.names = FALSE, col.names = TRUE)

 write.table(density_down,
             density_down_annot_file,
             quote = FALSE, sep = "\t",
             row.names = FALSE, col.names = TRUE)

 message("Written annotated density files: ",
         density_up_annot_file, " and ", density_down_annot_file)

 ###############################
 # 8. Export annotated tables to Excel
 ###############################

 excel_file <- file.path(
 out_dir,
 paste0("circos_annotations_", comparison_label, ".xlsx")
 )

 wb <- createWorkbook()

 addWorksheet(wb, "scatter_points")
 writeData(wb, "scatter_points", scatter_annot)

 addWorksheet(wb, "density_up")
 writeData(wb, "density_up", density_up)

 addWorksheet(wb, "density_down")
 writeData(wb, "density_down", density_down)

 saveWorkbook(wb, excel_file, overwrite = TRUE)

 message("Excel workbook written to: ", excel_file)

 message("Done.")

Automated β-Lactamase Gene Detection with NCBI AMRFinderPlus (Data_Patricia_AMRFinderPlus_2025, v2)

1. Installation and Database Setup

To install and prepare NCBI AMRFinderPlus in the bacto environment:

mamba activate bacto
mamba install ncbi-amrfinderplus
mamba update ncbi-amrfinderplus

mamba activate bacto
amrfinder -u

This will:
- Download and install the latest AMRFinderPlus version and its database.
- Create /home/jhuang/mambaforge/envs/bacto/share/amrfinderplus/data/.
- Symlink the latest database version for use.

Check available organism options for annotation:

amrfinder --list_organisms
#Available --organism options: Acinetobacter_baumannii, Burkholderia_cepacia, Burkholderia_mallei, Burkholderia_pseudomallei, Campylobacter, Citrobacter_freundii, Clostridioides_difficile, Corynebacterium_diphtheriae, Enterobacter_asburiae, Enterobacter_cloacae, Enterococcus_faecalis, Enterococcus_faecium, Escherichia, Klebsiella_oxytoca, Klebsiella_pneumoniae, Neisseria_gonorrhoeae, Neisseria_meningitidis, Pseudomonas_aeruginosa, Salmonella, Serratia_marcescens, Staphylococcus_aureus, Staphylococcus_pseudintermedius, Streptococcus_agalactiae, Streptococcus_pneumoniae, Streptococcus_pyogenes, Vibrio_cholerae, Vibrio_parahaemolyticus, Vibrio_vulnificus

Supported values include species such as Escherichia, Klebsiella_pneumoniae, Enterobacter_cloacae, Pseudomonas_aeruginosa and many others.

2. Batch Analysis: Bash Script for Genome Screening

Use the following script to screen multiple genomes using AMRFinderPlus and output only β-lactam/beta-lactamase hits from a metadata table.

Input: genome_metadata.tsv — tab-separated columns: filename_TAB_organism, with header.

filename    organism
58.fasta    Escherichia coli
92.fasta    Klebsiella pneumoniae
125.fasta   Enterobacter cloacae complex
128.fasta   Enterobacter cloacae complex
130.fasta   Enterobacter cloacae complex
147.fasta   Citrobacter freundii
149.fasta   Citrobacter freundii
160.fasta   Citrobacter braakii
161.fasta   Citrobacter braakii
168.fasta   Providencia stuartii
184.fasta   Klebsiella aerogenes
65.fasta    Pseudomonas aeruginosa
201.fasta   Pseudomonas aeruginosa
209.fasta   Pseudomonas aeruginosa
167.fasta   Serratia marcescens

Run:

cd ~/DATA/Data_Patricia_AMRFinderPlus_2025/genomes
./run_amrfinder_and_summarize.sh genome_metadata.tsv
#./run_amrfinder_and_summarize.sh genome_metadata_149.tsv
#OR_DETECT_RUN: amrfinder -n 92.fasta -o amrfinder_results/92.amrfinder.tsv --plus --organism Klebsiella_pneumoniae --threads 1

python summarize_from_amrfinder_results.py amrfinder_results
# or, since that's the default:
# python summarize_from_amrfinder_results.py

Produce

AMRFinder-wide outputs
- amrfinder_all.tsv
- amrfinder_summary_by_isolate_gene.tsv
- amrfinder_summary_by_gene.tsv
- amrfinder_summary_by_class.tsv (if a class column exists)
- amrfinder_summary.xlsx (with multiple sheets)
β-lactam-only outputs (if Class and Subclass are present)
- beta_lactam_all.tsv
- beta_lactam_summary_by_gene.tsv
- beta_lactam_summary_by_isolate_gene.tsv
- beta_lactam_all.xlsx
- beta_lactam_summary.xlsx

Report

Please find attached the updated AMRFinderPlus summary files, now including isolate 167. For β-lactam–specific results, please see beta_lactam_all.xlsx and beta_lactam_summary.xlsx. In particular, beta_lactam_summary.xlsx contains two sheets:

by_gene – aggregated counts and isolate lists for each β-lactam gene
by_isolate_gene – per-isolate overview of detected β-lactam genes

Script:

run_amrfinder_and_summarize.sh

      #!/usr/bin/env bash
      set -euo pipefail

      META_FILE="${1:-}"

      if [[ -z "$META_FILE" || ! -f "$META_FILE" ]]; then
      echo "Usage: $0 genome_metadata.tsv" >&2
      exit 1
      fi

      OUTDIR="amrfinder_results"
      mkdir -p "$OUTDIR"

      echo ">>> Checking AMRFinder installation..."
      amrfinder -V || { echo "ERROR: amrfinder not working"; exit 1; }
      echo

      echo ">>> Running AMRFinderPlus on all genomes listed in $META_FILE"

      # --- loop over metadata file ---
      # expected columns: filename
<TAB>organism
      tail -n +2 "$META_FILE" | while IFS=$'\t' read -r fasta organism; do
      # skip empty lines
      [[ -z "$fasta" ]] && continue

      if [[ ! -f "$fasta" ]]; then
      echo "WARN: FASTA file '$fasta' not found, skipping."
      continue
      fi

      isolate_id="${fasta%.fasta}"

      # map free-text organism to AMRFinder --organism names (optional)
      org_opt=""
      case "$organism" in
      "Escherichia coli")              org_opt="--organism Escherichia" ;;
      "Klebsiella pneumoniae")         org_opt="--organism Klebsiella_pneumoniae" ;;
      "Enterobacter cloacae complex")  org_opt="--organism Enterobacter_cloacae" ;;
      "Citrobacter freundii")          org_opt="--organism Citrobacter_freundii" ;;
      "Citrobacter braakii")           org_opt="--organism Citrobacter_freundii" ;;
      "Pseudomonas aeruginosa")        org_opt="--organism Pseudomonas_aeruginosa" ;;
      "Serratia marcescens")           org_opt="--organism Serratia_marcescens" ;;
      # others (Providencia stuartii, Klebsiella aerogenes)
      # currently have no organism-specific rules in AMRFinder, so we omit --organism
      *)                               org_opt="" ;;
      esac

      out_tsv="${OUTDIR}/${isolate_id}.amrfinder.tsv"

      echo "  - ${fasta} (${organism}) -> ${out_tsv} ${org_opt}"
      amrfinder -n "$fasta" -o "$out_tsv" --plus $org_opt
      done

      echo ">>> AMRFinderPlus runs finished."
      echo ">>> All done."
      echo "   - Individual reports: ${OUTDIR}/*.amrfinder.tsv"

summarize_from_amrfinder_results.py

      #!/usr/bin/env python3
      """
      summarize_from_amrfinder_results.py

      Usage:
      python summarize_from_amrfinder_results.py [amrfinder_results_dir]

      Default directory is "amrfinder_results" (relative to current working dir).

      This script:
      1) Reads all *.amrfinder.tsv in the given directory
      2) Merges them into a combined table
      3) Generates AMRFinder-wide summaries (amrfinder_* files)
      4) Applies a β-lactam filter:

              Element type == "AMR" (case-insensitive)
      AND Class or Subclass contains "beta-lactam" (case-insensitive)

      and generates β-lactam-only summaries (beta_lactam_* files).

      It NEVER re-runs AMRFinder; it only uses existing TSV files.
      """

      import sys
      import os
      import glob
      import re

      try:
      import pandas as pd
      except ImportError:
      sys.stderr.write(
              "ERROR: pandas is not installed.\n"
              "Install with something like:\n"
              "  mamba install pandas openpyxl -c conda-forge -c bioconda\n"
      )
      sys.exit(1)

      # ---------------------------------------------------------------------
      # Helpers
      # ---------------------------------------------------------------------

      def read_one(path):
      """Read one *.amrfinder.tsv and add an 'isolate_id' column from the filename."""
      df = pd.read_csv(path, sep="\t", dtype=str)
      df.columns = [c.strip() for c in df.columns]
      isolate_id = os.path.basename(path).replace(".amrfinder.tsv", "")
      df["isolate_id"] = isolate_id
      return df

      def pick(df, *candidates):
      """Return the first existing column name among candidates (normalized names)."""
      for c in candidates:
              if c in df.columns:
              return c
      return None

      # ---------------------------------------------------------------------
      # AMRFinder-wide summaries (no β-lactam filter)
      # ---------------------------------------------------------------------

      def make_amrfinder_summaries(
      df_all,
      col_gene,
      col_seq,
      col_class,
      col_subcls,
      col_ident,
      col_cov,
      col_iso,
      ):
      """Summaries for ALL AMRFinder hits (no β-lactam filter)."""
      if df_all.empty:
              print("[amrfinder] No rows in merged table, skipping summaries.")
              return

      # full merged table
      df_all.to_csv("amrfinder_all.tsv", sep="\t", index=False)
      print(">>> Full AMRFinder table written to: amrfinder_all.tsv")

      # ---- summary by isolate × gene ----
      rows = []
      for (iso, gene), sub in df_all.groupby([col_iso, col_gene], dropna=False):
              row = {
              "isolate_id": iso,
              "Gene_symbol": sub[col_gene].iloc[0],
              "n_hits": len(sub),
              }
              if col_seq is not None:
              row["Sequence_name"] = sub[col_seq].iloc[0]
              if col_class is not None:
              row["Class"] = sub[col_class].iloc[0]
              if col_subcls is not None:
              row["Subclass"] = sub[col_subcls].iloc[0]
              if col_ident is not None:
              vals = pd.to_numeric(sub[col_ident], errors="coerce")
              row["%identity_min"] = vals.min()
              row["%identity_max"] = vals.max()
              if col_cov is not None:
              vals = pd.to_numeric(sub[col_cov], errors="coerce")
              row["%coverage_min"] = vals.min()
              row["%coverage_max"] = vals.max()
              rows.append(row)

      summary_iso_gene = pd.DataFrame(rows)
      summary_iso_gene.to_csv(
              "amrfinder_summary_by_isolate_gene.tsv", sep="\t", index=False
      )
      print(">>> Isolate × gene summary written to: amrfinder_summary_by_isolate_gene.tsv")

      # ---- summary by gene ----
      def join(vals):
              uniq = sorted(set(vals.dropna().astype(str)))
              return ",".join(uniq)

      rows = []
      for gene, sub in df_all.groupby(col_gene, dropna=False):
              row = {
              "Gene_symbol": sub[col_gene].iloc[0],
              "n_isolates": sub[col_iso].nunique(),
              "isolates": ",".join(sorted(set(sub[col_iso].dropna().astype(str)))),
              "n_hits": len(sub),
              }
              if col_seq is not None:
              row["Sequence_name"] = join(sub[col_seq])
              if col_class is not None:
              row["Class"] = join(sub[col_class])
              if col_subcls is not None:
              row["Subclass"] = join(sub[col_subcls])
              rows.append(row)

      summary_gene = pd.DataFrame(rows)
      summary_gene = summary_gene.sort_values("n_isolates", ascending=False)
      summary_gene.to_csv("amrfinder_summary_by_gene.tsv", sep="\t", index=False)
      print(">>> Gene-level summary written to: amrfinder_summary_by_gene.tsv")

      # ---- summary by class/subclass ----
      summary_class = None
      if col_class is not None:
              group_cols = [col_class]
              if col_subcls is not None:
              group_cols.append(col_subcls)

              summary_class = (
              df_all.groupby(group_cols, dropna=False)
              .agg(
                      n_isolates=(col_iso, "nunique"),
                      n_hits=(col_iso, "size"),
              )
              .reset_index()
              )
              summary_class.to_csv("amrfinder_summary_by_class.tsv", sep="\t", index=False)
              print(">>> Class-level summary written to: amrfinder_summary_by_class.tsv")
      else:
              print(">>> No 'class' column found; amrfinder_summary_by_class.tsv not created.")

      # ---- Excel workbook ----
      try:
              with pd.ExcelWriter("amrfinder_summary.xlsx") as xw:
              df_all.to_excel(xw, sheet_name="amrfinder_all", index=False)
              summary_iso_gene.to_excel(xw, sheet_name="by_isolate_gene", index=False)
              summary_gene.to_excel(xw, sheet_name="by_gene", index=False)
              if summary_class is not None:
                      summary_class.to_excel(xw, sheet_name="by_class", index=False)
              print(">>> Excel workbook written: amrfinder_summary.xlsx")
      except Exception as e:
              print("WARNING: could not write amrfinder_summary.xlsx:", e)

      # ---------------------------------------------------------------------
      # β-lactam summaries
      # ---------------------------------------------------------------------

      def make_beta_lactam_summaries(
      df_beta,
      col_gene,
      col_seq,
      col_subcls,
      col_ident,
      col_cov,
      col_iso,
      ):
      """Summaries for β-lactam subset (after mask)."""
      if df_beta.empty:
              print("[beta_lactam] No β-lactam hits in subset, skipping.")
              return

      # full β-lactam table
      beta_all_tsv = "beta_lactam_all.tsv"
      df_beta.to_csv(beta_all_tsv, sep="\t", index=False)
      print(">>> β-lactam / β-lactamase hits written to: %s" % beta_all_tsv)

      # -------- summary by gene (with list of isolates) --------
      group_cols = [col_gene]
      if col_seq is not None:
              group_cols.append(col_seq)
      if col_subcls is not None:
              group_cols.append(col_subcls)

      def join_isolates(vals):
              uniq = sorted(set(vals.dropna().astype(str)))
              return ",".join(uniq)

      summary_gene = (
              df_beta.groupby(group_cols, dropna=False)
              .agg(
              n_isolates=(col_iso, "nunique"),
              isolates=(col_iso, join_isolates),
              n_hits=(col_iso, "size"),
              )
              .reset_index()
      )

      rename_map = {}
      if col_gene is not None:
              rename_map[col_gene] = "Gene_symbol"
      if col_seq is not None:
              rename_map[col_seq] = "Sequence_name"
      if col_subcls is not None:
              rename_map[col_subcls] = "Subclass"
      summary_gene.rename(columns=rename_map, inplace=True)

      sum_gene_tsv = "beta_lactam_summary_by_gene.tsv"
      summary_gene.to_csv(sum_gene_tsv, sep="\t", index=False)
      print(">>> Gene-level β-lactam summary written to: %s" % sum_gene_tsv)
      print("    (includes 'isolates' = comma-separated isolate_ids)")

      # -------- summary by isolate & gene (with annotation) --------
      rows = []
      for (iso, gene), sub in df_beta.groupby([col_iso, col_gene], dropna=False):
              row = {
              "isolate_id": iso,
              "Gene_symbol": sub[col_gene].iloc[0],
              "n_hits": len(sub),
              }
              if col_seq is not None:
              row["Sequence_name"] = sub[col_seq].iloc[0]
              if col_subcls is not None:
              row["Subclass"] = sub[col_subcls].iloc[0]

              if col_ident is not None:
              vals = pd.to_numeric(sub[col_ident], errors="coerce")
              row["%identity_min"] = vals.min()
              row["%identity_max"] = vals.max()
              if col_cov is not None:
              vals = pd.to_numeric(sub[col_cov], errors="coerce")
              row["%coverage_min"] = vals.min()
              row["%coverage_max"] = vals.max()

              rows.append(row)

      summary_iso_gene = pd.DataFrame(rows)
      sum_iso_gene_tsv = "beta_lactam_summary_by_isolate_gene.tsv"
      summary_iso_gene.to_csv(sum_iso_gene_tsv, sep="\t", index=False)
      print(">>> Isolate × gene β-lactam summary written to: %s" % sum_iso_gene_tsv)
      print("    (includes 'Gene_symbol' and 'Sequence_name' annotation columns)")

      # -------- optional Excel exports --------
      try:
              with pd.ExcelWriter("beta_lactam_all.xlsx") as xw:
              df_beta.to_excel(xw, sheet_name="beta_lactam_all", index=False)
              with pd.ExcelWriter("beta_lactam_summary.xlsx") as xw:
              summary_gene.to_excel(xw, sheet_name="by_gene", index=False)
              summary_iso_gene.to_excel(xw, sheet_name="by_isolate_gene", index=False)
              print(">>> Excel workbooks written: beta_lactam_all.xlsx, beta_lactam_summary.xlsx")
      except Exception as e:
              print("WARNING: could not write β-lactam Excel files:", e)

      # ---------------------------------------------------------------------
      # Main
      # ---------------------------------------------------------------------

      def main():
      outdir = sys.argv[1] if len(sys.argv) > 1 else "amrfinder_results"

      if not os.path.isdir(outdir):
              sys.stderr.write("ERROR: directory '%s' not found.\n" % outdir)
              sys.exit(1)

      files = sorted(glob.glob(os.path.join(outdir, "*.amrfinder.tsv")))
      if not files:
              sys.stderr.write("ERROR: no *.amrfinder.tsv files found in '%s'.\n" % outdir)
              sys.exit(1)

      print(">>> Found %d AMRFinder TSV files in: %s" % (len(files), outdir))
      for f in files:
              print("   -", os.path.basename(f))

      dfs = [read_one(f) for f in files]
      df = pd.concat(dfs, ignore_index=True)

      # normalize column names for internal use
      norm_cols = {c: c.strip().lower().replace(" ", "_") for c in df.columns}
      df.rename(columns=norm_cols, inplace=True)

      # locate columns (handles your Element type / subtype + older formats)
      col_gene       = pick(df, "gene_symbol", "genesymbol")
      col_seq        = pick(df, "sequence_name", "sequencename")
      col_elemtype   = pick(df, "element_type")
      col_elemsub    = pick(df, "element_subtype")
      col_class      = pick(df, "class")
      col_subcls     = pick(df, "subclass")
      col_ident      = pick(df, "%identity_to_reference_sequence", "identity")
      col_cov        = pick(df, "%coverage_of_reference_sequence", "coverage_of_reference_sequence")
      col_iso        = "isolate_id"

      print("\nDetected columns:")
      for label, col in [
              ("gene", col_gene),
              ("sequence", col_seq),
              ("element_type", col_elemtype),
              ("element_subtype", col_elemsub),
              ("class", col_class),
              ("subclass", col_subcls),
              ("%identity", col_ident),
              ("%coverage", col_cov),
              ("isolate_id", col_iso),
      ]:
              print("  %-14s: %s" % (label, col))

      if col_gene is None:
              sys.stderr.write(
              "ERROR: could not find a gene symbol column "
              "(expected something like 'Gene symbol' in the original AMRFinder output).\n"
              )
              sys.exit(1)

      print("\n=== Generating AMRFinder-wide summaries (all hits) ===")
      make_amrfinder_summaries(
              df_all=df,
              col_gene=col_gene,
              col_seq=col_seq,
              col_class=col_class,
              col_subcls=col_subcls,
              col_ident=col_ident,
              col_cov=col_cov,
              col_iso=col_iso,
      )

      # -----------------------------------------------------------------
      # β-lactam subset
      #
      # New logic for your current data:
      #   Element type == "AMR"
      #   AND Class or Subclass contains "beta-lactam"
      #
      # Falls back to just Class/Subclass if Element type not present.
      # -----------------------------------------------------------------
      if (col_elemtype is not None) or (col_class is not None or col_subcls is not None):

              # element type AMR (if column exists, otherwise all True)
              if col_elemtype is not None:
              mask_amr = df[col_elemtype].str.contains("AMR", case=False, na=False)
              else:
              mask_amr = pd.Series(True, index=df.index)

              # beta-lactam pattern (handles BETA-LACTAM, beta lactam, etc.)
              beta_pattern = re.compile(r"beta[- ]?lactam", re.IGNORECASE)

              mask_beta = pd.Series(False, index=df.index)
              if col_class is not None:
              mask_beta |= df[col_class].fillna("").str.contains(beta_pattern)
              if col_subcls is not None:
              mask_beta |= df[col_subcls].fillna("").str.contains(beta_pattern)

              mask = mask_amr & mask_beta
              df_beta = df.loc[mask].copy()

              if df_beta.empty:
              print(
                      "\nWARNING: No β-lactam hits found "
                      "(Element type == 'AMR' AND Class/Subclass contains 'beta-lactam')."
              )
              else:
              print(
                      "\n=== β-lactam subset ===\n"
                      "  kept %d of %d rows where Element type is 'AMR' and "
                      "Class/Subclass contains 'beta-lactam'\n"
                      % (len(df_beta), len(df))
              )
              make_beta_lactam_summaries(
                      df_beta=df_beta,
                      col_gene=col_gene,
                      col_seq=col_seq,
                      col_subcls=col_subcls,
                      col_ident=col_ident,
                      col_cov=col_cov,
                      col_iso=col_iso,
              )
      else:
              print(
              "\nWARNING: Cannot apply β-lactam filter because Element type and/or "
              "class/subclass columns were not found. Only amrfinder_* "
              "outputs were generated."
              )

      if __name__ == "__main__":
      main()

Microbial bioinformatics

Microbial bioinformatics uses computational tools to analyze genomes, track evolution, and study functions in microorganisms, including bacteria and viruses.

Daily Archives: 2025年12月5日

Pipeline to Create Circos DE Plots from DESeq2 Output (Data_JuliaFuchs_RNAseq_2025)

Automated β-Lactamase Gene Detection with NCBI AMRFinderPlus (Data_Patricia_AMRFinderPlus_2025, v2)

1. Installation and Database Setup

2. Batch Analysis: Bash Script for Genome Screening