summarize_from_amrfinder_results.py
#!/usr/bin/env python3
"""
summarize_from_amrfinder_results.py
Usage:
python summarize_from_amrfinder_results.py [amrfinder_results_dir]
Default directory is "amrfinder_results" (relative to current working dir).
This script:
1) Reads all *.amrfinder.tsv in the given directory
2) Merges them into a combined table
3) Generates AMRFinder-wide summaries (amrfinder_* files)
4) Applies a β-lactam filter:
Element type == "AMR" (case-insensitive)
AND Class or Subclass contains "beta-lactam" (case-insensitive)
and generates β-lactam-only summaries (beta_lactam_* files).
It NEVER re-runs AMRFinder; it only uses existing TSV files.
"""
import sys
import os
import glob
import re
try:
import pandas as pd
except ImportError:
sys.stderr.write(
"ERROR: pandas is not installed.\n"
"Install with something like:\n"
" mamba install pandas openpyxl -c conda-forge -c bioconda\n"
)
sys.exit(1)
# ---------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------
def read_one(path):
"""Read one *.amrfinder.tsv and add an 'isolate_id' column from the filename."""
df = pd.read_csv(path, sep="\t", dtype=str)
df.columns = [c.strip() for c in df.columns]
isolate_id = os.path.basename(path).replace(".amrfinder.tsv", "")
df["isolate_id"] = isolate_id
return df
def pick(df, *candidates):
"""Return the first existing column name among candidates (normalized names)."""
for c in candidates:
if c in df.columns:
return c
return None
# ---------------------------------------------------------------------
# AMRFinder-wide summaries (no β-lactam filter)
# ---------------------------------------------------------------------
def make_amrfinder_summaries(
df_all,
col_gene,
col_seq,
col_class,
col_subcls,
col_ident,
col_cov,
col_iso,
):
"""Summaries for ALL AMRFinder hits (no β-lactam filter)."""
if df_all.empty:
print("[amrfinder] No rows in merged table, skipping summaries.")
return
# full merged table
df_all.to_csv("amrfinder_all.tsv", sep="\t", index=False)
print(">>> Full AMRFinder table written to: amrfinder_all.tsv")
# ---- summary by isolate × gene ----
rows = []
for (iso, gene), sub in df_all.groupby([col_iso, col_gene], dropna=False):
row = {
"isolate_id": iso,
"Gene_symbol": sub[col_gene].iloc[0],
"n_hits": len(sub),
}
if col_seq is not None:
row["Sequence_name"] = sub[col_seq].iloc[0]
if col_class is not None:
row["Class"] = sub[col_class].iloc[0]
if col_subcls is not None:
row["Subclass"] = sub[col_subcls].iloc[0]
if col_ident is not None:
vals = pd.to_numeric(sub[col_ident], errors="coerce")
row["%identity_min"] = vals.min()
row["%identity_max"] = vals.max()
if col_cov is not None:
vals = pd.to_numeric(sub[col_cov], errors="coerce")
row["%coverage_min"] = vals.min()
row["%coverage_max"] = vals.max()
rows.append(row)
summary_iso_gene = pd.DataFrame(rows)
summary_iso_gene.to_csv(
"amrfinder_summary_by_isolate_gene.tsv", sep="\t", index=False
)
print(">>> Isolate × gene summary written to: amrfinder_summary_by_isolate_gene.tsv")
# ---- summary by gene ----
def join(vals):
uniq = sorted(set(vals.dropna().astype(str)))
return ",".join(uniq)
rows = []
for gene, sub in df_all.groupby(col_gene, dropna=False):
row = {
"Gene_symbol": sub[col_gene].iloc[0],
"n_isolates": sub[col_iso].nunique(),
"isolates": ",".join(sorted(set(sub[col_iso].dropna().astype(str)))),
"n_hits": len(sub),
}
if col_seq is not None:
row["Sequence_name"] = join(sub[col_seq])
if col_class is not None:
row["Class"] = join(sub[col_class])
if col_subcls is not None:
row["Subclass"] = join(sub[col_subcls])
rows.append(row)
summary_gene = pd.DataFrame(rows)
summary_gene = summary_gene.sort_values("n_isolates", ascending=False)
summary_gene.to_csv("amrfinder_summary_by_gene.tsv", sep="\t", index=False)
print(">>> Gene-level summary written to: amrfinder_summary_by_gene.tsv")
# ---- summary by class/subclass ----
summary_class = None
if col_class is not None:
group_cols = [col_class]
if col_subcls is not None:
group_cols.append(col_subcls)
summary_class = (
df_all.groupby(group_cols, dropna=False)
.agg(
n_isolates=(col_iso, "nunique"),
n_hits=(col_iso, "size"),
)
.reset_index()
)
summary_class.to_csv("amrfinder_summary_by_class.tsv", sep="\t", index=False)
print(">>> Class-level summary written to: amrfinder_summary_by_class.tsv")
else:
print(">>> No 'class' column found; amrfinder_summary_by_class.tsv not created.")
# ---- Excel workbook ----
try:
with pd.ExcelWriter("amrfinder_summary.xlsx") as xw:
df_all.to_excel(xw, sheet_name="amrfinder_all", index=False)
summary_iso_gene.to_excel(xw, sheet_name="by_isolate_gene", index=False)
summary_gene.to_excel(xw, sheet_name="by_gene", index=False)
if summary_class is not None:
summary_class.to_excel(xw, sheet_name="by_class", index=False)
print(">>> Excel workbook written: amrfinder_summary.xlsx")
except Exception as e:
print("WARNING: could not write amrfinder_summary.xlsx:", e)
# ---------------------------------------------------------------------
# β-lactam summaries
# ---------------------------------------------------------------------
def make_beta_lactam_summaries(
df_beta,
col_gene,
col_seq,
col_subcls,
col_ident,
col_cov,
col_iso,
):
"""Summaries for β-lactam subset (after mask)."""
if df_beta.empty:
print("[beta_lactam] No β-lactam hits in subset, skipping.")
return
# full β-lactam table
beta_all_tsv = "beta_lactam_all.tsv"
df_beta.to_csv(beta_all_tsv, sep="\t", index=False)
print(">>> β-lactam / β-lactamase hits written to: %s" % beta_all_tsv)
# -------- summary by gene (with list of isolates) --------
group_cols = [col_gene]
if col_seq is not None:
group_cols.append(col_seq)
if col_subcls is not None:
group_cols.append(col_subcls)
def join_isolates(vals):
uniq = sorted(set(vals.dropna().astype(str)))
return ",".join(uniq)
summary_gene = (
df_beta.groupby(group_cols, dropna=False)
.agg(
n_isolates=(col_iso, "nunique"),
isolates=(col_iso, join_isolates),
n_hits=(col_iso, "size"),
)
.reset_index()
)
rename_map = {}
if col_gene is not None:
rename_map[col_gene] = "Gene_symbol"
if col_seq is not None:
rename_map[col_seq] = "Sequence_name"
if col_subcls is not None:
rename_map[col_subcls] = "Subclass"
summary_gene.rename(columns=rename_map, inplace=True)
sum_gene_tsv = "beta_lactam_summary_by_gene.tsv"
summary_gene.to_csv(sum_gene_tsv, sep="\t", index=False)
print(">>> Gene-level β-lactam summary written to: %s" % sum_gene_tsv)
print(" (includes 'isolates' = comma-separated isolate_ids)")
# -------- summary by isolate & gene (with annotation) --------
rows = []
for (iso, gene), sub in df_beta.groupby([col_iso, col_gene], dropna=False):
row = {
"isolate_id": iso,
"Gene_symbol": sub[col_gene].iloc[0],
"n_hits": len(sub),
}
if col_seq is not None:
row["Sequence_name"] = sub[col_seq].iloc[0]
if col_subcls is not None:
row["Subclass"] = sub[col_subcls].iloc[0]
if col_ident is not None:
vals = pd.to_numeric(sub[col_ident], errors="coerce")
row["%identity_min"] = vals.min()
row["%identity_max"] = vals.max()
if col_cov is not None:
vals = pd.to_numeric(sub[col_cov], errors="coerce")
row["%coverage_min"] = vals.min()
row["%coverage_max"] = vals.max()
rows.append(row)
summary_iso_gene = pd.DataFrame(rows)
sum_iso_gene_tsv = "beta_lactam_summary_by_isolate_gene.tsv"
summary_iso_gene.to_csv(sum_iso_gene_tsv, sep="\t", index=False)
print(">>> Isolate × gene β-lactam summary written to: %s" % sum_iso_gene_tsv)
print(" (includes 'Gene_symbol' and 'Sequence_name' annotation columns)")
# -------- optional Excel exports --------
try:
with pd.ExcelWriter("beta_lactam_all.xlsx") as xw:
df_beta.to_excel(xw, sheet_name="beta_lactam_all", index=False)
with pd.ExcelWriter("beta_lactam_summary.xlsx") as xw:
summary_gene.to_excel(xw, sheet_name="by_gene", index=False)
summary_iso_gene.to_excel(xw, sheet_name="by_isolate_gene", index=False)
print(">>> Excel workbooks written: beta_lactam_all.xlsx, beta_lactam_summary.xlsx")
except Exception as e:
print("WARNING: could not write β-lactam Excel files:", e)
# ---------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------
def main():
outdir = sys.argv[1] if len(sys.argv) > 1 else "amrfinder_results"
if not os.path.isdir(outdir):
sys.stderr.write("ERROR: directory '%s' not found.\n" % outdir)
sys.exit(1)
files = sorted(glob.glob(os.path.join(outdir, "*.amrfinder.tsv")))
if not files:
sys.stderr.write("ERROR: no *.amrfinder.tsv files found in '%s'.\n" % outdir)
sys.exit(1)
print(">>> Found %d AMRFinder TSV files in: %s" % (len(files), outdir))
for f in files:
print(" -", os.path.basename(f))
dfs = [read_one(f) for f in files]
df = pd.concat(dfs, ignore_index=True)
# normalize column names for internal use
norm_cols = {c: c.strip().lower().replace(" ", "_") for c in df.columns}
df.rename(columns=norm_cols, inplace=True)
# locate columns (handles your Element type / subtype + older formats)
col_gene = pick(df, "gene_symbol", "genesymbol")
col_seq = pick(df, "sequence_name", "sequencename")
col_elemtype = pick(df, "element_type")
col_elemsub = pick(df, "element_subtype")
col_class = pick(df, "class")
col_subcls = pick(df, "subclass")
col_ident = pick(df, "%identity_to_reference_sequence", "identity")
col_cov = pick(df, "%coverage_of_reference_sequence", "coverage_of_reference_sequence")
col_iso = "isolate_id"
print("\nDetected columns:")
for label, col in [
("gene", col_gene),
("sequence", col_seq),
("element_type", col_elemtype),
("element_subtype", col_elemsub),
("class", col_class),
("subclass", col_subcls),
("%identity", col_ident),
("%coverage", col_cov),
("isolate_id", col_iso),
]:
print(" %-14s: %s" % (label, col))
if col_gene is None:
sys.stderr.write(
"ERROR: could not find a gene symbol column "
"(expected something like 'Gene symbol' in the original AMRFinder output).\n"
)
sys.exit(1)
print("\n=== Generating AMRFinder-wide summaries (all hits) ===")
make_amrfinder_summaries(
df_all=df,
col_gene=col_gene,
col_seq=col_seq,
col_class=col_class,
col_subcls=col_subcls,
col_ident=col_ident,
col_cov=col_cov,
col_iso=col_iso,
)
# -----------------------------------------------------------------
# β-lactam subset
#
# New logic for your current data:
# Element type == "AMR"
# AND Class or Subclass contains "beta-lactam"
#
# Falls back to just Class/Subclass if Element type not present.
# -----------------------------------------------------------------
if (col_elemtype is not None) or (col_class is not None or col_subcls is not None):
# element type AMR (if column exists, otherwise all True)
if col_elemtype is not None:
mask_amr = df[col_elemtype].str.contains("AMR", case=False, na=False)
else:
mask_amr = pd.Series(True, index=df.index)
# beta-lactam pattern (handles BETA-LACTAM, beta lactam, etc.)
beta_pattern = re.compile(r"beta[- ]?lactam", re.IGNORECASE)
mask_beta = pd.Series(False, index=df.index)
if col_class is not None:
mask_beta |= df[col_class].fillna("").str.contains(beta_pattern)
if col_subcls is not None:
mask_beta |= df[col_subcls].fillna("").str.contains(beta_pattern)
mask = mask_amr & mask_beta
df_beta = df.loc[mask].copy()
if df_beta.empty:
print(
"\nWARNING: No β-lactam hits found "
"(Element type == 'AMR' AND Class/Subclass contains 'beta-lactam')."
)
else:
print(
"\n=== β-lactam subset ===\n"
" kept %d of %d rows where Element type is 'AMR' and "
"Class/Subclass contains 'beta-lactam'\n"
% (len(df_beta), len(df))
)
make_beta_lactam_summaries(
df_beta=df_beta,
col_gene=col_gene,
col_seq=col_seq,
col_subcls=col_subcls,
col_ident=col_ident,
col_cov=col_cov,
col_iso=col_iso,
)
else:
print(
"\nWARNING: Cannot apply β-lactam filter because Element type and/or "
"class/subclass columns were not found. Only amrfinder_* "
"outputs were generated."
)
if __name__ == "__main__":
main()