This pipeline updates existing Lakeview merged .lake files by replacing their blue track content with filtered track CSVs, while keeping original .h5 filenames/paths unchanged so Lakeview can still load the raw kymographs locally.
Step 1 — Fix naming mismatches before updating lakes
To ensure each kymograph (.h5) finds its corresponding filtered CSV, filename inconsistencies are corrected first:
_p940_vs_940_: some filtered CSVs contain_p940_while the corresponding.h5uses_940_→ rename CSVs accordingly.ch4vsch5: some filtered CSVs were labeled_ch4_while the.h5filenames use_ch5_(or vice versa) → rename CSVs to match.- Extra CSVs without any matching
.h5are removed to avoid confusion later.
This step prevents Lakeview kymos from being dropped simply because no matching CSV is found.
Step 2 — Filter tracks and generate debug reports
Run the filtering scripts to create multiple filtered outputs from each raw *_blue.csv:
- Binding position filter: 2.2–3.8 µm
- Lifetime thresholds: ≥1s, ≥2s, ≥5s
- A lifetime-only filter: ≥5s without a position constraint
The debug version additionally writes per-track reports (binding position, lifetime, pass/fail reason), which makes it much easier to spot issues caused by parsing, NaNs, or unexpected track structure.
Step 3 — Organize filtered outputs into separate folders
Move filtered files into dedicated directories so each downstream lake update corresponds to a single filtering rule:
filtered_blue_position(2.2–3.8 µm)filtered_blue_position_1s(2.2–3.8 µm + ≥1s)filtered_blue_position_5s(2.2–3.8 µm + ≥5s)filtered_blue_lifetime_5s_only(≥5s, no position filter)
Step 4 — Update .lake files using the filtered tracks
Run 2_update_lakes.py once per filtered folder to create updated .lake outputs (and logs):
- For each kymo in each
.lake, the script tries to find a matching*_blue*.csv. -
Outcomes are classified:
- case1: CSV found and contains ≥1 track → replace blue track text and keep the kymo.
- case2: CSV found but no tracks remain after filtering (header-only / parse error) → remove the kymo.
- case3: no matching CSV → remove the kymo.
- extra: kymo missing a
data/tracks/bluefield → remove the kymo.
-
After filtering/removing kymos, the script also rebuilds:
file_viewer(keeps only.h5files referenced by retained kymos)experiments[*].dataset(keeps only dataset entries matching retained kymos)
This keeps the updated .lake files internally consistent and avoids dangling references.
Scripts used (code snippets)
1) 1_filter_track.py
import pandas as pd
import glob
import os
# === Parameters ===
input_folder = "./data"
output_folder = "./filtered"
separated_folder = "./separated"
# Default position filter parameters (in µm)
default_min_binding_pos = 2.2
default_max_binding_pos = 3.8
# Column names (based on CSV header)
track_col = "track index"
time_col = "time (seconds)"
position_col = "position (um)"
# Filter configurations
filter_configs = [
{
"label": "position",
"min_lifetime": 0.0,
"min_binding_pos": default_min_binding_pos,
"max_binding_pos": default_max_binding_pos,
"desc": "Tracks with binding position 2.2–3.8 µm",
},
{
"label": "position_1s",
"min_lifetime": 1.0,
"min_binding_pos": default_min_binding_pos,
"max_binding_pos": default_max_binding_pos,
"desc": "Tracks with binding position 2.2–3.8 µm and lifetime ≥ 1 s",
},
{
"label": "position_5s",
"min_lifetime": 5.0,
"min_binding_pos": default_min_binding_pos,
"max_binding_pos": default_max_binding_pos,
"desc": "Tracks with binding position 2.2–3.8 µm and lifetime ≥ 5 s",
},
{
"label": "position_2s",
"min_lifetime": 2.0,
"min_binding_pos": default_min_binding_pos,
"max_binding_pos": default_max_binding_pos,
"desc": "Tracks with binding position 2.2–3.8 µm and lifetime ≥ 2 s",
},
{
"label": "lifetime_5s_only",
"min_lifetime": 5.0,
"min_binding_pos": None,
"max_binding_pos": None,
"desc": "Tracks with lifetime ≥ 5 s, no position filter",
},
]
def load_csv(filepath):
"""
Load a blue track CSV:
- find header line starting with '# track index'
- read data rows (semicolon-separated, skipping header lines)
- set lowercase column names based on the header line
"""
try:
with open(filepath, "r") as f:
lines = f.readlines()
if not lines:
raise ValueError(f"File {filepath} is empty")
header_line = None
for line in lines:
if line.startswith("# track index"):
header_line = line.lstrip("# ").strip()
break
if header_line is None:
raise ValueError(
f"No header line starting with '# track index' found in {filepath}"
)
df = pd.read_csv(filepath, sep=";", comment="#", header=None, skiprows=2)
df.columns = [c.strip().lower() for c in header_line.split(";")]
required_cols = [track_col, time_col, position_col]
missing_cols = [col for col in required_cols if col not in df.columns]
if missing_cols:
raise ValueError(
f"Missing required columns in {filepath}: {missing_cols}"
)
return df, lines[0].strip(), header_line
except Exception as e:
print(f"Error loading {filepath}: {e}")
return None, None, None
def compute_lifetime(track_df):
"""
Compute the lifetime of one track as (max time - min time) in seconds.
"""
if track_df[time_col].empty:
return 0.0
return track_df[time_col].max() - track_df[time_col].min()
# === Main Processing ===
os.makedirs(output_folder, exist_ok=True)
os.makedirs(separated_folder, exist_ok=True)
for filepath in glob.glob(os.path.join(input_folder, "*.csv")):
print(f"\n=== Processing input file: {filepath} ===")
df, header1, header2 = load_csv(filepath)
if df is None:
continue
base = os.path.splitext(os.path.basename(filepath))[0]
total_tracks_in_file = df[track_col].nunique()
print(f" Total tracks in file: {total_tracks_in_file}")
for config in filter_configs:
label = config["label"]
min_lifetime = config["min_lifetime"]
min_binding_pos = config.get("min_binding_pos")
max_binding_pos = config.get("max_binding_pos")
kept_tracks = []
removed_tracks = []
fail_pos_only = 0
fail_life_only = 0
fail_both = 0
for track_id, track_df in df.groupby(track_col):
if track_df.empty:
continue
binding_pos = track_df[position_col].iloc[0]
lifetime = compute_lifetime(track_df)
# Position check only if min/max are defined
position_ok = True
if min_binding_pos is not None and binding_pos < min_binding_pos:
position_ok = False
if max_binding_pos is not None and binding_pos > max_binding_pos:
position_ok = False
lifetime_ok = lifetime >= min_lifetime
if position_ok and lifetime_ok:
kept_tracks.append(track_df)
else:
removed_tracks.append(track_df)
if not position_ok and not lifetime_ok:
fail_both += 1
elif not position_ok:
fail_pos_only += 1
elif not lifetime_ok:
fail_life_only += 1
n_kept = len(kept_tracks)
n_removed = len(removed_tracks)
total_tracks = n_kept + n_removed
print(
f" [{label}] tracks kept: {n_kept}/{total_tracks} "
f"(removed: {n_removed}; "
f"fail_pos_only={fail_pos_only}, "
f"fail_life_only={fail_life_only}, "
f"fail_both={fail_both})"
)
# --- Write filtered (kept) file or placeholder file ---
outpath = os.path.join(output_folder, f"{base}_{label}.csv")
if n_kept > 0:
# Normal case: some tracks passed the filter
kept_df = pd.concat(kept_tracks, ignore_index=True)
with open(outpath, "w") as f:
f.write(f"{header1}\n")
f.write(f"# {header2}\n")
kept_df.to_csv(outpath, mode="a", sep=";", index=False, header=False)
print(f" -> Saved filtered tracks ({config['desc']}): {outpath}")
else:
# NEW: no track passed the filter → write header-only placeholder file
with open(outpath, "w") as f:
f.write(f"{header1}\n")
f.write(f"# {header2}\n")
f.write(
f"# no tracks passed the '{label}' filter for {base}; "
f"placeholder file for downstream processing\n"
)
print(
f" -> No tracks passed the '{label}' filter for {base}. "
f"Created header-only placeholder: {outpath}"
)
# Save removed tracks (optional, still useful for debugging)
if removed_tracks:
removed_df = pd.concat(removed_tracks, ignore_index=True)
sep_outpath = os.path.join(
separated_folder, f"{base}_removed_{label}.csv"
)
with open(sep_outpath, "w") as f:
f.write(f"{header1}\n")
f.write(f"# {header2}\n")
removed_df.to_csv(
sep_outpath, mode="a", sep=";", index=False, header=False
)
# print(f" -> Saved removed tracks: {sep_outpath}")
print("\nProcessing complete.")
2) 1_filter_track_debug.py
import pandas as pd
import glob
import os
import argparse
from typing import Optional, Tuple
# Default position filter parameters (in µm)
default_min_binding_pos = 2.2
default_max_binding_pos = 3.8
# Column names (based on CSV header, after lowercasing)
track_col = "track index"
time_col = "time (seconds)"
position_col = "position (um)"
# Filter configurations
filter_configs = [
{
"label": "position",
"min_lifetime": 0.0,
"min_binding_pos": default_min_binding_pos,
"max_binding_pos": default_max_binding_pos,
"desc": "Tracks with binding position 2.2–3.8 µm",
},
{
"label": "position_1s",
"min_lifetime": 1.0,
"min_binding_pos": default_min_binding_pos,
"max_binding_pos": default_max_binding_pos,
"desc": "Tracks with binding position 2.2–3.8 µm and lifetime ≥ 1 s",
},
{
"label": "position_5s",
"min_lifetime": 5.0,
"min_binding_pos": default_min_binding_pos,
"max_binding_pos": default_max_binding_pos,
"desc": "Tracks with binding position 2.2–3.8 µm and lifetime ≥ 5 s",
},
{
"label": "position_2s",
"min_lifetime": 2.0,
"min_binding_pos": default_min_binding_pos,
"max_binding_pos": default_max_binding_pos,
"desc": "Tracks with binding position 2.2–3.8 µm and lifetime ≥ 2 s",
},
{
"label": "lifetime_5s_only",
"min_lifetime": 5.0,
"min_binding_pos": None,
"max_binding_pos": None,
"desc": "Tracks with lifetime ≥ 5 s, no position filter",
},
]
def parse_args():
p = argparse.ArgumentParser(description="Filter blue track CSVs and emit debug reports per track.")
p.add_argument("--input_folder", "-i", default="./data", help="Folder containing input *_blue.csv files.")
p.add_argument("--output_folder", "-o", default="./filtered", help="Folder to write filtered CSVs.")
p.add_argument("--separated_folder", "-s", default="./separated", help="Folder to write removed-tracks CSVs.")
p.add_argument("--debug_folder", "-d", default="./debug_reports", help="Folder to write per-track debug reports.")
p.add_argument(
"--only",
default=None,
help="Optional: only process files whose basename contains this substring (e.g. 'p967_250704_502_10pN_ch4_0bar_b4_1_blue').",
)
p.add_argument(
"--binding_pos_method",
choices=["first_non_nan", "median", "mean"],
default="first_non_nan",
help="How to compute 'binding position' per track for the position filter.",
)
p.add_argument("--verbose", action="store_true", help="Print per-track debug for removed tracks (can be noisy).")
return p.parse_args()
def _coerce_numeric(series: pd.Series) -> pd.Series:
"""
Coerce numeric robustly:
- supports comma decimal separators: '1,23' -> '1.23'
- invalid parses become NaN
"""
s = series.astype(str).str.strip()
s = s.str.replace(",", ".", regex=False)
return pd.to_numeric(s, errors="coerce")
def load_csv(filepath: str) -> Tuple[Optional[pd.DataFrame], Optional[str], Optional[str]]:
"""
Load a blue track CSV:
- find header line starting with '# track index'
- read data rows (semicolon-separated, skipping 2 header lines)
- set lowercase column names based on the header line
- coerce time/position to numeric robustly (comma decimals supported)
"""
try:
with open(filepath, "r", encoding="utf-8") as f:
lines = f.readlines()
if not lines:
raise ValueError(f"File {filepath} is empty")
header_line = None
for line in lines:
if line.startswith("# track index"):
header_line = line.lstrip("# ").strip()
break
if header_line is None:
raise ValueError(f"No header line starting with '# track index' found in {filepath}")
df = pd.read_csv(filepath, sep=";", comment="#", header=None, skiprows=2)
df.columns = [c.strip().lower() for c in header_line.split(";")]
required_cols = [track_col, time_col, position_col]
missing_cols = [col for col in required_cols if col not in df.columns]
if missing_cols:
raise ValueError(f"Missing required columns in {filepath}: {missing_cols}")
# Robust numeric conversion
df[time_col] = _coerce_numeric(df[time_col])
df[position_col] = _coerce_numeric(df[position_col])
# If conversion introduced NaNs, keep them but warn (important for debugging)
n_time_nan = int(df[time_col].isna().sum())
n_pos_nan = int(df[position_col].isna().sum())
if n_time_nan > 0 or n_pos_nan > 0:
print(
f" [WARN] {os.path.basename(filepath)}: NaNs after numeric parsing "
f"(time NaN={n_time_nan}, position NaN={n_pos_nan}). "
f"This can cause lifetime=0 or position filters to behave unexpectedly."
)
return df, lines[0].strip(), header_line
except Exception as e:
print(f"Error loading {filepath}: {e}")
return None, None, None
def compute_lifetime(track_df: pd.DataFrame) -> float:
"""
Lifetime = max(time) - min(time), using only non-NaN times.
"""
t = track_df[time_col].dropna()
if t.empty:
return 0.0
return float(t.max() - t.min())
def compute_binding_pos(track_df: pd.DataFrame, method: str) -> float:
"""
Binding position metric used for filtering.
"""
p = track_df[position_col].dropna()
if p.empty:
return float("nan")
if method == "first_non_nan":
return float(p.iloc[0])
if method == "median":
return float(p.median())
if method == "mean":
return float(p.mean())
return float(p.iloc[0])
def main():
args = parse_args()
os.makedirs(args.output_folder, exist_ok=True)
os.makedirs(args.separated_folder, exist_ok=True)
os.makedirs(args.debug_folder, exist_ok=True)
for filepath in glob.glob(os.path.join(args.input_folder, "*.csv")):
basefile = os.path.basename(filepath)
base = os.path.splitext(basefile)[0]
if args.only and args.only not in base:
continue
print(f"\n=== Processing input file: {filepath} ===")
df, header1, header2 = load_csv(filepath)
if df is None:
continue
total_tracks_in_file = df[track_col].nunique()
print(f" Total tracks in file: {total_tracks_in_file}")
for config in filter_configs:
label = config["label"]
min_lifetime = float(config["min_lifetime"])
min_binding_pos = config.get("min_binding_pos")
max_binding_pos = config.get("max_binding_pos")
kept_tracks = []
removed_tracks = []
# For debug report
track_rows = []
fail_pos_only = 0
fail_life_only = 0
fail_both = 0
for track_id, track_df in df.groupby(track_col):
if track_df.empty:
continue
binding_pos = compute_binding_pos(track_df, args.binding_pos_method)
lifetime = compute_lifetime(track_df)
# Position check only if min/max are defined, and treat NaN binding_pos as "fails position"
position_ok = True
if min_binding_pos is not None or max_binding_pos is not None:
if pd.isna(binding_pos):
position_ok = False
else:
if min_binding_pos is not None and binding_pos < float(min_binding_pos):
position_ok = False
if max_binding_pos is not None and binding_pos > float(max_binding_pos):
position_ok = False
lifetime_ok = lifetime >= min_lifetime
reason_parts = []
if not position_ok:
reason_parts.append("position_out_of_range_or_nan")
if not lifetime_ok:
reason_parts.append("lifetime_too_short_or_time_nan")
reason = "PASS" if (position_ok and lifetime_ok) else "+".join(reason_parts)
# Debug record per track
track_rows.append(
{
"track_id": track_id,
"n_points": int(len(track_df)),
"binding_pos_um": binding_pos,
"binding_pos_method": args.binding_pos_method,
"lifetime_s": lifetime,
"position_ok": bool(position_ok),
"lifetime_ok": bool(lifetime_ok),
"reason": reason,
"min_binding_pos_um": min_binding_pos,
"max_binding_pos_um": max_binding_pos,
"min_lifetime_s": min_lifetime,
}
)
if position_ok and lifetime_ok:
kept_tracks.append(track_df)
else:
removed_tracks.append(track_df)
if not position_ok and not lifetime_ok:
fail_both += 1
elif not position_ok:
fail_pos_only += 1
elif not lifetime_ok:
fail_life_only += 1
if args.verbose:
print(
f" [REMOVED {label}] track={track_id} "
f"binding_pos={binding_pos} lifetime={lifetime} reason={reason}"
)
n_kept = len(kept_tracks)
n_removed = len(removed_tracks)
total_tracks = n_kept + n_removed
print(
f" [{label}] tracks kept: {n_kept}/{total_tracks} "
f"(removed: {n_removed}; "
f"fail_pos_only={fail_pos_only}, "
f"fail_life_only={fail_life_only}, "
f"fail_both={fail_both})"
)
# --- Write filtered (kept) file or placeholder file ---
outpath = os.path.join(args.output_folder, f"{base}_{label}.csv")
if n_kept > 0:
kept_df = pd.concat(kept_tracks, ignore_index=True)
with open(outpath, "w", encoding="utf-8") as f:
f.write(f"{header1}\n")
f.write(f"# {header2}\n")
kept_df.to_csv(outpath, mode="a", sep=";", index=False, header=False)
print(f" -> Saved filtered tracks ({config['desc']}): {outpath}")
else:
with open(outpath, "w", encoding="utf-8") as f:
f.write(f"{header1}\n")
f.write(f"# {header2}\n")
f.write(
f"# no tracks passed the '{label}' filter for {base}; "
f"placeholder file for downstream processing\n"
)
print(
f" -> No tracks passed the '{label}' filter for {base}. "
f"Created header-only placeholder: {outpath}"
)
# Save removed tracks (still useful)
if removed_tracks:
removed_df = pd.concat(removed_tracks, ignore_index=True)
sep_outpath = os.path.join(args.separated_folder, f"{base}_removed_{label}.csv")
with open(sep_outpath, "w", encoding="utf-8") as f:
f.write(f"{header1}\n")
f.write(f"# {header2}\n")
removed_df.to_csv(sep_outpath, mode="a", sep=";", index=False, header=False)
# --- NEW: per-track debug report ---
report_df = pd.DataFrame(track_rows).sort_values(["reason", "track_id"])
report_path = os.path.join(args.debug_folder, f"{base}_{label}_track_report.csv")
report_df.to_csv(report_path, index=False)
print(f" -> Wrote per-track debug report: {report_path}")
print("\nProcessing complete.")
if __name__ == "__main__":
main()
3) 2_update_lakes.py
import pandas as pd
import glob
import os
import json
import argparse
def parse_args():
parser = argparse.ArgumentParser(
description="Update merged lake files with filtered blue track CSVs."
)
parser.add_argument(
"--merged_lake_folder", "-m",
default="./",
help="Folder containing merged .lake files (default: ./)"
)
parser.add_argument(
"--filtered_folder", "-f",
default="./filtered",
help="Folder containing filtered blue track CSVs (default: ./filtered)"
)
parser.add_argument(
"--output_folder", "-o",
default="./updated_lakes",
help="Folder to write updated .lake files to (default: ./updated_lakes)"
)
return parser.parse_args()
def build_blue_text_from_csv(csv_path):
"""
Rebuild the 'blue' track text block for Lakeview from a filtered CSV file.
Returns
-------
blue_text : str
Header + (optional) data rows in Lakeview format.
n_rows : int
Number of data rows (tracks). If 0, the CSV is considered "header only"
/ no tracks after filtering.
"""
with open(csv_path, "r", encoding="utf-8") as f:
lines = f.readlines()
if len(lines) < 2:
raise ValueError(f"{csv_path} has fewer than 2 header lines. Please check the file.")
header1 = lines[0].strip() # first header line
header2 = lines[1].strip() # second header line with column names
data_lines = lines[2:]
# Check if there is any non-comment, non-empty data line
has_data = any(
(not ln.lstrip().startswith("#")) and ln.strip() != ""
for ln in data_lines
)
# Column names are taken from the second header line (strip leading '# ')
colnames = [c.strip() for c in header2.lstrip("# ").split(";")]
base_text = header1 + "\n" + header2 + "\n"
if not has_data:
# Header-only CSV -> no tracks after filtering
return base_text, 0
# Read data rows with pandas
df = pd.read_csv(csv_path, sep=";", comment="#", header=None, skiprows=2)
if df.shape[0] == 0:
# Safety net: no rows
return base_text, 0
df.columns = colnames
n_rows = len(df)
txt = base_text
for _, row in df.iterrows():
row_str = ";".join(str(row[c]) for c in colnames)
txt += row_str + "\n"
return txt, n_rows
def find_matching_csv(filtered_folder, kymo_name, i):
"""
Try to find the filtered CSV corresponding to a given kymo_name.
1) First, try exact match:
<kymo_name>_blue*.csv
2) If not found, try a 'p'-patched version of the numeric chunk (e.g. 940 -> p940 or p940 -> 940)
"""
# 1) Exact match
pattern = os.path.join(filtered_folder, f"{kymo_name}_blue*.csv")
candidates = glob.glob(pattern)
if len(candidates) == 1:
return candidates[0]
elif len(candidates) > 1:
print(f" [kymo {i}] Multiple CSV matches for {kymo_name} (exact), skipping:")
for c in candidates:
print(f" - {c}")
return None # ambiguous
# 2) Fallback: patch the 3-digit numeric part by adding or removing 'p'
parts = kymo_name.split("_")
alt_candidates = []
for idx, part in enumerate(parts):
# Case A: pure 3-digit number (e.g. "940") -> try "p940"
if part.isdigit() and len(part) == 3:
alt_parts = parts.copy()
alt_parts[idx] = "p" + part
alt_name = "_".join(alt_parts)
alt_pattern = os.path.join(filtered_folder, f"{alt_name}_blue*.csv")
alt_candidates = glob.glob(alt_pattern)
if alt_candidates:
print(
f" [kymo {i}] No exact CSV for '{kymo_name}', "
f"but found match using '{alt_name}'."
)
break
# Case B: starts with 'p' and then 3 digits (e.g. "p940") -> try without 'p'
if part.startswith("p") and part[1:].isdigit() and len(part) == 4:
alt_parts = parts.copy()
alt_parts[idx] = part[1:] # drop the leading 'p'
alt_name = "_".join(alt_parts)
alt_pattern = os.path.join(filtered_folder, f"{alt_name}_blue*.csv")
alt_candidates = glob.glob(alt_pattern)
if alt_candidates:
print(
f" [kymo {i}] No exact CSV for '{kymo_name}', "
f"but found match using '{alt_name}'."
)
break
if len(alt_candidates) == 1:
return alt_candidates[0]
elif len(alt_candidates) > 1:
print(f" [kymo {i}] Multiple CSV matches for patched name, skipping:")
for c in alt_candidates:
print(f" - {c}")
return None
# Nothing found
return None
def main():
args = parse_args()
merged_lake_folder = args.merged_lake_folder
filtered_folder = args.filtered_folder
output_folder = args.output_folder
os.makedirs(output_folder, exist_ok=True)
# Global counters across all lakes
total_case1 = 0 # case1: CSV found & n_rows>0 → tracks updated (kymo kept)
total_case2 = 0 # case2: CSV exists, but no tracks remain after filtering (empty or error) → kymo removed
total_case3 = 0 # case3: no matching CSV → kymo removed
total_extra = 0 # extra: kymo without data/tracks/blue → removed
# Detailed lists of sample names (lake, kymo, ...)
case1_kymos = [] # (lake_file, kymo_name, csv_path)
case2_kymos = [] # (lake_file, kymo_name, csv_path, reason)
case3_kymos = [] # (lake_file, kymo_name)
extra_kymos = [] # (lake_file, kymo_name)
used_csv_paths = set() # CSVs that were actually matched to some kymo
# Loop over all merged .lake files
for lake_path in glob.glob(os.path.join(merged_lake_folder, "*.lake")):
base = os.path.basename(lake_path)
print(f"\n=== Processing lake file: {base} ===")
# per-lake list of removed kymograph names
removed_kymo_names = set()
# Load JSON from .lake file
with open(lake_path, "r", encoding="utf-8") as f:
lake = json.load(f)
old_kymos = lake.get("kymos", [])
new_kymos = [] # we will build a filtered list here
# Iterate over all kymos in this lake
for i, kymo in enumerate(old_kymos):
# Extract kymograph name from address.path (last segment of the path)
addr = kymo.get("address", {})
path = addr.get("path", "")
kymo_name = path.split("/")[-1] if path else None
if not kymo_name:
print(f" [kymo {i}] No valid name/path found, skipping.")
# keep it as-is (very unusual case)
new_kymos.append(kymo)
continue
# Find the corresponding filtered CSV
csv_path = find_matching_csv(filtered_folder, kymo_name, i)
if csv_path is None:
# case3: no CSV → remove kymo
print(
f" [kymo {i}] No suitable CSV found for '{kymo_name}' "
f"in {filtered_folder} → REMOVING kymograph from output lake."
)
total_case3 += 1
case3_kymos.append((base, kymo_name))
removed_kymo_names.add(kymo_name)
continue
csv_name = os.path.basename(csv_path)
used_csv_paths.add(os.path.abspath(csv_path))
# Build the new blue track text from the filtered CSV
try:
blue_text, n_rows = build_blue_text_from_csv(csv_path)
except Exception as e:
# case2: CSV present but not parseable
msg = f"read error: {e}"
print(f" [kymo {i}] Error reading {csv_name}: {msg} → REMOVING kymograph.")
total_case2 += 1
case2_kymos.append((base, kymo_name, csv_path, msg))
removed_kymo_names.add(kymo_name)
continue
if n_rows == 0:
# case2: CSV present but no tracks after filtering
msg = "0 tracks after filtering (header-only CSV)"
print(
f" [kymo {i}] CSV {csv_name} contains no tracks after filtering "
f"→ REMOVING kymograph."
)
total_case2 += 1
case2_kymos.append((base, kymo_name, csv_path, msg))
removed_kymo_names.add(kymo_name)
continue
# If we reach here, we have a non-empty CSV, so this is case1
try:
if "data" in kymo and "tracks" in kymo["data"] and "blue" in kymo["data"]["tracks"]:
kymo["data"]["tracks"]["blue"] = blue_text
new_kymos.append(kymo)
total_case1 += 1
case1_kymos.append((base, kymo_name, csv_path))
print(f" [kymo {i}] Updated blue tracks from {csv_name} (kept).")
else:
# extra: kymo structure has no blue field at all → remove
print(
f" [kymo {i}] Kymo '{kymo_name}' has no data/tracks/blue field "
f"→ REMOVING from output lake."
)
total_extra += 1
extra_kymos.append((base, kymo_name))
removed_kymo_names.add(kymo_name)
except Exception as e:
# treat write problems also as case2
msg = f"write error: {e}"
print(
f" [kymo {i}] Error writing tracks for {kymo_name}: {msg} "
f"→ REMOVING kymograph."
)
total_case2 += 1
case2_kymos.append((base, kymo_name, csv_path, msg))
removed_kymo_names.add(kymo_name)
# Replace kymos list with filtered one (case2/case3/extra removed)
lake["kymos"] = new_kymos
# ------------------------------------------------------
# NEW PART: rebuild file_viewer and experiments[*].dataset
# so that H5 links are consistent with the kept kymos.
# ------------------------------------------------------
kept_kymo_names = set()
file_viewer_files = []
for kymo in new_kymos:
addr = kymo.get("address", {})
path = addr.get("path", "")
file = addr.get("file", "")
if path:
name = path.split("/")[-1]
kept_kymo_names.add(name)
if file and file not in file_viewer_files:
file_viewer_files.append(file)
# 1) Root-level file_viewer: only files from kept kymos
if "file_viewer" in lake:
lake["file_viewer"] = file_viewer_files
# 2) Experiments datasets: keep only entries whose path matches kept kymo
if "experiments" in lake and isinstance(lake["experiments"], dict):
for exp_key, exp in lake["experiments"].items():
if not isinstance(exp, dict):
continue
dataset = exp.get("dataset")
if isinstance(dataset, list):
new_dataset = []
for item in dataset:
if not isinstance(item, dict):
continue
addr = item.get("address", {})
path = addr.get("path", "")
name = path.split("/")[-1] if path else None
if name in kept_kymo_names:
new_dataset.append(item)
exp["dataset"] = new_dataset
# Save updated lake JSON to output folder
out_path = os.path.join(output_folder, base)
with open(out_path, "w", encoding="utf-8") as f:
json.dump(lake, f, indent=4)
print(f"==> {base}: kept {len(new_kymos)} kymos after filtering, written to {out_path}")
# --- Global summary over all lakes ---
print("\n=== Summary over all processed lakes ===")
print(f" case1: updated kymos (CSV found & ≥1 track, kept) = {total_case1}")
print(f" case2: removed kymos (CSV exists, but no tracks remain after filtering) = {total_case2}")
print(f" case3: removed kymos (no matching CSV found) = {total_case3}")
print(f" extra: removed kymos (no data/tracks/blue field) = {total_extra}")
total_kymos = total_case1 + total_case2 + total_case3 + total_extra
print(f" total kymos classified (sum of the above) = {total_kymos}")
# CSV usage check
all_csv_paths = sorted(os.path.abspath(p) for p in glob.glob(os.path.join(filtered_folder, "*_blue*.csv")))
print(f"\nTotal CSV files in filtered_folder: {len(all_csv_paths)}")
print(f"CSV files actually used (matched to some kymo): {len(used_csv_paths)}")
unused_csv = [p for p in all_csv_paths if p not in used_csv_paths]
if unused_csv:
print("\nCSV files NOT used by any kymo (name mismatch / other replicates):")
for p in unused_csv:
print(f" {p}")
else:
print("\nAll CSV files in filtered_folder were used by at least one kymo.")
if __name__ == "__main__":
main()