#!/usr/bin/env python3
"""
cull.py — fast dailies triage for a folder of video clips.

Scans every video file under a directory, runs ffmpeg's blackdetect +
silencedetect filters, samples peak audio level, and writes:

  cull-report.md  — human-readable markdown sorted by severity
  cull-report.csv — same data, sortable in any spreadsheet

Flags any clip with:
  - Black frames totalling > 1 second
  - Audio silence segments > 5 seconds
  - Audio peak clipping (samples at 0 dBFS / -0.1 dBFS)
  - File size < 10 MB (likely accidental short take)
  - File duration < 2 seconds

Local, free. Needs ffmpeg + ffprobe on PATH (brew install ffmpeg).

Usage:
  python3 cull.py /path/to/dailies
  python3 cull.py ~/Footage/Wedding-2026-05 --out cull-2026-05.md

Python 3.9+. Standard library only.
"""
from __future__ import annotations

import argparse
import csv
import json
import re
import subprocess
import sys
from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import dataclass, field
from pathlib import Path

VIDEO_EXTS = {".mov", ".mp4", ".mxf", ".avi", ".mkv", ".braw", ".r3d", ".m4v"}
SILENCE_DB = "-40dB"          # treat below this as silence
SILENCE_MIN = 5.0             # min run-length in seconds to flag
BLACK_MIN = 1.0               # min black frame run-length to flag
TINY_BYTES = 10 * 1024 * 1024
TINY_SECONDS = 2.0
CLIP_PEAK_DB = -0.1           # max_volume above this = clipping


@dataclass
class ClipReport:
    path: Path
    duration: float = 0.0
    size_bytes: int = 0
    black_runs: list[tuple[float, float]] = field(default_factory=list)
    silence_runs: list[tuple[float, float]] = field(default_factory=list)
    peak_db: float = -100.0
    error: str = ""

    @property
    def total_black(self) -> float:
        return sum(end - start for start, end in self.black_runs)

    @property
    def total_silence(self) -> float:
        return sum(end - start for start, end in self.silence_runs)

    @property
    def severity(self) -> int:
        score = 0
        if self.error:
            score += 100
        if self.duration > 0 and self.duration < TINY_SECONDS:
            score += 50
        if self.size_bytes < TINY_BYTES:
            score += 20
        if self.total_black > BLACK_MIN:
            score += int(self.total_black * 5)
        if self.total_silence > SILENCE_MIN and self.duration > 0:
            score += int((self.total_silence / self.duration) * 30)
        if self.peak_db > CLIP_PEAK_DB:
            score += 30
        return score


def ffprobe(path: Path) -> tuple[float, int]:
    """(duration_seconds, size_bytes)"""
    try:
        r = subprocess.run(
            ["ffprobe", "-v", "error", "-show_entries",
             "format=duration,size", "-of", "json", str(path)],
            capture_output=True, text=True, timeout=30,
        )
        data = json.loads(r.stdout or "{}")
        fmt = data.get("format", {})
        return float(fmt.get("duration", 0)), int(fmt.get("size", 0))
    except Exception:
        return 0.0, path.stat().st_size if path.exists() else 0


def scan_clip(path: Path) -> ClipReport:
    report = ClipReport(path=path)
    try:
        report.duration, report.size_bytes = ffprobe(path)
    except Exception as e:
        report.error = f"probe failed: {e}"
        return report

    if report.duration == 0:
        report.error = "zero duration"
        return report

    try:
        r = subprocess.run(
            [
                "ffmpeg", "-hide_banner", "-nostats", "-i", str(path),
                "-af", f"silencedetect=noise={SILENCE_DB}:d={SILENCE_MIN},volumedetect",
                "-vf", f"blackdetect=d={BLACK_MIN}:pic_th=0.98",
                "-f", "null", "-",
            ],
            capture_output=True, text=True, timeout=max(180, int(report.duration * 1.5)),
        )
        stderr = r.stderr
    except subprocess.TimeoutExpired:
        report.error = "ffmpeg timeout"
        return report
    except Exception as e:
        report.error = f"ffmpeg failed: {e}"
        return report

    # parse blackdetect lines
    for m in re.finditer(
        r"blackdetect.*black_start:(\d+\.?\d*)\s+black_end:(\d+\.?\d*)", stderr
    ):
        report.black_runs.append((float(m.group(1)), float(m.group(2))))

    # parse silencedetect lines (start + duration form)
    silence_starts: list[float] = []
    for line in stderr.splitlines():
        if "silence_start" in line:
            m = re.search(r"silence_start:\s*(-?\d+\.?\d*)", line)
            if m:
                silence_starts.append(float(m.group(1)))
        elif "silence_end" in line:
            m = re.search(r"silence_end:\s*(\d+\.?\d*)\s*\|\s*silence_duration:\s*(\d+\.?\d*)", line)
            if m and silence_starts:
                end = float(m.group(1))
                start = silence_starts.pop(0)
                report.silence_runs.append((start, end))

    # parse volumedetect max_volume
    m = re.search(r"max_volume:\s*(-?\d+\.?\d*)\s*dB", stderr)
    if m:
        report.peak_db = float(m.group(1))

    return report


def format_md(reports: list[ClipReport]) -> str:
    reports_sorted = sorted(reports, key=lambda r: -r.severity)
    flagged = [r for r in reports_sorted if r.severity > 0]
    clean = [r for r in reports_sorted if r.severity == 0]
    lines: list[str] = []
    lines.append(f"# Cull report — {len(reports)} clips")
    lines.append(f"- Flagged: {len(flagged)}")
    lines.append(f"- Clean: {len(clean)}")
    lines.append("")
    if flagged:
        lines.append("## Flagged (sorted by severity)")
        lines.append("")
        lines.append("| Clip | Duration | Size | Issues |")
        lines.append("|---|---|---|---|")
        for r in flagged:
            issues = []
            if r.error:
                issues.append(f"ERROR: {r.error}")
            if r.duration > 0 and r.duration < TINY_SECONDS:
                issues.append(f"too short ({r.duration:.1f}s)")
            if r.size_bytes < TINY_BYTES:
                issues.append(f"tiny file ({r.size_bytes / 1024 / 1024:.1f} MB)")
            if r.total_black > BLACK_MIN:
                issues.append(f"black frames {r.total_black:.1f}s")
            if r.total_silence > SILENCE_MIN and r.duration > 0:
                pct = (r.total_silence / r.duration) * 100
                issues.append(f"silence {r.total_silence:.0f}s ({pct:.0f}%)")
            if r.peak_db > CLIP_PEAK_DB:
                issues.append(f"audio clipping ({r.peak_db:.2f} dBFS)")
            issues_s = "; ".join(issues) or "—"
            lines.append(
                f"| `{r.path.name}` | {r.duration:.1f}s | "
                f"{r.size_bytes / 1024 / 1024:.0f} MB | {issues_s} |"
            )
    if clean:
        lines.append("")
        lines.append(f"## Clean ({len(clean)})")
        lines.append("")
        for r in clean:
            lines.append(
                f"- `{r.path.name}` — {r.duration:.1f}s, "
                f"{r.size_bytes / 1024 / 1024:.0f} MB, peak {r.peak_db:.1f} dBFS"
            )
    return "\n".join(lines) + "\n"


def format_csv_rows(reports: list[ClipReport]):
    yield ["path", "duration_s", "size_mb", "severity",
           "black_total_s", "silence_total_s", "peak_dbfs", "error"]
    for r in sorted(reports, key=lambda r: -r.severity):
        yield [
            str(r.path),
            f"{r.duration:.2f}",
            f"{r.size_bytes / 1024 / 1024:.1f}",
            r.severity,
            f"{r.total_black:.2f}",
            f"{r.total_silence:.2f}",
            f"{r.peak_db:.2f}",
            r.error,
        ]


def main(argv: list[str] | None = None) -> int:
    parser = argparse.ArgumentParser(prog="cull", description=__doc__.split("\n\n")[0])
    parser.add_argument("folder", type=Path, help="Directory of video clips to scan")
    parser.add_argument("--out", type=Path, default=None, help="Output markdown path. Default: <folder>/cull-report.md")
    parser.add_argument("--csv", type=Path, default=None, help="Output CSV path. Default: <folder>/cull-report.csv")
    parser.add_argument("--workers", type=int, default=4, help="Parallel ffmpeg workers. Default 4.")
    parser.add_argument("--recursive", action="store_true", help="Recurse into subfolders")
    args = parser.parse_args(argv)

    if not args.folder.is_dir():
        print(f"cull: not a directory: {args.folder}", file=sys.stderr)
        return 2

    pattern = "**/*" if args.recursive else "*"
    clips = sorted(p for p in args.folder.glob(pattern)
                   if p.is_file() and p.suffix.lower() in VIDEO_EXTS)

    if not clips:
        print(f"cull: no video files found in {args.folder}", file=sys.stderr)
        print(f"  Looked for: {', '.join(sorted(VIDEO_EXTS))}", file=sys.stderr)
        return 1

    print(f"cull: scanning {len(clips)} clip(s) with {args.workers} workers...")
    reports: list[ClipReport] = []
    with ThreadPoolExecutor(max_workers=args.workers) as pool:
        futures = {pool.submit(scan_clip, c): c for c in clips}
        for i, fut in enumerate(as_completed(futures), 1):
            r = fut.result()
            reports.append(r)
            flag = "⚠" if r.severity > 0 else "·"
            print(f"  [{i}/{len(clips)}] {flag} {r.path.name}  severity={r.severity}")

    md_path = args.out or args.folder / "cull-report.md"
    csv_path = args.csv or args.folder / "cull-report.csv"
    md_path.write_text(format_md(reports), encoding="utf-8")
    with csv_path.open("w", newline="", encoding="utf-8") as f:
        w = csv.writer(f)
        for row in format_csv_rows(reports):
            w.writerow(row)
    flagged = sum(1 for r in reports if r.severity > 0)
    print(f"\ncull: wrote {md_path} + {csv_path}")
    print(f"  {flagged} of {len(reports)} clip(s) flagged.")
    return 0


if __name__ == "__main__":
    sys.exit(main())
