#!/bin/bash
# ai.openclaw.funnel.health — per-service watchdog for the Funnel pipeline.
# Mirrors gateway-watchdog conventions. Deterministic; no agent cognition (THR-87).
# set -uo pipefail WITHOUT -e so one failing check can't abort the rest.
set -uo pipefail
export PATH="/opt/homebrew/bin:/usr/bin:/bin:/usr/sbin:/sbin"

LOG_DIR="/tmp/openclaw"
LOG_FILE="$LOG_DIR/funnel-health.log"
mkdir -p "$LOG_DIR"

TELEGRAM_TARGET="8032472383"
PIPELINE_LABEL="ai.openclaw.funnel.pipeline"
PIPELINE_KICK="gui/${UID}/${PIPELINE_LABEL}"

FUNNEL_HOME="/Users/openclaw/openclaw/funnel"
PIPELINE_LOG="$FUNNEL_HOME/pipeline.log"
DEFERRED_DEDUP="$FUNNEL_HOME/deferred-dedup.jsonl"
DEFERRED_WRITES="$FUNNEL_HOME/deferred-writes.jsonl"
PAUSE_FILE="$FUNNEL_HOME/.funnel-paused"

STALE_SECS=2700
SIDECAR_BACKLOG_MAX=20
ALERT_COOLDOWN_SECS=1800
SEND_TIMEOUT_SECS=30

timestamp() { date -u '+%Y-%m-%dT%H:%M:%SZ'; }
log() { echo "$(timestamp) $*" >> "$LOG_FILE"; }
redact() { sed -E 's/(sk-[A-Za-z0-9_-]+|eyJ[A-Za-z0-9._-]+|Bearer [A-Za-z0-9._-]+|key=[A-Za-z0-9._-]+)/<redacted>/g'; }

file_age_secs() {
  local f mt now; f="$1"
  if [ ! -e "$f" ]; then echo 999999; return; fi
  mt="$(stat -f %m "$f" 2>/dev/null || echo 0)"; now="$(date +%s)"
  echo $(( now - mt ))
}

# single-instance lock (self-reclaims if a prior run died holding it)
LOCKDIR="$LOG_DIR/funnel-health.lock"
if ! mkdir "$LOCKDIR" 2>/dev/null; then
  lock_age="$(file_age_secs "$LOCKDIR")"
  if [ "$lock_age" -gt 120 ]; then
    log "stale lock (${lock_age}s) — reclaiming"
    rmdir "$LOCKDIR" 2>/dev/null || true
    mkdir "$LOCKDIR" 2>/dev/null || { log "SKIP: lock contended after reclaim"; exit 0; }
  else
    log "SKIP: previous run still active (lock ${lock_age}s)"; exit 0
  fi
fi
trap 'rmdir "$LOCKDIR" 2>/dev/null || true' EXIT

send_telegram() {
  /usr/bin/perl -e 'alarm shift; exec @ARGV' "$SEND_TIMEOUT_SECS" \
    openclaw message send --channel telegram --target "$TELEGRAM_TARGET" --message "$1"
}

telegram_alert() {
  local family msg now last statef
  family="$1"; msg="$2"
  statef="$LOG_DIR/funnel-health.alert.$family"
  now="$(date +%s)"; last=0
  [ -f "$statef" ] && last="$(cat "$statef" 2>/dev/null || echo 0)"
  if [ $(( now - last )) -lt "$ALERT_COOLDOWN_SECS" ]; then
    log "alert suppressed (cooldown $family): $(printf '%s' "$msg" | redact)"; return 0
  fi
  log "ALERT[$family] -> telegram: $(printf '%s' "$msg" | redact)"
  if send_telegram "$msg" >> "$LOG_FILE" 2>&1; then
    echo "$now" > "$statef"
    log "ALERT[$family] delivered"
  else
    log "telegram send FAILED or TIMED OUT ($family) — will retry next tick"
  fi
  return 0
}

sidecar_lines() {
  local total=0 n
  for f in "$DEFERRED_DEDUP" "$DEFERRED_WRITES"; do
    if [ -f "$f" ]; then n="$(wc -l < "$f" 2>/dev/null | tr -d ' ')"; total=$(( total + ${n:-0} )); fi
  done
  echo "$total"
}

# Check 1: scheduler liveness (skipped while intentionally paused)
if [ -f "$PAUSE_FILE" ]; then
  log "PAUSED: $PAUSE_FILE present — skipping scheduler check (no kickstart, no stale alert)"
else
  age="$(file_age_secs "$PIPELINE_LOG")"
  if [ "$age" -gt "$STALE_SECS" ]; then
    log "STALE pipeline.log age=${age}s > ${STALE_SECS}s — scheduler stuck; kickstarting"
    launchctl kickstart -k "$PIPELINE_KICK" >> "$LOG_FILE" 2>&1 || log "kickstart nonzero (label not loaded?)"
    sleep 15
    age2="$(file_age_secs "$PIPELINE_LOG")"
    if [ "$age2" -lt "$STALE_SECS" ]; then
      log "RECOVERED pipeline.log age=${age2}s after kickstart"
      telegram_alert scheduler "[funnel-health] RECOVERED · scheduler was stale (${age}s), kickstarted, ran again"
    else
      telegram_alert scheduler "[funnel-health] CRITICAL · scheduler stuck (pipeline.log ${age}s stale); kickstart did not restore — manual check needed"
    fi
  else
    log "OK scheduler: pipeline.log age=${age}s"
  fi
fi

# Check 2: capture daemon (imsg-watch) liveness
if pgrep -f 'imsg.*watch' >/dev/null 2>&1; then
  log "OK capture: imsg-watch alive"
else
  log "CAPTURE DOWN: no imsg-watch process"
  telegram_alert capture "[funnel-health] ALERT · capture daemon (imsg-watch) down — KeepAlive should relaunch; flagging if it persists"
fi

# Check 3: deferred-sidecar backlog (Notion/embedding outage)
sc="$(sidecar_lines)"
if [ "$sc" -gt "$SIDECAR_BACKLOG_MAX" ]; then
  log "SIDECAR BACKLOG: ${sc} deferred lines > ${SIDECAR_BACKLOG_MAX}"
  telegram_alert sidecar "[funnel-health] ALERT · deferred backlog ${sc} entries — Notion/embedding outage; deferred-drain not clearing"
else
  log "OK sidecars: ${sc} deferred lines"
fi

exit 0
