#!/bin/bash
# ai.openclaw.funnel.health — per-service watchdog for the Funnel pipeline.
# Mirrors gateway-watchdog conventions. Deterministic; no agent cognition (THR-87).
# NOTE: set -uo pipefail WITHOUT -e so one failing check can't abort the rest.
set -uo pipefail
export PATH="/opt/homebrew/bin:/usr/bin:/bin:/usr/sbin:/sbin"

LOG_DIR="/tmp/openclaw"
LOG_FILE="$LOG_DIR/funnel-health.log"
mkdir -p "$LOG_DIR"

TELEGRAM_TARGET="8032472383"
PIPELINE_LABEL="ai.openclaw.funnel.pipeline"
PIPELINE_KICK="gui/${UID}/${PIPELINE_LABEL}"

FUNNEL_HOME="/Users/openclaw/openclaw/funnel"
PIPELINE_LOG="$FUNNEL_HOME/pipeline.log"
DEFERRED_DEDUP="$FUNNEL_HOME/deferred-dedup.jsonl"
DEFERRED_WRITES="$FUNNEL_HOME/deferred-writes.jsonl"

STALE_SECS=2700          # 3 missed 15-min ticks => scheduler stuck
SIDECAR_BACKLOG_MAX=20   # deferred lines above this => outage backlog
ALERT_COOLDOWN_SECS=1800 # per-condition re-alert at most every 30 min

timestamp() { date -u '+%Y-%m-%dT%H:%M:%SZ'; }
log() { echo "$(timestamp) $*" >> "$LOG_FILE"; }
redact() { sed -E 's/(sk-[A-Za-z0-9_-]+|eyJ[A-Za-z0-9._-]+|Bearer [A-Za-z0-9._-]+|key=[A-Za-z0-9._-]+)/<redacted>/g'; }

telegram_alert() {
  # $1 = family (per-condition cooldown); $2 = message
  local family msg now last statef
  family="$1"; msg="$2"
  statef="$LOG_DIR/funnel-health.alert.$family"
  now="$(date +%s)"; last=0
  [ -f "$statef" ] && last="$(cat "$statef" 2>/dev/null || echo 0)"
  if [ $(( now - last )) -lt "$ALERT_COOLDOWN_SECS" ]; then
    log "alert suppressed (cooldown $family): $(printf '%s' "$msg" | redact)"; return 0
  fi
  echo "$now" > "$statef"
  log "ALERT[$family] -> telegram: $(printf '%s' "$msg" | redact)"
  openclaw message send --channel telegram --target "$TELEGRAM_TARGET" -m "$msg" >> "$LOG_FILE" 2>&1 \
    || log "telegram send FAILED ($family)"
}

file_age_secs() {
  local f mt now; f="$1"
  if [ ! -f "$f" ]; then echo 999999; return; fi
  mt="$(stat -f %m "$f" 2>/dev/null || echo 0)"; now="$(date +%s)"
  echo $(( now - mt ))
}

sidecar_lines() {
  local total=0 n
  for f in "$DEFERRED_DEDUP" "$DEFERRED_WRITES"; do
    if [ -f "$f" ]; then n="$(wc -l < "$f" 2>/dev/null | tr -d ' ')"; total=$(( total + ${n:-0} )); fi
  done
  echo "$total"
}

# --- Check 1: scheduler liveness (pipeline.log freshness) ---
age="$(file_age_secs "$PIPELINE_LOG")"
if [ "$age" -gt "$STALE_SECS" ]; then
  log "STALE pipeline.log age=${age}s > ${STALE_SECS}s — scheduler stuck; kickstarting"
  launchctl kickstart -k "$PIPELINE_KICK" >> "$LOG_FILE" 2>&1 || log "kickstart nonzero (label not loaded?)"
  sleep 15
  age2="$(file_age_secs "$PIPELINE_LOG")"
  if [ "$age2" -lt "$STALE_SECS" ]; then
    log "RECOVERED pipeline.log age=${age2}s after kickstart"
    telegram_alert scheduler "[funnel-health] RECOVERED · scheduler was stale (${age}s), kickstarted, ran again"
  else
    telegram_alert scheduler "[funnel-health] CRITICAL · scheduler stuck (pipeline.log ${age}s stale); kickstart did not restore — manual check needed"
  fi
else
  log "OK scheduler: pipeline.log age=${age}s"
fi

# --- Check 2: capture daemon (imsg-watch) liveness ---
if pgrep -f 'imsg.*watch' >/dev/null 2>&1; then
  log "OK capture: imsg-watch alive"
else
  log "CAPTURE DOWN: no imsg-watch process"
  telegram_alert capture "[funnel-health] ALERT · capture daemon (imsg-watch) down — KeepAlive should relaunch; flagging if it persists"
fi

# --- Check 3: deferred-sidecar backlog (Notion/embedding outage) ---
sc="$(sidecar_lines)"
if [ "$sc" -gt "$SIDECAR_BACKLOG_MAX" ]; then
  log "SIDECAR BACKLOG: ${sc} deferred lines > ${SIDECAR_BACKLOG_MAX}"
  telegram_alert sidecar "[funnel-health] ALERT · deferred backlog ${sc} entries — Notion/embedding outage; deferred-drain not clearing"
else
  log "OK sidecars: ${sc} deferred lines"
fi

exit 0
