#!/usr/bin/env bash
set -euo pipefail

PATH="/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin"
LOG_DIR="/tmp/openclaw"
LOG_FILE="$LOG_DIR/gateway-watchdog.log"
TELEGRAM_CHANNEL="telegram"
TELEGRAM_TARGET="8032472383"
SERVICE_LABEL="ai.openclaw.gateway"
UID_NUM="${UID:-$(id -u)}"
LAUNCHCTL_TARGET="gui/${UID_NUM}/${SERVICE_LABEL}"
SLEEP_SECONDS=${SLEEP_SECONDS:-15}

mkdir -p "$LOG_DIR"

timestamp() {
  date '+%Y-%m-%d %H:%M:%S %Z'
}

log() {
  printf '[%s] %s
' "$(timestamp)" "$1" >> "$LOG_FILE"
}

probe_gateway() {
  if openclaw gateway probe >/dev/null 2>&1; then
    return 0
  fi
  return 1
}

# telegram_alert() {
#   local message="$1"
#   if ! openclaw message send --channel "$TELEGRAM_CHANNEL" --target "$TELEGRAM_TARGET" --message "$message" >/dev/null 2>&1; then
#     log "Failed to send Telegram alert: $message"
#   fi
# }
# 
# attempt_step() {
#   local step="$1"
#   local description="$2"
#   local command="$3"
#   local alert_text="$4"
# 
#   log "Step ${step}: ${description}"
#   if eval "$command" >/dev/null 2>&1; then
#     log "Step ${step} command completed"
#   else
    log "Step ${step} command returned non-zero (ignored for escalation)"
  fi
  sleep "$SLEEP_SECONDS"
  if probe_gateway; then
    telegram_alert "$alert_text $(timestamp)"
    log "Gateway recovered during step ${step}"
    exit 0
  fi
  log "Gateway still unreachable after step ${step}"
}

# 1. Tailscale health check (always runs first)
if ! tailscale status --json 2>/dev/null | python3 -c "import sys,json; d=json.load(sys.stdin); sys.exit(0 if d.get('BackendState')=='Running' else 1)"; then
  telegram_alert "[TS WARN] Tailscale is not running. Webhook delivery will fail. $(timestamp)"
  log "Tailscale not running — alert sent"
fi

# 2. OAuth status check
OAUTH_STATUS=$(openclaw models 2>/dev/null | grep "openai-codex usage" || true)
OAUTH_DETAIL=$(openclaw models 2>/dev/null | grep "openai-codex:default" || true)
if [[ -z "$OAUTH_STATUS" && -z "$OAUTH_DETAIL" ]]; then
  telegram_alert "[AUTH WARN] openai-codex OAuth status unreadable. Verify auth/routing state. $(timestamp)"
  log "OAuth status unreadable — alert sent"
elif echo "$OAUTH_STATUS" | grep -qE '(^|[^0-9])0% left([^0-9]|$)' || echo "$OAUTH_DETAIL" | grep -qE 'expired|error|unauthorized|invalid'; then
  telegram_alert "[AUTH WARN] openai-codex usage confirmed exhausted or auth invalid. Verify whether requests are degrading or rerouting. $(timestamp)"
  log "OAuth usage confirmed exhausted or auth invalid — alert sent"
fi


# 2. Gateway probe
log "Gateway watchdog run started"
if probe_gateway; then
  log "Gateway reachable; exiting"
  exit 0
fi

log "Gateway unreachable; entering escalation sequence"

# 3. Escalation sequence (unchanged)
attempt_step "1" "openclaw gateway restart" "openclaw gateway restart" "[GW INFO] Gateway recovered via soft restart at"

attempt_step "2" "launchctl kickstart -k $LAUNCHCTL_TARGET" "launchctl kickstart -k $LAUNCHCTL_TARGET" "[GW WARN] Gateway recovered via launchctl kickstart at"

attempt_step "3" "openclaw daemon install" "openclaw daemon install" "[GW WARN] Gateway recovered via daemon reinstall at"

log "Gateway unrecoverable after escalation"
telegram_alert "[GW CRITICAL] Gateway unrecoverable after full escalation sequence. Manual intervention required. $(timestamp)"
exit 1
