#!/usr/bin/env bash
set -euo pipefail

WORKDIR="/Users/openclaw/.openclaw/workspace"
POLICY="$WORKDIR/knowledge/rules/privacy-policy.yml"
ALLOWLIST="$WORKDIR/knowledge/rules/privacy-allowlist.yml"

IN="${1:-}"
OUT="${2:-}"
MODE="${3:-dry-run}"   # dry-run | apply

if [[ -z "$IN" ]]; then
  echo "Usage: $0 <input.txt> [output.txt] [dry-run|apply]" >&2
  exit 1
fi

if [[ ! -f "$IN" ]]; then
  echo "Input file not found: $IN" >&2
  exit 1
fi

mkdir -p "$WORKDIR/knowledge/archive/quarantine" "$WORKDIR/knowledge/archive/sanitized" "$WORKDIR/tmp"
TS="$(date +%Y%m%d-%H%M%S)"
LOG="$WORKDIR/tmp/privacy-filter-$TS.log"
TMP="$WORKDIR/tmp/privacy-filter-$TS.sanitized.txt"
cp "$IN" "$TMP"

HIT_COUNT=0
HIT_RULES=""

log_hit() {
  local rule="$1"
  local action="$2"
  HIT_COUNT=$((HIT_COUNT + 1))
  if [[ -z "$HIT_RULES" ]]; then
    HIT_RULES="$rule"
  else
    HIT_RULES="$HIT_RULES,$rule"
  fi
  echo "[$(date -Iseconds)] rule=$rule action=$action input=$IN" >> "$LOG"
}

# Masking rules (deterministic)
if grep -Eqi 'sk-[a-z0-9]{20,}' "$TMP"; then
  perl -0777 -i -pe 's/sk-[a-z0-9]{20,}/[REDACTED_API_KEY]/ig' "$TMP"
  log_hit "openai_key" "mask"
fi

if grep -Eqi '(api[_-]?key|token|secret|password)\s*[:=]\s*\S+' "$TMP"; then
  perl -0777 -i -pe 's/(api[_-]?key|token|secret|password)\s*[:=]\s*\S+/[REDACTED_SECRET]/ig' "$TMP"
  log_hit "generic_token" "mask"
fi

if grep -Eq '\b\d{3}-?\d{2}-?\d{4}\b' "$TMP"; then
  perl -0777 -i -pe 's/\b\d{3}-?\d{2}-?\d{4}\b/[REDACTED_SSN]/g' "$TMP"
  log_hit "ssn" "mask"
fi

if grep -Eqi '\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b' "$TMP"; then
  perl -0777 -i -pe 's/\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b/[REDACTED_EMAIL]/ig' "$TMP"
  log_hit "email" "mask"
fi

if grep -Eq '(?:(?:\+?1\s*(?:[.-]\s*)?)?(?:\(\s*\d{3}\s*\)|\d{3})\s*(?:[.-]\s*)?)?\d{3}\s*(?:[.-]\s*)\d{4}' "$TMP"; then
  perl -0777 -i -pe 's/(?:(?:\+?1\s*(?:[.-]\s*)?)?(?:\(\s*\d{3}\s*\)|\d{3})\s*(?:[.-]\s*)?)?\d{3}\s*(?:[.-]\s*)\d{4}/[REDACTED_PHONE]/g' "$TMP"
  log_hit "phone" "mask"
fi

# Quarantine triggers
QUARANTINE=0
if grep -Eq -- '-----BEGIN (RSA|EC|OPENSSH|PGP) PRIVATE KEY-----' "$TMP"; then
  QUARANTINE=1
  log_hit "private_key_block" "quarantine"
fi

if grep -Eqi "\b(passport|driver['’]s license|dl#)\b[:\s]*[a-z0-9-]+" "$TMP"; then
  QUARANTINE=1
  log_hit "passport_dl" "quarantine"
fi

if [[ "$MODE" == "dry-run" ]]; then
  STATUS="clean"
  [[ $HIT_COUNT -gt 0 ]] && STATUS="masked"
  [[ $QUARANTINE -eq 1 ]] && STATUS="quarantined"

  echo "DRY RUN"
  echo "policy=$POLICY"
  echo "allowlist=$ALLOWLIST"
  echo "input=$IN"
  echo "sanitized_preview=$TMP"
  echo "log=$LOG"
  echo "quarantine=$QUARANTINE"
  echo "status=$STATUS"
  echo "hits=$HIT_COUNT"
  echo "rules_hit=${HIT_RULES:-none}"
  exit 0
fi

# apply mode
if [[ $QUARANTINE -eq 1 ]]; then
  DEST="$WORKDIR/knowledge/archive/quarantine/$(basename "$IN" .txt)-$TS.txt"
  cp "$TMP" "$DEST"
  STATUS="quarantined"
  echo "quarantined=$DEST"
else
  if [[ -z "$OUT" ]]; then
    OUT="$WORKDIR/knowledge/archive/sanitized/$(basename "$IN" .txt)-$TS.txt"
  fi
  cp "$TMP" "$OUT"
  STATUS="clean"
  [[ $HIT_COUNT -gt 0 ]] && STATUS="masked"
  echo "sanitized=$OUT"
fi

echo "status=$STATUS"
echo "hits=$HIT_COUNT"
echo "rules_hit=${HIT_RULES:-none}"
echo "log=$LOG"
