#!/usr/bin/env bash
set -euo pipefail

ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
KEY_FILE="${NOTION_KEY_FILE:-$HOME/.config/notion/api_key}"
NOTION_VERSION="2025-09-03"
DS_ID="${PLAUD_DS_ID:-20af8b3e-f83d-80de-90e2-000b43ad63f0}"
MODE="${1:-dry-run}"          # dry-run | apply
LIMIT="${2:-10}"               # max new records to process this run
STATE_DIR="$ROOT/knowledge/archive/state"
STATE_FILE="$STATE_DIR/plaud-ingest-state.tsv"

if [[ ! -f "$KEY_FILE" ]]; then
  echo "Notion API key file not found: $KEY_FILE" >&2
  exit 1
fi

if [[ "$MODE" != "dry-run" && "$MODE" != "apply" ]]; then
  echo "Invalid mode: $MODE (expected dry-run|apply)" >&2
  exit 1
fi

NOTION_KEY="$(cat "$KEY_FILE")"
RAW_DIR="$ROOT/knowledge/archive/raw"
mkdir -p "$RAW_DIR" "$ROOT/tmp" "$STATE_DIR"

if [[ ! -f "$STATE_FILE" ]]; then
  echo -e "id\tcreated_time\tlast_mode\tprocessed_at" > "$STATE_FILE"
fi

# Bootstrap state from existing raw files so reruns are incremental.
# Filename pattern includes the Notion page UUID before the title slug.
if [[ -d "$RAW_DIR" ]]; then
  while IFS= read -r f; do
    id="$(basename "$f" | grep -Eo '[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}' | head -n1 || true)"
    [[ -z "$id" ]] && continue
    if ! grep -q "^$id\t" "$STATE_FILE"; then
      echo -e "$id\tunknown\tbootstrap\t$(date -Iseconds)" >> "$STATE_FILE"
    fi
  done < <(find "$RAW_DIR" -type f -name '*.txt' 2>/dev/null)
fi

processed=0
skipped_existing=0
cursor=""
RUN_TS="$(date +%Y%m%d-%H%M%S)"
RUN_LOG="$ROOT/tmp/plaud-ingest-$RUN_TS.log"

echo "[$(date -Iseconds)] start mode=$MODE limit=$LIMIT ds_id=$DS_ID state_file=$STATE_FILE" >> "$RUN_LOG"

query_notion() {
  local cursor_json
  if [[ -n "$cursor" ]]; then
    cursor_json=", \"start_cursor\": \"$cursor\""
  else
    cursor_json=""
  fi

  curl -sS -X POST "https://api.notion.com/v1/data_sources/$DS_ID/query" \
    -H "Authorization: Bearer $NOTION_KEY" \
    -H "Notion-Version: $NOTION_VERSION" \
    -H "Content-Type: application/json" \
    --data "{\"page_size\": 25${cursor_json}}"
}

while [[ "$processed" -lt "$LIMIT" ]]; do
  json="$(query_notion)"

  has_more="$(echo "$json" | jq -r '.has_more // false')"
  cursor="$(echo "$json" | jq -r '.next_cursor // ""')"

  rows="$(echo "$json" | jq -c '.results[]')"
  [[ -z "$rows" ]] && break

  while IFS= read -r row; do
    [[ -z "$row" ]] && continue
    [[ "$processed" -ge "$LIMIT" ]] && break

    id="$(echo "$row" | jq -r '.id')"
    created="$(echo "$row" | jq -r '.created_time')"

    if grep -q "^$id\t" "$STATE_FILE"; then
      skipped_existing=$((skipped_existing + 1))
      echo "skip id=$id reason=already_processed"
      echo "[$(date -Iseconds)] id=$id skip=already_processed" >> "$RUN_LOG"
      continue
    fi

    title="$(echo "$row" | jq -r '(.properties.Title.title // []) | map(.plain_text) | join("")')"
    transcript="$(echo "$row" | jq -r '(.properties.Transcription.rich_text // []) | map(.plain_text) | join("")')"
    ai_summary="$(echo "$row" | jq -r '(.properties["AI Summary"].rich_text // []) | map(.plain_text) | join("")')"
    source_url="$(echo "$row" | jq -r '.properties["Source URL"].url // ""')"

    safe_title="$(echo "$title" | tr '/:' '-' | tr -cd '[:alnum:] _.-' | sed 's/  */ /g' | cut -c1-80)"
    [[ -z "$safe_title" ]] && safe_title="untitled"

    if [[ "$MODE" == "apply" ]]; then
      infile="$RAW_DIR/${created%%T}-${id}-${safe_title}.txt"
    else
      infile="$ROOT/tmp/dryrun-${created%%T}-${id}-${safe_title}.txt"
    fi

    cat > "$infile" <<EOF
record_id: $id
created_time: $created
title: $title
source_url: $source_url

[TRANSCRIPTION]
$transcript

[AI_SUMMARY]
$ai_summary
EOF

    pf_out="$("$ROOT/scripts/memory/privacy-filter.sh" "$infile" "" "$MODE")"
    status="$(echo "$pf_out" | awk -F= '/^status=/{print $2}' | tail -n1)"
    hits="$(echo "$pf_out" | awk -F= '/^hits=/{print $2}' | tail -n1)"
    rules_hit="$(echo "$pf_out" | awk -F= '/^rules_hit=/{print $2}' | tail -n1)"
    pf_log="$(echo "$pf_out" | awk -F= '/^log=/{print $2}' | tail -n1)"

    processed=$((processed + 1))
    if [[ "$MODE" == "apply" ]]; then
      echo -e "$id\t$created\t$MODE\t$(date -Iseconds)" >> "$STATE_FILE"
    fi

    echo "processed=$processed mode=$MODE id=$id status=${status:-unknown} hits=${hits:-0} rules=${rules_hit:-none}"
    echo "[$(date -Iseconds)] id=$id status=${status:-unknown} hits=${hits:-0} rules=${rules_hit:-none} filter_log=${pf_log:-n/a} input=$infile" >> "$RUN_LOG"
  done <<< "$rows"

  if [[ "$has_more" != "true" ]]; then
    break
  fi

done

echo "Done. processed=$processed skipped_existing=$skipped_existing mode=$MODE limit=$LIMIT"
echo "[$(date -Iseconds)] done processed=$processed skipped_existing=$skipped_existing mode=$MODE limit=$LIMIT run_log=$RUN_LOG" >> "$RUN_LOG"
echo "state_file=$STATE_FILE"
echo "run_log=$RUN_LOG"