#!/usr/bin/env bash
set -euo pipefail

ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
IN_DIR="$ROOT/knowledge/archive/sanitized"
OUT_ENT="$ROOT/knowledge/entities/plaud-structured.md"
OUT_PROJ="$ROOT/knowledge/projects/plaud-structured.md"
OUT_REVIEW="$ROOT/knowledge/archive/review/plaud-promotions.md"
MODE="${1:-dry-run}"   # dry-run | apply
LIMIT="${2:-50}"

if [[ "$MODE" != "dry-run" && "$MODE" != "apply" ]]; then
  echo "Invalid mode: $MODE (expected dry-run|apply)" >&2
  exit 1
fi

mkdir -p "$ROOT/tmp" "$ROOT/knowledge/archive/review"
RUN_TS="$(date +%Y%m%d-%H%M%S)"
RUN_LOG="$ROOT/tmp/plaud-structured-$RUN_TS.log"
TMP="$ROOT/tmp/plaud-structured-$RUN_TS"
mkdir -p "$TMP"

: > "$TMP/entities.md"
: > "$TMP/projects.md"
: > "$TMP/review.md"

processed=0
pref_h=0; pref_m=0; pref_l=0
dec_h=0; dec_m=0; dec_l=0
open_h=0; open_m=0; open_l=0
proj_h=0; proj_m=0; proj_l=0

add_item() {
  local file="$1" type="$2" conf="$3" title="$4" note="$5" source="$6" snippet="$7" reject_reason="${8:-}"
  {
    echo "- type: $type"
    echo "  confidence: $conf"
    echo "  title: \"$title\""
    echo "  note: \"$note\""
    echo "  source: $source"
    echo "  snippet: \"$snippet\""
    if [[ -n "$reject_reason" ]]; then
      echo "  reject_reason: \"$reject_reason\""
    fi
  } >> "$file"
}

is_noisy_text() {
  local t="$1"
  if echo "$t" | grep -Eiq 'speaker [0-9]+|chapter [0-9]+|\b(Albus|Aurelia|Rhea|Esca|Matriarch)\b|\bfiction|narrative dictation\b'; then
    return 0
  fi
  return 1
}

quality_ok() {
  local s="$1"
  local n=${#s}
  [[ $n -lt 35 ]] && return 1
  local punct
  punct=$(echo "$s" | tr -cd '[:punct:]' | wc -c | tr -d ' ')
  [[ $punct -gt 40 ]] && return 1
  return 0
}

while IFS= read -r f; do
  processed=$((processed + 1))

  title="$(awk -F': ' '/^title:/{sub(/^title: /,""); print; exit}' "$f")"
  [[ -z "$title" ]] && title="Untitled"
  source_file="${f#$ROOT/}"

  ai_summary="$(awk '/^\[AI_SUMMARY\]/{flag=1;next} flag{print}' "$f" | tr '\n' ' ' | sed 's/  */ /g')"
  transcript="$(awk '/^\[TRANSCRIPTION\]/{flag=1;next}/^\[AI_SUMMARY\]/{flag=0}flag{if(++n<=120) print}' "$f" | tr '\n' ' ' | sed 's/  */ /g')"
  text="$ai_summary $transcript"

  # preference
  if echo "$text" | grep -Eiq '\b(prefer|preference|likes?|doesn.t like|avoid|wants?)\b'; then
    conf="low"
    note="Potential preference signal."
    reject_reason=""
    if is_noisy_text "$text"; then
      reject_reason="noise_or_fiction_markers"
    elif echo "$text" | grep -Eiq '\b(I prefer|we prefer|I like|I don.t like|we avoid)\b'; then
      conf="high"; note="Direct first-person preference phrasing detected."
      pref_h=$((pref_h+1))
    elif echo "$text" | grep -Eiq '\bprefer|preference|avoid\b'; then
      conf="medium"; note="Explicit preference terminology detected."
      pref_m=$((pref_m+1))
    else
      pref_l=$((pref_l+1))
    fi
    snippet="$( (echo "$text" | grep -Eio '.{0,40}(prefer|preference|likes?|doesn.t like|avoid|wants?).{0,80}' | head -n1 | sed 's/"/\"/g') || true )"
    if ! quality_ok "$snippet"; then reject_reason="low_snippet_quality"; fi
    add_item "$TMP/entities.md" "preference" "$conf" "$title" "$note" "$source_file" "$snippet" "$reject_reason"
    [[ "$conf" == "high" && -z "$reject_reason" ]] && add_item "$TMP/review.md" "preference" "$conf" "$title" "$note" "$source_file" "$snippet"
  fi

  # decision
  if echo "$text" | grep -Eiq '\b(decide|decision|agreed|plan is|we will|will do|next steps?)\b'; then
    conf="low"
    note="Potential decision signal."
    reject_reason=""
    if is_noisy_text "$text"; then
      reject_reason="noise_or_fiction_markers"
    elif echo "$text" | grep -Eiq '\b(we decided|I decided|decision is|agreed to|plan is to|we will)\b'; then
      conf="high"; note="Direct first-person decision/action commitment phrasing detected."
      dec_h=$((dec_h+1))
    elif echo "$text" | grep -Eiq '\bdecision|agreed|next steps\b'; then
      conf="medium"; note="Explicit decision terminology detected."
      dec_m=$((dec_m+1))
    else
      dec_l=$((dec_l+1))
    fi
    snippet="$( (echo "$text" | grep -Eio '.{0,40}(decide|decision|agreed|plan is|we will|next steps?).{0,80}' | head -n1 | sed 's/"/\"/g') || true )"
    if ! quality_ok "$snippet"; then reject_reason="low_snippet_quality"; fi
    add_item "$TMP/projects.md" "decision" "$conf" "$title" "$note" "$source_file" "$snippet" "$reject_reason"
    [[ "$conf" == "high" && -z "$reject_reason" ]] && add_item "$TMP/review.md" "decision" "$conf" "$title" "$note" "$source_file" "$snippet"
  fi

  # open loop
  if echo "$text" | grep -Eiq '\b(todo|to-do|action item|follow up|follow-up|pending|needs to|remind)\b'; then
    conf="low"
    note="Potential open loop signal."
    if echo "$text" | grep -Eiq '\baction item|follow up|needs to|pending\b'; then
      conf="medium"; note="Action-oriented open loop phrasing detected."
      open_m=$((open_m+1))
    else
      open_l=$((open_l+1))
    fi
    snippet="$( (echo "$text" | grep -Eio '.{0,40}(todo|to-do|action item|follow up|pending|needs to|remind).{0,80}' | head -n1 | sed 's/"/\"/g') || true )"
    add_item "$TMP/projects.md" "open_loop" "$conf" "$title" "$note" "$source_file" "$snippet"
  fi

  # project update
  if echo "$text" | grep -Eiq '\b(project|milestone|status|blocked|on track|schedule|deadline)\b'; then
    conf="low"
    note="Potential project status/update signal."
    if echo "$text" | grep -Eiq '\bstatus|blocked|on track|deadline|milestone\b'; then
      conf="medium"; note="Explicit project status language detected."
      proj_m=$((proj_m+1))
    else
      proj_l=$((proj_l+1))
    fi
    snippet="$( (echo "$text" | grep -Eio '.{0,40}(project|milestone|status|blocked|on track|schedule|deadline).{0,80}' | head -n1 | sed 's/"/\"/g') || true )"
    add_item "$TMP/projects.md" "project_update" "$conf" "$title" "$note" "$source_file" "$snippet"
  fi

done < <(find "$IN_DIR" -type f -name '*.txt' | sort -r | head -n "$LIMIT")

ent_n=$(grep -c '^- type:' "$TMP/entities.md" || true)
proj_n=$(grep -c '^- type:' "$TMP/projects.md" || true)
rev_n=$(grep -c '^- type:' "$TMP/review.md" || true)

{
  echo "[$(date -Iseconds)] mode=$MODE limit=$LIMIT processed=$processed entities=$ent_n projects=$proj_n review_high=$rev_n"
  echo "preference high=$pref_h medium=$pref_m low=$pref_l"
  echo "decision high=$dec_h medium=$dec_m low=$dec_l"
  echo "open_loop high=$open_h medium=$open_m low=$open_l"
  echo "project_update high=$proj_h medium=$proj_m low=$proj_l"
} >> "$RUN_LOG"

if [[ "$MODE" == "dry-run" ]]; then
  echo "DRY RUN"
  echo "processed=$processed"
  echo "entities_candidates=$ent_n"
  echo "projects_candidates=$proj_n"
  echo "review_high_candidates=$rev_n"
  echo "preference_hml=${pref_h}/${pref_m}/${pref_l}"
  echo "decision_hml=${dec_h}/${dec_m}/${dec_l}"
  echo "open_loop_hml=${open_h}/${open_m}/${open_l}"
  echo "project_update_hml=${proj_h}/${proj_m}/${proj_l}"
  echo "preview_entities=$TMP/entities.md"
  echo "preview_projects=$TMP/projects.md"
  echo "preview_review=$TMP/review.md"
  echo "run_log=$RUN_LOG"
  exit 0
fi

if [[ ! -f "$OUT_ENT" ]]; then
  cat > "$OUT_ENT" <<EOF
# Plaud Structured Entities

Structured candidates with confidence + source snippet.
High-confidence entries are copied to review queue before any MEMORY.md promotion.
EOF
fi

if [[ ! -f "$OUT_PROJ" ]]; then
  cat > "$OUT_PROJ" <<EOF
# Plaud Structured Projects

Structured candidates with confidence + source snippet.
High-confidence entries are copied to review queue before any MEMORY.md promotion.
EOF
fi

if [[ ! -f "$OUT_REVIEW" ]]; then
  cat > "$OUT_REVIEW" <<EOF
# Plaud Promotion Review Queue

High-confidence candidates for manual approval before MEMORY.md updates.
EOF
fi

if [[ $ent_n -gt 0 ]]; then
  { echo "\n## Batch $RUN_TS"; cat "$TMP/entities.md"; } >> "$OUT_ENT"
fi
if [[ $proj_n -gt 0 ]]; then
  { echo "\n## Batch $RUN_TS"; cat "$TMP/projects.md"; } >> "$OUT_PROJ"
fi
if [[ $rev_n -gt 0 ]]; then
  { echo "\n## Batch $RUN_TS"; cat "$TMP/review.md"; } >> "$OUT_REVIEW"
fi

echo "applied=1"
echo "processed=$processed"
echo "entities_candidates=$ent_n"
echo "projects_candidates=$proj_n"
echo "review_high_candidates=$rev_n"
echo "run_log=$RUN_LOG"