#!/usr/bin/env bash
set -euo pipefail

ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
IN_DIR="$ROOT/knowledge/archive/sanitized"
OUT_ENTITIES="$ROOT/knowledge/entities/plaud-distilled.md"
OUT_PROJECTS="$ROOT/knowledge/projects/plaud-distilled.md"
OUT_DAILY_DIR="$ROOT/knowledge/daily"
MODE="${1:-dry-run}"   # dry-run | apply
LIMIT="${2:-50}"

if [[ "$MODE" != "dry-run" && "$MODE" != "apply" ]]; then
  echo "Invalid mode: $MODE (expected dry-run|apply)" >&2
  exit 1
fi

mkdir -p "$OUT_DAILY_DIR" "$ROOT/tmp"
RUN_TS="$(date +%Y%m%d-%H%M%S)"
RUN_LOG="$ROOT/tmp/plaud-distill-$RUN_TS.log"
TMP="$ROOT/tmp/plaud-distill-$RUN_TS"
mkdir -p "$TMP"

: > "$TMP/entities.md"
: > "$TMP/projects.md"
: > "$TMP/daily.tsv"

count=0
while IFS= read -r f; do
  count=$((count + 1))

  title="$(awk -F': ' '/^title:/{sub(/^title: /,""); print; exit}' "$f")"
  created="$(awk -F': ' '/^created_time:/{sub(/^created_time: /,""); print; exit}' "$f")"
  cdate_raw="${created%%T}"
  cdate="$(echo "$cdate_raw" | grep -Eo '^[0-9]{4}-[0-9]{2}-[0-9]{2}' || true)"
  [[ -z "$cdate" ]] && cdate="$(date +%F)"
  source_file="${f#$ROOT/}"

  ai_summary="$(awk '/^\[AI_SUMMARY\]/{flag=1;next} flag{print}' "$f" | tr '\n' ' ' | sed 's/  */ /g')"
  transcript="$(awk '/^\[TRANSCRIPTION\]/{flag=1;next}/^\[AI_SUMMARY\]/{flag=0}flag{if(++n<=120) print}' "$f" | tr '\n' ' ' | sed 's/  */ /g')"

  text="$ai_summary $transcript"

  pref=""
  decision=""
  open_loop=""

  if echo "$text" | grep -Eiq '\b(prefer|preference|likes?|doesn.t like|avoid|wants?)\b'; then
    pref="Potential preference signal from conversation context."
  fi

  if echo "$text" | grep -Eiq '\b(decide|decision|agreed|plan is|we will|will do|next steps?)\b'; then
    decision="Potential decision/action signal in summary/transcript."
  fi

  if echo "$text" | grep -Eiq '\b(todo|to-do|action item|follow up|follow-up|pending|needs to|remind)\b'; then
    open_loop="Potential open loop/action item detected."
  fi

  if [[ -n "$pref" ]]; then
    {
      echo "- confidence: low"
      echo "  type: preference"
      echo "  title: \"$title\""
      echo "  note: \"$pref\""
      echo "  source: $source_file"
    } >> "$TMP/entities.md"
  fi

  if [[ -n "$decision" ]]; then
    {
      echo "- confidence: low"
      echo "  type: decision"
      echo "  title: \"$title\""
      echo "  note: \"$decision\""
      echo "  source: $source_file"
    } >> "$TMP/projects.md"
  fi

  if [[ -n "$open_loop" ]]; then
    printf "%s\t%s\t%s\t%s\n" "$cdate" "$title" "$open_loop" "$source_file" >> "$TMP/daily.tsv"
  fi

done < <(find "$IN_DIR" -type f -name '*.txt' | sort -r | head -n "$LIMIT")

entities_n=$(grep -c '^- confidence:' "$TMP/entities.md" || true)
projects_n=$(grep -c '^- confidence:' "$TMP/projects.md" || true)
daily_n=$(wc -l < "$TMP/daily.tsv" | tr -d ' ')

echo "[$(date -Iseconds)] mode=$MODE limit=$LIMIT processed=$count entities=$entities_n projects=$projects_n daily_items=$daily_n" >> "$RUN_LOG"

if [[ "$MODE" == "dry-run" ]]; then
  echo "DRY RUN"
  echo "processed=$count"
  echo "entities_candidates=$entities_n"
  echo "projects_candidates=$projects_n"
  echo "daily_candidates=$daily_n"
  echo "preview_entities=$TMP/entities.md"
  echo "preview_projects=$TMP/projects.md"
  echo "preview_daily=$TMP/daily.tsv"
  echo "run_log=$RUN_LOG"
  exit 0
fi

if [[ ! -f "$OUT_ENTITIES" ]]; then
  cat > "$OUT_ENTITIES" <<EOF
# Plaud Distilled Entities

Low-confidence auto-extracted candidates. Human review required before promotion to MEMORY.md.

EOF
fi

if [[ ! -f "$OUT_PROJECTS" ]]; then
  cat > "$OUT_PROJECTS" <<EOF
# Plaud Distilled Projects

Low-confidence auto-extracted candidates. Human review required before promotion to MEMORY.md.

EOF
fi

if [[ $entities_n -gt 0 ]]; then
  {
    echo "\n## Batch $RUN_TS"
    cat "$TMP/entities.md"
  } >> "$OUT_ENTITIES"
fi

if [[ $projects_n -gt 0 ]]; then
  {
    echo "\n## Batch $RUN_TS"
    cat "$TMP/projects.md"
  } >> "$OUT_PROJECTS"
fi

if [[ $daily_n -gt 0 ]]; then
  while IFS=$'\t' read -r day title note source; do
    dayf="$OUT_DAILY_DIR/$day.md"
    if [[ ! -f "$dayf" ]]; then
      cat > "$dayf" <<EOF
# Daily Note — $day

## Key Events

## Decisions

## Promotions (to durable knowledge)

## Open Loops
EOF
    fi
    {
      echo "- [low] $title"
      echo "  - note: $note"
      echo "  - source: $source"
    } >> "$dayf"
  done < "$TMP/daily.tsv"
fi

echo "applied=1"
echo "processed=$count"
echo "entities_candidates=$entities_n"
echo "projects_candidates=$projects_n"
echo "daily_candidates=$daily_n"
echo "run_log=$RUN_LOG"