#!/usr/bin/env python3
import argparse
import hashlib
import json
import os
import re
import urllib.request
from datetime import datetime, timezone

NOTION_VERSION = "2025-09-03"
PLAUD_DS = os.environ.get("PLAUD_DS_ID", "20af8b3e-f83d-80de-90e2-000b43ad63f0")
MC_DB = os.environ.get("MEMORY_CANDIDATES_DB_ID", "d40769f3-b824-432b-a8aa-0ee8a5d00dde")
MC_DS = os.environ.get("MEMORY_CANDIDATES_DS_ID", "c9d26639-dde4-41cd-8bc8-1a5623f6c4cf")


def notion_request(method: str, path: str, payload=None):
    key_path = os.path.expanduser("~/.config/notion/api_key")
    key = open(key_path, "r", encoding="utf-8").read().strip()
    data = json.dumps(payload).encode() if payload is not None else None
    req = urllib.request.Request(
        "https://api.notion.com/v1" + path,
        data=data,
        method=method,
        headers={
            "Authorization": f"Bearer {key}",
            "Notion-Version": NOTION_VERSION,
            "Content-Type": "application/json",
        },
    )
    with urllib.request.urlopen(req, timeout=45) as resp:
        return json.loads(resp.read().decode())


def existing_indexes():
    keys = set()
    record_ids = set()
    cursor = None
    while True:
        body = {"page_size": 100}
        if cursor:
            body["start_cursor"] = cursor
        page = notion_request("POST", f"/data_sources/{MC_DS}/query", body)
        for row in page.get("results", []):
            props = row.get("properties", {})
            txt = "".join(
                x.get("plain_text", "")
                for x in props.get("Dedupe Key", {}).get("rich_text", [])
            ).strip()
            if txt:
                keys.add(txt)
            rid = "".join(
                x.get("plain_text", "")
                for x in props.get("Plaud Record ID", {}).get("rich_text", [])
            ).strip()
            if rid:
                record_ids.add(rid)
        if not page.get("has_more"):
            break
        cursor = page.get("next_cursor")
    return keys, record_ids


SENTENCE_SPLIT = re.compile(r"(?<=[.!?])\s+")


def extract_claims(text: str, max_claims: int):
    text = (text or "").strip()
    if not text:
        return []
    text = text.replace("\n•", "•").replace("\n-", "-")
    parts = []
    for chunk in re.split(r"[\n;]+", text):
        chunk = chunk.strip(" -•\t")
        if not chunk:
            continue
        parts.extend(SENTENCE_SPLIT.split(chunk))

    out = []
    for p in parts:
        p = " ".join(p.split()).strip(" -•")
        if len(p) < 24:
            continue
        if len(p) > 220:
            p = p[:217] + "..."
        out.append(p)

    seen = set()
    uniq = []
    for p in out:
        k = p.lower()
        if k in seen:
            continue
        seen.add(k)
        uniq.append(p)
    return uniq[:max_claims]


def classify(claim: str):
    c = claim.lower()
    if any(x in c for x in ["follow up", "follow-up", "need to", "todo", "to-do", "?"]):
        return "Open Loop", "Medium"
    if any(x in c for x in ["decide", "decision", "will ", "must ", "should "]):
        return "Decision", "Medium"
    if any(x in c for x in ["idea", "could ", "might ", "consider", "option"]):
        return "Idea", "Low"
    if any(x in c for x in ["i think", "i feel", "believe", "seems", "appears"]):
        return "Perception", "Low"
    return "Fact", "Low"


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--max-records", type=int, default=20)
    ap.add_argument("--max-claims-per-record", type=int, default=4)
    args = ap.parse_args()

    dedupe_keys, existing_record_ids = existing_indexes()

    created = 0
    scanned = 0
    records_selected = 0
    cursor = None
    while records_selected < args.max_records:
        body = {"page_size": 100}
        if cursor:
            body["start_cursor"] = cursor
        plaud = notion_request("POST", f"/data_sources/{PLAUD_DS}/query", body)
        rows = plaud.get("results", [])
        if not rows:
            break

        for row in rows:
            rid = row.get("id")
            scanned += 1
            if rid in existing_record_ids:
                continue
            records_selected += 1
            if records_selected > args.max_records:
                break

            p = row.get("properties", {})
            title = "".join(x.get("plain_text", "") for x in p.get("Title", {}).get("title", [])).strip() or "(untitled)"
            summary = "".join(x.get("plain_text", "") for x in p.get("AI Summary", {}).get("rich_text", [])).strip()
            transcript = "".join(x.get("plain_text", "") for x in p.get("Transcription", {}).get("rich_text", [])).strip()
            source = (p.get("Source URL", {}) or {}).get("url")

            claims = extract_claims(summary, args.max_claims_per_record)
            if not claims:
                claims = extract_claims(transcript[:1800], max(2, args.max_claims_per_record - 1))

            for claim in claims:
                dedupe = "mc:" + hashlib.sha1((rid + "|" + claim.lower()).encode()).hexdigest()[:16]
                if dedupe in dedupe_keys:
                    continue
                ctype, conf = classify(claim)
                notion_request(
                    "POST",
                    "/pages",
                    {
                        "parent": {"database_id": MC_DB},
                        "properties": {
                            "Name": {"title": [{"type": "text", "text": {"content": claim}}]},
                            "Type": {"select": {"name": ctype}},
                            "Status": {"select": {"name": "Candidate"}},
                            "Confidence": {"select": {"name": conf}},
                            "Plaud Record ID": {"rich_text": [{"type": "text", "text": {"content": rid}}]},
                            "Plaud Source": {"relation": [{"id": rid}]},
                            "Source URL": {"url": source},
                            "Source Excerpt": {"rich_text": [{"type": "text", "text": {"content": claim}}]},
                            "Dedupe Key": {"rich_text": [{"type": "text", "text": {"content": dedupe}}]},
                            "Notes": {
                                "rich_text": [{"type": "text", "text": {"content": f"Auto-extracted from Plaud ({title[:80]})."}}]
                            },
                        },
                    },
                )
                dedupe_keys.add(dedupe)
                created += 1

            existing_record_ids.add(rid)

        if not plaud.get("has_more"):
            break
        cursor = plaud.get("next_cursor")

    print(
        json.dumps(
            {
                "created_candidates": created,
                "records_scanned": scanned,
                "max_records": args.max_records,
                "timestamp": datetime.now(timezone.utc).isoformat(),
            },
            indent=2,
        )
    )


if __name__ == "__main__":
    main()