#!/usr/bin/env python3
import argparse
import email
import imaplib
import json
import os
import re
import ssl
import sys
from datetime import datetime, timezone
from email.header import decode_header, make_header
from email.message import Message
from pathlib import Path
from typing import Dict, List, Optional, Tuple
from urllib import request, error

NOTION_VERSION = "2025-09-03"
PIPELINE_DATA_SOURCE_ID = "cbde3032-5e2c-4557-8a00-cd2d14d4ab83"
DEFAULT_IMAP_HOST = "imap.mail.me.com"
DEFAULT_IMAP_PORT = 993
DEFAULT_FOLDER = "Search"
DEFAULT_SECRET_PATH = "/Users/openclaw/.secrets/icloud_imap_app_password"
DEFAULT_NOTION_KEY_PATH = "/Users/openclaw/.config/notion/api_key"
DEFAULT_EMAIL = "bradenmcleish@icloud.com"
STATUS_NEW = "New"
SOURCE_LINKEDIN = "LinkedIn Alert"
SOURCE_INBOUND = "Inbound Recruiter"
SOURCE_DIGEST = "Job Board Digest"
RICH_TEXT_SEGMENT_LIMIT = 2000
RICH_TEXT_SEGMENT_MAX = 100
TRUNCATION_MARKER = "\n\n[TRUNCATED — original body exceeded Notion rich_text limit]"
IMAP_TIMEOUT_SECONDS = 30
NOTION_TIMEOUT_SECONDS = 15
DEFAULT_FETCH_LIMIT = 0


def log_stage(stage: str, **data) -> None:
    print(json.dumps({"stage": stage, **data}, ensure_ascii=False), file=sys.stderr, flush=True)


def load_secret(path: str) -> str:
    value = Path(path).read_text(encoding="utf-8").strip()
    if not value:
        raise RuntimeError(f"empty_secret:{path}")
    return value


def load_notion_key() -> str:
    file_value = load_secret(DEFAULT_NOTION_KEY_PATH)
    if file_value:
        return file_value
    env = os.getenv("NOTION_API_KEY")
    if env:
        return env
    raise RuntimeError("notion_api_key_missing")


def decode_mime_header(value: Optional[str]) -> str:
    if not value:
        return ""
    try:
        return str(make_header(decode_header(value)))
    except Exception:
        return value


def parse_addresses(value: Optional[str]) -> Tuple[str, str]:
    if not value:
        return "", ""
    addresses = email.utils.getaddresses([value])
    if not addresses:
        return "", value.strip()
    name, addr = addresses[0]
    return (addr or "").strip(), value.strip()


def extract_plain_text(msg: Message) -> str:
    if msg.is_multipart():
        parts: List[str] = []
        for part in msg.walk():
            content_type = part.get_content_type()
            disposition = (part.get("Content-Disposition") or "").lower()
            if content_type == "text/plain" and "attachment" not in disposition:
                payload = part.get_payload(decode=True) or b""
                charset = part.get_content_charset() or "utf-8"
                try:
                    parts.append(payload.decode(charset, errors="replace"))
                except Exception:
                    parts.append(payload.decode("utf-8", errors="replace"))
        return "\n\n".join([p.strip() for p in parts if p.strip()]).strip()

    payload = msg.get_payload(decode=True) or b""
    charset = msg.get_content_charset() or "utf-8"
    try:
        return payload.decode(charset, errors="replace").strip()
    except Exception:
        return payload.decode("utf-8", errors="replace").strip()


def message_received_iso(msg: Message) -> str:
    raw_date = msg.get("Date")
    if raw_date:
        try:
            dt = email.utils.parsedate_to_datetime(raw_date)
            if dt.tzinfo is None:
                dt = dt.replace(tzinfo=timezone.utc)
            return dt.astimezone(timezone.utc).isoformat()
        except Exception:
            pass
    return datetime.now(timezone.utc).isoformat()


def classify_source(from_email: str) -> str:
    domain = from_email.split("@")[-1].lower() if "@" in from_email else ""
    return SOURCE_LINKEDIN if domain.endswith("linkedin.com") else SOURCE_INBOUND


def notion_request(method: str, url: str, payload: Optional[dict] = None) -> dict:
    notion_key = load_notion_key()
    data = None
    headers = {
        "Authorization": f"Bearer {notion_key}",
        "Notion-Version": NOTION_VERSION,
        "Content-Type": "application/json",
    }
    if payload is not None:
        data = json.dumps(payload).encode("utf-8")
    req = request.Request(url, data=data, headers=headers, method=method)
    try:
        with request.urlopen(req, timeout=NOTION_TIMEOUT_SECONDS) as resp:
            body = resp.read().decode("utf-8")
            return json.loads(body) if body else {}
    except error.HTTPError as exc:
        body = exc.read().decode("utf-8", errors="replace")
        raise RuntimeError(f"notion_http_error:{exc.code}:{body}") from exc


def _utf16_len(s: str) -> int:
    return len(s.encode("utf-16-le")) // 2


def chunk_rich_text(text: str) -> List[dict]:
    if not text:
        return []
    capacity = RICH_TEXT_SEGMENT_LIMIT * RICH_TEXT_SEGMENT_MAX
    effective_text = text
    if len(effective_text) > capacity:
        keep = capacity - len(TRUNCATION_MARKER)
        effective_text = effective_text[:keep] + TRUNCATION_MARKER
    chunks: List[dict] = []
    segment: List[str] = []
    seg_u16_len = 0
    for ch in effective_text:
        ch_u16 = len(ch.encode("utf-16-le")) // 2
        if seg_u16_len + ch_u16 > RICH_TEXT_SEGMENT_LIMIT and segment:
            chunks.append({"text": {"content": "".join(segment)}})
            segment = []
            seg_u16_len = 0
        segment.append(ch)
        seg_u16_len += ch_u16
    if segment:
        chunks.append({"text": {"content": "".join(segment)}})
    return chunks


def create_pipeline_record(
    subject: str,
    body_text: str,
    from_email: str,
    to_text: str,
    received_iso: str,
    parsed: Optional[dict] = None,
    resolution: Optional[dict] = None,
    qualified: Optional[dict] = None,
) -> dict:
    fields = (parsed or {}).get("fields", {})
    source = (parsed or {}).get("source") or classify_source(from_email)
    status = (parsed or {}).get("status") or STATUS_NEW

    props: Dict = {
        "Name": {"title": [{"text": {"content": subject or "(no subject)"}}]},
        "Email Text": {"rich_text": chunk_rich_text(body_text)},
        "From": {"email": from_email or None},
        "To": {"rich_text": chunk_rich_text(to_text)},
        "Date Received": {"date": {"start": received_iso}},
        "Status": {"select": {"name": status}},
        "Source": {"select": {"name": source}},
    }

    def _rt(key: str, value: Optional[str]) -> None:
        if value:
            props[key] = {"rich_text": chunk_rich_text(value)}

    _rt("Role Title", fields.get("role_title"))
    _rt("Company", fields.get("company"))
    _rt("Location", fields.get("location"))
    _rt("Salary", fields.get("salary"))
    _rt("LinkedIn Job ID", fields.get("linkedin_job_id"))
    _rt("Recruiter Name", fields.get("recruiter_name"))

    job_url = fields.get("job_url")
    if job_url:
        props["Role URL"] = {"url": job_url}

    posted_date = fields.get("posted_date")
    if posted_date:
        iso_date = _try_parse_iso_date(posted_date)
        if iso_date:
            props["Posted Date"] = {"date": {"start": iso_date}}

    classification = (parsed or {}).get("classification")

    if qualified and classification != "non_linkedin":
        fit = qualified.get("summary", {}).get("fit_score_10")
        if fit is not None:
            props["Fit Score"] = {"number": fit}

    if classification == "non_linkedin":
        props["Source"] = {"select": {"name": SOURCE_DIGEST}}
        props["Status"] = {"select": {"name": STATUS_NEW}}
    else:
        strategy = None
        if classification == "recruiter_inmail":
            strategy = (parsed or {}).get("strategy")
        elif qualified:
            strategy = qualified.get("summary", {}).get("strategy")
        if strategy:
            props["Strategy"] = {"select": {"name": strategy}}

    if resolution is not None or qualified is not None:
        research = {}
        if resolution is not None:
            research["resolution_plan"] = resolution
        if qualified is not None:
            research["qualification"] = qualified
        props["Research Payload"] = {"rich_text": chunk_rich_text(
            json.dumps(research, ensure_ascii=False)
        )}

    payload = {
        "parent": {"data_source_id": PIPELINE_DATA_SOURCE_ID},
        "properties": props,
    }
    return notion_request("POST", "https://api.notion.com/v1/pages", payload)


def _try_parse_iso_date(date_str: str) -> Optional[str]:
    import re as _re
    if _re.fullmatch(r'\d{4}-\d{2}-\d{2}', date_str):
        return date_str
    for fmt in ("%B %d, %Y", "%b %d, %Y", "%Y-%m-%dT%H:%M:%S%z",
                "%B %d, %Y at %I:%M:%S %p %Z", "%B %d, %Y at %I:%M:%S %p"):
        try:
            dt = datetime.strptime(date_str.strip(), fmt)
            return dt.strftime("%Y-%m-%d")
        except ValueError:
            continue
    m = _re.search(r'(\w+ \d{1,2},?\s*\d{4})', date_str)
    if m:
        for fmt in ("%B %d, %Y", "%B %d %Y", "%b %d, %Y", "%b %d %Y"):
            try:
                dt = datetime.strptime(m.group(1).replace(",", ", "), fmt)
                return dt.strftime("%Y-%m-%d")
            except ValueError:
                continue
    return None


def fetch_unseen_messages(imap_host: str, imap_port: int, username: str, password: str, folder: str, limit: int) -> List[Tuple[bytes, bytes]]:
    ctx = ssl.create_default_context()
    log_stage("imap_connect_start", host=imap_host, port=imap_port)
    client = imaplib.IMAP4_SSL(imap_host, imap_port, ssl_context=ctx, timeout=IMAP_TIMEOUT_SECONDS)
    try:
        log_stage("imap_connect_ok")
        log_stage("imap_login_start", user=username)
        client.login(username, password)
        log_stage("imap_login_ok", user=username)
        log_stage("imap_select_start", folder=folder)
        status, _ = client.select(f'"{folder}"')
        if status != "OK":
            raise RuntimeError(f"imap_select_failed:{folder}:{status}")
        log_stage("imap_select_ok", folder=folder)
        log_stage("imap_search_start", folder=folder)
        status, data = client.search(None, "UNSEEN")
        if status != "OK":
            raise RuntimeError(f"imap_search_failed:{status}")
        ids = data[0].split() if data and data[0] else []
        if limit > 0:
            ids = ids[:limit]
        log_stage("imap_search_ok", unseen_count=len(ids), limit=limit)
        messages: List[Tuple[bytes, bytes]] = []
        for msg_id in ids:
            log_stage("imap_fetch_start", imap_id=msg_id.decode())
            status, fetched = client.fetch(msg_id, "(BODY.PEEK[])")
            if status != "OK":
                raise RuntimeError(f"imap_fetch_failed:{msg_id.decode()}:{status}")
            log_stage("imap_fetch_shape", imap_id=msg_id.decode(), fetched_type=type(fetched).__name__, item_types=[type(part).__name__ for part in (fetched or [])], item_count=len(fetched or []))
            tuple_payloads: List[bytes] = []
            for part in fetched:
                if isinstance(part, tuple):
                    tuple_payloads.append(part[1])
                elif isinstance(part, bytes):
                    log_stage("imap_fetch_bytes_preview", imap_id=msg_id.decode(), preview=part[:200].decode('utf-8', errors='replace'))
            if len(tuple_payloads) > 1:
                log_stage("imap_fetch_multituple", imap_id=msg_id.decode(), tuple_count=len(tuple_payloads))
            if tuple_payloads:
                messages.append((msg_id, b"".join(tuple_payloads)))
            log_stage("imap_fetch_ok", imap_id=msg_id.decode(), coalesced=len(tuple_payloads))
        return messages
    finally:
        try:
            client.logout()
        except Exception:
            pass


def mark_seen(imap_host: str, imap_port: int, username: str, password: str, folder: str, msg_ids: List[bytes]) -> None:
    if not msg_ids:
        return
    ctx = ssl.create_default_context()
    log_stage("imap_mark_seen_connect_start", host=imap_host, port=imap_port)
    client = imaplib.IMAP4_SSL(imap_host, imap_port, ssl_context=ctx, timeout=IMAP_TIMEOUT_SECONDS)
    try:
        client.login(username, password)
        log_stage("imap_mark_seen_login_ok", user=username)
        status, _ = client.select(f'"{folder}"')
        if status != "OK":
            raise RuntimeError(f"imap_select_failed:{folder}:{status}")
        log_stage("imap_mark_seen_select_ok", folder=folder, count=len(msg_ids))
        for msg_id in msg_ids:
            log_stage("imap_mark_seen_start", imap_id=msg_id.decode())
            status, _ = client.store(msg_id, "+FLAGS", "(\\Seen)")
            if status != "OK":
                raise RuntimeError(f"imap_store_seen_failed:{msg_id.decode()}:{status}")
            log_stage("imap_mark_seen_ok", imap_id=msg_id.decode())
    finally:
        try:
            client.logout()
        except Exception:
            pass


def process_once(email_address: str, secret_path: str, imap_host: str, imap_port: int, folder: str, limit: int) -> dict:
    from notion_sender_parser import parse_record
    from employer_resolution import resolution_plan
    from qualification_flow import qualify, QualificationInput

    password = load_secret(secret_path)
    fetched = fetch_unseen_messages(imap_host, imap_port, email_address, password, folder, limit)
    processed = []
    skipped = 0
    seen_ids: List[bytes] = []
    for msg_id, raw_bytes in fetched:
        msg = email.message_from_bytes(raw_bytes)
        subject = decode_mime_header(msg.get("Subject"))
        from_email, from_raw = parse_addresses(msg.get("From"))
        _to_email, to_raw = parse_addresses(msg.get("To"))
        body_text = extract_plain_text(msg)
        received_iso = message_received_iso(msg)

        # Deliverable 2: skip malformed messages
        if not subject and not from_email:
            log_stage("imap_skip_malformed", imap_id=msg_id.decode())
            seen_ids.append(msg_id)
            skipped += 1
            continue

        # Deliverable 3: wire parser
        parsed = parse_record(subject, body_text, from_email)
        classification = parsed["classification"]

        # Deliverable 4: wire employer resolution (linkedin_alert or recruiter_inmail only)
        resolution = None
        if classification in ("linkedin_alert", "recruiter_inmail"):
            resolution = resolution_plan(
                body_text, subject,
                parsed["fields"].get("company") or "",
                parsed["fields"].get("role_title") or "",
            )

        # Deliverable 5: wire qualification (skip for non_linkedin)
        qualified = None
        if classification != "non_linkedin":
            qi = QualificationInput(
                role_title=parsed["fields"].get("role_title"),
                company=parsed["fields"].get("company"),
                salary=parsed["fields"].get("salary"),
                location=parsed["fields"].get("location"),
                posted_date=parsed["fields"].get("posted_date"),
                recruiter_name=parsed["fields"].get("recruiter_name"),
                inbound_recruiter=(classification == "recruiter_inmail"),
            )
            qualified = qualify(qi)

        log_stage("notion_create_start", imap_id=msg_id.decode(), subject=subject[:120], classification=classification)
        page = create_pipeline_record(
            subject, body_text, from_email, to_raw, received_iso,
            parsed=parsed, resolution=resolution, qualified=qualified,
        )
        log_stage("notion_create_ok", imap_id=msg_id.decode(), page_id=page.get("id"))
        seen_ids.append(msg_id)
        processed.append({
            "imap_id": msg_id.decode(),
            "page_id": page.get("id"),
            "subject": subject,
            "from_email": from_email,
            "source": parsed.get("source") or classify_source(from_email),
            "classification": classification,
            "received_iso": received_iso,
        })
    mark_seen(imap_host, imap_port, email_address, password, folder, seen_ids)
    return {
        "fetched_count": len(fetched),
        "processed_count": len(processed),
        "skipped_malformed": skipped,
        "processed": processed,
    }


def main() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument("--once", action="store_true", help="Run one IMAP ingestion pass")
    parser.add_argument("--email", default=DEFAULT_EMAIL)
    parser.add_argument("--secret-path", default=DEFAULT_SECRET_PATH)
    parser.add_argument("--imap-host", default=DEFAULT_IMAP_HOST)
    parser.add_argument("--imap-port", type=int, default=DEFAULT_IMAP_PORT)
    parser.add_argument("--folder", default=DEFAULT_FOLDER)
    parser.add_argument("--limit", type=int, default=DEFAULT_FETCH_LIMIT)
    args = parser.parse_args()

    if not args.once:
        raise SystemExit("Use --once for a one-shot ingestion pass.")

    result = process_once(args.email, args.secret_path, args.imap_host, args.imap_port, args.folder, args.limit)
    print(json.dumps(result, ensure_ascii=False))


if __name__ == "__main__":
    main()
