#!/usr/bin/env python3
"""
OpenTrack Co. — Sales Engineer Candidate Sourcing Pipeline

Three stages:
  1. Source candidates from Exa people search
  2. Rank by keyword signal scoring
  3. Output ranked CSV

Usage:
  python3 pipeline.py [--config config.yaml]

Environment:
  EXA_API_KEY — required, injected by OpenClaw config
"""

import argparse
import csv
import json
import os
import re
import sys
from datetime import datetime, timezone
from pathlib import Path

import requests
import yaml


def load_config(path: str) -> dict:
    with open(path) as f:
        return yaml.safe_load(f)


# ── Stage 1: Source ──────────────────────────────────────────────────────────

def search_exa(query: str, config: dict, api_key: str) -> list[dict]:
    """Call Exa search API and return raw results."""
    exa_cfg = config["exa"]
    payload = {
        "query": query,
        "type": exa_cfg.get("type", "auto"),
        "category": exa_cfg.get("category", "people"),
        "numResults": exa_cfg.get("num_results", 10),
        "contents": exa_cfg.get("contents", {"summary": True}),
    }
    headers = {
        "x-api-key": api_key,
        "Content-Type": "application/json",
    }
    resp = requests.post(
        exa_cfg["endpoint"],
        headers=headers,
        json=payload,
        timeout=30,
    )
    resp.raise_for_status()
    return resp.json().get("results", [])


def extract_candidate(result: dict) -> dict | None:
    """Extract structured candidate data from an Exa result."""
    url = result.get("url", "")
    if not url:
        return None

    # Name from title or entities
    name = result.get("title", "").strip()

    # Entities contain structured data (people category)
    entities = result.get("entities", [])
    current_title = ""
    current_company = ""
    location = ""
    work_history = []

    for ent in entities:
        props = ent.get("properties", {})
        etype = ent.get("type", "").lower()

        # Person entity
        if "person" in etype or "name" in props:
            if not name:
                name = props.get("name", "")

            # Work history entries
            wh = props.get("workHistory", [])
            if isinstance(wh, list):
                for role in wh:
                    if isinstance(role, dict):
                        t = role.get("title", "")
                        c = role.get("company", role.get("organization", ""))
                        if isinstance(c, dict):
                            c = c.get("name", str(c))
                        if t or c:
                            work_history.append(f"{t} at {c}" if t and c else (t or str(c)))

            if not location:
                location = props.get("location", props.get("city", ""))

        # Organization entity
        if "organization" in etype or "company" in etype:
            org_name = props.get("name", "")
            if isinstance(org_name, dict):
                org_name = org_name.get("name", str(org_name))
            if not current_company and org_name:
                current_company = org_name

    # Try to parse current title from work history or title
    if work_history:
        current_title = current_title or work_history[0].split(" at ")[0]
        current_company = current_company or (work_history[0].split(" at ", 1)[1] if " at " in work_history[0] else "")

    # Fallback: parse from title
    if not current_title and name:
        raw_title = result.get("title", "")
        # Common pattern: "Name — Title at Company"
        for sep in [" — ", " - ", " | ", " – "]:
            if sep in raw_title:
                parts = raw_title.split(sep, 1)
                if len(parts) == 2:
                    role_part = parts[1]
                    if " at " in role_part:
                        current_title, current_company = role_part.split(" at ", 1)
                    else:
                        current_title = role_part
                    break

    summary = ""
    contents = result.get("contents", {})
    if isinstance(contents, dict):
        summary = contents.get("summary", "")
    if not summary:
        summary = result.get("summary", result.get("text", ""))

    return {
        "name": name or "Unknown",
        "profile_url": url,
        "location": location or "",
        "current_title": current_title or "",
        "current_company": current_company or "",
        "work_history": " | ".join(work_history) if work_history else "",
        "profile_description": summary or "",
    }


def source_candidates(config: dict, api_key: str) -> list[dict]:
    """Run all queries, pool results, dedupe by profile URL."""
    seen = set()
    candidates = []

    for query in config["queries"]:
        print(f"  Query: {query}", file=sys.stderr)
        results = search_exa(query, config, api_key)
        print(f"    → {len(results)} raw results", file=sys.stderr)

        for r in results:
            cand = extract_candidate(r)
            if not cand:
                continue
            url = cand["profile_url"]
            if url in seen:
                continue
            seen.add(url)
            candidates.append(cand)

    print(f"  Total after dedup: {len(candidates)}", file=sys.stderr)
    return candidates


# ── Stage 2: Rank ────────────────────────────────────────────────────────────

def build_text(candidate: dict) -> str:
    """Combine title + work history + description for scoring."""
    parts = [
        candidate.get("current_title", ""),
        candidate.get("current_company", ""),
        candidate.get("work_history", ""),
        candidate.get("profile_description", ""),
    ]
    return " ".join(p for p in parts if p).lower()


def score_candidate(candidate: dict, config: dict) -> float:
    """Score candidate 0–1 across four weighted keyword groups."""
    text = build_text(candidate)
    weights = config["scoring"]["weights"]
    groups = config["scoring"]["keyword_groups"]
    total = 0.0

    for group_name, weight in weights.items():
        keywords = groups.get(group_name, [])
        hits = sum(1 for kw in keywords if kw.lower() in text)
        # Normalize: at least 1 hit = full weight, partial hits = partial weight
        signal = min(hits / 3.0, 1.0) if keywords else 0
        total += signal * weight

    return round(total, 4)


def rank_candidates(candidates: list[dict], config: dict) -> list[dict]:
    """Score and sort candidates descending by score."""
    for c in candidates:
        c["score"] = score_candidate(c, config)
    candidates.sort(key=lambda c: c["score"], reverse=True)
    return candidates


# ── Stage 3: Output ──────────────────────────────────────────────────────────

def write_csv(candidates: list[dict], config: dict) -> str:
    """Write ranked candidates to a timestamped CSV."""
    out_cfg = config["output"]
    out_dir = Path(out_cfg.get("dir", "output"))
    out_dir.mkdir(parents=True, exist_ok=True)

    ts = datetime.now(timezone.utc).strftime("%Y-%m-%d")
    filename = f"{out_cfg.get('filename_prefix', 'candidates')}_{ts}.csv"
    path = out_dir / filename

    columns = [
        "score",
        "name",
        "current_title",
        "current_company",
        "location",
        "work_history",
        "profile_description",
        "profile_url",
    ]

    with open(path, "w", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=columns)
        writer.writeheader()
        for c in candidates:
            writer.writerow({col: c.get(col, "") for col in columns})

    return str(path)


# ── Main ──────────────────────────────────────────────────────────────────────

def main():
    parser = argparse.ArgumentParser(description="Sales Engineer candidate sourcing pipeline")
    parser.add_argument("--config", default="config.yaml", help="Path to config YAML")
    parser.add_argument("--output", default=None, help="Override output CSV path")
    args = parser.parse_args()

    api_key = os.environ.get("EXA_API_KEY")
    if not api_key:
        print("ERROR: EXA_API_KEY env var not set", file=sys.stderr)
        sys.exit(1)

    config = load_config(args.config)

    print("Stage 1: Sourcing candidates...", file=sys.stderr)
    candidates = source_candidates(config, api_key)
    if not candidates:
        print("No candidates found. Exiting.", file=sys.stderr)
        sys.exit(0)

    print(f"Stage 2: Ranking {len(candidates)} candidates...", file=sys.stderr)
    candidates = rank_candidates(candidates, config)

    print("Stage 3: Writing CSV...", file=sys.stderr)
    csv_path = args.output or write_csv(candidates, config)

    print(f"\nDone. {len(candidates)} candidates written to {csv_path}", file=sys.stderr)
    # Also print CSV path to stdout for automation
    print(csv_path)

    # Print top 5 summary to stderr
    print("\nTop 5 candidates:", file=sys.stderr)
    for c in candidates[:5]:
        print(f"  {c['score']:.3f}  {c['name']}  —  {c['current_title']} at {c['current_company']}", file=sys.stderr)


if __name__ == "__main__":
    main()
