#!/usr/bin/env python3
"""
EA Connect 2025 - Unified Database & Search Tool

Commands:
    python ea_connect.py search "query"     # Semantic vector search (HyDE + rerank)
    python ea_connect.py deep "query"       # Deep search (multi-HyDE, RRF, graph, CoT)
    python ea_connect.py find --interest X  # Keyword search with filters
    python ea_connect.py similar --name X   # Find similar attendees
    python ea_connect.py stats              # Show statistics
    python ea_connect.py sql "SELECT ..."   # Raw SQL
    python ea_connect.py build              # Build database from JSON
    python ea_connect.py embed              # Generate embeddings
"""

import argparse
import json
import os
import sqlite3
import struct
import sys
from pathlib import Path

# Paths
SCRIPT_DIR = Path(__file__).parent
DB_PATH = SCRIPT_DIR / 'ea_connect.db'

# API Key (free $1 credit for easy use - please be respectful!)
OPENROUTER_API_KEY = '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'


# ============================================================
# Database Connection & Helpers
# ============================================================

def get_conn(load_vec=False):
    """Get database connection, optionally loading sqlite-vec."""
    conn = sqlite3.connect(DB_PATH)
    conn.row_factory = sqlite3.Row

    if load_vec:
        try:
            import sqlite_vec
            conn.enable_load_extension(True)
            sqlite_vec.load(conn)
            conn.enable_load_extension(False)
        except ImportError:
            print("sqlite-vec not installed. Run: pip install sqlite-vec")
            sys.exit(1)

    return conn


def find_latest_json(prefix: str) -> Path:
    """Find most recent JSON file with given prefix."""
    files = sorted(SCRAPED_DIR.glob(f'{prefix}_*.json'), reverse=True)
    return files[0] if files else None


def serialize_f32(vec: list[float]) -> bytes:
    """Serialize float list to bytes for sqlite-vec."""
    return struct.pack(f'{len(vec)}f', *vec)


def deserialize_f32(blob: bytes) -> list[float]:
    """Deserialize bytes to float list."""
    return list(struct.unpack(f'{len(blob)//4}f', blob))


# ============================================================
# Data Parsing Helpers
# ============================================================

def slugify_to_readable(slug: str) -> str:
    """Convert 'ai-safety-technical-research' to 'Ai Safety Technical Research'."""
    if not slug:
        return ''
    text = slug.replace('-', ' ').replace('_', ' ')
    words = text.split()
    result = []
    for i, word in enumerate(words):
        if i == 0:
            result.append(word.capitalize())
        elif word.lower() in ('and', 'or', 'the', 'of', 'to', 'in', 'for', 'a', 'an'):
            result.append(word.lower())
        else:
            result.append(word.capitalize() if len(word) > 2 else word)
    return ' '.join(result)


def extract_readable_values(items: list) -> list[str]:
    """Convert list of slugs to readable strings."""
    if not items:
        return []
    return [slugify_to_readable(item) for item in items if item]


def parse_country(country_code: str) -> str:
    """Convert country code to name."""
    if not country_code:
        return None
    countries = {
        'US': 'United States', 'GB': 'United Kingdom', 'UK': 'United Kingdom',
        'CA': 'Canada', 'DE': 'Germany', 'FR': 'France', 'NL': 'Netherlands',
        'AU': 'Australia', 'NZ': 'New Zealand', 'IN': 'India', 'CN': 'China',
        'JP': 'Japan', 'KR': 'South Korea', 'SG': 'Singapore', 'HK': 'Hong Kong',
        'CH': 'Switzerland', 'AT': 'Austria', 'BE': 'Belgium', 'ES': 'Spain',
        'IT': 'Italy', 'PT': 'Portugal', 'SE': 'Sweden', 'NO': 'Norway',
        'DK': 'Denmark', 'FI': 'Finland', 'IE': 'Ireland', 'PL': 'Poland',
        'CZ': 'Czech Republic', 'MX': 'Mexico', 'BR': 'Brazil', 'AR': 'Argentina',
        'CL': 'Chile', 'CO': 'Colombia', 'ZA': 'South Africa', 'NG': 'Nigeria',
        'KE': 'Kenya', 'GH': 'Ghana', 'EG': 'Egypt', 'IL': 'Israel',
        'AE': 'United Arab Emirates', 'PH': 'Philippines', 'ID': 'Indonesia',
        'TH': 'Thailand', 'VN': 'Vietnam', 'MY': 'Malaysia', 'PK': 'Pakistan',
    }
    return countries.get(country_code.upper().strip(), country_code)


def parse_job_seeking(items: list) -> str:
    """Parse job seeking preference."""
    if not items:
        return None
    val = items[0].lower() if items else ''
    if 'actively-looking' in val or 'actively looking' in val:
        return 'Actively Looking'
    elif 'interested-in-changing' in val or 'open to' in val:
        return 'Open to Change'
    elif 'pitch-me' in val or 'pitch me' in val:
        return 'Open to Pitches'
    elif 'not-interested' in val or 'not interested' in val:
        return 'Not Looking'
    return slugify_to_readable(items[0]) if items else None


def parse_career_stage(items: list) -> tuple[str, str]:
    """Parse career stage into years + employment type."""
    if not items:
        return None, None
    val = items[0].lower() if items else ''

    years = None
    if '0-2' in val or '02' in val:
        years = '0-2'
    elif '3-5' in val or '35' in val:
        years = '3-5'
    elif '6-15' in val or '615' in val:
        years = '6-15'
    elif '15+' in val or '15-' in val or '15plus' in val:
        years = '15+'

    emp_type = None
    if 'student' in val:
        emp_type = 'Student'
    elif 'self-employed' in val or 'self employed' in val:
        emp_type = 'Self-Employed'
    elif 'working' in val:
        emp_type = 'Employed'

    return years, emp_type


def parse_collab_prefs(items: list) -> dict:
    """Parse collaboration preferences."""
    prefs = {
        'hiring_senior': False, 'hiring_mid': False, 'hiring_junior': False,
        'hiring_soon': False, 'seeking_collaborators': False,
        'seeking_cofounders': False, 'has_funding': False,
    }
    if not items:
        return prefs

    text = ' '.join(items).lower()
    prefs['hiring_senior'] = 'senior' in text
    prefs['hiring_mid'] = 'mid-level' in text or 'mid level' in text
    prefs['hiring_junior'] = 'junior' in text or 'entry' in text
    prefs['hiring_soon'] = '3-6' in text or 'next few months' in text
    prefs['seeking_collaborators'] = 'collaborator' in text
    prefs['seeking_cofounders'] = 'co-founder' in text or 'cofounder' in text
    prefs['has_funding'] = 'funding' in text or 'resources' in text
    return prefs


def build_embedding_text(role: str, org: str, about: str, event_goals: str,
                         can_help: str, interests: list, expertise: list) -> str:
    """Build concatenated text optimized for embeddings."""
    parts = []

    if role and org:
        parts.append(f"{role} at {org}.")
    elif role:
        parts.append(f"{role}.")
    elif org:
        parts.append(f"Works at {org}.")

    if interests:
        parts.append(f"Interests: {', '.join(interests)}.")
    if expertise:
        parts.append(f"Expertise: {', '.join(expertise)}.")

    if about:
        parts.append(about.strip())
    if event_goals:
        parts.append(f"Goals: {event_goals.strip()}")
    if can_help:
        parts.append(f"Can help with: {can_help.strip()}")

    return ' '.join(parts) if parts else None


# ============================================================
# OpenRouter API
# ============================================================

def call_openrouter(messages: list, model: str = "openai/gpt-4o-mini", max_tokens: int = 500) -> str:
    """Call OpenRouter chat API."""
    import requests

    if not OPENROUTER_API_KEY:
        print("Error: OPENROUTER_API_KEY not set in .env")
        sys.exit(1)

    resp = requests.post(
        "https://openrouter.ai/api/v1/chat/completions",
        headers={
            "Authorization": f"Bearer {OPENROUTER_API_KEY}",
            "Content-Type": "application/json",
        },
        json={
            "model": model,
            "messages": messages,
            "max_tokens": max_tokens,
        }
    )

    if resp.status_code != 200:
        raise Exception(f"OpenRouter error: {resp.status_code} {resp.text[:200]}")

    return resp.json()["choices"][0]["message"]["content"]


def get_embedding(text: str) -> list[float]:
    """Get embedding from OpenRouter using text-embedding-3-large."""
    import requests

    if not OPENROUTER_API_KEY:
        print("Error: OPENROUTER_API_KEY not set in .env")
        sys.exit(1)

    resp = requests.post(
        "https://openrouter.ai/api/v1/embeddings",
        headers={
            "Authorization": f"Bearer {OPENROUTER_API_KEY}",
            "Content-Type": "application/json",
        },
        json={
            "model": "openai/text-embedding-3-large",
            "input": text,
        }
    )

    if resp.status_code != 200:
        raise Exception(f"Embedding error: {resp.status_code} {resp.text[:200]}")

    return resp.json()["data"][0]["embedding"]


def get_embeddings_batch(texts: list[str], batch_size: int = 100) -> list[list[float]]:
    """Get embeddings in batches."""
    import requests

    if not OPENROUTER_API_KEY:
        print("Error: OPENROUTER_API_KEY not set in .env")
        sys.exit(1)

    all_embeddings = []

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        print(f"  Embedding batch {i//batch_size + 1}/{(len(texts)-1)//batch_size + 1}...", end=" ", flush=True)

        resp = requests.post(
            "https://openrouter.ai/api/v1/embeddings",
            headers={
                "Authorization": f"Bearer {OPENROUTER_API_KEY}",
                "Content-Type": "application/json",
            },
            json={
                "model": "openai/text-embedding-3-large",
                "input": batch,
            }
        )

        if resp.status_code != 200:
            raise Exception(f"Embedding error: {resp.status_code} {resp.text[:200]}")

        data = resp.json()["data"]
        # Sort by index to maintain order
        data.sort(key=lambda x: x["index"])
        all_embeddings.extend([d["embedding"] for d in data])
        print(f"done ({len(batch)} texts)")

    return all_embeddings


# ============================================================
# BUILD Command
# ============================================================

def cmd_build(args):
    """Build SQLite database from scraped JSON files."""
    if DB_PATH.exists():
        DB_PATH.unlink()

    conn = sqlite3.connect(DB_PATH)
    cur = conn.cursor()

    cur.executescript('''
        CREATE TABLE attendees (
            id TEXT PRIMARY KEY,
            name TEXT NOT NULL,
            first_name TEXT,
            last_name TEXT,
            role TEXT,
            organization TEXT,
            profile_url TEXT,
            photo_url TEXT,
            about TEXT,
            event_goals TEXT,
            can_help_with TEXT,
            embedding_text TEXT,
            job_seeking TEXT,
            country TEXT,
            years_experience TEXT,
            employment_type TEXT,
            hiring_senior BOOLEAN DEFAULT 0,
            hiring_mid BOOLEAN DEFAULT 0,
            hiring_junior BOOLEAN DEFAULT 0,
            hiring_soon BOOLEAN DEFAULT 0,
            seeking_collaborators BOOLEAN DEFAULT 0,
            seeking_cofounders BOOLEAN DEFAULT 0,
            has_funding BOOLEAN DEFAULT 0,
            has_linkedin BOOLEAN DEFAULT 0,
            has_twitter BOOLEAN DEFAULT 0,
            is_mentor BOOLEAN DEFAULT 0,
            is_speaker BOOLEAN DEFAULT 0
        );

        CREATE TABLE interests (
            attendee_id TEXT,
            interest TEXT,
            PRIMARY KEY (attendee_id, interest),
            FOREIGN KEY (attendee_id) REFERENCES attendees(id)
        );

        CREATE TABLE expertise (
            attendee_id TEXT,
            skill TEXT,
            PRIMARY KEY (attendee_id, skill),
            FOREIGN KEY (attendee_id) REFERENCES attendees(id)
        );

        CREATE TABLE affiliations (
            attendee_id TEXT,
            affiliation TEXT,
            PRIMARY KEY (attendee_id, affiliation),
            FOREIGN KEY (attendee_id) REFERENCES attendees(id)
        );

        CREATE INDEX idx_interests_interest ON interests(interest);
        CREATE INDEX idx_expertise_skill ON expertise(skill);
        CREATE INDEX idx_attendees_country ON attendees(country);
        CREATE INDEX idx_attendees_org ON attendees(organization);
        CREATE INDEX idx_attendees_job_seeking ON attendees(job_seeking);
        CREATE INDEX idx_attendees_mentor ON attendees(is_mentor);
        CREATE INDEX idx_attendees_speaker ON attendees(is_speaker);
    ''')

    # Load JSON files
    attendees_file = find_latest_json('all_attendees')
    mentors_file = find_latest_json('mentors')
    speakers_file = find_latest_json('speakers')

    print(f"Reading {attendees_file.name}...")
    with open(attendees_file, 'r') as f:
        attendees = json.load(f)
    print(f"  {len(attendees)} attendees")

    mentor_ids = set()
    speaker_ids = set()

    if mentors_file:
        print(f"Reading {mentors_file.name}...")
        with open(mentors_file, 'r') as f:
            mentor_ids = {m['id'] for m in json.load(f)}
        print(f"  {len(mentor_ids)} mentors")

    if speakers_file:
        print(f"Reading {speakers_file.name}...")
        with open(speakers_file, 'r') as f:
            speaker_ids = {s['id'] for s in json.load(f)}
        print(f"  {len(speaker_ids)} speakers")

    print(f"\nProcessing attendees...")

    for a in attendees:
        aid = a.get('id')
        if not aid:
            continue

        years, emp_type = parse_career_stage(a.get('career_stage', []))
        collab = parse_collab_prefs(a.get('collaboration_preferences', []))

        social = a.get('social_networks', [])
        has_linkedin = 'LINKEDIN' in social
        has_twitter = 'TWITTER' in social or 'X' in social

        interests = extract_readable_values(a.get('areas_of_interest', []))
        expertise = extract_readable_values(a.get('areas_of_expertise', []))

        embedding_text = build_embedding_text(
            role=a.get('job_title'),
            org=a.get('organization'),
            about=a.get('biography'),
            event_goals=a.get('hoping_to_get'),
            can_help=a.get('how_i_can_help'),
            interests=interests,
            expertise=expertise,
        )

        cur.execute('''
            INSERT OR REPLACE INTO attendees VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
        ''', (
            aid,
            a.get('full_name', '').strip(),
            a.get('first_name'),
            a.get('last_name'),
            a.get('job_title'),
            a.get('organization'),
            a.get('swapcard_url'),
            a.get('photo_url'),
            a.get('biography'),
            a.get('hoping_to_get'),
            a.get('how_i_can_help'),
            embedding_text,
            parse_job_seeking(a.get('job_seeking', [])),
            parse_country(a.get('country')),
            years,
            emp_type,
            collab['hiring_senior'],
            collab['hiring_mid'],
            collab['hiring_junior'],
            collab['hiring_soon'],
            collab['seeking_collaborators'],
            collab['seeking_cofounders'],
            collab['has_funding'],
            has_linkedin,
            has_twitter,
            aid in mentor_ids,
            aid in speaker_ids,
        ))

        for interest in interests:
            cur.execute('INSERT OR IGNORE INTO interests VALUES (?,?)', (aid, interest))

        for skill in expertise:
            cur.execute('INSERT OR IGNORE INTO expertise VALUES (?,?)', (aid, skill))

        for aff in extract_readable_values(a.get('affiliation_groups', [])):
            cur.execute('INSERT OR IGNORE INTO affiliations VALUES (?,?)', (aid, aff))

    conn.commit()

    # Summary
    print("\n" + "=" * 60)
    print("DATABASE BUILT: ea_connect.db")
    print("=" * 60)

    for table in ['attendees', 'interests', 'expertise', 'affiliations']:
        count = cur.execute(f'SELECT COUNT(*) FROM {table}').fetchone()[0]
        print(f"  {table}: {count}")

    stats = {
        'With embedding text': 'embedding_text IS NOT NULL',
        'With bio': 'about IS NOT NULL',
        'With interests': 'id IN (SELECT DISTINCT attendee_id FROM interests)',
        'With expertise': 'id IN (SELECT DISTINCT attendee_id FROM expertise)',
        'With country': 'country IS NOT NULL',
        'Mentors': 'is_mentor = 1',
        'Speakers': 'is_speaker = 1',
    }

    print(f"\nCoverage:")
    total = len(attendees)
    for label, condition in stats.items():
        count = cur.execute(f'SELECT COUNT(*) FROM attendees WHERE {condition}').fetchone()[0]
        print(f"  {label}: {count} ({100*count//total}%)")

    conn.close()
    print("\nDone!")


# ============================================================
# EMBED Command
# ============================================================

def cmd_embed(args):
    """Generate embeddings for all attendees with embedding_text."""
    conn = get_conn(load_vec=True)
    cur = conn.cursor()

    # Create embeddings table if not exists
    cur.execute('''
        CREATE VIRTUAL TABLE IF NOT EXISTS vec_embeddings USING vec0(
            attendee_id TEXT PRIMARY KEY,
            embedding FLOAT[3072]
        )
    ''')

    # Check existing
    existing = {r[0] for r in cur.execute('SELECT attendee_id FROM vec_embeddings').fetchall()}

    # Get attendees needing embeddings
    rows = cur.execute('''
        SELECT id, embedding_text FROM attendees
        WHERE embedding_text IS NOT NULL
    ''').fetchall()

    to_embed = [(r['id'], r['embedding_text']) for r in rows if r['id'] not in existing]

    if not to_embed:
        print(f"All {len(existing)} attendees already have embeddings.")
        return

    print(f"Generating embeddings for {len(to_embed)} attendees...")
    print(f"  ({len(existing)} already exist)")

    # Get embeddings in batches
    ids = [t[0] for t in to_embed]
    texts = [t[1] for t in to_embed]

    embeddings = get_embeddings_batch(texts)

    print(f"\nInserting into database...")
    for aid, emb in zip(ids, embeddings):
        cur.execute(
            'INSERT INTO vec_embeddings (attendee_id, embedding) VALUES (?, ?)',
            (aid, serialize_f32(emb))
        )

    conn.commit()

    total = cur.execute('SELECT COUNT(*) FROM vec_embeddings').fetchone()[0]
    print(f"\nDone! {total} embeddings in database.")
    conn.close()


# ============================================================
# SEARCH Command (Semantic Vector Search with HyDE + Reranking)
# ============================================================

# HyDE: Generate hypothetical profile that WOULD match the query
# This solves the asymmetry problem (query space ≠ document space)
HYDE_PROMPT = """You are helping search a conference attendee database. The user wants to find specific people.

Your task: Write a HYPOTHETICAL attendee profile (as if you ARE that ideal person) that would perfectly match what the user is looking for.

Write in first person. Include:
- Job title and organization type
- Areas of expertise and interests
- What you're working on / your background
- What you can offer or help with
- Your goals at the conference

Be specific and concrete. Write 3-4 sentences as if this is YOUR actual conference profile.

User is looking for: {query}

Hypothetical matching profile:"""

# For "I want X from someone" queries, flip to "I offer X" perspective
INVERSE_HYDE_PROMPT = """You are helping search a conference attendee database. The user wants to find someone who can PROVIDE something to them.

Your task: Write a hypothetical profile FROM THE PERSPECTIVE of someone who would be a great match - someone who OFFERS what the user NEEDS.

The user needs: {query}

Write a first-person profile (3-4 sentences) as someone who:
- Has the resources/connections/expertise the user is seeking
- Would be interested in supporting this kind of work
- Describes what they offer and their background

Hypothetical profile of someone who could help:"""

# Reranking prompt - evaluates actual match quality
RERANK_PROMPT = """You are evaluating how well conference attendees match a search query.

The user is looking for: {query}

Rate each candidate on a scale of 0-100 based on:
- Direct relevance to what the user needs
- Ability to actually help (not just topical overlap)
- Seniority/influence if relevant to the query
- Explicit signals (mentions funding, hiring, collaborating, etc.)

Return ONLY a JSON object mapping candidate numbers to scores, like:
{{"1": 85, "2": 45, "3": 92, ...}}

Candidates:
{candidates}

JSON scores:"""


def detect_query_intent(query: str) -> str:
    """Detect if query is seeking something FROM others vs finding similar people."""
    seeking_keywords = ['fund', 'funding', 'invest', 'grant', 'support', 'help me',
                        'looking for', 'need', 'want', 'hire me', 'connect me',
                        'introduce', 'advice', 'mentor']
    query_lower = query.lower()
    for kw in seeking_keywords:
        if kw in query_lower:
            return 'seeking'  # User wants something FROM the matches
    return 'similar'  # User wants to find similar/related people


def cmd_search(args):
    """Semantic vector search with HyDE + LLM reranking."""
    conn = get_conn(load_vec=True)
    cur = conn.cursor()

    # Check if embeddings exist
    try:
        count = cur.execute('SELECT COUNT(*) FROM vec_embeddings').fetchone()[0]
        if count == 0:
            print("No embeddings found. Run 'python ea_connect.py embed' first.")
            return
    except:
        print("Embeddings table not found. Run 'python ea_connect.py embed' first.")
        return

    query = args.query
    intent = detect_query_intent(query)

    # Step 1: HyDE - generate hypothetical matching profile
    if not args.raw:
        print(f"Query: \"{query}\"")
        print(f"Intent: {intent} (looking for people who can provide something)" if intent == 'seeking' else f"Intent: {intent} (finding similar/related people)")

        prompt = INVERSE_HYDE_PROMPT if intent == 'seeking' else HYDE_PROMPT

        print(f"\nGenerating hypothetical matching profile (HyDE)...")
        try:
            hyde_profile = call_openrouter(
                [{"role": "user", "content": prompt.format(query=query)}],
                model="openai/gpt-4o-mini",
                max_tokens=300
            )
            print(f"  → \"{hyde_profile[:120]}...\"")
            search_text = hyde_profile
        except Exception as e:
            print(f"  (HyDE failed: {e}, using raw query)")
            search_text = query
    else:
        search_text = query
        print(f"Raw search: \"{query}\"")

    # Step 2: Vector search for candidates (get more than we need for reranking)
    print(f"\nVector search...")
    search_embedding = get_embedding(search_text)

    # Get 3x candidates for reranking
    k_candidates = args.limit * 3 if not args.raw else args.limit

    results = cur.execute('''
        SELECT
            v.attendee_id,
            v.distance,
            a.name,
            a.role,
            a.organization,
            a.country,
            a.is_mentor,
            a.is_speaker,
            a.has_funding,
            a.seeking_collaborators,
            a.seeking_cofounders,
            a.profile_url,
            a.embedding_text,
            a.about,
            a.can_help_with
        FROM vec_embeddings v
        JOIN attendees a ON v.attendee_id = a.id
        WHERE v.embedding MATCH ? AND k = ?
        ORDER BY distance
    ''', (serialize_f32(search_embedding), k_candidates)).fetchall()

    # Apply hard filters
    filtered = []
    for r in results:
        if args.mentor and not r['is_mentor']:
            continue
        if args.speaker and not r['is_speaker']:
            continue
        if args.funding and not r['has_funding']:
            continue
        if args.country and args.country.lower() not in (r['country'] or '').lower():
            continue
        filtered.append(r)

    results = filtered

    if not results:
        print("No matches found.")
        conn.close()
        return

    # Step 3: LLM Reranking (unless --raw or --no-rerank)
    if not args.raw and not args.no_rerank and len(results) > 5:
        print(f"Reranking {len(results)} candidates...")

        # Build candidate summaries for reranking
        candidate_texts = []
        for i, r in enumerate(results, 1):
            summary = f"{i}. {r['name']}"
            if r['role']:
                summary += f", {r['role']}"
            if r['organization']:
                summary += f" at {r['organization']}"

            extras = []
            if r['is_mentor']:
                extras.append("mentor")
            if r['is_speaker']:
                extras.append("speaker")
            if r['has_funding']:
                extras.append("has funding")
            if r['seeking_collaborators']:
                extras.append("seeking collaborators")
            if extras:
                summary += f" [{', '.join(extras)}]"

            # Add snippet of what they can help with or about
            snippet = r['can_help_with'] or r['about'] or ''
            if snippet:
                summary += f"\n   \"{snippet[:200]}...\""

            candidate_texts.append(summary)

        candidates_str = "\n".join(candidate_texts)

        try:
            rerank_response = call_openrouter(
                [{"role": "user", "content": RERANK_PROMPT.format(
                    query=query,
                    candidates=candidates_str
                )}],
                model="openai/gpt-4o-mini",
                max_tokens=500
            )

            # Parse JSON scores
            import re
            json_match = re.search(r'\{[^{}]+\}', rerank_response)
            if json_match:
                scores = json.loads(json_match.group())
                # Reorder results by score
                scored_results = []
                for i, r in enumerate(results, 1):
                    score = scores.get(str(i), scores.get(i, 50))
                    scored_results.append((score, r))
                scored_results.sort(key=lambda x: -x[0])
                results = [r for _, r in scored_results]
                print(f"  Reranked by relevance scores")
        except Exception as e:
            print(f"  (Reranking failed: {e}, using vector order)")

    # Display results
    results = results[:args.limit]
    print(f"\nTop {len(results)} matches:\n")

    for i, r in enumerate(results, 1):
        flags = ''
        if r['is_mentor']:
            flags += '🎓'
        if r['is_speaker']:
            flags += '🎤'
        if r['has_funding']:
            flags += '💰'
        if r['seeking_collaborators'] or r['seeking_cofounders']:
            flags += '🤝'

        header = f"{i:2d}. {r['name']}"
        if r['role'] and r['organization']:
            header += f" - {r['role']} @ {r['organization']}"
        elif r['organization']:
            header += f" @ {r['organization']}"
        if flags:
            header += f" {flags}"

        print(header)

        # Show distance for debugging if verbose
        if args.verbose:
            print(f"    [distance: {r['distance']:.4f}]")

        # Show relevant snippet
        snippet = r['can_help_with'] or r['about'] or r['embedding_text'] or ''
        if snippet:
            snippet = snippet[:150].replace('\n', ' ')
            print(f"    {snippet}...")

        if args.verbose:
            print(f"    {r['profile_url']}")
        print()



# ============================================================
# DEEP SEARCH Command (Multi-HyDE + RRF + Graph + CoT Rerank)
# ============================================================

# Query decomposition prompt
DECOMPOSE_PROMPT = """Break this search query into 2-4 independent aspects/facets.
Each aspect should capture a different dimension of what the user wants.

Query: {query}

Return JSON only: {{"aspects": ["aspect1", "aspect2", ...]}}
Example: "AI safety funders who support career changers" -> {{"aspects": ["people who fund AI safety research", "people who support career transitions", "people open to funding newcomers"]}}

JSON:"""

# Multi-perspective HyDE prompts
HYDE_PERSPECTIVES = [
    # Self perspective (original)
    """Write a first-person conference profile (3-4 sentences) as someone who perfectly matches: {query}
Include: role, organization type, expertise, what you offer, conference goals.""",

    # Colleague perspective
    """A colleague is describing someone at a conference who matches: {query}
Write what they'd say: "You should meet [name], they're a... [3-4 sentences about role, expertise, what they can help with]" """,

    # Bio/LinkedIn perspective
    """Write a professional bio (3-4 sentences) for someone who matches: {query}
Format: "[Name] is a [role] at [org type]. They [expertise/work]. They're particularly interested in [goals]." """,
]

# Negative profile prompt (what we DON'T want)
NEGATIVE_HYDE_PROMPT = """Write a profile of someone who SEEMS relevant but would NOT actually be a good match for: {query}

This person might share some keywords or surface-level similarity, but lacks what the user actually needs.
Write 2-3 sentences as their profile.

Profile of a FALSE POSITIVE:"""

# Chain-of-thought reranking prompt
COT_RERANK_PROMPT = """Evaluate how well each candidate matches this search query. Think step by step.

Query: {query}

For each candidate:
1. What is the user actually looking for?
2. What does this person specifically offer?
3. Is there a GENUINE match or just keyword/topic overlap?
4. Rate their ability to actually help (seniority, resources, access, relevance)

Score 0-100. Return JSON mapping candidate number to score.

Candidates:
{candidates}

Think through each, then return JSON like {{"1": 85, "2": 45, ...}}:"""


def reciprocal_rank_fusion(ranked_lists: list[list], k: int = 60) -> list:
    """Combine multiple ranked lists using RRF."""
    from collections import defaultdict
    scores = defaultdict(float)

    for ranked_list in ranked_lists:
        for rank, item_id in enumerate(ranked_list):
            scores[item_id] += 1.0 / (k + rank + 1)

    return sorted(scores.keys(), key=lambda x: -scores[x])


def decompose_query(query: str) -> list[str]:
    """Break query into independent aspects."""
    try:
        response = call_openrouter(
            [{"role": "user", "content": DECOMPOSE_PROMPT.format(query=query)}],
            model="openai/gpt-4o-mini",
            max_tokens=200
        )
        import re
        match = re.search(r'\{[^{}]+\}', response)
        if match:
            data = json.loads(match.group())
            aspects = data.get('aspects', [query])
            return aspects[:4] if aspects else [query]
    except:
        pass
    return [query]


def generate_hyde_profiles(query: str, intent: str) -> list[str]:
    """Generate multiple hypothetical profiles from different perspectives."""
    profiles = []

    # Use inverse perspective for 'seeking' intent
    base_query = query
    if intent == 'seeking':
        # Flip the perspective: user wants X -> generate profile of someone who offers X
        base_query = f"someone who can provide/offer: {query}"

    for perspective_prompt in HYDE_PERSPECTIVES:
        try:
            profile = call_openrouter(
                [{"role": "user", "content": perspective_prompt.format(query=base_query)}],
                model="openai/gpt-4o-mini",
                max_tokens=250
            )
            profiles.append(profile)
        except:
            continue

    return profiles if profiles else [query]


def generate_negative_profile(query: str) -> str:
    """Generate a profile of what we DON'T want (false positive)."""
    try:
        return call_openrouter(
            [{"role": "user", "content": NEGATIVE_HYDE_PROMPT.format(query=query)}],
            model="openai/gpt-4o-mini",
            max_tokens=200
        )
    except:
        return None


def vector_search_ids(cur, embedding: list[float], k: int = 50) -> list[str]:
    """Run vector search returning just attendee IDs in ranked order."""
    results = cur.execute('''
        SELECT v.attendee_id
        FROM vec_embeddings v
        WHERE v.embedding MATCH ? AND k = ?
        ORDER BY distance
    ''', (serialize_f32(embedding), k)).fetchall()
    return [r[0] for r in results]


def contrastive_search(cur, positive_embs: list, negative_emb: list | None, k: int = 100) -> list[str]:
    """Search with multiple positive embeddings, optionally subtract negative."""
    ranked_lists = []

    # Get rankings from each positive embedding
    for emb in positive_embs:
        ids = vector_search_ids(cur, emb, k=k)
        ranked_lists.append(ids)

    # RRF fusion of positive results
    fused_ids = reciprocal_rank_fusion(ranked_lists)

    # If we have a negative embedding, penalize similar results
    if negative_emb:
        negative_ids = set(vector_search_ids(cur, negative_emb, k=k//2))
        # Move negative matches down (but don't remove - they might still be relevant)
        fused_ids = [i for i in fused_ids if i not in negative_ids] + \
                    [i for i in fused_ids if i in negative_ids]

    return fused_ids


def graph_expand(cur, seed_ids: list[str], max_expand: int = 30) -> set[str]:
    """Expand results by finding colleagues and interest-similar people."""
    expanded = set(seed_ids)

    # Get orgs of top results
    top_ids = seed_ids[:5]
    if not top_ids:
        return expanded

    placeholders = ','.join('?' * len(top_ids))

    # Find colleagues (same org)
    orgs = cur.execute(f'''
        SELECT DISTINCT organization FROM attendees
        WHERE id IN ({placeholders}) AND organization IS NOT NULL
    ''', top_ids).fetchall()

    for org_row in orgs[:3]:
        colleagues = cur.execute('''
            SELECT id FROM attendees
            WHERE organization = ? AND id NOT IN ({})
            LIMIT 5
        '''.format(placeholders), (org_row[0], *top_ids)).fetchall()
        expanded.update(r[0] for r in colleagues)

    # Find interest-similar people
    interests = cur.execute(f'''
        SELECT DISTINCT interest FROM interests
        WHERE attendee_id IN ({placeholders})
    ''', top_ids).fetchall()

    if interests:
        interest_list = [r[0] for r in interests[:5]]
        int_placeholders = ','.join('?' * len(interest_list))
        similar = cur.execute(f'''
            SELECT attendee_id, COUNT(*) as overlap
            FROM interests
            WHERE interest IN ({int_placeholders})
            GROUP BY attendee_id
            HAVING overlap >= 2
            ORDER BY overlap DESC
            LIMIT {max_expand}
        ''', interest_list).fetchall()
        expanded.update(r[0] for r in similar)

    return expanded


def cot_rerank(query: str, candidates: list[dict], limit: int = 15) -> list[dict]:
    """Chain-of-thought reranking with reasoning."""
    if len(candidates) <= 5:
        return candidates

    # Build candidate summaries
    candidate_texts = []
    for i, c in enumerate(candidates[:40], 1):  # Cap at 40 for context limits
        summary = f"{i}. {c['name']}"
        if c['role']:
            summary += f", {c['role']}"
        if c['organization']:
            summary += f" at {c['organization']}"

        extras = []
        if c['is_mentor']:
            extras.append("mentor")
        if c['is_speaker']:
            extras.append("speaker")
        if c['has_funding']:
            extras.append("has funding")
        if c['seeking_collaborators']:
            extras.append("seeking collaborators")
        if extras:
            summary += f" [{', '.join(extras)}]"

        snippet = c.get('can_help_with') or c.get('about') or ''
        if snippet:
            summary += f"\n   \"{snippet[:180]}...\""

        candidate_texts.append(summary)

    try:
        response = call_openrouter(
            [{"role": "user", "content": COT_RERANK_PROMPT.format(
                query=query,
                candidates="\n".join(candidate_texts)
            )}],
            model="openai/gpt-4o-mini",
            max_tokens=800
        )

        import re
        # Find JSON in response (might have reasoning before it)
        json_match = re.search(r'\{[^{}]+\}', response)
        if json_match:
            scores = json.loads(json_match.group())
            scored = []
            for i, c in enumerate(candidates[:40], 1):
                score = scores.get(str(i), scores.get(i, 50))
                scored.append((score, c))
            scored.sort(key=lambda x: -x[0])
            return [c for _, c in scored][:limit]
    except Exception as e:
        print(f"  (CoT rerank failed: {e})")

    return candidates[:limit]


def cmd_deep(args):
    """Deep semantic search with multi-HyDE, RRF fusion, graph expansion, and CoT reranking."""
    import time
    start = time.time()

    conn = get_conn(load_vec=True)
    cur = conn.cursor()

    # Check embeddings exist
    try:
        count = cur.execute('SELECT COUNT(*) FROM vec_embeddings').fetchone()[0]
        if count == 0:
            print("No embeddings found. Run 'python ea_connect.py embed' first.")
            return
    except:
        print("Embeddings table not found. Run 'python ea_connect.py embed' first.")
        return

    query = args.query
    print(f"{'='*60}")
    print(f"DEEP SEARCH: \"{query}\"")
    print(f"{'='*60}\n")

    # Step 1: Detect intent
    intent = detect_query_intent(query)
    print(f"[1/6] Intent: {intent}")

    # Step 2: Decompose query into aspects
    print(f"[2/6] Decomposing query...")
    aspects = decompose_query(query)
    print(f"      → {len(aspects)} aspects: {aspects}")

    # Step 3: Generate multi-perspective HyDE profiles for each aspect
    print(f"[3/6] Generating HyDE profiles ({len(aspects)} aspects × {len(HYDE_PERSPECTIVES)} perspectives)...")
    all_positive_embeddings = []

    for aspect in aspects:
        profiles = generate_hyde_profiles(aspect, intent)
        print(f"      → {len(profiles)} profiles for '{aspect[:50]}...'")

        # Embed all profiles
        for profile in profiles:
            try:
                emb = get_embedding(profile)
                all_positive_embeddings.append(emb)
            except:
                continue

    print(f"      → {len(all_positive_embeddings)} total embeddings")

    # Step 4: Generate negative profile and embed
    print(f"[4/6] Generating contrastive (negative) profile...")
    negative_emb = None
    neg_profile = generate_negative_profile(query)
    if neg_profile:
        try:
            negative_emb = get_embedding(neg_profile)
            print(f"      → \"{neg_profile[:80]}...\"")
        except:
            pass

    # Step 5: Contrastive search with RRF fusion
    print(f"[5/6] Running contrastive vector search + RRF fusion...")
    candidate_ids = contrastive_search(cur, all_positive_embeddings, negative_emb, k=100)
    print(f"      → {len(candidate_ids)} candidates from vector search")

    # Graph expansion
    if not args.no_expand:
        print(f"      Expanding via graph (colleagues, shared interests)...")
        expanded_ids = graph_expand(cur, candidate_ids[:10])
        # Merge: original order first, then expanded
        candidate_ids = candidate_ids + [i for i in expanded_ids if i not in candidate_ids]
        print(f"      → {len(candidate_ids)} candidates after expansion")

    # Fetch full candidate data
    placeholders = ','.join('?' * len(candidate_ids[:60]))
    rows = cur.execute(f'''
        SELECT
            a.id, a.name, a.role, a.organization, a.country,
            a.is_mentor, a.is_speaker, a.has_funding,
            a.seeking_collaborators, a.seeking_cofounders,
            a.profile_url, a.about, a.can_help_with
        FROM attendees a
        WHERE a.id IN ({placeholders})
    ''', candidate_ids[:60]).fetchall()

    # Convert to dicts and maintain order
    id_to_row = {r['id']: dict(r) for r in rows}
    candidates = [id_to_row[i] for i in candidate_ids if i in id_to_row]

    # Apply hard filters
    if args.funding:
        candidates = [c for c in candidates if c['has_funding']]
    if args.mentor:
        candidates = [c for c in candidates if c['is_mentor']]
    if args.speaker:
        candidates = [c for c in candidates if c['is_speaker']]
    if args.country:
        candidates = [c for c in candidates if args.country.lower() in (c['country'] or '').lower()]

    print(f"      → {len(candidates)} after filters")

    # Step 6: CoT Reranking
    print(f"[6/6] Chain-of-thought reranking...")
    results = cot_rerank(query, candidates, limit=args.limit)

    elapsed = time.time() - start
    print(f"\n{'='*60}")
    print(f"TOP {len(results)} RESULTS ({elapsed:.1f}s)")
    print(f"{'='*60}\n")

    # Display results
    for i, r in enumerate(results, 1):
        flags = ''
        if r['is_mentor']:
            flags += '🎓'
        if r['is_speaker']:
            flags += '🎤'
        if r['has_funding']:
            flags += '💰'
        if r['seeking_collaborators'] or r['seeking_cofounders']:
            flags += '🤝'

        header = f"{i:2d}. {r['name']}"
        if r['role'] and r['organization']:
            header += f" - {r['role']} @ {r['organization']}"
        elif r['organization']:
            header += f" @ {r['organization']}"
        if flags:
            header += f" {flags}"

        print(header)

        snippet = r['can_help_with'] or r['about'] or ''
        if snippet:
            snippet = snippet[:150].replace('\n', ' ')
            print(f"    {snippet}...")

        if args.verbose:
            print(f"    {r['profile_url']}")
        print()

    conn.close()


# ============================================================
# FIND Command (Keyword Search)
# ============================================================

def cmd_find(args):
    """Keyword-based search with filters."""
    conn = get_conn()

    conditions = []
    params = []

    if args.interest:
        conditions.append('''id IN (SELECT attendee_id FROM interests
                            WHERE LOWER(interest) LIKE ?)''')
        params.append(f'%{args.interest.lower()}%')

    if args.expertise:
        conditions.append('''id IN (SELECT attendee_id FROM expertise
                            WHERE LOWER(skill) LIKE ?)''')
        params.append(f'%{args.expertise.lower()}%')

    if args.country:
        conditions.append('LOWER(country) LIKE ?')
        params.append(f'%{args.country.lower()}%')

    if args.org:
        conditions.append('LOWER(organization) LIKE ?')
        params.append(f'%{args.org.lower()}%')

    if args.keyword:
        kw = f'%{args.keyword.lower()}%'
        conditions.append('''(LOWER(about) LIKE ? OR LOWER(event_goals) LIKE ?
                            OR LOWER(can_help_with) LIKE ? OR LOWER(role) LIKE ?
                            OR LOWER(organization) LIKE ?)''')
        params.extend([kw, kw, kw, kw, kw])

    if args.mentor:
        conditions.append('is_mentor = 1')
    if args.speaker:
        conditions.append('is_speaker = 1')
    if args.funding:
        conditions.append('has_funding = 1')
    if args.hiring:
        conditions.append('(hiring_senior OR hiring_mid OR hiring_junior OR hiring_soon)')
    if args.collaborators:
        conditions.append('seeking_collaborators = 1')

    if not conditions:
        print("Specify at least one filter. Use --help for options.")
        return

    query = f'''
        SELECT a.*,
               GROUP_CONCAT(DISTINCT i.interest) as interests,
               GROUP_CONCAT(DISTINCT e.skill) as expertise_list
        FROM attendees a
        LEFT JOIN interests i ON a.id = i.attendee_id
        LEFT JOIN expertise e ON a.id = e.attendee_id
        WHERE {' AND '.join(conditions)}
        GROUP BY a.id
        ORDER BY a.name
        LIMIT ?
    '''
    params.append(args.limit)

    rows = conn.execute(query, params).fetchall()
    total = conn.execute(f'''SELECT COUNT(DISTINCT a.id) FROM attendees a
                             WHERE {' AND '.join(conditions)}''', params[:-1]).fetchone()[0]

    print(f"\nFound {total} attendees:\n")

    for r in rows:
        flags = ''
        if r['is_mentor']:
            flags += '🎓'
        if r['is_speaker']:
            flags += '🎤'
        if r['has_funding']:
            flags += '💰'

        header = f"  {r['name']}"
        if r['role'] and r['organization']:
            header += f" - {r['role']} @ {r['organization']}"
        elif r['organization']:
            header += f" @ {r['organization']}"
        if flags:
            header += f" {flags}"
        print(header)

        if args.verbose:
            if r['country']:
                print(f"    Country: {r['country']}")
            if r['interests']:
                ints = r['interests'][:80] + '...' if len(r['interests'] or '') > 80 else r['interests']
                print(f"    Interests: {ints}")
            if r['about']:
                about = r['about'][:100].replace('\n', ' ')
                print(f"    About: {about}...")
            print(f"    Profile: {r['profile_url']}")
            print()

    if total > args.limit:
        print(f"\n... and {total - args.limit} more (use --limit to see more)")

    conn.close()


# ============================================================
# SIMILAR Command
# ============================================================

def cmd_similar(args):
    """Find similar attendees by name or interests."""
    conn = get_conn()

    target_interests = set()
    target_expertise = set()
    exclude_id = None

    if args.name:
        row = conn.execute('SELECT id FROM attendees WHERE LOWER(name) LIKE ?',
                          (f'%{args.name.lower()}%',)).fetchone()
        if not row:
            matches = conn.execute('SELECT name FROM attendees WHERE LOWER(name) LIKE ? LIMIT 10',
                                  (f'%{args.name.lower()}%',)).fetchall()
            if matches:
                print("Did you mean:")
                for m in matches:
                    print(f"  - {m['name']}")
            else:
                print(f"No attendee found matching '{args.name}'")
            return

        exclude_id = row['id']
        target_interests = {r['interest'].lower() for r in
                          conn.execute('SELECT interest FROM interests WHERE attendee_id = ?',
                                      (exclude_id,))}
        target_expertise = {r['skill'].lower() for r in
                          conn.execute('SELECT skill FROM expertise WHERE attendee_id = ?',
                                      (exclude_id,))}

        print(f"Finding people similar to attendee with interests: {', '.join(list(target_interests)[:5])}...")

    if args.interests:
        target_interests = {i.strip().lower() for i in args.interests.split(',')}

    if not target_interests and not target_expertise:
        print("Specify --name or --interests")
        return

    all_attendees = conn.execute('''
        SELECT a.id, a.name, a.role, a.organization, a.profile_url, a.about
        FROM attendees a
    ''').fetchall()

    scores = []
    for a in all_attendees:
        if a['id'] == exclude_id:
            continue

        their_interests = {r['interest'].lower() for r in
                         conn.execute('SELECT interest FROM interests WHERE attendee_id = ?',
                                     (a['id'],))}
        their_expertise = {r['skill'].lower() for r in
                         conn.execute('SELECT skill FROM expertise WHERE attendee_id = ?',
                                     (a['id'],))}

        shared_int = target_interests & their_interests
        shared_exp = target_expertise & their_expertise
        complement = target_interests & their_expertise

        if not shared_int and not shared_exp and not complement:
            continue

        score = len(shared_int) * 2 + len(shared_exp) * 1.5 + len(complement) * 3
        scores.append({
            'attendee': a,
            'score': score,
            'shared_interests': shared_int,
            'complementary': complement,
        })

    scores.sort(key=lambda x: -x['score'])

    print(f"\nTop {min(len(scores), args.limit)} Similar Attendees:\n")

    for i, s in enumerate(scores[:args.limit], 1):
        a = s['attendee']
        header = f"{i:2d}. {a['name']}"
        if a['role'] and a['organization']:
            header += f" - {a['role']} @ {a['organization']}"
        print(header)
        print(f"    Score: {s['score']:.1f}")
        if s['shared_interests']:
            print(f"    Shared: {', '.join(sorted(s['shared_interests']))[:60]}")
        if s['complementary']:
            print(f"    Expert in your interests: {', '.join(sorted(s['complementary']))[:60]}")
        print()

    conn.close()


# ============================================================
# STATS Command
# ============================================================

def cmd_stats(args):
    """Show database statistics."""
    conn = get_conn()

    print("\n" + "=" * 60)
    print("EA CONNECT 2025 - STATISTICS")
    print("=" * 60)

    total = conn.execute('SELECT COUNT(*) FROM attendees').fetchone()[0]
    print(f"\nTotal attendees: {total}")

    # Check embeddings
    try:
        conn_vec = get_conn(load_vec=True)
        emb_count = conn_vec.execute('SELECT COUNT(*) FROM vec_embeddings').fetchone()[0]
        print(f"With embeddings: {emb_count}")
        conn_vec.close()
    except:
        print("Embeddings: not generated")

    print("\nProfile Completeness:")
    for field, label in [
        ('embedding_text', 'Has embedding text'),
        ('about', 'Has bio'),
        ('country', 'Has country'),
    ]:
        count = conn.execute(f'SELECT COUNT(*) FROM attendees WHERE {field} IS NOT NULL').fetchone()[0]
        print(f"  {100*count//total:3d}% ({count:4d}) - {label}")

    has_interests = conn.execute('SELECT COUNT(DISTINCT attendee_id) FROM interests').fetchone()[0]
    print(f"  {100*has_interests//total:3d}% ({has_interests:4d}) - Has interests")

    print("\nTop 10 Interests:")
    rows = conn.execute('''SELECT interest, COUNT(*) as c FROM interests
                          GROUP BY interest ORDER BY c DESC LIMIT 10''').fetchall()
    for r in rows:
        print(f"  {r['c']:3d} - {r['interest']}")

    print("\nTop 10 Countries:")
    rows = conn.execute('''SELECT country, COUNT(*) as c FROM attendees
                          WHERE country IS NOT NULL GROUP BY country
                          ORDER BY c DESC LIMIT 10''').fetchall()
    for r in rows:
        print(f"  {r['c']:3d} - {r['country']}")

    print("\nSpecial Roles:")
    for field, label in [('is_mentor', 'Mentors'), ('is_speaker', 'Speakers'),
                         ('has_funding', 'Has funding'), ('seeking_collaborators', 'Seeking collaborators')]:
        count = conn.execute(f'SELECT COUNT(*) FROM attendees WHERE {field} = 1').fetchone()[0]
        print(f"  {count:3d} - {label}")

    conn.close()


# ============================================================
# SQL Command
# ============================================================

def cmd_sql(args):
    """Run custom SQL query."""
    conn = get_conn()

    try:
        rows = conn.execute(args.query).fetchall()
        if rows:
            print('\t'.join(rows[0].keys()))
            print('-' * 60)
            for r in rows[:args.limit]:
                print('\t'.join(str(v)[:40] if v else '' for v in r))
            if len(rows) > args.limit:
                print(f"\n... {len(rows) - args.limit} more rows")
        else:
            print("No results.")
    except Exception as e:
        print(f"Error: {e}")

    conn.close()


# ============================================================
# Main
# ============================================================

def main():
    parser = argparse.ArgumentParser(description='EA Connect 2025 - Database & Search Tool')
    subparsers = parser.add_subparsers(dest='command', help='Commands')

    # build
    subparsers.add_parser('build', help='Build database from JSON files')

    # embed
    subparsers.add_parser('embed', help='Generate embeddings for attendees')

    # search (semantic)
    p = subparsers.add_parser('search', help='Semantic vector search with HyDE + reranking')
    p.add_argument('query', help='Natural language search query')
    p.add_argument('--raw', action='store_true', help='Skip HyDE and reranking (raw embedding search)')
    p.add_argument('--no-rerank', action='store_true', help='Skip LLM reranking step')
    p.add_argument('--mentor', '-m', action='store_true', help='Only mentors')
    p.add_argument('--speaker', '-s', action='store_true', help='Only speakers')
    p.add_argument('--funding', '-f', action='store_true', help='Has funding')
    p.add_argument('--hiring', action='store_true', help='Is hiring')
    p.add_argument('--country', '-c', help='Filter by country')
    p.add_argument('--verbose', '-v', action='store_true')
    p.add_argument('--limit', '-l', type=int, default=15)

    # deep (advanced multi-HyDE search)
    p = subparsers.add_parser('deep', help='Deep search: multi-HyDE, RRF fusion, graph expansion, CoT reranking')
    p.add_argument('query', help='Natural language search query')
    p.add_argument('--no-expand', action='store_true', help='Skip graph expansion')
    p.add_argument('--mentor', '-m', action='store_true', help='Only mentors')
    p.add_argument('--speaker', '-s', action='store_true', help='Only speakers')
    p.add_argument('--funding', '-f', action='store_true', help='Has funding')
    p.add_argument('--country', '-c', help='Filter by country')
    p.add_argument('--verbose', '-v', action='store_true')
    p.add_argument('--limit', '-l', type=int, default=15)

    # find (keyword)
    p = subparsers.add_parser('find', help='Keyword search with filters')
    p.add_argument('--interest', '-i', help='Filter by interest')
    p.add_argument('--expertise', '-e', help='Filter by expertise')
    p.add_argument('--country', '-c', help='Filter by country')
    p.add_argument('--org', '-o', help='Filter by organization')
    p.add_argument('--keyword', '-k', help='Keyword search')
    p.add_argument('--mentor', '-m', action='store_true')
    p.add_argument('--speaker', '-s', action='store_true')
    p.add_argument('--funding', '-f', action='store_true')
    p.add_argument('--hiring', action='store_true')
    p.add_argument('--collaborators', action='store_true')
    p.add_argument('--verbose', '-v', action='store_true')
    p.add_argument('--limit', '-l', type=int, default=20)

    # similar
    p = subparsers.add_parser('similar', help='Find similar attendees')
    p.add_argument('--name', '-n', help='Find people similar to this attendee')
    p.add_argument('--interests', '-i', help='Comma-separated interests')
    p.add_argument('--verbose', '-v', action='store_true')
    p.add_argument('--limit', '-l', type=int, default=15)

    # stats
    subparsers.add_parser('stats', help='Show statistics')

    # sql
    p = subparsers.add_parser('sql', help='Run custom SQL')
    p.add_argument('query', help='SQL query')
    p.add_argument('--limit', '-l', type=int, default=50)

    args = parser.parse_args()

    if not args.command:
        parser.print_help()
        return

    commands = {
        'build': cmd_build,
        'embed': cmd_embed,
        'search': cmd_search,
        'deep': cmd_deep,
        'find': cmd_find,
        'similar': cmd_similar,
        'stats': cmd_stats,
        'sql': cmd_sql,
    }

    commands[args.command](args)


if __name__ == '__main__':
    main()
