kicad-lib/scripts/extract_symbols.py

#!/usr/bin/env python3
"""
KiCad 9 Symbol Metadata Extractor
==================================
Walks every .kicad_sch file in the project directory and extracts
metadata for every placed symbol (component instance), correctly
expanding hierarchical sheet instances so that each unique reference
in the final design becomes its own record.

KiCad stores multi-instance sheets by embedding an `(instances ...)`
block in each symbol.  That block contains one `(path ...)` entry per
sheet instantiation, each with the authoritative reference for that
copy.  This script reads those paths so a sheet used N times produces
N distinct records per symbol.

Output: extract_symbols.json  (same directory as this script)

Usage:
    python3 extract_symbols.py [project_dir]

If project_dir is omitted, the directory containing this script is used.
"""

import json
import sys
from pathlib import Path


# ---------------------------------------------------------------------------
# S-expression parser
# ---------------------------------------------------------------------------

def _tokenize(text: str) -> list:
    """
    Convert raw KiCad S-expression text into a flat list of tokens.
    Token forms:
        ('OPEN',)          – opening paren
        ('CLOSE',)         – closing paren
        ('ATOM', value)    – unquoted word / number / bool
        ('STR',  value)    – double-quoted string (escapes resolved)
    """
    tokens = []
    i, n = 0, len(text)
    while i < n:
        c = text[i]
        if c in ' \t\r\n':
            i += 1
        elif c == '(':
            tokens.append(('OPEN',))
            i += 1
        elif c == ')':
            tokens.append(('CLOSE',))
            i += 1
        elif c == '"':
            j = i + 1
            buf = []
            while j < n:
                if text[j] == '\\' and j + 1 < n:
                    buf.append(text[j + 1])
                    j += 2
                elif text[j] == '"':
                    j += 1
                    break
                else:
                    buf.append(text[j])
                    j += 1
            tokens.append(('STR', ''.join(buf)))
            i = j
        else:
            j = i
            while j < n and text[j] not in ' \t\r\n()':
                j += 1
            tokens.append(('ATOM', text[i:j]))
            i = j
    return tokens


def _parse(tokens: list, pos: int) -> tuple:
    """
    Recursively parse one S-expression value starting at *pos*.
    Returns (parsed_value, next_pos).
    A list/node becomes a Python list; atoms and strings become strings.
    """
    tok = tokens[pos]
    kind = tok[0]
    if kind == 'OPEN':
        pos += 1
        items = []
        while tokens[pos][0] != 'CLOSE':
            item, pos = _parse(tokens, pos)
            items.append(item)
        return items, pos + 1          # consume CLOSE
    elif kind in ('ATOM', 'STR'):
        return tok[1], pos + 1
    else:
        raise ValueError(f"Unexpected token at pos {pos}: {tok}")


def parse_sexp(text: str):
    """Parse a complete KiCad S-expression file. Returns the root list."""
    tokens = _tokenize(text)
    root, _ = _parse(tokens, 0)
    return root


# ---------------------------------------------------------------------------
# Helpers to navigate parsed S-expressions
# ---------------------------------------------------------------------------

def tag(node) -> str:
    if isinstance(node, list) and node and isinstance(node[0], str):
        return node[0]
    return ''


def children(node: list) -> list:
    return node[1:] if isinstance(node, list) else []


def first_child_with_tag(node: list, name: str):
    for child in children(node):
        if isinstance(child, list) and tag(child) == name:
            return child
    return None


def all_children_with_tag(node: list, name: str) -> list:
    return [c for c in children(node) if isinstance(c, list) and tag(c) == name]


def scalar(node, index: int = 1, default=None):
    if isinstance(node, list) and len(node) > index:
        return node[index]
    return default


# ---------------------------------------------------------------------------
# Instance path extraction
# ---------------------------------------------------------------------------

def extract_instances(sym_node: list) -> list[dict]:
    """
    Parse the (instances ...) block of a symbol and return one dict per
    hierarchical path.  Each dict has:
        path      – the full UUID path string
        reference – the reference designator for that instance
        unit      – the unit number for that instance
        project   – the project name

    If there is no instances block (unusual), returns an empty list.
    """
    instances_node = first_child_with_tag(sym_node, 'instances')
    if instances_node is None:
        return []

    results = []
    for project_node in all_children_with_tag(instances_node, 'project'):
        project_name = scalar(project_node, 1, '')
        for path_node in all_children_with_tag(project_node, 'path'):
            path_str = scalar(path_node, 1, '')
            ref_node = first_child_with_tag(path_node, 'reference')
            unit_node = first_child_with_tag(path_node, 'unit')
            results.append({
                'path': path_str,
                'reference': scalar(ref_node, 1) if ref_node else None,
                'unit': scalar(unit_node, 1) if unit_node else None,
                'project': project_name,
            })
    return results


# ---------------------------------------------------------------------------
# Symbol extraction
# ---------------------------------------------------------------------------

def extract_symbol_records(sym_node: list, sheet_file: str) -> list[dict]:
    """
    Extract metadata from a placed-symbol node and return one record per
    hierarchical instance (i.e. one record per path in the instances block).

    For a sheet used only once, this produces a single record.
    For a sheet instantiated N times, this produces N records — each with
    its own unique reference designator from the instances block.
    """
    # --- Shared fields (same for all instances of this symbol placement) ---
    shared = {
        'sheet_file': sheet_file,
        'lib_id': None,
        'at': None,
        'exclude_from_sim': None,
        'in_bom': None,
        'on_board': None,
        'dnp': None,
        'uuid': None,
        'properties': {},
    }

    for child in children(sym_node):
        if not isinstance(child, list):
            continue
        t = tag(child)
        if t == 'lib_id':
            shared['lib_id'] = scalar(child, 1)
        elif t == 'at':
            shared['at'] = {
                'x': scalar(child, 1),
                'y': scalar(child, 2),
                'angle': scalar(child, 3, 0),
            }
        elif t == 'exclude_from_sim':
            shared['exclude_from_sim'] = scalar(child, 1) == 'yes'
        elif t == 'in_bom':
            shared['in_bom'] = scalar(child, 1) == 'yes'
        elif t == 'on_board':
            shared['on_board'] = scalar(child, 1) == 'yes'
        elif t == 'dnp':
            shared['dnp'] = scalar(child, 1) == 'yes'
        elif t == 'uuid':
            shared['uuid'] = scalar(child, 1)
        elif t == 'property':
            prop_name = scalar(child, 1)
            prop_val  = scalar(child, 2)
            if prop_name is not None:
                shared['properties'][prop_name] = prop_val

    # Promote standard properties for convenient access
    props = shared['properties']
    shared['value']       = props.get('Value')
    shared['footprint']   = props.get('Footprint')
    shared['datasheet']   = props.get('Datasheet')
    shared['description'] = props.get('Description')

    # --- Per-instance fields (one record per path in instances block) ---
    instances = extract_instances(sym_node)

    if not instances:
        # Fallback: no instances block — use top-level Reference property
        record = dict(shared)
        record['reference']       = props.get('Reference')
        record['instance_path']   = None
        record['instance_unit']   = shared.get('unit')
        record['instance_project']= None
        return [record]

    records = []
    for inst in instances:
        record = dict(shared)
        record['properties']       = dict(shared['properties'])  # copy so each is independent
        record['reference']        = inst['reference']
        record['instance_path']    = inst['path']
        record['instance_unit']    = inst['unit']
        record['instance_project'] = inst['project']
        records.append(record)

    return records


# ---------------------------------------------------------------------------
# Hierarchy walker
# ---------------------------------------------------------------------------

def find_reachable_sheets(root_sch: Path) -> list[Path]:
    """
    Walk the sheet hierarchy starting from *root_sch* and return an ordered
    list of every .kicad_sch file that is actually reachable (i.e. referenced
    directly or transitively as a sub-sheet).  Handles repeated sub-sheet
    references (same file used N times) by visiting the file only once.
    """
    reachable: list[Path] = []
    visited_names: set[str] = set()
    queue: list[Path] = [root_sch]

    while queue:
        sch = queue.pop(0)
        if sch.name in visited_names:
            continue
        visited_names.add(sch.name)
        reachable.append(sch)

        try:
            text = sch.read_text(encoding='utf-8')
        except OSError:
            continue

        root_node = parse_sexp(text)
        for child in children(root_node):
            if tag(child) != 'sheet':
                continue
            for prop in all_children_with_tag(child, 'property'):
                if scalar(prop, 1) == 'Sheetfile':
                    child_filename = scalar(prop, 2)
                    if child_filename:
                        child_path = sch.parent / child_filename
                        if child_path.exists() and child_path.name not in visited_names:
                            queue.append(child_path)

    return reachable


# ---------------------------------------------------------------------------
# Per-file parsing
# ---------------------------------------------------------------------------

def extract_from_schematic(sch_path: Path) -> list[dict]:
    """
    Parse one .kicad_sch file and return a list of symbol records.
    lib_symbols definitions are skipped; only placed instances are returned.
    """
    text = sch_path.read_text(encoding='utf-8')
    root = parse_sexp(text)

    results = []
    for child in children(root):
        if not isinstance(child, list):
            continue
        t = tag(child)
        if t == 'lib_symbols':
            continue          # skip library definitions
        if t == 'symbol' and first_child_with_tag(child, 'lib_id') is not None:
            records = extract_symbol_records(child, sch_path.name)
            results.extend(records)

    return results


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------

def get_root_uuid(project_dir: Path) -> str | None:
    """
    Find the UUID of the root schematic by reading the .kicad_pro file
    (which names the root sheet) or by scanning for the top-level sheet.
    Returns the UUID string, or None if it cannot be determined.
    """
    # The .kicad_pro file tells us the root schematic filename
    pro_files = list(project_dir.glob('*.kicad_pro'))
    root_sch: Path | None = None

    if pro_files:
        import json as _json
        try:
            pro = _json.loads(pro_files[0].read_text(encoding='utf-8'))
            root_name = pro.get('sheets', [{}])[0] if pro.get('sheets') else None
            # Fall back: just find a .kicad_sch with the same stem as the .pro
            root_sch = project_dir / (pro_files[0].stem + '.kicad_sch')
        except Exception:
            pass

    if root_sch is None or not root_sch.exists():
        # Guess: the .kicad_sch whose stem matches the .kicad_pro
        if pro_files:
            candidate = project_dir / (pro_files[0].stem + '.kicad_sch')
            if candidate.exists():
                root_sch = candidate

    if root_sch is None or not root_sch.exists():
        return None

    # Extract the first (uuid ...) at the root level of the file
    import re
    text = root_sch.read_text(encoding='utf-8')
    m = re.search(r'\(uuid\s+"([^"]+)"', text)
    return m.group(1) if m else None


def main(project_dir: Path):
    # Determine root schematic and walk the real hierarchy
    root_uuid = get_root_uuid(project_dir)

    pro_files = list(project_dir.glob('*.kicad_pro'))
    root_sch = project_dir / (pro_files[0].stem + '.kicad_sch') if pro_files else None

    if root_sch and root_sch.exists():
        sch_files = find_reachable_sheets(root_sch)
        print(f"Root sheet: {root_sch.name}")
        print(f"Found {len(sch_files)} reachable schematic file(s) in hierarchy:")
    else:
        # Fallback: glob everything
        sch_files = sorted(
            p for p in project_dir.rglob('*.kicad_sch')
            if not p.name.startswith('_autosave')
            and not p.suffix.endswith('.bak')
        )
        print(f"Warning: could not find root schematic; scanning all {len(sch_files)} files.\n")

    if not sch_files:
        print(f"No .kicad_sch files found in {project_dir}", file=sys.stderr)
        sys.exit(1)

    for f in sch_files:
        print(f"  {f.relative_to(project_dir)}")

    all_records: list[dict] = []

    for sch_path in sch_files:
        print(f"\nParsing {sch_path.name} ...", end=' ', flush=True)
        records = extract_from_schematic(sch_path)
        print(f"{len(records)} instance record(s)")
        all_records.extend(records)

    # All records come from reachable sheets, so no orphan filtering needed.
    # Optionally still filter by root UUID to catch stale instance paths.
    if root_uuid:
        active_prefix = f'/{root_uuid}/'
        active   = [r for r in all_records
                    if (r.get('instance_path') or '').startswith(active_prefix)]
        stale    = len(all_records) - len(active)
        print(f"\nTotal records : {len(all_records)}")
        if stale:
            print(f"Stale paths dropped: {stale}")
    else:
        active = all_records
        print(f"\nTotal records: {len(all_records)}")

    # ---- Stage 1: dedup by (instance_path, uuid) ----
    # Collapses records that were seen from multiple sheet scans into one.
    seen: set = set()
    stage1: list[dict] = []
    for r in active:
        key = (r.get('instance_path'), r.get('uuid'))
        if key not in seen:
            seen.add(key)
            stage1.append(r)

    # ---- Stage 2: dedup by uuid across different sheet files ----
    # If the SAME uuid appears in two *different* .kicad_sch files, that is a
    # UUID collision in the design (copy-paste without UUID regeneration).
    # The same uuid appearing in the same sheet file with different instance
    # paths is *correct* — it is how multi-instance sheets work, so those are
    # left alone.
    uuid_sheets: dict = {}         # uuid -> set of sheet_files seen
    uuid_collisions: dict = {}     # uuid -> list of colliding records
    unique: list[dict] = []
    for r in stage1:
        u = r.get('uuid')
        sf = r.get('sheet_file', '')
        sheets_so_far = uuid_sheets.setdefault(u, set())
        if not sheets_so_far or sf in sheets_so_far:
            # First time seeing this uuid, OR it's from the same sheet file
            # (legitimate multi-instance expansion) — keep it.
            sheets_so_far.add(sf)
            unique.append(r)
        else:
            # Same uuid, but from a DIFFERENT sheet file → UUID collision.
            uuid_collisions.setdefault(u, []).append(r)
            # Don't append to unique — drop the duplicate.

    if uuid_collisions:
        print(f"\nNote: {len(uuid_collisions)} UUID collision(s) detected "
              f"(same symbol UUID in multiple sheet files — likely copy-paste artifacts).")
        print("  Only the first occurrence is kept in the output.")
        for u, recs in list(uuid_collisions.items())[:10]:
            refs  = [r.get('reference') for r in recs]
            files = [r.get('sheet_file') for r in recs]
            print(f"  uuid={u[:8]}...  refs={refs}  sheets={files}")

    print(f"\nUnique instances after dedup: {len(unique)}")

    # Separate power symbols from real parts
    real  = [r for r in unique if not (r.get('lib_id') or '').startswith('power:')]
    power = [r for r in unique if     (r.get('lib_id') or '').startswith('power:')]
    print(f"  Non-power parts : {len(real)}")
    print(f"  Power symbols   : {len(power)}")

    # Check for true reference duplicates (same ref, different uuid = multi-unit)
    from collections import defaultdict, Counter
    by_ref: dict[str, list] = defaultdict(list)
    for r in unique:
        by_ref[r.get('reference', '')].append(r)

    multi_unit = {ref: recs for ref, recs in by_ref.items()
                  if len(recs) > 1 and len({r['uuid'] for r in recs}) > 1}
    if multi_unit:
        refs = [r for r in multi_unit if not r.startswith('#')]
        if refs:
            print(f"\nMulti-unit components ({len(refs)} references, expected for split-unit symbols):")
            for ref in sorted(refs):
                units = [r['instance_unit'] for r in multi_unit[ref]]
                print(f"  {ref}: units {units}")

    output = {
        "project_dir": str(project_dir),
        "root_uuid": root_uuid,
        "schematic_files": [str(f.relative_to(project_dir)) for f in sch_files],
        "total_instances": len(unique),
        "non_power_count": len(real),
        "symbols": unique,
    }

    out_path = project_dir / 'extract_symbols.json'
    out_path.write_text(json.dumps(output, indent=2, ensure_ascii=False), encoding='utf-8')
    print(f"\nOutput written to: {out_path}")

    # Print a summary table
    print("\n--- Summary (non-power parts, sorted by reference) ---")
    for r in sorted(real, key=lambda x: x.get('reference') or ''):
        ref   = r.get('reference', '')
        value = r.get('value', '')
        lib   = r.get('lib_id', '')
        mpn   = r['properties'].get('MPN', '')
        sheet = r.get('sheet_file', '')
        unit  = r.get('instance_unit', '')
        print(f"  {ref:<12}  u{unit:<2}  {value:<30}  {lib:<40}  MPN={mpn:<25}  [{sheet}]")


if __name__ == '__main__':
    if len(sys.argv) > 1:
        project_dir = Path(sys.argv[1]).resolve()
    else:
        project_dir = Path(__file__).parent.resolve()

    if not project_dir.is_dir():
        print(f"Error: {project_dir} is not a directory", file=sys.stderr)
        sys.exit(1)

    main(project_dir)