initial commit

2026-02-22 08:16:48 -06:00
commit 3f0aff923d
15 changed files with 4364 additions and 0 deletions
--- a/extract_symbols.py
+++ b/extract_symbols.py
@@ -0,0 +1,517 @@
+#!/usr/bin/env python3
+"""
+KiCad 9 Symbol Metadata Extractor
+==================================
+Walks every .kicad_sch file in the project directory and extracts
+metadata for every placed symbol (component instance), correctly
+expanding hierarchical sheet instances so that each unique reference
+in the final design becomes its own record.
+
+KiCad stores multi-instance sheets by embedding an `(instances ...)`
+block in each symbol.  That block contains one `(path ...)` entry per
+sheet instantiation, each with the authoritative reference for that
+copy.  This script reads those paths so a sheet used N times produces
+N distinct records per symbol.
+
+Output: extract_symbols.json  (same directory as this script)
+
+Usage:
+    python3 extract_symbols.py [project_dir]
+
+If project_dir is omitted, the directory containing this script is used.
+"""
+
+import json
+import sys
+from pathlib import Path
+
+
+# ---------------------------------------------------------------------------
+# S-expression parser
+# ---------------------------------------------------------------------------
+
+def _tokenize(text: str) -> list:
+    """
+    Convert raw KiCad S-expression text into a flat list of tokens.
+    Token forms:
+        ('OPEN',)          – opening paren
+        ('CLOSE',)         – closing paren
+        ('ATOM', value)    – unquoted word / number / bool
+        ('STR',  value)    – double-quoted string (escapes resolved)
+    """
+    tokens = []
+    i, n = 0, len(text)
+    while i < n:
+        c = text[i]
+        if c in ' \t\r\n':
+            i += 1
+        elif c == '(':
+            tokens.append(('OPEN',))
+            i += 1
+        elif c == ')':
+            tokens.append(('CLOSE',))
+            i += 1
+        elif c == '"':
+            j = i + 1
+            buf = []
+            while j < n:
+                if text[j] == '\\' and j + 1 < n:
+                    buf.append(text[j + 1])
+                    j += 2
+                elif text[j] == '"':
+                    j += 1
+                    break
+                else:
+                    buf.append(text[j])
+                    j += 1
+            tokens.append(('STR', ''.join(buf)))
+            i = j
+        else:
+            j = i
+            while j < n and text[j] not in ' \t\r\n()':
+                j += 1
+            tokens.append(('ATOM', text[i:j]))
+            i = j
+    return tokens
+
+
+def _parse(tokens: list, pos: int) -> tuple:
+    """
+    Recursively parse one S-expression value starting at *pos*.
+    Returns (parsed_value, next_pos).
+    A list/node becomes a Python list; atoms and strings become strings.
+    """
+    tok = tokens[pos]
+    kind = tok[0]
+    if kind == 'OPEN':
+        pos += 1
+        items = []
+        while tokens[pos][0] != 'CLOSE':
+            item, pos = _parse(tokens, pos)
+            items.append(item)
+        return items, pos + 1          # consume CLOSE
+    elif kind in ('ATOM', 'STR'):
+        return tok[1], pos + 1
+    else:
+        raise ValueError(f"Unexpected token at pos {pos}: {tok}")
+
+
+def parse_sexp(text: str):
+    """Parse a complete KiCad S-expression file. Returns the root list."""
+    tokens = _tokenize(text)
+    root, _ = _parse(tokens, 0)
+    return root
+
+
+# ---------------------------------------------------------------------------
+# Helpers to navigate parsed S-expressions
+# ---------------------------------------------------------------------------
+
+def tag(node) -> str:
+    if isinstance(node, list) and node and isinstance(node[0], str):
+        return node[0]
+    return ''
+
+
+def children(node: list) -> list:
+    return node[1:] if isinstance(node, list) else []
+
+
+def first_child_with_tag(node: list, name: str):
+    for child in children(node):
+        if isinstance(child, list) and tag(child) == name:
+            return child
+    return None
+
+
+def all_children_with_tag(node: list, name: str) -> list:
+    return [c for c in children(node) if isinstance(c, list) and tag(c) == name]
+
+
+def scalar(node, index: int = 1, default=None):
+    if isinstance(node, list) and len(node) > index:
+        return node[index]
+    return default
+
+
+# ---------------------------------------------------------------------------
+# Instance path extraction
+# ---------------------------------------------------------------------------
+
+def extract_instances(sym_node: list) -> list[dict]:
+    """
+    Parse the (instances ...) block of a symbol and return one dict per
+    hierarchical path.  Each dict has:
+        path      – the full UUID path string
+        reference – the reference designator for that instance
+        unit      – the unit number for that instance
+        project   – the project name
+
+    If there is no instances block (unusual), returns an empty list.
+    """
+    instances_node = first_child_with_tag(sym_node, 'instances')
+    if instances_node is None:
+        return []
+
+    results = []
+    for project_node in all_children_with_tag(instances_node, 'project'):
+        project_name = scalar(project_node, 1, '')
+        for path_node in all_children_with_tag(project_node, 'path'):
+            path_str = scalar(path_node, 1, '')
+            ref_node = first_child_with_tag(path_node, 'reference')
+            unit_node = first_child_with_tag(path_node, 'unit')
+            results.append({
+                'path': path_str,
+                'reference': scalar(ref_node, 1) if ref_node else None,
+                'unit': scalar(unit_node, 1) if unit_node else None,
+                'project': project_name,
+            })
+    return results
+
+
+# ---------------------------------------------------------------------------
+# Symbol extraction
+# ---------------------------------------------------------------------------
+
+def extract_symbol_records(sym_node: list, sheet_file: str) -> list[dict]:
+    """
+    Extract metadata from a placed-symbol node and return one record per
+    hierarchical instance (i.e. one record per path in the instances block).
+
+    For a sheet used only once, this produces a single record.
+    For a sheet instantiated N times, this produces N records — each with
+    its own unique reference designator from the instances block.
+    """
+    # --- Shared fields (same for all instances of this symbol placement) ---
+    shared = {
+        'sheet_file': sheet_file,
+        'lib_id': None,
+        'at': None,
+        'exclude_from_sim': None,
+        'in_bom': None,
+        'on_board': None,
+        'dnp': None,
+        'uuid': None,
+        'properties': {},
+    }
+
+    for child in children(sym_node):
+        if not isinstance(child, list):
+            continue
+        t = tag(child)
+        if t == 'lib_id':
+            shared['lib_id'] = scalar(child, 1)
+        elif t == 'at':
+            shared['at'] = {
+                'x': scalar(child, 1),
+                'y': scalar(child, 2),
+                'angle': scalar(child, 3, 0),
+            }
+        elif t == 'exclude_from_sim':
+            shared['exclude_from_sim'] = scalar(child, 1) == 'yes'
+        elif t == 'in_bom':
+            shared['in_bom'] = scalar(child, 1) == 'yes'
+        elif t == 'on_board':
+            shared['on_board'] = scalar(child, 1) == 'yes'
+        elif t == 'dnp':
+            shared['dnp'] = scalar(child, 1) == 'yes'
+        elif t == 'uuid':
+            shared['uuid'] = scalar(child, 1)
+        elif t == 'property':
+            prop_name = scalar(child, 1)
+            prop_val  = scalar(child, 2)
+            if prop_name is not None:
+                shared['properties'][prop_name] = prop_val
+
+    # Promote standard properties for convenient access
+    props = shared['properties']
+    shared['value']       = props.get('Value')
+    shared['footprint']   = props.get('Footprint')
+    shared['datasheet']   = props.get('Datasheet')
+    shared['description'] = props.get('Description')
+
+    # --- Per-instance fields (one record per path in instances block) ---
+    instances = extract_instances(sym_node)
+
+    if not instances:
+        # Fallback: no instances block — use top-level Reference property
+        record = dict(shared)
+        record['reference']       = props.get('Reference')
+        record['instance_path']   = None
+        record['instance_unit']   = shared.get('unit')
+        record['instance_project']= None
+        return [record]
+
+    records = []
+    for inst in instances:
+        record = dict(shared)
+        record['properties']       = dict(shared['properties'])  # copy so each is independent
+        record['reference']        = inst['reference']
+        record['instance_path']    = inst['path']
+        record['instance_unit']    = inst['unit']
+        record['instance_project'] = inst['project']
+        records.append(record)
+
+    return records
+
+
+# ---------------------------------------------------------------------------
+# Hierarchy walker
+# ---------------------------------------------------------------------------
+
+def find_reachable_sheets(root_sch: Path) -> list[Path]:
+    """
+    Walk the sheet hierarchy starting from *root_sch* and return an ordered
+    list of every .kicad_sch file that is actually reachable (i.e. referenced
+    directly or transitively as a sub-sheet).  Handles repeated sub-sheet
+    references (same file used N times) by visiting the file only once.
+    """
+    reachable: list[Path] = []
+    visited_names: set[str] = set()
+    queue: list[Path] = [root_sch]
+
+    while queue:
+        sch = queue.pop(0)
+        if sch.name in visited_names:
+            continue
+        visited_names.add(sch.name)
+        reachable.append(sch)
+
+        try:
+            text = sch.read_text(encoding='utf-8')
+        except OSError:
+            continue
+
+        root_node = parse_sexp(text)
+        for child in children(root_node):
+            if tag(child) != 'sheet':
+                continue
+            for prop in all_children_with_tag(child, 'property'):
+                if scalar(prop, 1) == 'Sheetfile':
+                    child_filename = scalar(prop, 2)
+                    if child_filename:
+                        child_path = sch.parent / child_filename
+                        if child_path.exists() and child_path.name not in visited_names:
+                            queue.append(child_path)
+
+    return reachable
+
+
+# ---------------------------------------------------------------------------
+# Per-file parsing
+# ---------------------------------------------------------------------------
+
+def extract_from_schematic(sch_path: Path) -> list[dict]:
+    """
+    Parse one .kicad_sch file and return a list of symbol records.
+    lib_symbols definitions are skipped; only placed instances are returned.
+    """
+    text = sch_path.read_text(encoding='utf-8')
+    root = parse_sexp(text)
+
+    results = []
+    for child in children(root):
+        if not isinstance(child, list):
+            continue
+        t = tag(child)
+        if t == 'lib_symbols':
+            continue          # skip library definitions
+        if t == 'symbol' and first_child_with_tag(child, 'lib_id') is not None:
+            records = extract_symbol_records(child, sch_path.name)
+            results.extend(records)
+
+    return results
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+def get_root_uuid(project_dir: Path) -> str | None:
+    """
+    Find the UUID of the root schematic by reading the .kicad_pro file
+    (which names the root sheet) or by scanning for the top-level sheet.
+    Returns the UUID string, or None if it cannot be determined.
+    """
+    # The .kicad_pro file tells us the root schematic filename
+    pro_files = list(project_dir.glob('*.kicad_pro'))
+    root_sch: Path | None = None
+
+    if pro_files:
+        import json as _json
+        try:
+            pro = _json.loads(pro_files[0].read_text(encoding='utf-8'))
+            root_name = pro.get('sheets', [{}])[0] if pro.get('sheets') else None
+            # Fall back: just find a .kicad_sch with the same stem as the .pro
+            root_sch = project_dir / (pro_files[0].stem + '.kicad_sch')
+        except Exception:
+            pass
+
+    if root_sch is None or not root_sch.exists():
+        # Guess: the .kicad_sch whose stem matches the .kicad_pro
+        if pro_files:
+            candidate = project_dir / (pro_files[0].stem + '.kicad_sch')
+            if candidate.exists():
+                root_sch = candidate
+
+    if root_sch is None or not root_sch.exists():
+        return None
+
+    # Extract the first (uuid ...) at the root level of the file
+    import re
+    text = root_sch.read_text(encoding='utf-8')
+    m = re.search(r'\(uuid\s+"([^"]+)"', text)
+    return m.group(1) if m else None
+
+
+def main(project_dir: Path):
+    # Determine root schematic and walk the real hierarchy
+    root_uuid = get_root_uuid(project_dir)
+
+    pro_files = list(project_dir.glob('*.kicad_pro'))
+    root_sch = project_dir / (pro_files[0].stem + '.kicad_sch') if pro_files else None
+
+    if root_sch and root_sch.exists():
+        sch_files = find_reachable_sheets(root_sch)
+        print(f"Root sheet: {root_sch.name}")
+        print(f"Found {len(sch_files)} reachable schematic file(s) in hierarchy:")
+    else:
+        # Fallback: glob everything
+        sch_files = sorted(
+            p for p in project_dir.rglob('*.kicad_sch')
+            if not p.name.startswith('_autosave')
+            and not p.suffix.endswith('.bak')
+        )
+        print(f"Warning: could not find root schematic; scanning all {len(sch_files)} files.\n")
+
+    if not sch_files:
+        print(f"No .kicad_sch files found in {project_dir}", file=sys.stderr)
+        sys.exit(1)
+
+    for f in sch_files:
+        print(f"  {f.relative_to(project_dir)}")
+
+    all_records: list[dict] = []
+
+    for sch_path in sch_files:
+        print(f"\nParsing {sch_path.name} ...", end=' ', flush=True)
+        records = extract_from_schematic(sch_path)
+        print(f"{len(records)} instance record(s)")
+        all_records.extend(records)
+
+    # All records come from reachable sheets, so no orphan filtering needed.
+    # Optionally still filter by root UUID to catch stale instance paths.
+    if root_uuid:
+        active_prefix = f'/{root_uuid}/'
+        active   = [r for r in all_records
+                    if (r.get('instance_path') or '').startswith(active_prefix)]
+        stale    = len(all_records) - len(active)
+        print(f"\nTotal records : {len(all_records)}")
+        if stale:
+            print(f"Stale paths dropped: {stale}")
+    else:
+        active = all_records
+        print(f"\nTotal records: {len(all_records)}")
+
+    # ---- Stage 1: dedup by (instance_path, uuid) ----
+    # Collapses records that were seen from multiple sheet scans into one.
+    seen: set = set()
+    stage1: list[dict] = []
+    for r in active:
+        key = (r.get('instance_path'), r.get('uuid'))
+        if key not in seen:
+            seen.add(key)
+            stage1.append(r)
+
+    # ---- Stage 2: dedup by uuid across different sheet files ----
+    # If the SAME uuid appears in two *different* .kicad_sch files, that is a
+    # UUID collision in the design (copy-paste without UUID regeneration).
+    # The same uuid appearing in the same sheet file with different instance
+    # paths is *correct* — it is how multi-instance sheets work, so those are
+    # left alone.
+    uuid_sheets: dict = {}         # uuid -> set of sheet_files seen
+    uuid_collisions: dict = {}     # uuid -> list of colliding records
+    unique: list[dict] = []
+    for r in stage1:
+        u = r.get('uuid')
+        sf = r.get('sheet_file', '')
+        sheets_so_far = uuid_sheets.setdefault(u, set())
+        if not sheets_so_far or sf in sheets_so_far:
+            # First time seeing this uuid, OR it's from the same sheet file
+            # (legitimate multi-instance expansion) — keep it.
+            sheets_so_far.add(sf)
+            unique.append(r)
+        else:
+            # Same uuid, but from a DIFFERENT sheet file → UUID collision.
+            uuid_collisions.setdefault(u, []).append(r)
+            # Don't append to unique — drop the duplicate.
+
+    if uuid_collisions:
+        print(f"\nNote: {len(uuid_collisions)} UUID collision(s) detected "
+              f"(same symbol UUID in multiple sheet files — likely copy-paste artifacts).")
+        print("  Only the first occurrence is kept in the output.")
+        for u, recs in list(uuid_collisions.items())[:10]:
+            refs  = [r.get('reference') for r in recs]
+            files = [r.get('sheet_file') for r in recs]
+            print(f"  uuid={u[:8]}...  refs={refs}  sheets={files}")
+
+    print(f"\nUnique instances after dedup: {len(unique)}")
+
+    # Separate power symbols from real parts
+    real  = [r for r in unique if not (r.get('lib_id') or '').startswith('power:')]
+    power = [r for r in unique if     (r.get('lib_id') or '').startswith('power:')]
+    print(f"  Non-power parts : {len(real)}")
+    print(f"  Power symbols   : {len(power)}")
+
+    # Check for true reference duplicates (same ref, different uuid = multi-unit)
+    from collections import defaultdict, Counter
+    by_ref: dict[str, list] = defaultdict(list)
+    for r in unique:
+        by_ref[r.get('reference', '')].append(r)
+
+    multi_unit = {ref: recs for ref, recs in by_ref.items()
+                  if len(recs) > 1 and len({r['uuid'] for r in recs}) > 1}
+    if multi_unit:
+        refs = [r for r in multi_unit if not r.startswith('#')]
+        if refs:
+            print(f"\nMulti-unit components ({len(refs)} references, expected for split-unit symbols):")
+            for ref in sorted(refs):
+                units = [r['instance_unit'] for r in multi_unit[ref]]
+                print(f"  {ref}: units {units}")
+
+    output = {
+        "project_dir": str(project_dir),
+        "root_uuid": root_uuid,
+        "schematic_files": [str(f.relative_to(project_dir)) for f in sch_files],
+        "total_instances": len(unique),
+        "non_power_count": len(real),
+        "symbols": unique,
+    }
+
+    out_path = project_dir / 'extract_symbols.json'
+    out_path.write_text(json.dumps(output, indent=2, ensure_ascii=False), encoding='utf-8')
+    print(f"\nOutput written to: {out_path}")
+
+    # Print a summary table
+    print("\n--- Summary (non-power parts, sorted by reference) ---")
+    for r in sorted(real, key=lambda x: x.get('reference') or ''):
+        ref   = r.get('reference', '')
+        value = r.get('value', '')
+        lib   = r.get('lib_id', '')
+        mpn   = r['properties'].get('MPN', '')
+        sheet = r.get('sheet_file', '')
+        unit  = r.get('instance_unit', '')
+        print(f"  {ref:<12}  u{unit:<2}  {value:<30}  {lib:<40}  MPN={mpn:<25}  [{sheet}]")
+
+
+if __name__ == '__main__':
+    if len(sys.argv) > 1:
+        project_dir = Path(sys.argv[1]).resolve()
+    else:
+        project_dir = Path(__file__).parent.resolve()
+
+    if not project_dir.is_dir():
+        print(f"Error: {project_dir} is not a directory", file=sys.stderr)
+        sys.exit(1)
+
+    main(project_dir)