HumanBean17 · HumanBean17 · May 5, 2026 · May 5, 2026 · May 5, 2026
@@ -142,6 +142,7 @@ The DB is dropped and rebuilt from scratch on each run (Phase 1 is a full rebuil
 | `graph_neighbors` | Generic BFS over `EXTENDS|IMPLEMENTS|INJECTS|DECLARES|CALLS`, directional. |
 | `impact_analysis` | Reverse closure: what breaks if this changes. |
 | `analyze_pr` | Map a unified diff (`diff_unified`) to overlapping indexed symbols, sum type-level `impact_analysis` blast, count cross-microservice `CALLS`, list touched `Route` ids (`EXPOSES`), and return a v1 `risk_score` / `risk_band` plus `notes` (binary hunks and renames are skipped for symbol mapping). |
+| `diagnose_ignore` | Explain whether a path is excluded for indexing / graph walks and which rule layer won (`builtin_default`, `project_root`, `nested`, `gitignore`). |
 | `graph_meta` | Counts, ontology version, build timestamp, parse errors; route totals / `routes_by_framework` / `routes_resolved_pct` (v5+); `routes_from_brownfield_pct` / `routes_by_layer` (v6+). |
 | `list_routes` | Filterable listing of `Route` nodes (`microservice`, `framework`, `path_prefix`, `method`). |
 | `find_route_handlers` | Symbols that `EXPOSES` a route id (confidence + resolution strategy on the edge). |
@@ -346,8 +347,8 @@ then `route_overrides.fqn`. Rebuild Lance + Kuzu (`refresh_code_index` or
 
 **Kuzu vs Lance (Layer A consistency):** both the Kuzu graph writer and Lance
 chunk enrichment call **one** function, `graph_enrich.collect_annotation_meta_chain`,
-which scans the project with sorted `*.java` paths, the same exclude rules as
-`build_ast_graph` / `iter_java_source_files`, parse-error warnings on stderr, and
+which scans the project with sorted `*.java` paths, the same layered ignore rules as
+`build_ast_graph` / `path_filtering.iter_java_source_files`, parse-error warnings on stderr, and
 deterministic “first wins” for duplicate annotation simple names. Kuzu and Lance
 **should** agree; they can still diverge if the same file is handled differently
 elsewhere in the pipeline (e.g. parse edge cases). If graph tools and
@@ -396,6 +397,40 @@ Combined, these pull `processClientMessage` / `pickEligibleOperator` /
 only enqueue or configure. Like role weights, the bonus is **skipped when the
 caller locks `role=`**.
 
+### Ignore patterns
+
+Java file discovery for the Kuzu graph, annotation meta-chain collection, and
+the CocoIndex Lance pipeline share the same layered ignore model
+(`path_filtering.LayeredIgnore`):
+
+1. **Builtin default** — the historical hardcoded list (build dirs, `*.class`,
+   `src/test/java`, dot-directories, …).
+2. **Project root** — optional `<project>/.lancedb-mcp/ignore` (gitignore syntax,
+   including negation with `!`).
+3. **Nested** — any `<subdir>/.lancedb-mcp/ignore` on the path from the project
+   root to the file; closer files override farther ones.
+4. **Git** — every `.gitignore` from the project root down to the file’s
+   directory, merged in order, using `pathspec.GitIgnoreSpec` (same semantics as
+   git). Disable with `LayeredIgnore(..., use_gitignore=False)` (used where the
+   legacy walker did not consult git).
+
+If no `.lancedb-mcp/ignore` exists anywhere under the project, behaviour matches
+the pre-B5 builtin list alone (plus git when enabled). When a negation rule
+could un-ignore paths under directories the CocoIndex walk used to prune
+globally, the walk switches to a permissive exclude list and each candidate
+path is filtered again with the full layered rules.
+
+Use the `diagnose_ignore` MCP tool (or `LayeredIgnore.diagnose_dict`) to see
+which file and line decided for a given path.
+
+**Monorepo note:** negation detection runs two full-tree ``rglob`` passes when
+constructing a `LayeredIgnore` (ignore files and `.gitignore` files). That is
+usually cheap to amortise; extremely large trees should expect that fixed cost
+per new instance.
+
+**Dependencies:** `pathspec` is pinned in `requirements.txt` and constrained
+the same way in `pyproject.toml` (loose bundle install vs. wheel metadata).
+
 ### Debugging empty `context_before` / `context_after`
 
 If `context_neighbors=1` returns empty context strings, set

@@ -59,11 +59,7 @@
     resolve_routes_for_method,
     symbol_id,
 )
-from java_index_v1_common import (
-    COMMON_EXCLUDED_PATH_PATTERNS,
-    compile_excluded_glob_patterns,
-    iter_java_source_files,
-)
+from path_filtering import LayeredIgnore, iter_java_source_files
 
 log = logging.getLogger(__name__)
 
@@ -212,7 +208,7 @@ class GraphTables:
     skipped_files: int = 0
 
 
-# ---------- file walk (see `java_index_v1_common.iter_java_source_files`) ----------
+# ---------- file walk (see `path_filtering.iter_java_source_files`) ----------
 
 
 # ---------- pass 1 ----------
@@ -275,10 +271,10 @@ def _register_type(
 def pass1_parse(root: Path, tables: GraphTables, *, verbose: bool) -> dict[str, JavaFileAst]:
     """Walk files, parse them, populate node indexes. Returns path -> AST."""
     asts: dict[str, JavaFileAst] = {}
-    excludes = compile_excluded_glob_patterns(COMMON_EXCLUDED_PATH_PATTERNS)
+    ignore = LayeredIgnore(root)
     t0 = time.time()
     n_files = 0
-    for p in iter_java_source_files(root, excludes):
+    for p in iter_java_source_files(root, ignore=ignore):
         n_files += 1
         try:
             content = p.read_bytes()

@@ -46,11 +46,7 @@
     VALID_ROUTE_FRAMEWORKS,
     VALID_ROUTE_KINDS,
 )
-from java_index_v1_common import (
-    COMMON_EXCLUDED_PATH_PATTERNS,
-    compile_excluded_glob_patterns,
-    iter_java_source_files,
-)
+from path_filtering import LayeredIgnore, iter_java_source_files
 
 __all__ = [
     "AnnotationDecl",
@@ -242,9 +238,9 @@ def _collect_annotation_decl_index(project_root_str: str) -> dict[str, Annotatio
     root = Path(project_root_str)
     if not root.is_dir():
         return {}
-    excludes = compile_excluded_glob_patterns(COMMON_EXCLUDED_PATH_PATTERNS)
+    ignore = LayeredIgnore(root)
     decls: dict[str, AnnotationDecl] = {}
-    for p in sorted(iter_java_source_files(root, excludes), key=str):
+    for p in sorted(iter_java_source_files(root, ignore=ignore), key=str):
         try:
             content = p.read_bytes()
         except OSError as exc:

@@ -37,14 +37,14 @@
 from cocoindex.resources.file import PatternFilePathMatcher
 
 from java_index_v1_common import (
-    COMMON_EXCLUDED_PATH_PATTERNS,
     JAVA_CHUNK,
     SBERT_MODEL,
     SQL_CHUNK,
     YAML_CHUNK,
     chunk_key_range,
     position_to_json,
 )
+from path_filtering import LayeredIgnore
 from ast_java import ONTOLOGY_VERSION, parse_java
 from graph_enrich import enrich_chunk
 
@@ -157,6 +157,8 @@ async def process_java_file(
 ) -> None:
     embedder = coco.use_context(EMBEDDER)
     project_root = coco.use_context(PROJECT_ROOT)
+    if LayeredIgnore(project_root).is_ignored((project_root / file.file_path.path).resolve())[0]:
+        return
     try:
         content = await file.read_text()
     except UnicodeDecodeError:
@@ -218,6 +220,9 @@ async def process_sql_file(
     table: lancedb.TableTarget[SqlLanceChunk],
 ) -> None:
     embedder = coco.use_context(EMBEDDER)
+    project_root = coco.use_context(PROJECT_ROOT)
+    if LayeredIgnore(project_root).is_ignored((project_root / file.file_path.path).resolve())[0]:
+        return
     try:
         content = await file.read_text()
     except UnicodeDecodeError:
@@ -259,6 +264,9 @@ async def process_yaml_file(
     table: lancedb.TableTarget[YamlLanceChunk],
 ) -> None:
     embedder = coco.use_context(EMBEDDER)
+    project_root = coco.use_context(PROJECT_ROOT)
+    if LayeredIgnore(project_root).is_ignored((project_root / file.file_path.path).resolve())[0]:
+        return
     try:
         content = await file.read_text()
     except UnicodeDecodeError:
@@ -327,20 +335,23 @@ async def app_main() -> None:
         yaml_schema,
     )
 
+    project_root = coco.use_context(PROJECT_ROOT)
+    _ignore = LayeredIgnore(project_root)
+    _walk_excludes = _ignore.cocoindex_excluded_patterns()
     java_files = localfs.walk_dir(
         PROJECT_ROOT,
         recursive=True,
         path_matcher=PatternFilePathMatcher(
             included_patterns=["**/*.java"],
-            excluded_patterns=COMMON_EXCLUDED_PATH_PATTERNS,
+            excluded_patterns=_walk_excludes,
         ),
     )
     sql_files = localfs.walk_dir(
         PROJECT_ROOT,
         recursive=True,
         path_matcher=PatternFilePathMatcher(
             included_patterns=["**/src/main/resources/db/migration/*.sql"],
-            excluded_patterns=COMMON_EXCLUDED_PATH_PATTERNS,
+            excluded_patterns=_walk_excludes,
         ),
     )
     yaml_files = localfs.walk_dir(
@@ -351,7 +362,7 @@ async def app_main() -> None:
                 "**/src/main/resources/application*.yml",
                 "**/src/main/resources/application*.yaml",
             ],
-            excluded_patterns=COMMON_EXCLUDED_PATH_PATTERNS,
+            excluded_patterns=_walk_excludes,
         ),
     )
 

@@ -2,9 +2,7 @@
 
 from __future__ import annotations
 
-import fnmatch
 import os
-from collections.abc import Iterable, Iterator
 from pathlib import Path
 from typing import Any
 
@@ -14,23 +12,6 @@
 _DEFAULT_HUB = "sentence-transformers/all-MiniLM-L6-v2"
 SBERT_MODEL = os.path.expandvars(os.path.expanduser(os.environ.get("SBERT_MODEL", _DEFAULT_HUB)))
 
-# Pruning for LocalFile sources: skip VCS, build outputs, dependency trees, and
-# test sources (we currently index prod Java only to keep the semantic index clean).
-# Also avoids EMFILE under default ulimits when the engine traverses in parallel.
-COMMON_EXCLUDED_PATH_PATTERNS: list[str] = [
-    "**/.*",
-    "**/.git/**",
-    "**/.idea/**",
-    "**/.venv/**",
-    "**/node_modules/**",
-    "**/target/**",
-    "**/build/**",
-    "**/out/**",
-    "**/*.class",
-    "**/src/test/java/**",
-    "**/src/test/resources/**",
-]
-
 # Larger window + overlap so chunks carry more behavioural context (method bodies
 # rarely split mid-statement, fewer "orphan" import-only hits at chunk edges).
 # Requires re-index to apply.
@@ -51,56 +32,3 @@ def position_to_json(pos: TextPosition) -> dict[str, Any]:
 def chunk_key_range(chunk: Chunk) -> tuple[int, int]:
     """Byte range for stable primary keys (start inclusive, end exclusive)."""
     return chunk.start.byte_offset, chunk.end.byte_offset
-
-
-# ---------- shared Java source tree walk (graph index + meta-annotation pass) ----------
-
-def compile_excluded_glob_patterns(
-    patterns: Iterable[str] | tuple[str, ...],
-) -> list[str]:
-    """Store exclude patterns in list form; same as ast-graph `index` compile step."""
-    return list(patterns)
-
-
-def is_relative_path_excluded(
-    rel_posix: str, exclude_globs: list[str],
-) -> bool:
-    """True if a project-relative path matches an exclude glob (incl. `**/<path>`)."""
-    for pat in exclude_globs:
-        if fnmatch.fnmatch(rel_posix, pat):
-            return True
-        if fnmatch.fnmatch(f"**/{rel_posix}", pat):
-            return True
-    return False
-
-
-def iter_java_source_files(
-    root: Path, exclude_globs: list[str],
-) -> Iterator[Path]:
-    """Walk `root` for `*.java`, honouring the same prunes and globs as `build_ast_graph`."""
-    for dirpath, dirnames, filenames in os.walk(root):
-        dirnames[:] = [
-            d
-            for d in dirnames
-            if d
-            not in (
-                ".git",
-                "target",
-                "build",
-                "out",
-                "node_modules",
-                ".venv",
-                ".idea",
-            )
-        ]
-        for fn in filenames:
-            if not fn.endswith(".java"):
-                continue
-            p = Path(dirpath) / fn
-            try:
-                rel = p.resolve().relative_to(root.resolve()).as_posix()
-            except ValueError:
-                rel = p.as_posix()
-            if is_relative_path_excluded(rel, exclude_globs):
-                continue
-            yield p