Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 37 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ The DB is dropped and rebuilt from scratch on each run (Phase 1 is a full rebuil
| `graph_neighbors` | Generic BFS over `EXTENDS|IMPLEMENTS|INJECTS|DECLARES|CALLS`, directional. |
| `impact_analysis` | Reverse closure: what breaks if this changes. |
| `analyze_pr` | Map a unified diff (`diff_unified`) to overlapping indexed symbols, sum type-level `impact_analysis` blast, count cross-microservice `CALLS`, list touched `Route` ids (`EXPOSES`), and return a v1 `risk_score` / `risk_band` plus `notes` (binary hunks and renames are skipped for symbol mapping). |
| `diagnose_ignore` | Explain whether a path is excluded for indexing / graph walks and which rule layer won (`builtin_default`, `project_root`, `nested`, `gitignore`). |
| `graph_meta` | Counts, ontology version, build timestamp, parse errors; route totals / `routes_by_framework` / `routes_resolved_pct` (v5+); `routes_from_brownfield_pct` / `routes_by_layer` (v6+). |
| `list_routes` | Filterable listing of `Route` nodes (`microservice`, `framework`, `path_prefix`, `method`). |
| `find_route_handlers` | Symbols that `EXPOSES` a route id (confidence + resolution strategy on the edge). |
Expand Down Expand Up @@ -346,8 +347,8 @@ then `route_overrides.fqn`. Rebuild Lance + Kuzu (`refresh_code_index` or

**Kuzu vs Lance (Layer A consistency):** both the Kuzu graph writer and Lance
chunk enrichment call **one** function, `graph_enrich.collect_annotation_meta_chain`,
which scans the project with sorted `*.java` paths, the same exclude rules as
`build_ast_graph` / `iter_java_source_files`, parse-error warnings on stderr, and
which scans the project with sorted `*.java` paths, the same layered ignore rules as
`build_ast_graph` / `path_filtering.iter_java_source_files`, parse-error warnings on stderr, and
deterministic “first wins” for duplicate annotation simple names. Kuzu and Lance
**should** agree; they can still diverge if the same file is handled differently
elsewhere in the pipeline (e.g. parse edge cases). If graph tools and
Expand Down Expand Up @@ -396,6 +397,40 @@ Combined, these pull `processClientMessage` / `pickEligibleOperator` /
only enqueue or configure. Like role weights, the bonus is **skipped when the
caller locks `role=`**.

### Ignore patterns

Java file discovery for the Kuzu graph, annotation meta-chain collection, and
the CocoIndex Lance pipeline share the same layered ignore model
(`path_filtering.LayeredIgnore`):

1. **Builtin default** — the historical hardcoded list (build dirs, `*.class`,
`src/test/java`, dot-directories, …).
2. **Project root** — optional `<project>/.lancedb-mcp/ignore` (gitignore syntax,
including negation with `!`).
3. **Nested** — any `<subdir>/.lancedb-mcp/ignore` on the path from the project
root to the file; closer files override farther ones.
4. **Git** — every `.gitignore` from the project root down to the file’s
directory, merged in order, using `pathspec.GitIgnoreSpec` (same semantics as
git). Disable with `LayeredIgnore(..., use_gitignore=False)` (used where the
legacy walker did not consult git).

If no `.lancedb-mcp/ignore` exists anywhere under the project, behaviour matches
the pre-B5 builtin list alone (plus git when enabled). When a negation rule
could un-ignore paths under directories the CocoIndex walk used to prune
globally, the walk switches to a permissive exclude list and each candidate
path is filtered again with the full layered rules.

Use the `diagnose_ignore` MCP tool (or `LayeredIgnore.diagnose_dict`) to see
which file and line decided for a given path.

**Monorepo note:** negation detection runs two full-tree ``rglob`` passes when
constructing a `LayeredIgnore` (ignore files and `.gitignore` files). That is
usually cheap to amortise; extremely large trees should expect that fixed cost
per new instance.

**Dependencies:** `pathspec` is pinned in `requirements.txt` and constrained
the same way in `pyproject.toml` (loose bundle install vs. wheel metadata).

### Debugging empty `context_before` / `context_after`

If `context_neighbors=1` returns empty context strings, set
Expand Down
12 changes: 4 additions & 8 deletions build_ast_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,11 +59,7 @@
resolve_routes_for_method,
symbol_id,
)
from java_index_v1_common import (
COMMON_EXCLUDED_PATH_PATTERNS,
compile_excluded_glob_patterns,
iter_java_source_files,
)
from path_filtering import LayeredIgnore, iter_java_source_files

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -212,7 +208,7 @@ class GraphTables:
skipped_files: int = 0


# ---------- file walk (see `java_index_v1_common.iter_java_source_files`) ----------
# ---------- file walk (see `path_filtering.iter_java_source_files`) ----------


# ---------- pass 1 ----------
Expand Down Expand Up @@ -275,10 +271,10 @@ def _register_type(
def pass1_parse(root: Path, tables: GraphTables, *, verbose: bool) -> dict[str, JavaFileAst]:
"""Walk files, parse them, populate node indexes. Returns path -> AST."""
asts: dict[str, JavaFileAst] = {}
excludes = compile_excluded_glob_patterns(COMMON_EXCLUDED_PATH_PATTERNS)
ignore = LayeredIgnore(root)
t0 = time.time()
n_files = 0
for p in iter_java_source_files(root, excludes):
for p in iter_java_source_files(root, ignore=ignore):
n_files += 1
try:
content = p.read_bytes()
Expand Down
10 changes: 3 additions & 7 deletions graph_enrich.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,7 @@
VALID_ROUTE_FRAMEWORKS,
VALID_ROUTE_KINDS,
)
from java_index_v1_common import (
COMMON_EXCLUDED_PATH_PATTERNS,
compile_excluded_glob_patterns,
iter_java_source_files,
)
from path_filtering import LayeredIgnore, iter_java_source_files

__all__ = [
"AnnotationDecl",
Expand Down Expand Up @@ -242,9 +238,9 @@ def _collect_annotation_decl_index(project_root_str: str) -> dict[str, Annotatio
root = Path(project_root_str)
if not root.is_dir():
return {}
excludes = compile_excluded_glob_patterns(COMMON_EXCLUDED_PATH_PATTERNS)
ignore = LayeredIgnore(root)
decls: dict[str, AnnotationDecl] = {}
for p in sorted(iter_java_source_files(root, excludes), key=str):
for p in sorted(iter_java_source_files(root, ignore=ignore), key=str):
try:
content = p.read_bytes()
except OSError as exc:
Expand Down
19 changes: 15 additions & 4 deletions java_index_flow_lancedb.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,14 @@
from cocoindex.resources.file import PatternFilePathMatcher

from java_index_v1_common import (
COMMON_EXCLUDED_PATH_PATTERNS,
JAVA_CHUNK,
SBERT_MODEL,
SQL_CHUNK,
YAML_CHUNK,
chunk_key_range,
position_to_json,
)
from path_filtering import LayeredIgnore
from ast_java import ONTOLOGY_VERSION, parse_java
from graph_enrich import enrich_chunk

Expand Down Expand Up @@ -157,6 +157,8 @@ async def process_java_file(
) -> None:
embedder = coco.use_context(EMBEDDER)
project_root = coco.use_context(PROJECT_ROOT)
if LayeredIgnore(project_root).is_ignored((project_root / file.file_path.path).resolve())[0]:
return
try:
content = await file.read_text()
except UnicodeDecodeError:
Expand Down Expand Up @@ -218,6 +220,9 @@ async def process_sql_file(
table: lancedb.TableTarget[SqlLanceChunk],
) -> None:
embedder = coco.use_context(EMBEDDER)
project_root = coco.use_context(PROJECT_ROOT)
if LayeredIgnore(project_root).is_ignored((project_root / file.file_path.path).resolve())[0]:
return
try:
content = await file.read_text()
except UnicodeDecodeError:
Expand Down Expand Up @@ -259,6 +264,9 @@ async def process_yaml_file(
table: lancedb.TableTarget[YamlLanceChunk],
) -> None:
embedder = coco.use_context(EMBEDDER)
project_root = coco.use_context(PROJECT_ROOT)
if LayeredIgnore(project_root).is_ignored((project_root / file.file_path.path).resolve())[0]:
return
try:
content = await file.read_text()
except UnicodeDecodeError:
Expand Down Expand Up @@ -327,20 +335,23 @@ async def app_main() -> None:
yaml_schema,
)

project_root = coco.use_context(PROJECT_ROOT)
_ignore = LayeredIgnore(project_root)
_walk_excludes = _ignore.cocoindex_excluded_patterns()
java_files = localfs.walk_dir(
PROJECT_ROOT,
recursive=True,
path_matcher=PatternFilePathMatcher(
included_patterns=["**/*.java"],
excluded_patterns=COMMON_EXCLUDED_PATH_PATTERNS,
excluded_patterns=_walk_excludes,
),
)
sql_files = localfs.walk_dir(
PROJECT_ROOT,
recursive=True,
path_matcher=PatternFilePathMatcher(
included_patterns=["**/src/main/resources/db/migration/*.sql"],
excluded_patterns=COMMON_EXCLUDED_PATH_PATTERNS,
excluded_patterns=_walk_excludes,
),
)
yaml_files = localfs.walk_dir(
Expand All @@ -351,7 +362,7 @@ async def app_main() -> None:
"**/src/main/resources/application*.yml",
"**/src/main/resources/application*.yaml",
],
excluded_patterns=COMMON_EXCLUDED_PATH_PATTERNS,
excluded_patterns=_walk_excludes,
),
)

Expand Down
72 changes: 0 additions & 72 deletions java_index_v1_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,7 @@

from __future__ import annotations

import fnmatch
import os
from collections.abc import Iterable, Iterator
from pathlib import Path
from typing import Any

Expand All @@ -14,23 +12,6 @@
_DEFAULT_HUB = "sentence-transformers/all-MiniLM-L6-v2"
SBERT_MODEL = os.path.expandvars(os.path.expanduser(os.environ.get("SBERT_MODEL", _DEFAULT_HUB)))

# Pruning for LocalFile sources: skip VCS, build outputs, dependency trees, and
# test sources (we currently index prod Java only to keep the semantic index clean).
# Also avoids EMFILE under default ulimits when the engine traverses in parallel.
COMMON_EXCLUDED_PATH_PATTERNS: list[str] = [
"**/.*",
"**/.git/**",
"**/.idea/**",
"**/.venv/**",
"**/node_modules/**",
"**/target/**",
"**/build/**",
"**/out/**",
"**/*.class",
"**/src/test/java/**",
"**/src/test/resources/**",
]

# Larger window + overlap so chunks carry more behavioural context (method bodies
# rarely split mid-statement, fewer "orphan" import-only hits at chunk edges).
# Requires re-index to apply.
Expand All @@ -51,56 +32,3 @@ def position_to_json(pos: TextPosition) -> dict[str, Any]:
def chunk_key_range(chunk: Chunk) -> tuple[int, int]:
"""Byte range for stable primary keys (start inclusive, end exclusive)."""
return chunk.start.byte_offset, chunk.end.byte_offset


# ---------- shared Java source tree walk (graph index + meta-annotation pass) ----------

def compile_excluded_glob_patterns(
patterns: Iterable[str] | tuple[str, ...],
) -> list[str]:
"""Store exclude patterns in list form; same as ast-graph `index` compile step."""
return list(patterns)


def is_relative_path_excluded(
rel_posix: str, exclude_globs: list[str],
) -> bool:
"""True if a project-relative path matches an exclude glob (incl. `**/<path>`)."""
for pat in exclude_globs:
if fnmatch.fnmatch(rel_posix, pat):
return True
if fnmatch.fnmatch(f"**/{rel_posix}", pat):
return True
return False


def iter_java_source_files(
root: Path, exclude_globs: list[str],
) -> Iterator[Path]:
"""Walk `root` for `*.java`, honouring the same prunes and globs as `build_ast_graph`."""
for dirpath, dirnames, filenames in os.walk(root):
dirnames[:] = [
d
for d in dirnames
if d
not in (
".git",
"target",
"build",
"out",
"node_modules",
".venv",
".idea",
)
]
for fn in filenames:
if not fn.endswith(".java"):
continue
p = Path(dirpath) / fn
try:
rel = p.resolve().relative_to(root.resolve()).as_posix()
except ValueError:
rel = p.as_posix()
if is_relative_path_excluded(rel, exclude_globs):
continue
yield p
Loading