rrbutani · November 28, 2024 21:37 · rrbutani · Nov 28, 2024
diff --git a/..bazel-dynamic-input-subsetting-with-tree-artifacts.md b/..bazel-dynamic-input-subsetting-with-tree-artifacts.md
diff --git a/.bazelrc b/.bazelrc
 # TODO(nixos): necessary for `$PATH` for `/usr/bin/env python3`
 common --action_env=PATH
 common --action_env=NIX_LD
diff --git a/.envrc b/.envrc
 # shellcheck shell=bash
 use nix -p bazel_7 python3

 # NOTE(nixos): necessary to use the python3 interpreter that rules_python
 # fetches...
 export NIX_LD="$(
    cat "$(
        nix eval --expr "((import <nixpkgs> {}).stdenv.cc).outPath" --impure --raw
    )/nix-support/dynamic-linker"
 )"
diff --git a/.gitignore b/.gitignore
 /.direnv
 /bazel-*
 /MODULE.bazel.lock
diff --git a/a.header b/a.header

 "0: hello from header a"

 include(b)
 include(c)

 "3: end of header a"

 include(c)
diff --git a/a.source b/a.source

 include(a)

 "4: hello from a"

 include(c)
 include(c)
diff --git a/b.header b/b.header

 "1: header b"

 include(c)

 "2: header b"

 include(c)
 include(c)
 include(c)

diff --git a/BUILD.bazel b/BUILD.bazel

 load("@rules_python//python:defs.bzl", "py_binary")
 load(":defs.bzl", "library", "binary")

 py_binary(
    name = "compiler",
    srcs = ["compiler.py"],
 )

 ################################################################################

 library(name = "b", hdrs = ["b.header"], deps = [":c"])
 library(name = "c", hdrs = ["c.header"])
 library(name = "d", hdrs = ["d.header"], deps = [":e", ":b", ":c"])
 library(name = "e", hdrs = ["e.header"], deps = [":c"])

 binary(
    name = "a",
    src = "a.source",
    hdrs = ["a.header"],
    deps = [
        ":b",
        ":c",

        # actually unused:
        ":d",
        ":e",
    ],
 )
diff --git a/c.header b/c.header

 "##############################################################################"
diff --git a/compiler.py b/compiler.py
 #!/usr/bin/env python3

 # Strawman example "compiler" for a very simple "language":
 #   - source files end in `.source` and have only 1 syntactic construct:
 #     + `include(<name>)` lines that result in the "compiler" looking for a file
 #       named `<name>.header` and substituting it in
 #     + all other lines are left as is
 #     + `<name>` must be a single path component (i.e. no directories)
 #   - header files end in `.header` and have the same syntax
 #
 # Recursive includes are disallowed though this is not enforced anywhere.
 #
 # A particular header can be included multiple times.

 import argparse
 import functools
 import os
 from pathlib import Path
 import sys
 from typing import Iterable

 def arg_parser():
    parser = argparse.ArgumentParser()
    subs = parser.add_subparsers()

    comp = subs.add_parser("compile")
    comp.add_argument("--input", type=Path, required=True)
    comp.add_argument("--output", type=Path, required=True)
    comp.add_argument("--available-headers", type=Path, nargs="*", default=[])
    comp.set_defaults(func=compile)

    scan = subs.add_parser("scan-deps")
    scan.add_argument("--input", type=Path, required=True)
    scan.add_argument("--pruned-header-out-dir", type=Path, required=True)
    scan.add_argument("--available-headers", type=Path, nargs="*", default=[])
    scan.set_defaults(func=scan_deps)

    def subcmd_error(_args): raise ValueError("must provide a subcommand")
    parser.set_defaults(func=subcmd_error)

    return parser

 p = lambda *a, **kw: print(*a, **kw, file=sys.stderr)

 class Include(str): pass
 class Line(str): pass

 def parse_file(file_path: Path) -> Iterable[Include | Line]:
    p(f"Reading '{file_path}'")
    with open(file_path, "r") as f:
        while (line := f.readline()):
            line_ = line.strip()
            if line_.startswith("include(") and line_.endswith(")"):
                yield Include(line_.removeprefix("include(").removesuffix(")"))
            else:
                yield Line(line)

 def _make_header_map(available_headers: frozenset[Path]) -> dict[str, Path]:
    out = {}
    for h in available_headers:
        base = h.name
        assert base.endswith(".header")
        name = base.removesuffix(".header")

        assert name
        assert name not in out
        out[name] = h

    return out

 @functools.cache
 def resolve_header_path(available_headers: frozenset[Path], name: str) -> Path:
    map = _make_header_map(available_headers)
    if not name in map: raise ValueError(f"no header found for `{name}`")
    return map[name]

 ################################################################################

 def scan_deps(args):
    hdrs = frozenset(args.available_headers)
    out: Path = args.pruned_header_out_dir

    p(f"Scanning inputs of {args.input}; pruning into → {out}")
    p(f"{len(args.available_headers)} headers provided.")


    @functools.cache
    def direct_references(file_path: Path) -> list[str]: return [
        str(entry) for entry in parse_file(file_path) if type(entry) is Include
    ]

    def recursive_references(file_path: Path) -> Iterable[tuple[str, Path]]:
        for include_name in direct_references(file_path):
            include_path = resolve_header_path(hdrs, include_name)
            yield include_name, include_path
            yield from recursive_references(include_path)

    referenced_headers = { n: p for n, p in recursive_references(args.input) }
    referenced_header_paths: set[Path] = set(referenced_headers.values())

    p()
    had_unused = False
    for h in args.available_headers:
        if h not in referenced_header_paths:
            had_unused = True
            p(f"Header {h} was not used.")
    if not had_unused:
        p("No unused headers.")

    if out.exists():
        assert not os.listdir(out)
    os.makedirs(out, exist_ok=True)
    for name, path in referenced_headers.items():
        # NOTE: we need to make a new output symlink that references the header
        # file in question relative to the output directory's path
        #
        # if we just create a symlink pointing at the input path (i.e. a staged
        # input symlink) we get this error:
        # "error while validating output tree artifact <tree>: <file> (Too many levels of symbolic links)"
        #
        # if we point at the (one level deep) resolved path of the staged input
        # symlink we get the sandbox-only absolute path for the file (i.e.
        # `/tmp/bazel-source-roots/0/a.header`) which Bazel considers dangling
        # when validating the TreeArtifact
        #
        # if we resolve the symlink all the way, we get a host filesystem
        # absolute path which is not hermetic and which (I believe) Bazel will
        # not track in SkyFrame (not certain) and will not interact with RBE
        # well
        #
        # See: https://github.com/bazelbuild/bazel/issues/20891
        # See: https://github.com/bazel-contrib/rules_oci/pull/559/files

        relative_header_path = path.relative_to(out, walk_up=True)
        os.symlink(relative_header_path, out.joinpath(name + ".header"))

    p(f"\n{len(referenced_header_paths)} headers used.")

 def compile(args):
    hdrs = frozenset(args.available_headers)

    p(f"Compiling: {args.input} → {args.output}")
    p(f"{len(hdrs)} headers provided.")

    @functools.cache
    def cache_parse(file_path: Path) -> list[Include | Line]:
        return list(parse_file(file_path))

    def recursively_expand(file_path: Path) -> Iterable[str]:
        for item in cache_parse(file_path):
            match item:
                case Include(name):
                    yield from recursively_expand(resolve_header_path(
                        hdrs, name,
                    ))
                case Line(line): yield line
                case other: raise ValueError(f"unreachable: {other}")

    os.makedirs(args.output.parent, exist_ok=True)
    with open(args.output, "w") as out:
        out.writelines(recursively_expand(args.input))

 if __name__ == "__main__":
    (args := arg_parser().parse_args()).func(args)

    # Just so it's apparent from stdout whether we ran or hit in the cache.
    import datetime
    print(f"\nFinished at {datetime.datetime.now()}")
diff --git a/d.header b/d.header

 "header d: you should not see this!"

 include(e)
 include(b)
 include(c)
diff --git a/defs.bzl b/defs.bzl

 MyInfo = provider(
    fields = dict(
        headers = 'depset[File]',
    ),
 )


 def _library_impl(ctx):
    direct_headers = ctx.files.hdrs
    deps = ctx.attr.deps

    all_headers = depset(
        direct = direct_headers,
        transitive = [d[MyInfo].headers for d in deps],
    )

    return [
        DefaultInfo(files = all_headers),
        MyInfo(headers = all_headers),
    ]

 library = rule(
    implementation = _library_impl,
    attrs = dict(
        hdrs = attr.label_list(allow_files = [".header"]),
        deps = attr.label_list(providers = [MyInfo]),
    ),
    provides = [MyInfo],
 )


 def _binary_impl(ctx):
    all_headers = depset(
        direct = ctx.files.hdrs,
        transitive = [d[MyInfo].headers for d in ctx.attr.deps],
    )
    src = ctx.file.src
    compiler = ctx.executable._compiler

    out = ctx.actions.declare_file(ctx.attr.name + ".out")
    headers_for_src = ctx.actions.declare_directory("_" + ctx.attr.name + ".headers")

    # NOTE: `TreeArtifact`s

    # First, run `scan-deps` to winnow the set of headers:
    ctx.actions.run(
        outputs = [headers_for_src],
        inputs = depset(direct = [src], transitive = [all_headers]),
        executable = compiler,
        arguments = [
            ctx.actions.args()
                .add("scan-deps")
                .add("--input", src)
                .add("--pruned-header-out-dir", headers_for_src.path)
                .add_all("--available-headers", all_headers)
        ],
        mnemonic = "ScanDeps",
        progress_message = "Scanning %{input} for deps (%{label})",

        # NOTE: in a "real" use case we might tag this action as "local" to cut
        # down on the number of files that need to be copied to RBE workers.
        # execution_requirements = { ... },

        # TODO(nixos): necessary for `$PATH` for `/usr/bin/env python3`
        use_default_shell_env = True,
    )

    # Then run `compile` with the narrowed set of headers (symlinks):
    ctx.actions.run(
        outputs = [out],
        inputs = depset(direct = [src, headers_for_src]),
        executable = compiler,
        arguments = [
            ctx.actions.args()
                .add("compile")
                .add("--input", src)
                .add("--output", out)
                .add_all("--available-headers", [headers_for_src])
        ],
        mnemonic = "Compile",
        progress_message = "Compiling %{input} -> %{output} (%{label})",

        # TODO(nixos): necessary for `$PATH` for `/usr/bin/env python3`
        use_default_shell_env = True,    )

    return [DefaultInfo(files = depset([out]))]

 binary = rule(
    implementation = _binary_impl,
    attrs = dict(
        src = attr.label(allow_single_file = [".source"]),
        hdrs = attr.label_list(allow_files = [".header"]),
        deps = attr.label_list(providers = [MyInfo]),
        _compiler = attr.label(
            executable = True,
            cfg = "exec",
            default = Label("//:compiler"),
        )
    ),
 )
diff --git a/e.header b/e.header

 "header e: you should not see this either!"

 include(c)
diff --git a/MODULE.bazel b/MODULE.bazel
 module(name = "dynamic_input_subsetting_with_tree_artifacts")

 bazel_dep(name = "rules_python", version = "0.35.0")

 python = use_extension("@rules_python//python/extensions:python.bzl", "python")
 python.toolchain(python_version = "3.12", is_default=True)
	# TODO(nixos): necessary for `$PATH` for `/usr/bin/env python3`
	common --action_env=PATH
	common --action_env=NIX_LD
	# shellcheck shell=bash
	use nix -p bazel_7 python3

	# NOTE(nixos): necessary to use the python3 interpreter that rules_python
	# fetches...
	export NIX_LD="$(
	cat "$(
	nix eval --expr "((import <nixpkgs> {}).stdenv.cc).outPath" --impure --raw
	)/nix-support/dynamic-linker"
	)"

	load("@rules_python//python:defs.bzl", "py_binary")
	load(":defs.bzl", "library", "binary")

	py_binary(
	name = "compiler",
	srcs = ["compiler.py"],
	)

	################################################################################

	library(name = "b", hdrs = ["b.header"], deps = [":c"])
	library(name = "c", hdrs = ["c.header"])
	library(name = "d", hdrs = ["d.header"], deps = [":e", ":b", ":c"])
	library(name = "e", hdrs = ["e.header"], deps = [":c"])

	binary(
	name = "a",
	src = "a.source",
	hdrs = ["a.header"],
	deps = [
	":b",
	":c",

	# actually unused:
	":d",
	":e",
	],
	)
	#!/usr/bin/env python3

	# Strawman example "compiler" for a very simple "language":
	# - source files end in `.source` and have only 1 syntactic construct:
	# + `include(<name>)` lines that result in the "compiler" looking for a file
	# named `<name>.header` and substituting it in
	# + all other lines are left as is
	# + `<name>` must be a single path component (i.e. no directories)
	# - header files end in `.header` and have the same syntax
	#
	# Recursive includes are disallowed though this is not enforced anywhere.
	#
	# A particular header can be included multiple times.

	import argparse
	import functools
	import os
	from pathlib import Path
	import sys
	from typing import Iterable

	def arg_parser():
	parser = argparse.ArgumentParser()
	subs = parser.add_subparsers()

	comp = subs.add_parser("compile")
	comp.add_argument("--input", type=Path, required=True)
	comp.add_argument("--output", type=Path, required=True)
	comp.add_argument("--available-headers", type=Path, nargs="*", default=[])
	comp.set_defaults(func=compile)

	scan = subs.add_parser("scan-deps")
	scan.add_argument("--input", type=Path, required=True)
	scan.add_argument("--pruned-header-out-dir", type=Path, required=True)
	scan.add_argument("--available-headers", type=Path, nargs="*", default=[])
	scan.set_defaults(func=scan_deps)

	def subcmd_error(_args): raise ValueError("must provide a subcommand")
	parser.set_defaults(func=subcmd_error)

	return parser

	p = lambda a, kw: print(a, **kw, file=sys.stderr)

	class Include(str): pass
	class Line(str): pass

	def parse_file(file_path: Path) -> Iterable[Include \| Line]:
	p(f"Reading '{file_path}'")
	with open(file_path, "r") as f:
	while (line := f.readline()):
	line_ = line.strip()
	if line_.startswith("include(") and line_.endswith(")"):
	yield Include(line_.removeprefix("include(").removesuffix(")"))
	else:
	yield Line(line)

	def _make_header_map(available_headers: frozenset[Path]) -> dict[str, Path]:
	out = {}
	for h in available_headers:
	base = h.name
	assert base.endswith(".header")
	name = base.removesuffix(".header")

	assert name
	assert name not in out
	out[name] = h

	return out

	@functools.cache
	def resolve_header_path(available_headers: frozenset[Path], name: str) -> Path:
	map = _make_header_map(available_headers)
	if not name in map: raise ValueError(f"no header found for `{name}`")
	return map[name]

	################################################################################

	def scan_deps(args):
	hdrs = frozenset(args.available_headers)
	out: Path = args.pruned_header_out_dir

	p(f"Scanning inputs of {args.input}; pruning into → {out}")
	p(f"{len(args.available_headers)} headers provided.")


	@functools.cache
	def direct_references(file_path: Path) -> list[str]: return [
	str(entry) for entry in parse_file(file_path) if type(entry) is Include
	]

	def recursive_references(file_path: Path) -> Iterable[tuple[str, Path]]:
	for include_name in direct_references(file_path):
	include_path = resolve_header_path(hdrs, include_name)
	yield include_name, include_path
	yield from recursive_references(include_path)

	referenced_headers = { n: p for n, p in recursive_references(args.input) }
	referenced_header_paths: set[Path] = set(referenced_headers.values())

	p()
	had_unused = False
	for h in args.available_headers:
	if h not in referenced_header_paths:
	had_unused = True
	p(f"Header {h} was not used.")
	if not had_unused:
	p("No unused headers.")

	if out.exists():
	assert not os.listdir(out)
	os.makedirs(out, exist_ok=True)
	for name, path in referenced_headers.items():
	# NOTE: we need to make a new output symlink that references the header
	# file in question relative to the output directory's path
	#
	# if we just create a symlink pointing at the input path (i.e. a staged
	# input symlink) we get this error:
	# "error while validating output tree artifact <tree>: <file> (Too many levels of symbolic links)"
	#
	# if we point at the (one level deep) resolved path of the staged input
	# symlink we get the sandbox-only absolute path for the file (i.e.
	# `/tmp/bazel-source-roots/0/a.header`) which Bazel considers dangling
	# when validating the TreeArtifact
	#
	# if we resolve the symlink all the way, we get a host filesystem
	# absolute path which is not hermetic and which (I believe) Bazel will
	# not track in SkyFrame (not certain) and will not interact with RBE
	# well
	#
	# See: https://github.com/bazelbuild/bazel/issues/20891
	# See: https://github.com/bazel-contrib/rules_oci/pull/559/files

	relative_header_path = path.relative_to(out, walk_up=True)
	os.symlink(relative_header_path, out.joinpath(name + ".header"))

	p(f"\n{len(referenced_header_paths)} headers used.")

	def compile(args):
	hdrs = frozenset(args.available_headers)

	p(f"Compiling: {args.input} → {args.output}")
	p(f"{len(hdrs)} headers provided.")

	@functools.cache
	def cache_parse(file_path: Path) -> list[Include \| Line]:
	return list(parse_file(file_path))

	def recursively_expand(file_path: Path) -> Iterable[str]:
	for item in cache_parse(file_path):
	match item:
	case Include(name):
	yield from recursively_expand(resolve_header_path(
	hdrs, name,
	))
	case Line(line): yield line
	case other: raise ValueError(f"unreachable: {other}")

	os.makedirs(args.output.parent, exist_ok=True)
	with open(args.output, "w") as out:
	out.writelines(recursively_expand(args.input))

	if __name__ == "__main__":
	(args := arg_parser().parse_args()).func(args)

	# Just so it's apparent from stdout whether we ran or hit in the cache.
	import datetime
	print(f"\nFinished at {datetime.datetime.now()}")

	MyInfo = provider(
	fields = dict(
	headers = 'depset[File]',
	),
	)


	def _library_impl(ctx):
	direct_headers = ctx.files.hdrs
	deps = ctx.attr.deps

	all_headers = depset(
	direct = direct_headers,
	transitive = [d[MyInfo].headers for d in deps],
	)

	return [
	DefaultInfo(files = all_headers),
	MyInfo(headers = all_headers),
	]

	library = rule(
	implementation = _library_impl,
	attrs = dict(
	hdrs = attr.label_list(allow_files = [".header"]),
	deps = attr.label_list(providers = [MyInfo]),
	),
	provides = [MyInfo],
	)


	def _binary_impl(ctx):
	all_headers = depset(
	direct = ctx.files.hdrs,
	transitive = [d[MyInfo].headers for d in ctx.attr.deps],
	)
	src = ctx.file.src
	compiler = ctx.executable._compiler

	out = ctx.actions.declare_file(ctx.attr.name + ".out")
	headers_for_src = ctx.actions.declare_directory("_" + ctx.attr.name + ".headers")

	# NOTE: `TreeArtifact`s

	# First, run `scan-deps` to winnow the set of headers:
	ctx.actions.run(
	outputs = [headers_for_src],
	inputs = depset(direct = [src], transitive = [all_headers]),
	executable = compiler,
	arguments = [
	ctx.actions.args()
	.add("scan-deps")
	.add("--input", src)
	.add("--pruned-header-out-dir", headers_for_src.path)
	.add_all("--available-headers", all_headers)
	],
	mnemonic = "ScanDeps",
	progress_message = "Scanning %{input} for deps (%{label})",

	# NOTE: in a "real" use case we might tag this action as "local" to cut
	# down on the number of files that need to be copied to RBE workers.
	# execution_requirements = { ... },

	# TODO(nixos): necessary for `$PATH` for `/usr/bin/env python3`
	use_default_shell_env = True,
	)

	# Then run `compile` with the narrowed set of headers (symlinks):
	ctx.actions.run(
	outputs = [out],
	inputs = depset(direct = [src, headers_for_src]),
	executable = compiler,
	arguments = [
	ctx.actions.args()
	.add("compile")
	.add("--input", src)
	.add("--output", out)
	.add_all("--available-headers", [headers_for_src])
	],
	mnemonic = "Compile",
	progress_message = "Compiling %{input} -> %{output} (%{label})",

	# TODO(nixos): necessary for `$PATH` for `/usr/bin/env python3`
	use_default_shell_env = True, )

	return [DefaultInfo(files = depset([out]))]

	binary = rule(
	implementation = _binary_impl,
	attrs = dict(
	src = attr.label(allow_single_file = [".source"]),
	hdrs = attr.label_list(allow_files = [".header"]),
	deps = attr.label_list(providers = [MyInfo]),
	_compiler = attr.label(
	executable = True,
	cfg = "exec",
	default = Label("//:compiler"),
	)
	),
	)
	module(name = "dynamic_input_subsetting_with_tree_artifacts")

	bazel_dep(name = "rules_python", version = "0.35.0")

	python = use_extension("@rules_python//python/extensions:python.bzl", "python")
	python.toolchain(python_version = "3.12", is_default=True)