diff --git a/pkgs/build-support/rust/fetch-cargo-vendor-util-v2.py b/pkgs/build-support/rust/fetch-cargo-vendor-util-v2.py new file mode 100644 index 0000000000000..5dc789c93ad95 --- /dev/null +++ b/pkgs/build-support/rust/fetch-cargo-vendor-util-v2.py @@ -0,0 +1,416 @@ +import functools +import hashlib +import json +import multiprocessing as mp +import re +import shutil +import subprocess +import sys +import tomllib +from os.path import islink, realpath +from pathlib import Path +from typing import Any, TypedDict, cast +from urllib.parse import unquote + +import requests +import tomli_w +from requests.adapters import HTTPAdapter, Retry + +eprint = functools.partial(print, file=sys.stderr) + + +def load_toml(path: Path) -> dict[str, Any]: + with open(path, "rb") as f: + return tomllib.load(f) + + +def get_lockfile_version(cargo_lock_toml: dict[str, Any]) -> int: + # lockfile v1 and v2 don't have the `version` key, so assume v2 + version = cargo_lock_toml.get("version", 2) + + # TODO: add logic for differentiating between v1 and v2 + + return version + + +def create_http_session() -> requests.Session: + retries = Retry( + total=5, + backoff_factor=0.5, + status_forcelist=[500, 502, 503, 504] + ) + session = requests.Session() + session.headers["User-Agent"] = "nixpkgs-fetchCargoVendor/2 (https://github.com/NixOS/nixpkgs)" + session.mount('http://', HTTPAdapter(max_retries=retries)) + session.mount('https://', HTTPAdapter(max_retries=retries)) + return session + + +def download_file_with_checksum(session: requests.Session, url: str, destination_path: Path) -> str: + sha256_hash = hashlib.sha256() + with session.get(url, stream=True) as response: + if not response.ok: + raise Exception(f"Failed to fetch file from {url}. Status code: {response.status_code}") + with open(destination_path, "wb") as file: + for chunk in response.iter_content(1024): # Download in chunks + if chunk: # Filter out keep-alive chunks + file.write(chunk) + sha256_hash.update(chunk) + + # Compute the final checksum + checksum = sha256_hash.hexdigest() + return checksum + + +def get_download_url_for_tarball(pkg: dict[str, Any]) -> str: + # TODO: support other registries + # maybe fetch config.json from the registry root and get the dl key + # See: https://doc.rust-lang.org/cargo/reference/registry-index.html#index-configuration + if pkg["source"] != "registry+https://github.com/rust-lang/crates.io-index": + raise Exception("Only the default crates.io registry is supported.") + + # Use static.crates.io (CDN) instead of crates.io/api to avoid the 1 req/sec + # rate limit on the API servers. + return f"https://static.crates.io/crates/{pkg["name"]}/{pkg["version"]}/download" + + +def download_tarball(session: requests.Session, pkg: dict[str, Any], out_dir: Path) -> None: + + url = get_download_url_for_tarball(pkg) + filename = f"{pkg["name"]}-{pkg["version"]}.tar.gz" + + # TODO: allow legacy checksum specification, see importCargoLock for example + # also, don't forget about the other usage of the checksum + expected_checksum = pkg["checksum"] + + tarball_out_dir = out_dir / "tarballs" / filename + eprint(f"Fetching {url} -> tarballs/{filename}") + + calculated_checksum = download_file_with_checksum(session, url, tarball_out_dir) + + if calculated_checksum != expected_checksum: + raise Exception(f"Hash mismatch! File fetched from {url} had checksum {calculated_checksum}, expected {expected_checksum}.") + + +def download_git_tree(url: str, git_sha_rev: str, out_dir: Path) -> None: + + tree_out_dir = out_dir / "git" / git_sha_rev + eprint(f"Fetching {url}#{git_sha_rev} -> git/{git_sha_rev}") + + cmd = ["nix-prefetch-git", "--builder", "--quiet", "--fetch-submodules", "--url", url, "--rev", git_sha_rev, "--out", str(tree_out_dir)] + subprocess.check_output(cmd) + + +GIT_SOURCE_REGEX = re.compile("git\\+(?P[^?]+)(\\?(?Prev|tag|branch)=(?P.*))?#(?P.*)") + + +class GitSourceInfo(TypedDict): + url: str + type: str | None + value: str | None + git_sha_rev: str + + +def parse_git_source(source: str, lockfile_version: int) -> GitSourceInfo: + match = GIT_SOURCE_REGEX.match(source) + if match is None: + raise Exception(f"Unable to process git source: {source}.") + + source_info = cast(GitSourceInfo, match.groupdict(default=None)) + + # the source URL is URL-encoded in lockfile_version >=4 + # since we just used regex to parse it we have to manually decode the escaped branch/tag name + if lockfile_version >= 4 and source_info["value"] is not None: + source_info["value"] = unquote(source_info["value"]) + + return source_info + + +def create_vendor_staging(lockfile_path: Path, out_dir: Path) -> None: + cargo_lock_toml = load_toml(lockfile_path) + lockfile_version = get_lockfile_version(cargo_lock_toml) + + git_packages: list[dict[str, Any]] = [] + registry_packages: list[dict[str, Any]] = [] + + for pkg in cargo_lock_toml["package"]: + # ignore local dependenices + if "source" not in pkg.keys(): + eprint(f"Skipping local dependency: {pkg["name"]}") + continue + source = pkg["source"] + + if source.startswith("git+"): + git_packages.append(pkg) + elif source.startswith("registry+"): + registry_packages.append(pkg) + else: + raise Exception(f"Can't process source: {source}.") + + git_sha_rev_to_url: dict[str, str] = {} + for pkg in git_packages: + source_info = parse_git_source(pkg["source"], lockfile_version) + git_sha_rev_to_url[source_info["git_sha_rev"]] = source_info["url"] + + out_dir.mkdir(exist_ok=True) + shutil.copy(lockfile_path, out_dir / "Cargo.lock") + + # fetch git trees sequentially, since fetching concurrently leads to flaky behaviour + if len(git_packages) != 0: + (out_dir / "git").mkdir() + for git_sha_rev, url in git_sha_rev_to_url.items(): + download_git_tree(url, git_sha_rev, out_dir) + + # run tarball download jobs in parallel, with at most 5 concurrent download jobs + with mp.Pool(min(5, mp.cpu_count())) as pool: + if len(registry_packages) != 0: + (out_dir / "tarballs").mkdir() + session = create_http_session() + tarball_args_gen = ((session, pkg, out_dir) for pkg in registry_packages) + pool.starmap(download_tarball, tarball_args_gen) + + +def get_manifest_metadata(manifest_path: Path) -> dict[str, Any]: + cmd = ["cargo", "metadata", "--format-version", "1", "--no-deps", "--manifest-path", str(manifest_path)] + output = subprocess.check_output(cmd) + return json.loads(output) + + +def try_get_crate_manifest_path_from_manifest_path(manifest_path: Path, crate_name: str) -> Path | None: + try: + metadata = get_manifest_metadata(manifest_path) + except subprocess.CalledProcessError: + eprint(f"Warning: cargo metadata failed for {manifest_path}, skipping") + return None + + for pkg in metadata["packages"]: + if pkg["name"] == crate_name: + return Path(pkg["manifest_path"]) + + return None + + +def find_crate_manifest_in_tree(tree: Path, crate_name: str) -> Path: + # Scan all Cargo.toml files; sort by depth/path to make ordering deterministic + # and prefer less-nested manifests first. + manifest_paths = sorted( + tree.glob("**/Cargo.toml"), + key=lambda path: (len(path.parts), str(path)), + ) + + for manifest_path in manifest_paths: + res = try_get_crate_manifest_path_from_manifest_path(manifest_path, crate_name) + if res is not None: + return res + + raise Exception(f"Couldn't find manifest for crate {crate_name} inside {tree}.") + + +def copy_and_patch_git_crate_subtree(git_tree: Path, crate_name: str, crate_out_dir: Path) -> None: + + # This function will get called by copytree to decide which entries of a directory should be copied + # We'll copy everything except symlinks that are invalid + def ignore_func(dir_str: str, path_strs: list[str]) -> list[str]: + ignorelist: list[str] = [] + + dir = Path(realpath(dir_str, strict=True)) + + for path_str in path_strs: + path = dir / path_str + if not islink(path): + continue + + # Filter out cyclic symlinks and symlinks pointing at nonexistant files + try: + target_path = Path(realpath(path, strict=True)) + except OSError: + ignorelist.append(path_str) + eprint(f"Failed to resolve symlink, ignoring: {path}") + continue + + # Filter out symlinks that point outside of the current crate's base git tree + # This can be useful if the nix build sandbox is turned off and there is a symlink to a common absolute path + if not target_path.is_relative_to(git_tree): + ignorelist.append(path_str) + eprint(f"Symlink points outside of the crate's base git tree, ignoring: {path} -> {target_path}") + continue + + return ignorelist + + crate_manifest_path = find_crate_manifest_in_tree(git_tree, crate_name) + crate_tree = crate_manifest_path.parent + + eprint(f"Copying to {crate_out_dir}") + shutil.copytree(crate_tree, crate_out_dir, ignore=ignore_func) + crate_out_dir.chmod(0o755) + + with open(crate_manifest_path, "r") as f: + manifest_data = f.read() + + if "workspace" in manifest_data: + crate_manifest_metadata = get_manifest_metadata(crate_manifest_path) + workspace_root = Path(crate_manifest_metadata["workspace_root"]) + + root_manifest_path = workspace_root / "Cargo.toml" + manifest_path = crate_out_dir / "Cargo.toml" + + manifest_path.chmod(0o644) + eprint(f"Patching {manifest_path}") + + cmd = ["replace-workspace-values", str(manifest_path), str(root_manifest_path)] + subprocess.check_output(cmd) + + +def extract_crate_tarball_contents(tarball_path: Path, crate_out_dir: Path) -> None: + eprint(f"Unpacking to {crate_out_dir}") + crate_out_dir.mkdir() + cmd = ["tar", "xf", str(tarball_path), "-C", str(crate_out_dir), "--strip-components=1"] + subprocess.check_output(cmd) + + +def make_git_source_selector(source_info: GitSourceInfo) -> dict[str, str]: + selector = {} + selector["git"] = source_info["url"] + if source_info["type"] is not None: + selector[source_info["type"]] = source_info["value"] + return selector + + +def make_registry_source_selector(source: str) -> dict[str, str]: + registry = source[9:] if source.startswith("registry+") else source + selector = {} + selector["registry"] = registry + return selector + + +def create_vendor(vendor_staging_dir: Path, out_dir: Path) -> None: + lockfile_path = vendor_staging_dir / "Cargo.lock" + out_dir.mkdir(exist_ok=True) + shutil.copy(lockfile_path, out_dir / "Cargo.lock") + + cargo_lock_toml = load_toml(lockfile_path) + lockfile_version = get_lockfile_version(cargo_lock_toml) + + source_to_ind: dict[str, str] = {} + source_config = {} + next_registry_ind = 0 + next_git_ind = 0 + + def add_source_replacement( + orig_key: str, + orig_selector: dict[str, str], + vendored_key: str, + vendored_dir: str + ) -> None: + source_config[vendored_key] = {} + source_config[vendored_key]["directory"] = vendored_dir + source_config[orig_key] = orig_selector + source_config[orig_key]["replace-with"] = vendored_key + + # we reserve registry index 0 for crates-io + source_to_ind["registry+https://github.com/rust-lang/crates.io-index"] = "registry-0" + source_to_ind["sparse+https://index.crates.io/"] = "registry-0" + add_source_replacement( + orig_key="crates-io", + orig_selector={}, # there is an internal selector defined for the `crates-io` source + vendored_key="vendored-source-registry-0", + vendored_dir="@vendor@/source-registry-0" + ) + next_registry_ind += 1 + + for pkg in cargo_lock_toml["package"]: + # ignore local dependencies + if "source" not in pkg.keys(): + continue + source: str = pkg["source"] + if source in source_to_ind: + continue + + if source.startswith("git+"): + ind = f"git-{next_git_ind}" + next_git_ind += 1 + source_info = parse_git_source(source, lockfile_version) + selector = make_git_source_selector(source_info) + elif source.startswith("registry+") or source.startswith("sparse+"): + ind = f"registry-{next_registry_ind}" + next_registry_ind += 1 + selector = make_registry_source_selector(source) + else: + raise Exception(f"Can't process source: {source}.") + + source_to_ind[source] = ind + add_source_replacement( + orig_key=f"original-source-{ind}", + orig_selector=selector, + vendored_key=f"vendored-source-{ind}", + vendored_dir=f"@vendor@/source-{ind}" + ) + + config_path = out_dir / ".cargo" / "config.toml" + config_path.parent.mkdir() + + with open(config_path, "wb") as config_file: + tomli_w.dump({"source": source_config}, config_file) + + for pkg in cargo_lock_toml["package"]: + + # ignore local dependenices + if "source" not in pkg.keys(): + continue + + source: str = pkg["source"] + source_ind = source_to_ind[source] + crate_dir_name = f"{pkg["name"]}-{pkg["version"]}" + source_dir_name = f"source-{source_ind}" + crate_out_dir = out_dir / source_dir_name / crate_dir_name + crate_out_dir.parent.mkdir(exist_ok=True) + + if source.startswith("git+"): + + source_info = parse_git_source(source, lockfile_version) + + git_sha_rev = source_info["git_sha_rev"] + git_tree = vendor_staging_dir / "git" / git_sha_rev + + copy_and_patch_git_crate_subtree(git_tree, pkg["name"], crate_out_dir) + + # git based crates allow having no checksum information + with open(crate_out_dir / ".cargo-checksum.json", "w") as f: + json.dump({"files": {}}, f) + + elif source.startswith("registry+") or source.startswith("sparse+"): + filename = f"{pkg["name"]}-{pkg["version"]}.tar.gz" + + # TODO: change this when non-crates-io registries are supported + dir_name = "tarballs" + + tarball_path = vendor_staging_dir / dir_name / filename + + extract_crate_tarball_contents(tarball_path, crate_out_dir) + + # non-git based crates need the package checksum at minimum + with open(crate_out_dir / ".cargo-checksum.json", "w") as f: + json.dump({"files": {}, "package": pkg["checksum"]}, f) + + else: + raise Exception(f"Can't process source: {source}.") + + +def main() -> None: + subcommand = sys.argv[1] + + subcommand_func_dict = { + "create-vendor-staging": lambda: create_vendor_staging(lockfile_path=Path(sys.argv[2]), out_dir=Path(sys.argv[3])), + "create-vendor": lambda: create_vendor(vendor_staging_dir=Path(sys.argv[2]), out_dir=Path(sys.argv[3])) + } + + subcommand_func = subcommand_func_dict.get(subcommand) + + if subcommand_func is None: + raise Exception(f"Unknown subcommand: '{subcommand}'. Must be one of {list(subcommand_func_dict.keys())}") + + subcommand_func() + + +if __name__ == "__main__": + main() diff --git a/pkgs/build-support/rust/fetch-cargo-vendor.nix b/pkgs/build-support/rust/fetch-cargo-vendor.nix index 5362491b260c3..a02bbd89c7fce 100644 --- a/pkgs/build-support/rust/fetch-cargo-vendor.nix +++ b/pkgs/build-support/rust/fetch-cargo-vendor.nix @@ -22,18 +22,29 @@ let ]; } (builtins.readFile ./replace-workspace-values.py); - fetchCargoVendorUtil = writers.writePython3Bin "fetch-cargo-vendor-util" { - libraries = - with python3Packages; - [ - requests - tomli-w - ] - ++ requests.optional-dependencies.socks; # to support socks proxy envs like ALL_PROXY in requests - flakeIgnore = [ - "E501" - ]; - } (builtins.readFile ./fetch-cargo-vendor-util.py); + mkFetchCargoVendorUtil = + name: src: + writers.writePython3Bin name { + libraries = + with python3Packages; + [ + requests + tomli-w + ] + ++ requests.optional-dependencies.socks; # to support socks proxy envs like ALL_PROXY in requests + flakeIgnore = [ + "E501" + ]; + } (builtins.readFile src); + + # Separate util used only by the FOD `vendorStaging` stage below. Kept + # distinct from fetchCargoVendorUtil so that changes to the network-facing + # bits (User-Agent, download URL) don't invalidate the input-addressed + # `-vendor` stage and force a mass rebuild of every Rust package in nixpkgs. + # vendorStaging is an FOD, so swapping its util is free for consumers. + # TODO: unify with fetchCargoVendorUtil on the next `staging` cycle. + fetchCargoVendorUtilV2 = mkFetchCargoVendorUtil "fetch-cargo-vendor-util-v2" ./fetch-cargo-vendor-util-v2.py; + fetchCargoVendorUtil = mkFetchCargoVendorUtil "fetch-cargo-vendor-util" ./fetch-cargo-vendor-util.py; in { @@ -61,7 +72,7 @@ let impureEnvVars = lib.fetchers.proxyImpureEnvVars; nativeBuildInputs = [ - fetchCargoVendorUtil + fetchCargoVendorUtilV2 cacert (nix-prefetch-git.override { git = gitMinimal; @@ -79,7 +90,7 @@ let cd "$cargoRoot" fi - fetch-cargo-vendor-util create-vendor-staging ./Cargo.lock "$out" + fetch-cargo-vendor-util-v2 create-vendor-staging ./Cargo.lock "$out" runHook postBuild '';