argumentcomputer · samuelburnham · Jun 19, 2026 · Jun 18, 2026 · Jun 18, 2026 · Jun 18, 2026
diff --git a/.github/actions/bencher-track/action.yml b/.github/actions/bencher-track/action.yml
@@ -0,0 +1,82 @@
+name: Track benchmarks on bencher
+description: >-
+  Run `bencher run` for the `ix` project, windowing deterministic "stepping"
+  measures to commits since the last lean-toolchain bump (and dropping them on
+  the bump commit via --thresholds-reset, so a toolchain step lands without a
+  spurious alert). The calling job must check out with fetch-depth: 0.
+
+inputs:
+  testbed:
+    description: Bencher testbed name.
+    required: true
+  file:
+    description: Bencher Metric Format JSON file to upload.
+    required: true
+  key:
+    description: BENCHER_API_KEY (project-scoped `bencher_run_*` or user-scoped `bencher_user_*`).
+    required: true
+  github-token:
+    description: GITHUB_TOKEN, for the --github-actions comment.
+    required: true
+  always-thresholds:
+    description: >-
+      Raw `--threshold-measure …` flags for measures tracked on every commit
+      (non-stepping). Word-split, so keep each flag a single token.
+    required: false
+    default: ""
+  stepping-measures:
+    description: >-
+      Space-separated measure names that step on a toolchain bump (e.g.
+      "file-size constants" or "fft-cost"). Windowed to commits-since-bump and
+      omitted on the bump commit.
+    required: false
+    default: ""
+
+runs:
+  using: composite
+  steps:
+    - uses: bencherdev/bencher@v0.6.7
+    - shell: bash
+      env:
+        BENCHER_KEY: ${{ inputs.key }}
+        GH_TOKEN: ${{ inputs.github-token }}
+        TESTBED: ${{ inputs.testbed }}
+        FILE: ${{ inputs.file }}
+        ALWAYS_THRESHOLDS: ${{ inputs.always-thresholds }}
+        STEPPING_MEASURES: ${{ inputs.stepping-measures }}
+      run: |
+        # Sample only commits since lean-toolchain last changed, so a stepping
+        # measure's baseline window never straddles a toolchain bump.
+        last_bump=$(git log -1 --format=%H -- lean-toolchain)
+        sample=$(git rev-list --count "${last_bump}..HEAD")
+        [ "$sample" -gt 64 ] && sample=64
+        echo "commits since last lean-toolchain bump (capped 64): $sample"
+        # On the bump commit (sample=0) omit the stepping measures entirely;
+        # --thresholds-reset then clears their models so the step doesn't alert.
+        # Between bumps these measures are deterministic, so pin them exactly
+        # (0% on both sides): any change in either direction is a real circuit
+        # or closure change, not noise, and should alert.
+        step_thresholds=()
+        if [ "$sample" -gt 0 ]; then
+          for m in $STEPPING_MEASURES; do
+            step_thresholds+=(
+              --threshold-measure "$m"
+              --threshold-test percentage
+              --threshold-max-sample-size "$sample"
+              --threshold-upper-boundary 0
+              --threshold-lower-boundary 0
+            )
+          done
+        fi
+        bencher run \
+          --project ix \
+          --key "$BENCHER_KEY" \
+          --branch "$GITHUB_REF_NAME" \
+          --hash "$GITHUB_SHA" \
+          --testbed "$TESTBED" \
+          --adapter json \
+          --github-actions "$GH_TOKEN" \
+          $ALWAYS_THRESHOLDS \
+          "${step_thresholds[@]}" \
+          --thresholds-reset \
+          --file "$FILE"
diff --git a/.github/workflows/aiur-bench.yml b/.github/workflows/aiur-bench.yml
@@ -0,0 +1,259 @@
+name: Aiur benchmarks
+
+# One workflow, two benchmarks per library env, on every push to main:
+#   1. compile job — `ix compile` the Lean env to a `.ixe` (compile-throughput
+#                    metrics) and cache the `.ixe`.
+#   2. prove job   — restore that `.ixe` from the cache (no recompile) and
+#                    STARK-check selected constants over it via bench-typecheck
+#                    (Aiur execute + prove metrics).
+# The prove job reuses the exact `.ixe` the compile job built, so the compiler
+# runs once. Compile and prove report to separate bencher testbeds so each one's
+# `--thresholds-reset` only touches its own measures.
+
+on:
+  push:
+    branches: main
+  workflow_dispatch:
+
+permissions:
+  contents: read
+  checks: write
+
+env:
+  COMPILE_DIR: Benchmarks/Compile
+
+jobs:
+  # Build + stage the `ix` and `bench-typecheck` binaries once, then restore
+  # them on the (more expensive) matrix runners.
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v6
+      # Pinned Rust toolchain + cargo cache (~/.cargo + target/, via the action's
+      # built-in rust-cache), so `lake build`'s cargo step doesn't recompile the
+      # Plonky3/multi-stark deps from scratch on every run.
+      - uses: actions-rust-lang/setup-rust-toolchain@v1
+      # `.cargo/config.toml` sets `-Ctarget-cpu=native`, and the binary is built
+      # once here (then only restored on the warp runners), so this build host's
+      # CPU fixes the instruction set baked in — incl. AVX-512, a big Plonky3
+      # speedup. Log it so a benchmark shift can be traced to a build-CPU change.
+      - name: Log build CPU
+        run: |
+          lscpu
+          grep -qw avx512f /proc/cpuinfo \
+            && echo "AVX-512F: present (compiled into the binary)" \
+            || echo "AVX-512F: absent"
+      - uses: leanprover/lean-action@v1
+        with:
+          auto-config: false
+          build: true
+          build-args: "ix --wfail -v"
+      - run: |
+          mkdir -p ~/.local/bin
+          echo | lake run install            # copies ix -> ~/.local/bin/ix
+          lake build bench-typecheck
+          cp .lake/build/bin/bench-typecheck ~/.local/bin/bench-typecheck
+          chmod +x ~/.local/bin/bench-typecheck
+      - uses: actions/cache/save@v5
+        with:
+          path: ~/.local/bin
+          key: aiur-bench-bins-${{ github.sha }}
+
+  # Compile each library env to a `.ixe` and track compile throughput. Caches
+  # the `.ixe` (keyed by sha + matrix job) for the prove job to consume.
+  compile:
+    needs: build
+    runs-on: warp-ubuntu-latest-x64-32x
+    strategy:
+      fail-fast: false
+      matrix:
+        bench: [InitStd, Lean, Mathlib, FLT] # Add FC if updated to latest Lean
+        include:
+          - bench: Mathlib
+            mathlib: true
+          - bench: FLT
+            cache_pkg: flt
+            mathlib: true
+          # - bench: FC
+          #   cache_pkg: formal_conjectures
+          #   mathlib: true
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+      - uses: actions/cache/restore@v5
+        with:
+          path: ~/.local/bin
+          key: aiur-bench-bins-${{ github.sha }}
+      - run: echo "$HOME/.local/bin" >> $GITHUB_PATH
+      # FC's library env lives in a sibling `${COMPILE_DIR}FC` package dir, so
+      # point COMPILE_DIR there for the FC matrix job.
+      # - if: matrix.bench == 'FC'
+      #   run: echo "COMPILE_DIR=${{ env.COMPILE_DIR }}FC" | tee -a $GITHUB_ENV
+      # Download elan; fetch the mathlib cache only for benches that import
+      # Mathlib (Mathlib, FLT) — InitStd/Lean would otherwise pull it for
+      # nothing, since the shared Benchmarks/Compile package depends on mathlib.
+      - uses: leanprover/lean-action@v1
+        with:
+          lake-package-directory: ${{ env.COMPILE_DIR }}
+          auto-config: false
+          use-github-cache: false
+          use-mathlib-cache: ${{ matrix.mathlib && 'true' || 'false' }}
+      # FLT and FC take a few minutes to rebuild, so cache their build artifacts.
+      - if: matrix.cache_pkg
+        uses: actions/cache@v5
+        with:
+          path: ${{ env.COMPILE_DIR }}/.lake/packages/${{ matrix.cache_pkg }}/.lake/build
+          key: ${{ matrix.cache_pkg }}-${{ runner.os }}-${{ runner.arch }}-${{ hashFiles(format('{0}/lean-toolchain', env.COMPILE_DIR)) }}-${{ hashFiles(format('{0}/lake-manifest.json', env.COMPILE_DIR)) }}
+      # No `--wfail` here: formal-conjectures (FC) emits a copyright-notice
+      # warning that must not fail the build.
+      - run: lake build Compile${{ matrix.bench }}
+        working-directory: ${{ env.COMPILE_DIR }}
+      # Serialize the env to a `.ixe` and emit the `##benchmark##` line.
+      - name: Run ix compile
+        run: |
+          ix compile ${{ env.COMPILE_DIR }}/Compile${{ matrix.bench }}.lean \
+            --out ${{ matrix.bench }}.ixe 2>&1 | tee output.txt
+      # Cache the `.ixe` for the prove job (reused, never recompiled there).
+      # Only the matrix jobs the prove job consumes, to stay under the repo cache limit.
+      - if: matrix.bench == 'InitStd' || matrix.bench == 'Mathlib'
+        uses: actions/cache/save@v5
+        with:
+          path: ${{ matrix.bench }}.ixe
+          key: aiur-ixe-${{ github.sha }}-${{ matrix.bench }}
+      - name: Generate compile benchmark JSON
+        run: |
+          line=$(grep '^##benchmark##' output.txt)
+          elapsed_s=$(echo "$line" | awk '{printf "%.3f", $2 / 1000}')
+          bytes=$(echo "$line" | awk '{print $3}')
+          constants=$(echo "$line" | awk '{print $4}')
+          throughput=$(echo "$line" | awk '{if ($2 > 0) printf "%.2f", $4 * 1000 / $2; else print 0}')
+          cat > benchmark.json <<EOF
+          {
+            "${{ matrix.bench }}": {
+              "compile-time": {"value": ${elapsed_s}},
+              "file-size": {"value": ${bytes}},
+              "throughput": {"value": ${throughput}},
+              "constants": {"value": ${constants}}
+            }
+          }
+          EOF
+      # Upload compile metrics. file-size/constants step on a toolchain bump, so
+      # they're the bump-windowed stepping measures; compile-time/throughput ride
+      # their normal 64-window (throughput's regression is a drop, hence its
+      # lower boundary).
+      - uses: ./.github/actions/bencher-track
+        with:
+          testbed: warp-ubuntu-x64-32x
+          file: benchmark.json
+          key: ${{ secrets.BENCHER_API_KEY }}
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          stepping-measures: file-size constants
+          always-thresholds: |
+            --threshold-measure compile-time --threshold-test percentage
+            --threshold-max-sample-size 64 --threshold-upper-boundary 0.05
+            --threshold-lower-boundary _
+            --threshold-measure throughput --threshold-test percentage
+            --threshold-max-sample-size 64 --threshold-upper-boundary _
+            --threshold-lower-boundary 0.05
+
+  # Restore each matrix job's `.ixe` from the cache and run the Aiur execute + prove
+  # benchmark over selected constants. No compiler run here.
+  prove:
+    needs: compile
+    runs-on: warp-ubuntu-latest-x64-32x
+    timeout-minutes: 60
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - bench: InitStd
+            consts: Nat.add_comm Nat.sub_le_of_le_add String.append Array.append_assoc
+          - bench: Mathlib
+            consts: Nat.factorial Nat.Coprime Nat.Prime.two_le
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          fetch-depth: 0   # full history for the toolchain-bump sample-size query
+      - uses: actions/cache/restore@v5
+        with:
+          path: ~/.local/bin
+          key: aiur-bench-bins-${{ github.sha }}
+      - run: echo "$HOME/.local/bin" >> $GITHUB_PATH
+      # Provision the toolchain so the bench-typecheck binary finds libleanshared
+      # (no package build). use-github-cache off: nothing to cache here, and
+      # parallel matrix jobs share a key and would race the cache save.
+      - uses: leanprover/lean-action@v1
+        with:
+          auto-config: false
+          build: false
+          use-github-cache: false
+      # Pull the `.ixe` the compile job built — do NOT recompile it here.
+      - uses: actions/cache/restore@v5
+        with:
+          path: ${{ matrix.bench }}.ixe
+          key: aiur-ixe-${{ github.sha }}-${{ matrix.bench }}
+          fail-on-cache-miss: true
+      # Run each constant in its own process so a clean failure or timeout drops
+      # only that constant from the report. NB: a constant heavy enough to OOM
+      # the runner host still cancels the whole job (an OOM SIGKILL of the host
+      # is uncatchable here), so every listed constant must fit in runner RAM.
+      # RAM is tracked via tracing-texray's machine-readable streaming lines
+      # (`[texray] <span> peak-rss-bytes=<N> (<X.YZ MiB>)`, emitted as each
+      # span closes): we parse the raw byte integer (awk's `$2+0` stops at
+      # the first non-digit, ignoring the parenthesized human suffix) and
+      # fold the max over spans in as `peak-rss` (bytes), the proving RSS
+      # high-water mark.
+      - name: Run Aiur typecheck benchmark
+        run: |
+          measure() {
+            local c="$1" rss
+            timeout 20m bench-typecheck --ixe ${{ matrix.bench }}.ixe "$c" \
+              --json "res-$c.json" --texray 2>"tx-$c.log" \
+              || echo "warning: $c failed (OOM/timeout); dropping it from this report"
+            rss=$(awk -F'peak-rss-bytes=' 'NF>1 && $2+0>max {max=$2+0} END {if (max>0) print max}' "tx-$c.log")
+            if [ -f "res-$c.json" ] && [ -n "$rss" ] && [ "$rss" -gt 0 ]; then
+              jq --argjson rss "$rss" 'map_values(. + {"peak-rss": $rss})' \
+                "res-$c.json" > "res-$c.json.tmp" && mv "res-$c.json.tmp" "res-$c.json" || true
+            fi
+          }
+          for c in ${{ matrix.consts }}; do measure "$c"; done
+          # Merge the per-constant results; if none produced anything, emit `{}`.
+          jq -s 'reduce .[] as $o ({}; . + $o)' res-*.json > results.json 2>/dev/null \
+            || echo '{}' > results.json
+          [ -s results.json ] || echo '{}' > results.json
+          # Wrap each metric value as { "value": v } for Bencher Metric Format.
+          # bench-typecheck already emits slug keys (constants, fft-cost,
+          # execute-time, prove-time, throughput = constants/prove-time); peak-rss
+          # is injected above.
+          jq '
+            map_values(to_entries | map({(.key): {value: .value}}) | add)
+          ' results.json > aiur.json
+          cat aiur.json
+      # Upload Aiur metrics. fft-cost and constants are deterministic and step on a
+      # toolchain bump (a different stdlib changes the constants' circuits and
+      # closure sizes), so they're the bump-windowed stepping measures — same
+      # treatment compile gives file-size/constants. prove-time/execute-time,
+      # peak-rss (texray's proving RSS high-water mark), and throughput
+      # (constants/prove-time, like compile's, where a drop is the regression)
+      # are noisy wall-clock and ride their normal 64-window.
+      - uses: ./.github/actions/bencher-track
+        with:
+          testbed: aiur-typecheck-x64-32x
+          file: aiur.json
+          key: ${{ secrets.BENCHER_API_KEY }}
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          stepping-measures: fft-cost constants
+          always-thresholds: |
+            --threshold-measure prove-time --threshold-test percentage
+            --threshold-max-sample-size 64 --threshold-upper-boundary 0.10
+            --threshold-lower-boundary _
+            --threshold-measure execute-time --threshold-test percentage
+            --threshold-max-sample-size 64 --threshold-upper-boundary 0.10
+            --threshold-lower-boundary _
+            --threshold-measure peak-rss --threshold-test percentage
+            --threshold-max-sample-size 64 --threshold-upper-boundary 0.10
+            --threshold-lower-boundary _
+            --threshold-measure throughput --threshold-test percentage
+            --threshold-max-sample-size 64 --threshold-upper-boundary _
+            --threshold-lower-boundary 0.10