diff --git a/.github/workflows/comment.yml b/.github/workflows/comment.yml
index 5ef2da52..c06d9726 100644
--- a/.github/workflows/comment.yml
+++ b/.github/workflows/comment.yml
@@ -9,7 +9,7 @@ permissions: {}
 jobs:
   upload-pr-comment:
     if: ${{ github.event.workflow_run.event == 'pull_request' }}
-    
+
     name: Upload PR comment
     runs-on: ubuntu-latest
     permissions:
@@ -17,33 +17,40 @@ jobs:
       pull-requests: write
 
     steps:
-      - name: List Annotations
+      - name: Download comparison artifacts
         uses: actions/github-script@v9
         with:
           script: |
+            let fs = require('fs');
             let artifacts = await github.rest.actions.listWorkflowRunArtifacts({
               owner: context.repo.owner,
               repo: context.repo.repo,
               run_id: ${{ github.event.workflow_run.id }},
             });
 
-            // List all artifacts
-            let matchArtifact = artifacts.data.artifacts.filter((artifact) => {
-              return artifact.name == "comment"
-            })[0];
-
-            // Download the artifact to github.workspace
-            let download = await github.rest.actions.downloadArtifact({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              artifact_id: matchArtifact.id,
-              archive_format: 'zip',
-            });
-
-            let fs = require('fs');
-            fs.writeFileSync('${{ github.workspace }}/comment.zip', Buffer.from(download.data));
+            for (let wanted of ["comment-gnu", "comment-bfs"]) {
+              let match = artifacts.data.artifacts.find((a) => a.name === wanted);
+              if (!match) {
+                console.log(`Artifact ${wanted} not found`);
+                continue;
+              }
+              let download = await github.rest.actions.downloadArtifact({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                artifact_id: match.id,
+                archive_format: 'zip',
+              });
+              fs.writeFileSync(`${{ github.workspace }}/${wanted}.zip`, Buffer.from(download.data));
+            }
 
-      - run: unzip comment.zip
+      - name: Extract artifacts
+        run: |
+          for a in comment-gnu comment-bfs; do
+            if test -f "$a.zip"; then
+              mkdir -p "$a"
+              unzip -o "$a.zip" -d "$a" || echo "Failed to unzip $a.zip"
+            fi
+          done
 
       - name: Comment on PR
         uses: actions/github-script@v9
@@ -51,26 +58,36 @@ jobs:
           github-token: ${{ secrets.GITHUB_TOKEN }}
           script: |
             let fs = require('fs');
-            let annotations = JSON.parse(fs.readFileSync('./annotations.json', 'utf8'));
 
-            let annotationContent = annotations
-              .data
-              .map(annotation => `${annotation.run}: ${annotation.annotation.message}`)
-              .join('\n');
+            function read(path) {
+              try { return fs.readFileSync(path, 'utf8'); } catch (e) { return ''; }
+            }
+
+            // The PR number is written to NR by both jobs.
+            let nr = read('comment-gnu/NR').trim() || read('comment-bfs/NR').trim();
+            if (!nr) {
+              console.log('No PR number found; skipping comment');
+              return;
+            }
+
+            let gnu = read('comment-gnu/result-gnu.txt').trim();
+            let bfs = read('comment-bfs/result-bfs.txt').trim();
 
-            // check if no changes
-            let gnuTestReport = annotationContent.includes('Run GNU findutils tests: Gnu tests No changes');
-            let bfsTestReport = annotationContent.includes('Run BFS tests: BFS tests No changes');
+            let sections = [];
+            if (gnu) sections.push('GNU findutils testsuite:\n```\n' + gnu + '\n```');
+            if (bfs) sections.push('bfs testsuite:\n```\n' + bfs + '\n```');
 
-            if (gnuTestReport && bfsTestReport) {
-              console.log('No changes');
+            if (sections.length === 0) {
+              console.log('No test result changes; skipping comment');
               return;
             }
 
-            // Comment on the PR
-            github.rest.issues.createComment({
+            let body = 'Commit ${{ github.event.workflow_run.head_sha }} has test result changes:\n\n'
+              + sections.join('\n\n');
+
+            await github.rest.issues.createComment({
               owner: context.repo.owner,
               repo: context.repo.repo,
-              issue_number: annotations.pull_request_number,
-              body: 'Commit ${{ github.event.workflow_run.head_sha }} has GNU testsuite comparison:\n```\n' + annotationContent + '\n```\n'
-            });
\ No newline at end of file
+              issue_number: Number(nr),
+              body: body,
+            });
diff --git a/.github/workflows/compat.yml b/.github/workflows/compat.yml
index c6b0e3e5..98485c85 100644
--- a/.github/workflows/compat.yml
+++ b/.github/workflows/compat.yml
@@ -3,6 +3,8 @@ on: [push, pull_request]
 name: External-testsuites
 env:
   CARGO_INCREMENTAL: "0"
+  # Default branch, used to fetch the reference (baseline) test results.
+  DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
 jobs:
   gnu-tests:
     permissions:
@@ -58,11 +60,9 @@ jobs:
             findutils-x86_64-unknown-linux-gnu.tar.zst
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Extract testing info
-        shell: bash
-        run: |
 
       - name: Upload gnu-test-report
+        if: success() || failure()
         uses: actions/upload-artifact@v7
         with:
           name: gnu-test-report
@@ -70,62 +70,60 @@ jobs:
             findutils.gnu/find/testsuite/*.log
             findutils.gnu/xargs/testsuite/*.log
             findutils.gnu/tests/**/*.log
-      - name: Upload gnu-result
+      # The per-test JSON summary doubles as the baseline: future runs on the
+      # default branch download this artifact to diff against.
+      - name: Upload gnu-full-result
+        if: success() || failure()
         uses: actions/upload-artifact@v7
         with:
-          name: gnu-result
-          path: gnu-result.json
-      - name: Download artifacts (gnu-result and gnu-test-report)
-        uses: actions/github-script@v9
-        with:
-          script: |
-            let fs = require('fs');
-            fs.mkdirSync('${{ github.workspace }}/dl', { recursive: true });
-
-            async function downloadArtifact(artifactName) {
-              // List all artifacts from the workflow run
-              let artifacts = await github.rest.actions.listWorkflowRunArtifacts({
-                owner: context.repo.owner,
-                repo: context.repo.repo,
-                run_id: ${{ github.run_id }},
-              });
-
-              // Find the specified artifact
-              let matchArtifact = artifacts.data.artifacts.find((artifact) => artifact.name === artifactName);
-              if (!matchArtifact) {
-                throw new Error(`Artifact "${artifactName}" not found.`);
-              }
+          name: gnu-full-result
+          path: findutils-gnu-full-result.json
+          if-no-files-found: warn
 
-              // Download the artifact
-              let download = await github.rest.actions.downloadArtifact({
-                owner: context.repo.owner,
-                repo: context.repo.repo,
-                artifact_id: matchArtifact.id,
-                archive_format: 'zip',
-              });
-
-              // Save the artifact to a file
-              fs.writeFileSync(`${{ github.workspace }}/dl/${artifactName}.zip`, Buffer.from(download.data));
-            }
-
-            // Download the required artifacts
-            await downloadArtifact("gnu-result");
-            await downloadArtifact("gnu-test-report");
-
-      - name: Compare failing tests against master
-        shell: bash
-        run: |
-          ./findutils/util/diff-gnu.sh ./dl ./findutils.gnu
-      - name: Compare against main results
+      - name: Retrieve reference results
+        uses: dawidd6/action-download-artifact@v21
+        continue-on-error: true
+        with:
+          workflow: compat.yml
+          branch: ${{ env.DEFAULT_BRANCH }}
+          workflow_conclusion: completed
+          name: gnu-full-result
+          path: reference-gnu
+          if_no_artifact_found: warn
+
+      - name: Compare against reference results
         shell: bash
         run: |
-          unzip dl/gnu-result.zip -d dl/
-          unzip dl/gnu-test-report.zip -d dl/
-          mv dl/gnu-result.json latest-gnu-result.json
-          python findutils/util/compare_gnu_result.py
+          mkdir -p comment
+          echo "${{ github.event.number }}" > comment/NR
+          REF="reference-gnu/findutils-gnu-full-result.json"
+          CUR="findutils-gnu-full-result.json"
+          : > comment/result-gnu.txt
+          if test ! -f "${CUR}"; then
+            echo "::error ::Missing current GNU results (${CUR}); failing early"
+            exit 1
+          fi
+          if test -f "${REF}"; then
+            python3 findutils/util/compare_test_results.py \
+              --ignore-file findutils/.github/workflows/ignore-intermittent.txt \
+              --output comment/result-gnu.txt \
+              "${CUR}" "${REF}"
+          else
+            echo "::warning ::No GNU reference results available yet; skipping comparison."
+          fi
+
+      - name: Upload GNU comparison comment
+        if: ${{ github.event_name == 'pull_request' && (success() || failure()) }}
+        uses: actions/upload-artifact@v7
+        with:
+          name: comment-gnu
+          path: comment/
 
   bfs-tests:
     name: Run BFS tests
+    permissions:
+      actions: read
+      contents: read
     runs-on: ubuntu-latest
     steps:
       - name: Checkout findutils
@@ -152,110 +150,54 @@ jobs:
           export CARGO_INCREMENTAL=0
           bash util/build-bfs.sh ||:
       - name: Upload bfs-test-report
+        if: success() || failure()
         uses: actions/upload-artifact@v7
         with:
           name: bfs-test-report
           path: bfs/tests.log
-      - name: Upload bfs-result
+      - name: Upload bfs-full-result
+        if: success() || failure()
         uses: actions/upload-artifact@v7
         with:
-          name: bfs-result
-          path: bfs-result.json
-      - name: Download artifacts (gnu-result and bfs-test-report)
-        uses: actions/github-script@v9
-        with:
-          script: |
-            let fs = require('fs');
-            fs.mkdirSync('${{ github.workspace }}/dl', { recursive: true });
-
-            async function downloadArtifact(artifactName) {
-              // List all artifacts from the workflow run
-              let artifacts = await github.rest.actions.listWorkflowRunArtifacts({
-                owner: context.repo.owner,
-                repo: context.repo.repo,
-                run_id: ${{ github.run_id }},
-              });
-
-              // Find the specified artifact
-              let matchArtifact = artifacts.data.artifacts.find((artifact) => artifact.name === artifactName);
-              if (!matchArtifact) {
-                throw new Error(`Artifact "${artifactName}" not found.`);
-              }
+          name: bfs-full-result
+          path: bfs-full-result.json
+          if-no-files-found: warn
 
-              // Download the artifact
-              let download = await github.rest.actions.downloadArtifact({
-                owner: context.repo.owner,
-                repo: context.repo.repo,
-                artifact_id: matchArtifact.id,
-                archive_format: 'zip',
-              });
-
-              // Save the artifact to a file
-              fs.writeFileSync(`${{ github.workspace }}/dl/${artifactName}.zip`, Buffer.from(download.data));
-            }
-
-            // Download the required artifacts
-            await downloadArtifact("bfs-result");
-            await downloadArtifact("bfs-test-report");
-      - name: Compare failing tests against main
-        shell: bash
-        run: |
-          ./findutils/util/diff-bfs.sh dl/tests.log bfs/tests.log
-      - name: Compare against main results
+      - name: Retrieve reference results
+        uses: dawidd6/action-download-artifact@v21
+        continue-on-error: true
+        with:
+          workflow: compat.yml
+          branch: ${{ env.DEFAULT_BRANCH }}
+          workflow_conclusion: completed
+          name: bfs-full-result
+          path: reference-bfs
+          if_no_artifact_found: warn
+
+      - name: Compare against reference results
         shell: bash
         run: |
-          unzip dl/bfs-result.zip -d dl/
-          unzip dl/bfs-test-report.zip -d dl/
-          mv dl/bfs-result.json latest-bfs-result.json
-          python findutils/util/compare_bfs_result.py
-
-  upload-annotations:
-    name: Upload annotations
-    runs-on: ubuntu-latest
-    needs: [gnu-tests, bfs-tests]
-    if: ${{ github.event_name == 'pull_request' }}
-
-    steps:
-      - name: List Annotations
-        uses: actions/github-script@v9
-
-        with:
-          script: |
-            let runs = await github.rest.checks.listForRef({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              ref: '${{ github.event.pull_request.head.sha }}'
-            });
-
-            let names = ['Run GNU findutils tests', 'Run BFS tests'];
-            let results = [];
-            runs.data.check_runs.filter(check => names.includes(check.name)).forEach(run => results.push(run));
-
-            let annotations = { data: [], pull_request_number: '${{ github.event.number }}' };
-            for (let result of results) {
-              let run = await github.rest.checks.listAnnotations({
-                owner: context.repo.owner,
-                repo: context.repo.repo,
-                check_run_id: result.id
-              });
-
-              run.data.forEach(data => {
-                annotations.data.push({
-                  run: result.name,
-                  annotation: data
-                });
-              });
-            }
-
-            // Remove duplicate items.
-            annotations.data = annotations.data.filter((value, index, self) =>
-              self.findIndex(v => v.annotation.message === value.annotation.message) === index);
-
-            let fs = require('fs');
-            fs.writeFileSync('${{ github.workspace }}/annotations.json', JSON.stringify(annotations));
-
-      - name: Upload annotations
+          mkdir -p comment
+          echo "${{ github.event.number }}" > comment/NR
+          REF="reference-bfs/bfs-full-result.json"
+          CUR="bfs-full-result.json"
+          : > comment/result-bfs.txt
+          if test ! -f "${CUR}"; then
+            echo "::error ::Missing current bfs results (${CUR}); failing early"
+            exit 1
+          fi
+          if test -f "${REF}"; then
+            python3 findutils/util/compare_test_results.py \
+              --ignore-file findutils/.github/workflows/ignore-intermittent.txt \
+              --output comment/result-bfs.txt \
+              "${CUR}" "${REF}"
+          else
+            echo "::warning ::No bfs reference results available yet; skipping comparison."
+          fi
+
+      - name: Upload BFS comparison comment
+        if: ${{ github.event_name == 'pull_request' && (success() || failure()) }}
         uses: actions/upload-artifact@v7
         with:
-          name: comment
-          path: annotations.json
+          name: comment-bfs
+          path: comment/
diff --git a/.github/workflows/ignore-intermittent.txt b/.github/workflows/ignore-intermittent.txt
new file mode 100644
index 00000000..ea74f603
--- /dev/null
+++ b/.github/workflows/ignore-intermittent.txt
@@ -0,0 +1,16 @@
+# List of intermittent test names to ignore when comparing GNU/bfs results.
+# Format: one test name per line; lines starting with # are comments.
+#
+# A test listed here is still reported in the PR comment (marked
+# "intermittent"), but a *new* failure of it will not fail the CI job. Use this
+# for tests that are known to be flaky or environment-dependent, so genuine
+# regressions in other tests stay actionable.
+#
+# Names must match those in the *-full-result.json files:
+#   * GNU dejagnu find tests:  <name>.new-O[0-3]   (e.g. printf.new-O0)
+#   * GNU dejagnu xargs tests: <name>              (e.g. IARG, space)
+#   * GNU automake tests:      tests/<dir>/<name>  (e.g. tests/find/used)
+#   * bfs tests:               <dir>/<name>        (e.g. posix/HL)
+#
+# Example:
+# tests/find/some-flaky-test
diff --git a/util/bfs_json_result.py b/util/bfs_json_result.py
new file mode 100755
index 00000000..45735a3c
--- /dev/null
+++ b/util/bfs_json_result.py
@@ -0,0 +1,76 @@
+#!/usr/bin/env python3
+
+"""
+Build a per-test JSON summary of a bfs testsuite run.
+
+bfs writes one line per test to tests.log:
+
+    [PASS] posix/H
+    [FAIL] common/newermt
+    [SKIP] gnu/...
+
+Output format matches util/gnu_json_result.py and is consumed by
+compare_test_results.py:
+
+    {
+      "summary": {"total": N, "passed": P, "failed": F, "skipped": S},
+      "tests": [{"name": "...", "status": "PASS|FAIL|SKIP"}, ...]
+    }
+"""
+
+import json
+import re
+import sys
+from pathlib import Path
+
+RESULT_RE = re.compile(r"^\[(PASS|FAIL|SKIP)\] (\S+)\s*$")
+
+
+def collect(log_file):
+    tests = {}
+    log = Path(log_file)
+    if log.is_file():
+        for line in log.read_text(encoding="utf-8", errors="replace").splitlines():
+            m = RESULT_RE.match(line)
+            if m:
+                tests[m.group(2)] = m.group(1)
+    return tests
+
+
+def build(log_file):
+    tests = collect(log_file)
+    passed = sum(1 for s in tests.values() if s == "PASS")
+    failed = sum(1 for s in tests.values() if s == "FAIL")
+    skipped = sum(1 for s in tests.values() if s == "SKIP")
+    return {
+        "summary": {
+            "total": len(tests),
+            "passed": passed,
+            "failed": failed,
+            "skipped": skipped,
+        },
+        "tests": [
+            {"name": name, "status": status}
+            for name, status in sorted(tests.items())
+        ],
+    }
+
+
+def main():
+    if len(sys.argv) != 3:
+        print(f"usage: {sys.argv[0]} <bfs-tests.log> <output.json>", file=sys.stderr)
+        return 2
+    result = build(sys.argv[1])
+    with open(sys.argv[2], "w", encoding="utf-8") as f:
+        json.dump(result, f, indent=2, sort_keys=True)
+        f.write("\n")
+    s = result["summary"]
+    print(
+        f"bfs tests summary = TOTAL: {s['total']} / "
+        f"PASS: {s['passed']} / FAIL: {s['failed']} / SKIP: {s['skipped']}"
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/util/build-bfs.sh b/util/build-bfs.sh
index c394112e..8f38a928 100755
--- a/util/build-bfs.sh
+++ b/util/build-bfs.sh
@@ -2,6 +2,9 @@
 
 set -eo pipefail
 
+# Repository root (where util/ lives), captured before we cd into the bfs tree.
+REPO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+
 if ! test -d ../bfs; then
     echo "Could not find ../bfs"
     echo "git clone https://github.com/tavianator/bfs.git"
@@ -24,30 +27,18 @@ fi
 LOG_FILE=tests.log
 ./tests/tests.sh --bfs="$FIND" "$@" 2>&1 | tee "$LOG_FILE" || :
 
-PASS=$(sed -En 's|^\[PASS] *([0-9]+) / .*|\1|p' "$LOG_FILE")
-SKIP=$(sed -En 's|^\[SKIP] *([0-9]+) / .*|\1|p' "$LOG_FILE")
-FAIL=$(sed -En 's|^\[FAIL] *([0-9]+) / .*|\1|p' "$LOG_FILE")
+# Build a per-test JSON summary (name + status for every test) used by
+# compare_test_results.py to detect per-test improvements/regressions.
+RESULT_JSON="${RESULT_JSON:-../bfs-full-result.json}"
+output="$(python3 "${REPO_DIR}/util/bfs_json_result.py" "$LOG_FILE" "${RESULT_JSON}")"
+echo "${output}"
 
-# Default any missing numbers to zero (e.g. no tests skipped)
-: ${PASS:=0}
-: ${SKIP:=0}
-: ${FAIL:=0}
+TOTAL=$(python3 -c "import json,sys;print(json.load(open(sys.argv[1]))['summary']['total'])" "${RESULT_JSON}")
+FAIL=$(python3 -c "import json,sys;print(json.load(open(sys.argv[1]))['summary']['failed'])" "${RESULT_JSON}")
 
-TOTAL=$((PASS + SKIP + FAIL))
 if (( TOTAL <= 1 )); then
     echo "Error in the execution, failing early"
     exit 1
 fi
 
-output="BFS tests summary = TOTAL: $TOTAL / PASS: $PASS / SKIP: $SKIP / FAIL: $FAIL"
-echo "${output}"
 if (( FAIL > 0 )); then echo "::warning ::${output}"; fi
-
-jq -n \
-   --arg date "$(date --rfc-email)" \
-   --arg sha "$GITHUB_SHA" \
-   --arg total "$TOTAL" \
-   --arg pass "$PASS" \
-   --arg skip "$SKIP" \
-   --arg fail "$FAIL" \
-   '{($date): { sha: $sha, total: $total, pass: $pass, skip: $skip, fail: $fail, }}' > ../bfs-result.json
diff --git a/util/build-gnu.sh b/util/build-gnu.sh
index 960f6907..945ff9b5 100755
--- a/util/build-gnu.sh
+++ b/util/build-gnu.sh
@@ -2,6 +2,9 @@
 
 set -e
 
+# Repository root (where util/ lives), captured before we cd into the GNU tree.
+REPO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+
 if test ! -d ../findutils.gnu; then
     echo "Could not find ../findutils.gnu"
     echo "git clone https://git.savannah.gnu.org/git/findutils.git findutils.gnu"
@@ -36,51 +39,18 @@ make check-TESTS $RUN_TEST || :
 make -C find/testsuite check || :
 make -C xargs/testsuite check || :
 
-PASS=0
-SKIP=0
-FAIL=0
-XPASS=0
-ERROR=0
-
-LOG_FILE=./find/testsuite/find.log
-if test -f "$LOG_FILE"; then
-    ((PASS += $(sed -En 's/# of expected passes\s*//p' "$LOG_FILE"))) || :
-    ((FAIL += $(sed -En 's/# of unexpected failures\s*//p' "$LOG_FILE"))) || :
-fi
-
-LOG_FILE=./xargs/testsuite/xargs.log
-if test -f "$LOG_FILE"; then
-    ((PASS += $(sed -En 's/# of expected passes\s*//p' "$LOG_FILE"))) || :
-    ((FAIL += $(sed -En 's/# of unexpected failures\s*//p' "$LOG_FILE"))) || :
-fi
-
-((TOTAL = PASS + FAIL)) || :
+# Build a per-test JSON summary (name + status for every test) used by
+# compare_test_results.py to detect per-test improvements/regressions.
+RESULT_JSON="${RESULT_JSON:-../findutils-gnu-full-result.json}"
+output="$(python3 "${REPO_DIR}/util/gnu_json_result.py" . "${RESULT_JSON}")"
+echo "${output}"
 
-LOG_FILE=./tests/test-suite.log
-if test -f "$LOG_FILE"; then
-    ((TOTAL += $(sed -n "s/.*# TOTAL: \(.*\)/\1/p"  "$LOG_FILE" | tr -d '\r' | head -n1))) || :
-    ((PASS += $(sed -n "s/.*# PASS: \(.*\)/\1/p" "$LOG_FILE" | tr -d '\r' | head -n1))) || :
-    ((SKIP += $(sed -n "s/.*# SKIP: \(.*\)/\1/p" "$LOG_FILE" | tr -d '\r' | head -n1))) || :
-    ((FAIL += $(sed -n "s/.*# FAIL: \(.*\)/\1/p" "$LOG_FILE" | tr -d '\r' | head -n1))) || :
-    ((XPASS += $(sed -n "s/.*# XPASS: \(.*\)/\1/p" "$LOG_FILE" | tr -d '\r' | head -n1))) || :
-    ((ERROR += $(sed -n "s/.*# ERROR: \(.*\)/\1/p" "$LOG_FILE" | tr -d '\r' | head -n1))) || :
-fi
+TOTAL=$(python3 -c "import json,sys;print(json.load(open(sys.argv[1]))['summary']['total'])" "${RESULT_JSON}")
+FAIL=$(python3 -c "import json,sys;print(json.load(open(sys.argv[1]))['summary']['failed'])" "${RESULT_JSON}")
 
 if ((TOTAL <= 1)); then
     echo "Error in the execution, failing early"
     exit 1
 fi
 
-output="GNU tests summary = TOTAL: $TOTAL / PASS: $PASS / FAIL: $FAIL / ERROR: $ERROR"
-echo "${output}"
-if [[ "$FAIL" -gt 0 || "$ERROR" -gt 0 ]]; then echo "::warning ::${output}" ; fi
-jq -n \
-   --arg date "$(date --rfc-email)" \
-   --arg sha "$GITHUB_SHA" \
-   --arg total "$TOTAL" \
-   --arg pass "$PASS" \
-   --arg skip "$SKIP" \
-   --arg fail "$FAIL" \
-   --arg xpass "$XPASS" \
-   --arg error "$ERROR" \
-   '{($date): { sha: $sha, total: $total, pass: $pass, skip: $skip, fail: $fail, xpass: $xpass, error: $error, }}' > ../gnu-result.json
+if [[ "$FAIL" -gt 0 ]]; then echo "::warning ::${output}"; fi
diff --git a/util/compare_bfs_result.py b/util/compare_bfs_result.py
deleted file mode 100644
index b96026bd..00000000
--- a/util/compare_bfs_result.py
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/usr/bin/python
-"""
-Compare the current results to the last results gathered from the main branch to highlight
-if a PR is making the results better/worse
-"""
-
-import json
-import sys
-
-NEW = json.load(open("bfs-result.json"))
-OLD = json.load(open("latest-bfs-result.json"))
-
-# Extract the specific results from the dicts
-[last] = OLD.values()
-[current] = NEW.values()
-
-pass_d = int(current["pass"]) - int(last["pass"])
-skip_d = int(current["skip"]) - int(last.get("skip", 0))
-fail_d = int(current["fail"]) - int(last["fail"])
-
-# Get an annotation to highlight changes
-print(f"::warning ::Changes from main: PASS {pass_d:+d} / SKIP {skip_d:+d} / FAIL {fail_d:+d}")
-
-# Check if there are no changes.
-if pass_d == 0:
-    print("::warning ::BFS tests No changes")
-
-# If results are worse fail the job to draw attention
-if pass_d < 0:
-    sys.exit(1)
diff --git a/util/compare_gnu_result.py b/util/compare_gnu_result.py
deleted file mode 100644
index 7e5e251e..00000000
--- a/util/compare_gnu_result.py
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/usr/bin/python
-"""
-Compare the current results to the last results gathered from the main branch to highlight
-if a PR is making the results better/worse
-"""
-
-import json
-import sys
-
-NEW = json.load(open("gnu-result.json"))
-OLD = json.load(open("latest-gnu-result.json"))
-
-# Extract the specific results from the dicts
-last = OLD[list(OLD.keys())[0]]
-current = NEW[list(NEW.keys())[0]]
-
-pass_d = int(current["pass"]) - int(last["pass"])
-fail_d = int(current["fail"]) - int(last["fail"])
-error_d = int(current["error"]) - int(last["error"])
-skip_d = int(current["skip"]) - int(last["skip"])
-
-# Get an annotation to highlight changes
-print(
-    f"::warning ::Changes from main: PASS {pass_d:+d} / FAIL {fail_d:+d} / ERROR {error_d:+d} / SKIP {skip_d:+d} "
-)
-
-# Check if there are no changes.
-if pass_d == 0:
-    print("::warning ::Gnu tests No changes")
-
-# If results are worse fail the job to draw attention
-if pass_d < 0:
-    sys.exit(1)
diff --git a/util/compare_test_results.py b/util/compare_test_results.py
new file mode 100755
index 00000000..bb4be960
--- /dev/null
+++ b/util/compare_test_results.py
@@ -0,0 +1,162 @@
+#!/usr/bin/env python3
+
+"""
+Compare the current GNU test results to the reference results gathered from the
+main branch, to highlight whether a PR makes the results better or worse.
+
+Writes a human-readable comparison to --output (empty when nothing changed, so
+the comment workflow can decide to stay silent). Exits 1 when there are new,
+non-intermittent failures; intermittent (flaky) tests listed in --ignore-file
+are reported but never fail the job.
+
+Adapted from the uutils sed/grep workflow.
+"""
+
+import json
+import sys
+import argparse
+from pathlib import Path
+
+
+def load_ignore_list(ignore_file):
+    """Load the set of intermittent test names to ignore from a file."""
+    ignore_set = set()
+    if ignore_file and Path(ignore_file).exists():
+        with open(ignore_file, "r") as f:
+            for line in f:
+                line = line.strip()
+                if line and not line.startswith("#"):
+                    ignore_set.add(line)
+    return ignore_set
+
+
+def extract_test_results(json_data):
+    """Return (summary, failed_test_names) from parsed JSON data."""
+    if not json_data or "summary" not in json_data:
+        return {"total": 0, "passed": 0, "failed": 0, "skipped": 0}, []
+
+    summary = json_data["summary"]
+    failed_tests = [
+        test.get("name", "unknown")
+        for test in json_data.get("tests", [])
+        if test.get("status") == "FAIL"
+    ]
+    return summary, failed_tests
+
+
+def compare_results(current_file, reference_file, ignore_file=None, output_file=None):
+    """Compare current results with reference results."""
+    ignore_set = load_ignore_list(ignore_file)
+
+    try:
+        with open(current_file, "r") as f:
+            current_summary, current_failed = extract_test_results(json.load(f))
+    except Exception as e:
+        print(f"Error loading current results: {e}")
+        return 1
+
+    try:
+        with open(reference_file, "r") as f:
+            reference_summary, reference_failed = extract_test_results(json.load(f))
+    except Exception as e:
+        print(f"Error loading reference results: {e}")
+        return 1
+
+    pass_diff = int(current_summary.get("passed", 0)) - int(
+        reference_summary.get("passed", 0)
+    )
+    fail_diff = int(current_summary.get("failed", 0)) - int(
+        reference_summary.get("failed", 0)
+    )
+    total_diff = int(current_summary.get("total", 0)) - int(
+        reference_summary.get("total", 0)
+    )
+
+    current_failed_set = set(current_failed)
+    reference_failed_set = set(reference_failed)
+
+    new_failures = current_failed_set - reference_failed_set
+    improvements = reference_failed_set - current_failed_set
+
+    non_intermittent_new_failures = new_failures - ignore_set
+
+    no_changes = (
+        pass_diff == 0
+        and fail_diff == 0
+        and total_diff == 0
+        and not new_failures
+        and not improvements
+    )
+
+    # Empty output tells the comment workflow there is nothing to post.
+    if no_changes:
+        if output_file:
+            with open(output_file, "w") as f:
+                f.write("")
+        return 0
+
+    output_lines = []
+    output_lines.append("Test results comparison:")
+    output_lines.append(
+        f"  Current:   TOTAL: {current_summary.get('total', 0)} / PASSED: {current_summary.get('passed', 0)} / FAILED: {current_summary.get('failed', 0)} / SKIPPED: {current_summary.get('skipped', 0)}"
+    )
+    output_lines.append(
+        f"  Reference: TOTAL: {reference_summary.get('total', 0)} / PASSED: {reference_summary.get('passed', 0)} / FAILED: {reference_summary.get('failed', 0)} / SKIPPED: {reference_summary.get('skipped', 0)}"
+    )
+    output_lines.append("")
+
+    if pass_diff != 0 or fail_diff != 0 or total_diff != 0:
+        output_lines.append("Changes from main branch:")
+        output_lines.append(f"  TOTAL: {total_diff:+d}")
+        output_lines.append(f"  PASSED: {pass_diff:+d}")
+        output_lines.append(f"  FAILED: {fail_diff:+d}")
+        output_lines.append("")
+
+    if new_failures:
+        # Only non-intermittent failures fail the job, but list them all.
+        real = sorted(new_failures - ignore_set)
+        flaky = sorted(new_failures & ignore_set)
+        output_lines.append(f"New test failures ({len(new_failures)}):")
+        for test in real:
+            output_lines.append(f"  - {test}")
+        for test in flaky:
+            output_lines.append(f"  - {test} (intermittent)")
+        output_lines.append("")
+
+    if improvements:
+        output_lines.append(f"Test improvements ({len(improvements)}):")
+        for test in sorted(improvements):
+            output_lines.append(f"  + {test}")
+        output_lines.append("")
+
+    output_text = "\n".join(output_lines)
+    if output_file:
+        with open(output_file, "w") as f:
+            f.write(output_text)
+    else:
+        print(output_text)
+
+    if non_intermittent_new_failures:
+        print(
+            f"ERROR: Found {len(non_intermittent_new_failures)} new non-intermittent test failures"
+        )
+        return 1
+
+    return 0
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Compare GNU test results")
+    parser.add_argument("current", help="Current test results JSON file")
+    parser.add_argument("reference", help="Reference test results JSON file")
+    parser.add_argument(
+        "--ignore-file", help="File containing intermittent test names to ignore"
+    )
+    parser.add_argument("--output", help="Output file for comparison results")
+
+    args = parser.parse_args()
+    return compare_results(args.current, args.reference, args.ignore_file, args.output)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/util/diff-bfs.sh b/util/diff-bfs.sh
deleted file mode 100755
index 4d821b91..00000000
--- a/util/diff-bfs.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/bash
-
-set -eu
-
-export LC_COLLATE=C
-
-# Extract the failing test lines from log files
-failing_tests() {
-    sed -En 's/^\[FAIL\] (.*[a-z].*)/\1/p' "$1" | sort
-}
-
-comm -3 <(failing_tests "$1") <(failing_tests "$2") | tr '\t' ',' | while IFS=, read old new; do
-    if [ -n "$old" ]; then
-        echo "::warning ::Congrats! The bfs test $old is now passing!"
-    fi
-    if [ -n "$new" ]; then
-        echo "::error ::bfs test failed: $new. $new is passing on 'main'. Maybe you have to rebase?"
-    fi
-done
diff --git a/util/diff-gnu.sh b/util/diff-gnu.sh
deleted file mode 100755
index f29f358d..00000000
--- a/util/diff-gnu.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/bash
-
-set -eu
-
-export LC_COLLATE=C
-
-# Extract the failing test lines from log files
-failing_tests() {
-    sed -En 's/FAIL: ([^,:]*)[,:].*/\1/p' "$1"/{tests,{find,xargs}/testsuite}/*.log | sort
-}
-
-comm -3 <(failing_tests "$1") <(failing_tests "$2") | tr '\t' ',' | while IFS=, read old new foo; do
-    if [ -n "$old" ]; then
-        echo "::warning ::Congrats! The GNU test $old is now passing!"
-    fi
-    if [ -n "$new" ]; then
-        echo "::error ::GNU test failed: $new. $new is passing on 'main'. Maybe you have to rebase?"
-    fi
-done
diff --git a/util/gnu_json_result.py b/util/gnu_json_result.py
new file mode 100755
index 00000000..4029caeb
--- /dev/null
+++ b/util/gnu_json_result.py
@@ -0,0 +1,144 @@
+#!/usr/bin/env python3
+
+"""
+Build a per-test JSON summary of a GNU findutils `make check` run.
+
+GNU findutils runs tests through three harnesses that each log results
+differently:
+
+  * dejagnu  -> find/testsuite/find.log and xargs/testsuite/xargs.log
+                ("PASS: <name>" / "FAIL: <name>, <reason>" lines)
+  * automake -> tests/**/*.log, one log per test script, each ending with a
+                "<STATUS> tests/<path>.sh (exit status: N)" line
+
+The names are naturally disjoint (dejagnu find names end in ".new-O[0-3]",
+automake names start with "tests/"), so they can share one flat namespace.
+
+Output format (consumed by compare_test_results.py):
+
+    {
+      "summary": {"total": N, "passed": P, "failed": F, "skipped": S},
+      "tests": [{"name": "...", "status": "PASS|FAIL|SKIP"}, ...]
+    }
+"""
+
+import json
+import re
+import sys
+from pathlib import Path
+
+# dejagnu status -> normalized status. Anything unexpected counts as FAIL so a
+# regression is never silently dropped.
+DEJAGNU = {
+    "PASS": "PASS",
+    "XFAIL": "PASS",  # expected failure: not a regression
+    "FAIL": "FAIL",
+    "XPASS": "FAIL",  # unexpected pass: worth surfacing
+    "ERROR": "FAIL",
+    "UNRESOLVED": "FAIL",
+    "UNSUPPORTED": "SKIP",
+    "UNTESTED": "SKIP",
+}
+
+DEJAGNU_RE = re.compile(r"^(PASS|XFAIL|FAIL|XPASS|ERROR|UNRESOLVED|UNSUPPORTED|UNTESTED): (.+)$")
+# automake per-test trailer, e.g. "FAIL tests/find/used.sh (exit status: 1)"
+AUTOMAKE_RE = re.compile(r"^(PASS|FAIL|SKIP|XPASS|XFAIL|ERROR) (tests/\S+?)(?:\.sh)? \(exit status: \d+\)$")
+AUTOMAKE = {
+    "PASS": "PASS",
+    "XFAIL": "PASS",
+    "FAIL": "FAIL",
+    "XPASS": "FAIL",
+    "ERROR": "FAIL",
+    "SKIP": "SKIP",
+}
+
+
+def _read(path):
+    return path.read_text(encoding="utf-8", errors="replace").splitlines()
+
+
+def _record(tests, name, status):
+    """Merge a status into `tests`, keeping failure sticky.
+
+    DejaGnu emits one line per assertion, so a single test name can appear many
+    times with mixed results. A test counts as FAIL if any assertion failed,
+    else PASS if any passed, else SKIP.
+    """
+    prev = tests.get(name)
+    if prev == "FAIL" or status == "FAIL":
+        tests[name] = "FAIL"
+    elif prev == "PASS" or status == "PASS":
+        tests[name] = "PASS"
+    else:
+        tests[name] = "SKIP"
+
+
+def collect(root):
+    """Return {name: status} for every test found under `root`."""
+    root = Path(root)
+    tests = {}
+
+    # dejagnu logs
+    for rel in ("find/testsuite/find.log", "xargs/testsuite/xargs.log"):
+        log = root / rel
+        if not log.is_file():
+            continue
+        for line in _read(log):
+            m = DEJAGNU_RE.match(line)
+            if not m:
+                continue
+            status, rest = m.group(1), m.group(2)
+            # FAIL lines carry a trailing ", <reason>"; the name is the head.
+            name = rest.split(",", 1)[0].strip()
+            _record(tests, name, DEJAGNU[status])
+
+    # automake per-test logs (skip the aggregate test-suite.log)
+    for log in (root / "tests").rglob("*.log"):
+        if log.name == "test-suite.log":
+            continue
+        for line in _read(log):
+            m = AUTOMAKE_RE.match(line)
+            if m:
+                _record(tests, m.group(2), AUTOMAKE[m.group(1)])
+                break
+
+    return tests
+
+
+def build(root):
+    tests = collect(root)
+    passed = sum(1 for s in tests.values() if s == "PASS")
+    failed = sum(1 for s in tests.values() if s == "FAIL")
+    skipped = sum(1 for s in tests.values() if s == "SKIP")
+    return {
+        "summary": {
+            "total": len(tests),
+            "passed": passed,
+            "failed": failed,
+            "skipped": skipped,
+        },
+        "tests": [
+            {"name": name, "status": status}
+            for name, status in sorted(tests.items())
+        ],
+    }
+
+
+def main():
+    if len(sys.argv) != 3:
+        print(f"usage: {sys.argv[0]} <gnu-source-dir> <output.json>", file=sys.stderr)
+        return 2
+    result = build(sys.argv[1])
+    with open(sys.argv[2], "w", encoding="utf-8") as f:
+        json.dump(result, f, indent=2, sort_keys=True)
+        f.write("\n")
+    s = result["summary"]
+    print(
+        f"GNU findutils tests summary = TOTAL: {s['total']} / "
+        f"PASS: {s['passed']} / FAIL: {s['failed']} / SKIP: {s['skipped']}"
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())