Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
c2a4603
docs: index-root fusion design note (feat/fuse-index-roots)
whilo May 30, 2026
3804164
feat(writing): index-root fusion — inline index roots into the db-record
whilo May 30, 2026
c92ddb2
Integrate PSS OP_BUF_V5 write-buffering (JVM, opt-in via pss.opBufSize)
whilo May 31, 2026
7597925
Fix with-comparator to pass through non-CachedStorage (mem backend)
whilo May 31, 2026
ce362fa
Make op-buf-size and branching-factor configurable via :index-config
whilo May 31, 2026
5aee7b8
Connect-reconcile for create-time-fixed index settings; keep fusion o…
whilo May 31, 2026
d15f1ce
Make merkle-audit and online GC fusion-aware; allow fusion under cryp…
whilo May 31, 2026
b4521e8
Make crypto-hash sound under op-buf (fold slots into the branch address)
whilo May 31, 2026
b01bf9b
cljs op-buf-v5 parity: reconstruct/emit :slots, thread comparator, ex…
whilo May 31, 2026
f5e234f
test: cljs op-buf $remove roundtrip (retract evens, cold-reopen odds …
whilo May 31, 2026
1556c6d
test: cljs op-buf $replace roundtrip (cardinality-one update → upsert…
whilo May 31, 2026
5118afe
test: cljs op-buf generative soundness (random churn + cold reopens v…
whilo May 31, 2026
8c05ef1
cljs merkle audit: cross-platform branch-crypto-uuid/canon/walk + -re…
whilo May 31, 2026
7d99c98
cross-host JVM->cljs: konserve dev local-root + exchange/fress-probe …
whilo May 31, 2026
fba56ac
fix(cljs): datahike.audit/verify-chain now cljs-compiles (require cor…
whilo May 31, 2026
9c0f2b1
diff-buf: rename op-buf → diff-buf (config key :diff-buf-size, fn dif…
whilo May 31, 2026
5a5306b
diff-buf: default ON (256) for new datahike stores
whilo May 31, 2026
0d26847
Rename DIFF_BUF_V5 comment tags to diff-buf
whilo May 31, 2026
997d4cf
diff-buf: drop storage-carried comparator; pin deps to release/git
whilo Jun 1, 2026
5aaac83
diff-buf: default OFF for in-memory backend (no PUTs to fold)
whilo Jun 2, 2026
a108963
test: seeded end-to-end generative model test for diff-buf (#2)
whilo Jun 2, 2026
552fea1
deps: bump PSS to 2063823 (anchorless-skip + count-drift fix + stress…
whilo Jun 2, 2026
5995e08
build: fix javadoc (b/javadoc doesn't exist) — unblocks git-dep prep
whilo Jun 2, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 29 additions & 12 deletions build.clj
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
(ns build
(:refer-clojure :exclude [compile])
(:require [clojure.edn :as edn]
[clojure.java.io :as io]
[clojure.string :as str]
[clojure.tools.build.api :as b]))

(def class-dir "target/classes")
Expand All @@ -15,16 +17,31 @@
"-Xlint:deprecation"]}))

(defn javadoc
"Generate Javadoc for the Java API.
Output will be in target/javadoc and automatically included in the jar."
"Generate Javadoc for the Java API into target/javadoc.
tools.build has no javadoc wrapper (there is no `b/javadoc`), so shell out to the JDK
`javadoc` tool via b/process, passing the project classpath so the Java API's imports
(clojure.lang.*, generated classes) resolve. Output is included in the jar at release."
[_]
(b/javadoc {:src-dirs ["java/src"]
:output-dir "target/javadoc"
:javadoc-opts ["-public"
"-Xdoclint:none"
"-windowtitle" "Datahike Java API"
"-doctitle" "Datahike Java API Documentation"
"-link" "https://docs.oracle.com/javase/8/docs/api/"
"-link" "https://clojure.github.io/clojure/"]})
(println "Javadoc generated in target/javadoc")
(println "Javadoc will be automatically published to javadoc.io when released to Clojars"))
(let [out "target/javadoc"
cp (str/join java.io.File/pathSeparator (:classpath-roots basis))
srcs (->> (io/file "java/src")
file-seq
(filter #(and (.isFile ^java.io.File %)
(str/ends-with? (.getName ^java.io.File %) ".java")))
(mapv #(.getPath ^java.io.File %)))
args (into ["javadoc" "-d" out "-classpath" cp
"-public" "-Xdoclint:none"
"-windowtitle" "Datahike Java API"
"-doctitle" "Datahike Java API Documentation"
"-link" "https://docs.oracle.com/javase/8/docs/api/"
"-link" "https://clojure.github.io/clojure/"]
srcs)
{:keys [exit]} (b/process {:command-args args})]
;; javadoc returns non-zero on warnings (the Java API has undocumented elements), which is
;; non-fatal — the HTML is still produced. Only treat it as a failure if no output appeared.
(when-not (.exists (io/file out "index.html"))
(throw (ex-info "javadoc produced no output" {:exit exit})))
(when-not (zero? exit)
(println "Note: javadoc exited" exit "(warnings above); docs generated regardless."))
(println "Javadoc generated in" out)
(println "Javadoc will be automatically published to javadoc.io when released to Clojars")))
5 changes: 3 additions & 2 deletions deps.edn
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
{:deps {org.clojure/clojure {:mvn/version "1.12.4"}
org.replikativ/hasch {:mvn/version "0.4.98"
:exclusions [org.clojure/clojurescript]}
org.replikativ/konserve {:mvn/version "0.9.346"
org.replikativ/konserve {:mvn/version "0.9.349" ;; includes cljs header meta-size cross-host fix (#143)
:exclusions [org.clojure/clojurescript
org.clojars.mmb90/cljs-cache]}

org.replikativ/superv.async {:mvn/version "0.3.50"
:exclusions [org.clojure/clojurescript]}
org.replikativ/datalog-parser {:mvn/version "0.2.37"}
org.replikativ/persistent-sorted-set {:mvn/version "0.4.122"}
org.replikativ/persistent-sorted-set {:git/url "https://github.com/replikativ/persistent-sorted-set.git"
:git/sha "2063823a6fa78dcda5570906d9e7509b0394ba68"} ;; diff-buf (feature/op-buf-v5); run `clojure -X:deps prep` to compile its Java
environ/environ {:mvn/version "1.2.0"}
nrepl/bencode {:mvn/version "1.2.0"}
org.replikativ/logging {:mvn/version "0.1.3"}
Expand Down
95 changes: 95 additions & 0 deletions doc/index-root-fusion.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
# Index-root fusion (reduce write amplification)

*Branch: `feat/fuse-index-roots`. Status: design, pre-implementation.*

## Problem

A datahike commit writes `(count pending-writes)` index-node objects + 2
db-records (under the commit-id and under the branch). Measured ~7 PUTs/commit
for small commits. The index-node objects include each index's **root**, which
changes essentially every commit. On per-request object storage this
amplification is the dominant cost (see saas `doc/cost-model.md`).

## How the write path works today

- `db->stored` (`writing.cljc`) calls `di/-flush` on each index → `psset/store`
walks dirty nodes and calls `CachedStorage.store` per node, which **appends
`[address node]` to `pending-writes`** and returns the (content- or squuid-)
address. The root's address becomes `pset._address`.
- The stored-db map references each index as a small record. The PSS konserve
write-handler serializes a `PersistentSortedSet` to `{:meta, :address,
:count}`; the **root node lives separately** at `:address`. Read-handler:
`(PersistentSortedSet. meta cmp address @storage nil count settings 0)` — the
5th arg (currently `nil`) is the in-memory `_root`.
- `commit!` drains `pending-writes` (`k/assoc store address node`, one PUT
each), then writes the db-record under `cid` and under `branch`.

## The fusion seam

Inline each index's **root node** into its db-record reference
(`{:meta, :address, :count, :root <root-node>}`) and **drop the root from
`pending-writes`** so it isn't PUT separately. Restore passes the inlined node
as the constructor's 5th arg instead of `nil` — deeper children stay lazy.

Win profile (sharper than "−3 PUTs"):
- **Index = single leaf root (tiny tenant):** the *whole* index inlines → zero
separate node PUTs for it. A few-datom tenant's commit collapses to ~2
record PUTs.
- **Deeper tree:** saves exactly **1 PUT per index** (the root); the dirty
leaf/intermediate path is still separate — that part is op-buf's job, later.
Also **−1 GET per index on cold open** (root arrives with the record).

## Options

- **A — explicit fused index-ref in `db->stored`/`stored->db`** *(recommended)*.
Build `{:meta :address :count :root <node>}`, remove the root from
`pending-writes`, reconstruct via the root-seeding constructor. Contained to
`writing.cljc` + a small helper. Opt-in via config `:fuse-index-roots?` so
it's measurable against baseline.
- **B — embed root in the PSS konserve write/read handler.** More automatic but
the handler would need storage access at serialize time + a way to skip the
separate write. Couples handler to pending state. Messier.
- **C — fusion + branch-as-pointer.** On top of A: write the fused object once
under `cid`, a tiny `{:head cid}` under `branch`. Halves per-commit record
bytes; costs a 2nd GET on branch-open. Optional follow-on.
- **D — inline the whole dirty path (op-buf / mini-WAL in the record).** The
deeper convergence; this is the PSS op-buf work, explicitly *after* A.

## Implementation plan (Option A)

Touchpoints, all in datahike (PSS untouched):

1. **Config:** add `:fuse-index-roots?` (default false).
2. **`db->stored`:** when enabled, for each flushed index pull its root node
(from `CachedStorage` cache at `pset._address`) and emit a fused ref; record
the root address so it can be excluded from the drain.
3. **`commit!` drain:** filter the fused root addresses out of `pending-writes`
before `k/assoc`-ing the rest. (We have `pset._address` per index.)
4. **`stored->db`:** detect the fused ref and reconstruct the index with the
inlined root node seeded into `_root` (constructor 5th arg) + `_address` +
storage for lazy children.
5. **Serialization:** the inlined root is a `Leaf`/`Branch` — already has
konserve read/write handlers, so it nests in the record map for free.

## Caveats to resolve

1. **crypto-hash audit** (`index/persistent_set.cljc` `walk-pss-address!`)
starts at the root *address* via `k/get` — with the root inlined there's no
konserve object there. v1: gate fusion on `:crypto-hash? false`, or teach the
walk to take the root from the record. (The merkle `:address` is still
computable from the inlined node, so audit *can* be made to work.)
2. **GC / `mark`:** the fused root has no konserve object; the reachability/free
path must not expect one at that address (don't add it to the konserve-key
reachable set; its children's addresses still are).
3. **`pending-writes` skip must be exact:** only the per-index *root* address is
removed; every deeper dirty node stays. Identify by `pset._address`.
4. **Backwards compat:** a fused db-record must be distinguishable from a legacy
one on read (presence of `:root`), so old stores still restore.

## Validation

- Roundtrip: write → restore → `(= (vec before) (vec after))`, counts, slices,
history (`as-of`) — at `:fuse-index-roots? true` and `false`.
- Measure with the saas `commit-cost` probe: PUTs/commit and cold-open GETs,
baseline vs fused, across tiny (single-leaf) and deeper trees.
- Full datahike test suite green with the flag off (byte-identical) and on.
7 changes: 6 additions & 1 deletion src/datahike/audit.cljc
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,12 @@
[konserve.core :as k]
[superv.async #?(:clj :refer :cljs :refer-macros) [go-try- <?-]]
[konserve.utils :refer [#?(:clj async+sync) *default-sync-translation*]
#?@(:cljs [:refer-macros [async+sync]])]))
#?@(:cljs [:refer-macros [async+sync]])]
;; cljs: superv.async/go-try- expands to clojure.core.async/go, so the `go`
;; MACRO must be required here or it falls back to the JVM macro and fails to
;; compile (vary-meta on keyword in go-impl). Mirrors datahike.versioning.
#?(:cljs [clojure.core.async :refer [<!]]))
#?(:cljs (:require-macros [clojure.core.async :refer [go]])))

(defn- audit-grade-stored?
"Mirror of writing/audit-grade? but for already-stored commits we're
Expand Down
36 changes: 33 additions & 3 deletions src/datahike/config.cljc
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,17 @@
(def ^:dynamic *default-search-cache-size* 10000)
(def ^:dynamic *default-store-cache-size* 1000)
(def ^:dynamic *default-crypto-hash?* false)
;; When true, each index's root node is inlined into the db-record instead of
;; stored as a separate konserve object — one fewer PUT and one fewer cold GET
;; per index per commit. Experimental; see doc/index-root-fusion.md.
;; Index-root fusion (one fewer PUT + cold GET per index per commit). Now SAFE to enable —
;; the merkle-audit walk and online GC are fusion-aware (verify/seed the inlined root from
;; the db-record instead of fetching it as a separate object). Kept OFF as the global
;; default for now only because flipping it churns count-based tests across the suite;
;; opt in per store (the SaaS template does) — connect adopts the stored value
;; (datahike.connector/adopt-stored-fixed) so fused and non-fused stores both reconnect.
;; TODO: flip to true once the suite's object-count assertions are updated for fusion.
(def ^:dynamic *default-fuse-index-roots?* false)
(def ^:dynamic *default-store* :memory) ;; store-less = in-memory?
(def ^:dynamic *default-db-name* nil) ;; when nil creates random name
(def ^:dynamic *default-db-branch* :db) ;; when nil creates random name
Expand All @@ -34,6 +45,7 @@
(s/def ::search-cache-size nat-int?)
(s/def ::store-cache-size pos-int?)
(s/def ::crypto-hash? boolean?)
(s/def ::fuse-index-roots? boolean?)
(s/def ::writer map?)
(s/def ::branch keyword?)
(s/def ::entity (s/or :map associative? :vec vector?))
Expand All @@ -54,6 +66,7 @@
::search-cache-size
::store-cache-size
::crypto-hash?
::fuse-index-roots?
::initial-tx
::name
::branch
Expand All @@ -66,6 +79,21 @@

(def self-writer {:backend :self})

(defn default-index-config-for-backend
"The default index-config for `index`, adjusted for the store `backend`.

diff-buf write-buffering (PSS `:diff-buf-size`) trades in-memory insert throughput
for fewer durable object PUTs — it only pays off on a request-priced object store.
An in-memory store has no PUTs to fold, so buffering there is pure overhead; default
`:diff-buf-size` to 0 for the in-memory backend. Index-agnostic: only touches the key
when the index's default actually carries it (PSS), and an explicit user `:index-config`
still wins (it is deep-merged over this default in load-config)."
[index backend]
(let [d (di/default-index-config index)]
(cond-> d
(and (contains? #{:memory :mem} backend) (contains? d :diff-buf-size))
(assoc :diff-buf-size 0))))

(defn from-deprecated
[{:keys [backend username password path host port id] :as _backend-cfg}
& {:keys [schema-on-read temporal-index index initial-tx]
Expand Down Expand Up @@ -96,7 +124,7 @@
#?(:clj (java.util.UUID/nameUUIDFromBytes (.getBytes path "UTF-8"))
:cljs (uuid path))))}))
:index index
:index-config (di/default-index-config index)
:index-config (default-index-config-for-backend index backend)
:keep-history? temporal-index
:attribute-refs? *default-attribute-refs?*
:initial-tx initial-tx
Expand Down Expand Up @@ -148,7 +176,8 @@
:crypto-hash? *default-crypto-hash?*
:branch *default-db-branch*
:writer self-writer
:index-config (di/default-index-config *default-index*)})
;; storeless ⇒ inherently in-memory ⇒ diff-buf off (no PUTs to fold)
:index-config (default-index-config-for-backend *default-index* :memory)})

(defn remove-nils
"Thanks to https://stackoverflow.com/a/34221816"
Expand Down Expand Up @@ -211,12 +240,13 @@
:index index
:branch *default-db-branch*
:crypto-hash? *default-crypto-hash?*
:fuse-index-roots? *default-fuse-index-roots?*
:writer self-writer
:search-cache-size (int-from-env :datahike-search-cache-size *default-search-cache-size*)
:store-cache-size (int-from-env :datahike-store-cache-size *default-store-cache-size*)
:index-config (if-let [index-config (map-from-env :datahike-index-config nil)]
index-config
(di/default-index-config index))}
(default-index-config-for-backend index (:backend store-config)))}
merged-config ((comp remove-nils dt/deep-merge) config config-as-arg)
{:keys [schema-flexibility initial-tx store attribute-refs?]} merged-config]
;; konserve now handles store config validation at runtime
Expand Down
27 changes: 27 additions & 0 deletions src/datahike/connector.cljc
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,29 @@
:stored-config stored-config
:diff (diff config stored-config)}))))

;; Settings fixed at database creation — they describe the on-disk format/semantics and
;; cannot be changed by reconnecting (changing them would be meaningless or corrupting).
;; Listed explicitly so any future addition is a deliberate decision.
(def create-time-fixed-keys
#{:keep-history? :attribute-refs? :schema-flexibility :index :crypto-hash? :fuse-index-roots?
;; :index-config sub-keys (PSS): :branching-factor :diff-buf-size
:index-config})

;; Of the fixed keys, the ones whose datahike default has changed (:fuse-index-roots?) or
;; that were newly added (:index-config {:branching-factor :diff-buf-size}) are sourced from
;; the STORED config on connect — adopt the stored value, or drop the key when the store
;; predates it. This lets existing stores connect unchanged and new stores reconnect
;; without re-specifying, while the strict consistency check still guards every other key.
;; (:index is already reconciled with a warning in -connect-impl*.)
(defn adopt-stored-fixed [config stored-config]
(let [adopt (fn [c k] (if (contains? stored-config k) (assoc c k (get stored-config k)) (dissoc c k)))
s-ic (or (:index-config stored-config) {})
adopt-ic (fn [ic k] (if (contains? s-ic k) (assoc ic k (get s-ic k)) (dissoc ic k)))
config (adopt config :fuse-index-roots?)
config (update config :index-config
(fn [ic] (reduce adopt-ic (or ic {}) [:branching-factor :diff-buf-size])))]
(if (empty? (:index-config config)) (dissoc config :index-config) config)))

(defn- normalize-config [cfg]
(-> cfg
(dissoc :writer :store :store-cache-size :search-cache-size)))
Expand Down Expand Up @@ -209,6 +232,10 @@
[config store stored-db]))
[config store stored-db]))
_ (version-check stored-db)
;; Source create-time-fixed settings (fuse / bf / diff-buf-size) from the
;; store so existing stores connect unchanged and new ones reconnect
;; without re-specifying; flows into both the check and the running db.
config (adopt-stored-fixed config (:config stored-db))
_ (when-not (:allow-unsafe-config config)
(ensure-stored-config-consistency config (:config stored-db)))
conn (conn-from-db (dsi/stored->db (assoc stored-db :config config) store))]
Expand Down
14 changes: 13 additions & 1 deletion src/datahike/gc.cljc
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
(ns datahike.gc
(:require [clojure.set :as set]
[datahike.index.interface :refer [-mark]]
[datahike.index.interface :refer [-mark -seed-root!]]
[datahike.index.secondary :as sec]
[konserve.core :as k]
[konserve.gc :refer [sweep!]]
Expand All @@ -25,11 +25,23 @@
(recur r visited reachable)
(let [{:keys [eavt-key avet-key aevt-key
temporal-eavt-key temporal-avet-key temporal-aevt-key
eavt-root aevt-root avet-root
temporal-eavt-root temporal-aevt-root temporal-avet-root
schema-meta-key secondary-index-keys]
{:keys [datahike/parents
datahike/created-at
datahike/updated-at]} :meta}
(<? S (k/get store to-check))
;; Root fusion: inlined roots aren't separate konserve objects, so
;; -mark on the lazy index would try to restore the root by address
;; and fail. Seed each inlined root into its index (mirrors stored->db)
;; so walk-addresses uses it and only its children are fetched.
_ (do (when eavt-root (-seed-root! eavt-key eavt-root))
(when aevt-root (-seed-root! aevt-key aevt-root))
(when avet-root (-seed-root! avet-key avet-root))
(when temporal-eavt-root (-seed-root! temporal-eavt-key temporal-eavt-root))
(when temporal-aevt-root (-seed-root! temporal-aevt-key temporal-aevt-root))
(when temporal-avet-root (-seed-root! temporal-avet-key temporal-avet-root)))
in-range? (> (get-time (or updated-at created-at))
(get-time after-date))]
(let [sec-reachable (when (seq secondary-index-keys)
Expand Down
Loading