diff --git a/tensorrt_llm/runtime/kv_cache_manager_v2/_block_radix_tree.py b/tensorrt_llm/runtime/kv_cache_manager_v2/_block_radix_tree.py index 6cada3da7cec..1024eca91575 100644 --- a/tensorrt_llm/runtime/kv_cache_manager_v2/_block_radix_tree.py +++ b/tensorrt_llm/runtime/kv_cache_manager_v2/_block_radix_tree.py @@ -14,6 +14,7 @@ # limitations under the License. import hashlib +from array import array from typing import TYPE_CHECKING, Iterable, Iterator, NamedTuple, Sequence, TypeVar, cast from . import rawref @@ -90,11 +91,21 @@ def update(self, data: int | bytes | Sequence[int | bytes]) -> "Hasher": elif type(data) is bytes: self._hasher.update(data) else: - for item in data: # type: ignore - assert ( - NDEBUG or (type(item) is int and (0 <= item < (1 << 64))) or type(item) is bytes - ) - self._hasher.update(item.to_bytes(8, "little") if (type(item) is int) else item) # type: ignore + # Hash the whole token block in one C call instead of one per token. + # array("Q", data).tobytes() packs each int as 8 native-endian bytes; + # all NVIDIA GPU host platforms (x86_64, aarch64/Grace) are little-endian + # so this is byte-identical to the per-token to_bytes(8, "little") loop. + # Falls back to that loop for multimodal blocks (which contain bytes items). + try: + self._hasher.update(array("Q", data).tobytes()) # type: ignore + except (TypeError, OverflowError): + for item in data: # type: ignore + assert ( + NDEBUG + or (type(item) is int and (0 <= item < (1 << 64))) + or type(item) is bytes + ) + self._hasher.update(item.to_bytes(8, "little") if (type(item) is int) else item) # type: ignore return self @property diff --git a/tests/unittest/kv_cache_manager_v2_tests/test_kv_cache_manager_v2.py b/tests/unittest/kv_cache_manager_v2_tests/test_kv_cache_manager_v2.py index 7e84c0ba75c2..583fa5da72c9 100755 --- a/tests/unittest/kv_cache_manager_v2_tests/test_kv_cache_manager_v2.py +++ b/tests/unittest/kv_cache_manager_v2_tests/test_kv_cache_manager_v2.py @@ -15,6 +15,7 @@ import array import functools import gc +import hashlib import itertools import os import random @@ -52,7 +53,7 @@ TokenIdExt, _KVCache, ) - from kv_cache_manager_v2._block_radix_tree import traverse_post_order + from kv_cache_manager_v2._block_radix_tree import Hasher, traverse_post_order from kv_cache_manager_v2._common import ( BAD_PAGE_INDEX, GPU_LEVEL, @@ -105,7 +106,10 @@ TokenIdExt, _KVCache, ) - from tensorrt_llm.runtime.kv_cache_manager_v2._block_radix_tree import traverse_post_order + from tensorrt_llm.runtime.kv_cache_manager_v2._block_radix_tree import ( + Hasher, + traverse_post_order, + ) from tensorrt_llm.runtime.kv_cache_manager_v2._common import ( BAD_PAGE_INDEX, GPU_LEVEL, @@ -2556,5 +2560,33 @@ def test_shrink_touched_pool(self) -> None: allocator.release(s) +class TestBlockKeyHashing(unittest.TestCase): + """Verify Hasher.update produces bit-identical digests to the per-token reference (no GPU needed).""" + + @staticmethod + def _ref_update(seed: bytes, block: "list[int | bytes]") -> bytes: + h = hashlib.sha256() + h.update(seed) + for item in block: + h.update(item.to_bytes(8, "little") if type(item) is int else item) + return h.digest() + + def test_update_int_block_matches_reference(self) -> None: + rng = random.Random(123) + seed = b"\xaa\xbb\xcc" + for n in (0, 1, 7, 32, 33, 257): + block = [rng.randint(0, (1 << 60)) for _ in range(n)] + self.assertEqual( + Hasher(seed).update(block).digest, + self._ref_update(seed, block), + f"int block of length {n}", + ) + + def test_update_mixed_multimodal_block(self) -> None: + block = [randbytes(32), 5, 6, randbytes(32)] + list(range(20)) + seed = b"\x01" + self.assertEqual(Hasher(seed).update(block).digest, self._ref_update(seed, block)) + + if __name__ == "__main__": unittest.main()