FROM nvcr.io/nvidia/cuda:12.8.1-devel-ubuntu24.04 AS base
FROM base AS system-requirements
# A few system dependencies.
RUN <<EOF
set -ex
apt update && apt install --yes --no-install-recommends wget cmake hwloc
EOF
FROM system-requirements AS nightly
# Install a nightly.
ARG NIGHTLY=nightly-2026-06-12
ADD https://github.com/intel/llvm/releases/download/${NIGHTLY}/sycl_linux.tar.gz /tmp/
RUN <<EOF
set -ex
mkdir -p /opt/sycl-${NIGHTLY}
tar xf /tmp/sycl_linux.tar.gz -C /opt/sycl-${NIGHTLY}
EOF
ENV PATH=/opt/sycl-${NIGHTLY}/bin:$PATH
ENV LD_LIBRARY_PATH=/opt/sycl-${NIGHTLY}/lib:$LD_LIBRARY_PATH
# Build the Kokkos smoke test that includes Kokkos::Experimental::Graph.
FROM nightly AS kokkos-compile
ARG CMAKE_BUILD_TYPE=Release
ARG KOKKOS_ARCH=AMPERE86
ARG KOKKOS_SHA=c34d0416befd81796a8957845f3f1905b3b0a4da
ARG KOKKOS_USERNAME=Kokkos
ADD https://github.com/${KOKKOS_USERNAME}/kokkos/archive/${KOKKOS_SHA}.tar.gz /tmp/
RUN <<EOF
set -ex
cd /tmp
tar -xzf ${KOKKOS_SHA}.tar.gz
cd kokkos-${KOKKOS_SHA}
clang++ --version
cmake -S . -B build \
-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} \
-DCMAKE_CXX_COMPILER=clang++ \
-DCMAKE_CXX_FLAGS="-ffp-model=precise" \
-DKokkos_ARCH_${KOKKOS_ARCH}=ON \
-DKokkos_ENABLE_ONEDPL=OFF \
-DKokkos_ENABLE_UNSUPPORTED_ARCHS=ON \
-DCMAKE_CXX_STANDARD=20 \
-DKokkos_ENABLE_SYCL=ON \
-DKokkos_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE=ON \
-DBUILD_SHARED_LIBS=OFF \
-DCMAKE_INSTALL_PREFIX=/opt/kokkos-${KOKKOS_SHA}/ \
-DKokkos_ENABLE_TESTS=OFF
cmake --build build -j16 --target install
EOF
FROM kokkos-compile AS reproducer
COPY <<EOF CMakeLists.txt
cmake_minimum_required(VERSION 3.25)
project(reproducer-ur-cuda LANGUAGES CXX)
find_package(Kokkos CONFIG REQUIRED)
add_executable(reproducer-lambda reproducer.cpp)
target_link_libraries(reproducer-lambda PRIVATE Kokkos::kokkoscore)
target_compile_definitions(reproducer-lambda PRIVATE NODE_FIVE_WITH_STRUCT=0)
add_executable(reproducer-struct reproducer.cpp)
target_link_libraries(reproducer-struct PRIVATE Kokkos::kokkoscore)
target_compile_definitions(reproducer-struct PRIVATE NODE_FIVE_WITH_STRUCT=1)
EOF
COPY <<EOF reproducer.cpp
#include "Kokkos_Core.hpp"
#include "Kokkos_Graph.hpp"
template <typename ViewType, std::integral IndexType, typename ValueType>
struct LoadAddStore {
ViewType data;
IndexType dst, src;
ValueType value;
KOKKOS_FUNCTION void operator()() const noexcept {
data(dst) = data(src) + value;
}
};
template <typename ViewType, std::integral IndexType, typename ValueType>
struct LoadAddStorePFor {
ViewType data;
IndexType dst, src;
ValueType value;
template <std::integral T>
KOKKOS_FUNCTION void operator()(const T) const noexcept {
data(dst) = data(src) + value;
}
};
#define EXPECT_EQ(a, b) \\
if(a != b) { \\
Kokkos::printf("%s (%d) != %s (%d) \\n", #a, a, #b, b); \\
success = false; \\
}
void test_large() {
const Kokkos::View<int[6], Kokkos::SYCL::memory_space> data(Kokkos::view_alloc("data"));
const auto mirror = Kokkos::create_mirror_view(data);
const Kokkos::Experimental::Graph graph{};
constexpr int index_0 = 0, index_1 = 1, index_2 = 2, index_3 = 3, index_4 = 4, index_5 = 5;
constexpr int value_0 = 1, value_1 = 2, value_2 = 3, value_3 = 4, value_4 = 5, value_5 = 6;
auto node_0 = graph.root_node().then(KOKKOS_LAMBDA() { data(index_0) = value_0; });
auto node_1 = node_0.then(LoadAddStore{.data = data, .dst = index_1, .src = index_0, .value = value_1});
auto node_2 = node_0.then(LoadAddStore{.data = data, .dst = index_2, .src = index_0, .value = value_2});
auto node_3 = node_0.then(LoadAddStore{.data = data, .dst = index_3, .src = index_0, .value = value_3});
auto node_4 = node_1.then(LoadAddStore{.data = data, .dst = index_4, .src = index_1, .value = value_4});
#if NODE_FIVE_WITH_STRUCT == 1
Kokkos::printf("With a struct.\\n");
auto node_5 = node_4.then_parallel_for(
Kokkos::RangePolicy<Kokkos::SYCL>(0, 1), LoadAddStorePFor{.data = data, .dst = index_5, .src = index_4, .value = value_5});
#else
Kokkos::printf("With a lambda.\\n");
auto node_5 = node_4.then_parallel_for(
Kokkos::RangePolicy<Kokkos::SYCL>(0, 1), KOKKOS_LAMBDA<std::integral T>(const T) {
data(index_5) = data(index_4) + value_5; });
#endif
const Kokkos::SYCL exec{};
graph.submit(exec);
Kokkos::deep_copy(exec, mirror, data);
exec.fence();
constexpr int expected_34 = 1586;
bool success = true;
EXPECT_EQ(mirror(index_0), 1);
EXPECT_EQ(mirror(index_1), 3);
EXPECT_EQ(mirror(index_2), 4);
EXPECT_EQ(mirror(index_3), 5);
EXPECT_EQ(mirror(index_4), 8);
EXPECT_EQ(mirror(index_5), 14);
if(!success) throw std::runtime_error("Test failed.");
}
int main() {
Kokkos::ScopeGuard guard{};
{
test_large();
Kokkos::printf("ALL GOOD\\n");
}
}
EOF
RUN <<EOF
set -ex
cmake -S . -B build -DCMAKE_CXX_COMPILER=clang++
cmake --build build -j4 --verbose
EOF
Describe the bug
We are seeing many of our
Kokkos::Experimental::Graph<Kokkos::SYCL>tests failing at runtime when they are compiled for Unified Runtime (UR) for CUDA.These tests are fine if compiled and run for e.g. Intel PVC.
Our reproducer shows that depending on the way we define the node work, it will or will not fail. This is one failure example among others.
We do note that the tests in core/unit_test/TestGraph.hpp are passing on our machines, but copy-pasting the reproducer below in core/unit_test/TestGraph.hpp show that it will fail there too. So the fact that
Kokkostests are fine is primarily by chance.We have tested AMPERE86, VOLTA70 and BLACKWELL120 GPUs, the problem is clearly on the software stack side. We tested nightlies (see reproducer) as well as the DPCPP 2025.3 release combined with UR from the v6.3.0 tag.
To reproduce
Docker-based reproducer.
Build image with:
Run the container with:
Typical output:
Dockerfile
Environment
See Dockerfile.
Additional context
Joint work with @maartenarnst.