Skip to content

UR(CUDA): Kokkos::Experimental::Graph<Kokkos::SYCL> failure under unified runtime for CUDA #22322

@romintomasetti

Description

@romintomasetti

Describe the bug

We are seeing many of our Kokkos::Experimental::Graph<Kokkos::SYCL> tests failing at runtime when they are compiled for Unified Runtime (UR) for CUDA.

These tests are fine if compiled and run for e.g. Intel PVC.

Our reproducer shows that depending on the way we define the node work, it will or will not fail. This is one failure example among others.

We do note that the tests in core/unit_test/TestGraph.hpp are passing on our machines, but copy-pasting the reproducer below in core/unit_test/TestGraph.hpp show that it will fail there too. So the fact that Kokkos tests are fine is primarily by chance.

We have tested AMPERE86, VOLTA70 and BLACKWELL120 GPUs, the problem is clearly on the software stack side. We tested nightlies (see reproducer) as well as the DPCPP 2025.3 release combined with UR from the v6.3.0 tag.

To reproduce

Docker-based reproducer.

Build image with:

docker buildx build --tag test --progress=plain - < reproducer.ur_cuda.dockerfile

Run the container with:

docker run --privileged -it test bash -c "./build/reproducer-lambda && ./build/reproducer-struct"

Typical output:

With a lambda.
ALL GOOD
With a struct.
mirror(index_1) (0) != 3 (3) 
mirror(index_4) (5) != 8 (8) 
mirror(index_5) (11) != 14 (14) 
terminate called after throwing an instance of 'std::runtime_error'
  what():  Test failed.
Dockerfile
FROM nvcr.io/nvidia/cuda:12.8.1-devel-ubuntu24.04 AS base

FROM base AS system-requirements

# A few system dependencies.
RUN <<EOF
    set -ex

    apt update && apt install --yes --no-install-recommends wget cmake hwloc
EOF

FROM system-requirements AS nightly

# Install a nightly.
ARG NIGHTLY=nightly-2026-06-12

ADD https://github.com/intel/llvm/releases/download/${NIGHTLY}/sycl_linux.tar.gz /tmp/

RUN <<EOF
    set -ex

    mkdir -p /opt/sycl-${NIGHTLY}

    tar xf /tmp/sycl_linux.tar.gz -C /opt/sycl-${NIGHTLY}
EOF

ENV PATH=/opt/sycl-${NIGHTLY}/bin:$PATH
ENV LD_LIBRARY_PATH=/opt/sycl-${NIGHTLY}/lib:$LD_LIBRARY_PATH

# Build the Kokkos smoke test that includes Kokkos::Experimental::Graph.
FROM nightly AS kokkos-compile

ARG CMAKE_BUILD_TYPE=Release
ARG KOKKOS_ARCH=AMPERE86
ARG KOKKOS_SHA=c34d0416befd81796a8957845f3f1905b3b0a4da
ARG KOKKOS_USERNAME=Kokkos

ADD https://github.com/${KOKKOS_USERNAME}/kokkos/archive/${KOKKOS_SHA}.tar.gz /tmp/

RUN <<EOF
    set -ex

    cd /tmp

    tar -xzf ${KOKKOS_SHA}.tar.gz

    cd kokkos-${KOKKOS_SHA}

    clang++ --version

    cmake -S . -B build \
        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} \
        -DCMAKE_CXX_COMPILER=clang++ \
        -DCMAKE_CXX_FLAGS="-ffp-model=precise" \
        -DKokkos_ARCH_${KOKKOS_ARCH}=ON \
        -DKokkos_ENABLE_ONEDPL=OFF \
        -DKokkos_ENABLE_UNSUPPORTED_ARCHS=ON \
        -DCMAKE_CXX_STANDARD=20 \
        -DKokkos_ENABLE_SYCL=ON \
        -DKokkos_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE=ON \
        -DBUILD_SHARED_LIBS=OFF \
        -DCMAKE_INSTALL_PREFIX=/opt/kokkos-${KOKKOS_SHA}/ \
        -DKokkos_ENABLE_TESTS=OFF

    cmake --build build -j16 --target install
EOF

FROM kokkos-compile AS reproducer

COPY <<EOF CMakeLists.txt
cmake_minimum_required(VERSION 3.25)

project(reproducer-ur-cuda LANGUAGES CXX)

find_package(Kokkos CONFIG REQUIRED)

add_executable(reproducer-lambda reproducer.cpp)
target_link_libraries(reproducer-lambda PRIVATE Kokkos::kokkoscore)
target_compile_definitions(reproducer-lambda PRIVATE NODE_FIVE_WITH_STRUCT=0)

add_executable(reproducer-struct reproducer.cpp)
target_link_libraries(reproducer-struct PRIVATE Kokkos::kokkoscore)
target_compile_definitions(reproducer-struct PRIVATE NODE_FIVE_WITH_STRUCT=1)
EOF

COPY <<EOF reproducer.cpp
#include "Kokkos_Core.hpp"
#include "Kokkos_Graph.hpp"

template <typename ViewType, std::integral IndexType, typename ValueType>
struct LoadAddStore {
    ViewType data;
    IndexType dst, src;
    ValueType value;

    KOKKOS_FUNCTION void operator()() const noexcept {
        data(dst) = data(src) + value;
    }
};

template <typename ViewType, std::integral IndexType, typename ValueType>
struct LoadAddStorePFor {
    ViewType data;
    IndexType dst, src;
    ValueType value;

    template <std::integral T>
    KOKKOS_FUNCTION void operator()(const T) const noexcept {
        data(dst) = data(src) + value;
    }
};

#define EXPECT_EQ(a, b) \\
    if(a != b) { \\
        Kokkos::printf("%s (%d) != %s (%d) \\n", #a, a, #b, b); \\
        success = false; \\
    }

void test_large() {
    const Kokkos::View<int[6], Kokkos::SYCL::memory_space> data(Kokkos::view_alloc("data"));
    const auto mirror = Kokkos::create_mirror_view(data);

    const Kokkos::Experimental::Graph graph{};

    constexpr int index_0 = 0, index_1 = 1, index_2 = 2, index_3 = 3, index_4 = 4, index_5 = 5;

    constexpr int value_0 = 1, value_1 = 2, value_2 = 3, value_3 = 4, value_4 = 5, value_5 = 6;

    auto node_0 = graph.root_node().then(KOKKOS_LAMBDA() { data(index_0) = value_0; });

    auto node_1 = node_0.then(LoadAddStore{.data = data, .dst = index_1, .src = index_0, .value = value_1});
    auto node_2 = node_0.then(LoadAddStore{.data = data, .dst = index_2, .src = index_0, .value = value_2});
    auto node_3 = node_0.then(LoadAddStore{.data = data, .dst = index_3, .src = index_0, .value = value_3});

    auto node_4 = node_1.then(LoadAddStore{.data = data, .dst = index_4, .src = index_1, .value = value_4});
#if NODE_FIVE_WITH_STRUCT == 1
    Kokkos::printf("With a struct.\\n");
    auto node_5 = node_4.then_parallel_for(
        Kokkos::RangePolicy<Kokkos::SYCL>(0, 1), LoadAddStorePFor{.data = data, .dst = index_5, .src = index_4, .value = value_5});
#else
    Kokkos::printf("With a lambda.\\n");
    auto node_5 = node_4.then_parallel_for(
        Kokkos::RangePolicy<Kokkos::SYCL>(0, 1), KOKKOS_LAMBDA<std::integral T>(const T) {
            data(index_5) = data(index_4) + value_5; });
#endif

    const Kokkos::SYCL exec{};

    graph.submit(exec);
    Kokkos::deep_copy(exec, mirror, data);
    exec.fence();

    constexpr int expected_34 = 1586;

    bool success = true;

    EXPECT_EQ(mirror(index_0), 1);
    EXPECT_EQ(mirror(index_1), 3);
    EXPECT_EQ(mirror(index_2), 4);
    EXPECT_EQ(mirror(index_3), 5);
    EXPECT_EQ(mirror(index_4), 8);
    EXPECT_EQ(mirror(index_5), 14);

    if(!success) throw std::runtime_error("Test failed.");
}

int main() {
    Kokkos::ScopeGuard guard{};
    {
        test_large();

        Kokkos::printf("ALL GOOD\\n");
    }
}
EOF

RUN <<EOF
    set -ex

    cmake -S . -B build -DCMAKE_CXX_COMPILER=clang++

    cmake --build build -j4 --verbose
EOF

Environment

See Dockerfile.

Additional context

Joint work with @maartenarnst.

Metadata

Metadata

Assignees

Labels

Type

No type
No fields configured for issues without a type.

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions