From a3ca643387ae132c350c66f9d903013f6f25ccf6 Mon Sep 17 00:00:00 2001 From: Brad Chamberlain Date: Wed, 29 Oct 2025 14:31:26 -0700 Subject: [PATCH] Skew SUMMA communication to avoid bottlenecks The way I learned the SUMMA algorithm was essentially "Locales broadcast their blocks of data to all the other locales in their row/col." But in Chapel, there isn't really a broadcast option (nor would we necessarily want to use one here if there was), so I'd implemented this by having all locales do a remote read of the block in question. Engin pointed out (some time ago) that this could be rewritten to avoid bottlenecks by skewing each locale by its row/col ID such that each block copy is a 1:1 communication rather than a sqrt(numLocales):1 communication, which could bottleneck. This implements that transformation. --- Signed-off-by: Brad Chamberlain --- test/studies/spsMatMatMult/MatMatMult.chpl | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/test/studies/spsMatMatMult/MatMatMult.chpl b/test/studies/spsMatMatMult/MatMatMult.chpl index 0ba8950e37bd..81a72b1e64ab 100644 --- a/test/studies/spsMatMatMult/MatMatMult.chpl +++ b/test/studies/spsMatMatMult/MatMatMult.chpl @@ -50,15 +50,25 @@ module MatMatMult { ref targLocs = A.targetLocales(); + if (targLocs.dim(0) != targLocs.dim(1)) { + halt("sparseMatMatMult() currently assumes a square target locale array"); + } + + const numBlocks = targLocs.dim(0).size; + if countComms then startCommDiagnostics(); var time: stopwatch; time.start(); coforall (locRow, locCol) in targLocs.domain { on targLocs[locRow, locCol] { + var spsData: sparseMatDat; - for srcloc in targLocs.dim(0) { + for loc in targLocs.dim(0) { + // Skew the row/col we access to avoid communication bottlenecks + const srcloc = (loc + locRow)%numBlocks; + // Make a local copy of the remote blocks of A and B; on my branch // this will also make a local copy of the remote indices, so long // as these are 'const'/read-only