From a3ca643387ae132c350c66f9d903013f6f25ccf6 Mon Sep 17 00:00:00 2001
From: Brad Chamberlain <bradcray@users.noreply.github.com>
Date: Wed, 29 Oct 2025 14:31:26 -0700
Subject: [PATCH] Skew SUMMA communication to avoid bottlenecks

The way I learned the SUMMA algorithm was essentially "Locales
broadcast their blocks of data to all the other locales in their
row/col."  But in Chapel, there isn't really a broadcast option (nor
would we necessarily want to use one here if there was), so I'd
implemented this by having all locales do a remote read of the block
in question.

Engin pointed out (some time ago) that this could be rewritten to
avoid bottlenecks by skewing each locale by its row/col ID such that
each block copy is a 1:1 communication rather than a
sqrt(numLocales):1 communication, which could bottleneck.  This
implements that transformation.

---
Signed-off-by: Brad Chamberlain <bradcray@users.noreply.github.com>
---
 test/studies/spsMatMatMult/MatMatMult.chpl | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/test/studies/spsMatMatMult/MatMatMult.chpl b/test/studies/spsMatMatMult/MatMatMult.chpl
index 0ba8950e37bd..81a72b1e64ab 100644
--- a/test/studies/spsMatMatMult/MatMatMult.chpl
+++ b/test/studies/spsMatMatMult/MatMatMult.chpl
@@ -50,15 +50,25 @@ module MatMatMult {
 
     ref targLocs = A.targetLocales();
   
+    if (targLocs.dim(0) != targLocs.dim(1)) {
+      halt("sparseMatMatMult() currently assumes a square target locale array");
+    }
+    
+    const numBlocks = targLocs.dim(0).size;
+
     if countComms then startCommDiagnostics();
     var time: stopwatch;
     time.start();
 
     coforall (locRow, locCol) in targLocs.domain {
       on targLocs[locRow, locCol] {
+
         var spsData: sparseMatDat;
 
-        for srcloc in targLocs.dim(0) {
+        for loc in targLocs.dim(0) {
+          // Skew the row/col we access to avoid communication bottlenecks
+          const srcloc = (loc + locRow)%numBlocks;
+
           // Make a local copy of the remote blocks of A and B; on my branch
           // this will also make a local copy of the remote indices, so long
           // as these are 'const'/read-only