From e22589fdae7bff0ea7796d8dc98e7708623b8dcf Mon Sep 17 00:00:00 2001 From: Matthew Hasselfield Date: Wed, 22 Apr 2026 12:13:14 -0400 Subject: [PATCH 1/7] Timing printfs --- demos/demo_proj2.py | 8 ++++-- include/Ranges.h | 3 ++ python/proj/ranges.py | 8 ++++-- src/Projection.cxx | 65 +++++++++++++++++++++++++++++-------------- 4 files changed, 59 insertions(+), 25 deletions(-) diff --git a/demos/demo_proj2.py b/demos/demo_proj2.py index ae23207c..a0b12167 100644 --- a/demos/demo_proj2.py +++ b/demos/demo_proj2.py @@ -116,12 +116,16 @@ def get_pixelizor(emap): det_weights[n_det//2] = 0.0 # ... unless we mark it as noisy. # Then back to map. +print('setup cuts...', end='\n ... ') +with Timer(): + cuts = so3g.proj.RangesMatrix.ones(shape=(24, n_det, n_t)) + print('Create timestream...', end='\n ... ') with Timer(): - map1 = pe.to_map(None, ptg, ofs, resp, sig1, det_weights, None) + map1 = pe.to_map(None, ptg, ofs, resp, sig1, det_weights, cuts) # Get the weight map (matrix). -wmap1 = pe.to_weight_map(None, ptg, ofs, resp, det_weights, None) +wmap1 = pe.to_weight_map(None, ptg, ofs, resp, det_weights, cuts) wmap1[1,0] = wmap1[0,1] # fill in unpopulated entries... wmap1[2,0] = wmap1[0,2] wmap1[2,1] = wmap1[1,2] diff --git a/include/Ranges.h b/include/Ranges.h index 1b543b77..25ae7d1c 100644 --- a/include/Ranges.h +++ b/include/Ranges.h @@ -70,6 +70,9 @@ template vector> extract_ranges(const bp::object & ival_list) { vector> v(bp::len(ival_list)); for (int i=0; i> v(N); +// for (int i=0; i>(ival_list[i])(); return v; } diff --git a/python/proj/ranges.py b/python/proj/ranges.py index 108b3e78..199a3401 100644 --- a/python/proj/ranges.py +++ b/python/proj/ranges.py @@ -35,6 +35,7 @@ def __repr__(self): return 'RangesMatrix(' + ','.join(map(str, self.shape)) + ')' def __len__(self): + #return len(self.ranges) return self.shape[0] def copy(self): @@ -44,7 +45,7 @@ def copy(self): def zeros_like(self): return RangesMatrix([x.zeros_like() for x in self.ranges], child_shape=self.shape[1:]) - + def ones_like(self): return RangesMatrix([x.ones_like() for x in self.ranges], child_shape=self.shape[1:]) @@ -53,7 +54,7 @@ def buffer(self, buff): [x.buffer(buff) for x in self.ranges] ## just to make this work like Ranges.buffer() return self - + def buffered(self, buff): out = self.copy() [x.buffer(buff) for x in out.ranges] @@ -93,6 +94,9 @@ def __getitem__(self, index): new_rm = rm[..., :] """ + #if isinstance(index, (int, np.int32, np.int64)): + # return self.ranges[index] + if not isinstance(index, tuple): index = (index,) diff --git a/src/Projection.cxx b/src/Projection.cxx index fd1b2298..cc61a761 100644 --- a/src/Projection.cxx +++ b/src/Projection.cxx @@ -9,6 +9,8 @@ using namespace std; #include #include +#include + #ifdef _OPENMP # include #endif // ifdef _OPENMP @@ -1865,6 +1867,7 @@ vector>> derive_ranges( bp::object thread_intervals, int n_det, int n_time, std::string arg_name) { + auto start = std::chrono::high_resolution_clock::now(); // The first index of the returned object should correspond to // (OMP) execution thread; the second index is over detectors. // The input object can be any of the following: @@ -1880,31 +1883,45 @@ vector>> derive_ranges( vector>> ivals; if (isNone(thread_intervals)) { + cout << "A\n "; // << RangesInt32::cc_count << "\n"; // It's None. Generate a single bunch with a single-thread covering all samples auto r = RangesInt32(n_time).add_interval(0, n_time); + vector> v(1, vector(n_det, r)); +// cout << "A - " << RangesInt32::cc_count << "\n"; ivals.push_back(v); + } else if(bp::extract(thread_intervals[0]).check()) { + cout << "B\n"; // It's a RangesMatrix (ndet,nranges). Promote to single thread, single bunch ivals.push_back(vector>(1, extract_ranges(thread_intervals))); } else if(bp::extract(thread_intervals[0][0]).check()) { + cout << "C\n"; // It's a per-thread RangesMatrix (nthread,ndet,nranges). Promote to single bunch + //const int N = bp::len(thread_intervals); vector> bunch; for (int i=0; i(thread_intervals[i]); bunch.push_back(extract_ranges(thread_intervals[i])); ivals.push_back(bunch); } else if(bp::extract(thread_intervals[0][0][0]).check()) { + cout << "D\n"; // It's a full multi-bunch (nbunch,nthread,ndet,nranges) thing. - for (int i=0; i> bunch; - for (int j=0; j(thread_intervals[i][j])); + const int N = bp::len(thread_intervals); + for (int i=0; i> bunch(M); + for (int j=0; j(thread_intervals[i][j]); ivals.push_back(bunch); } } else { // This should not happen assert(false); } + auto end1 = std::chrono::high_resolution_clock::now(); + std::chrono::duration elapsed1 = end1 - start; + // Check that these all have the right shape. Maybe consider a standard // for loop instead of foreach to give more useful error messages using // the index @@ -1925,6 +1942,12 @@ vector>> derive_ranges( } } } + + auto end2 = std::chrono::high_resolution_clock::now(); + std::chrono::duration elapsed2 = end2 - end1; + std::cout << "Block1: " << elapsed1.count() << " ms\n"; + std::cout << "Block2: " << elapsed2.count() << " ms\n"; + return ivals; } @@ -2343,23 +2366,23 @@ int _index_count(const T &) { return T::index_count; } PYBINDINGS("so3g") { - EXPORT_PIX(Flat); - EXPORT_PIX(Quat); +// EXPORT_PIX(Flat); +// EXPORT_PIX(Quat); EXPORT_PIX(CAR); - EXPORT_PIX(CEA); - EXPORT_PIX(ARC); - EXPORT_PIX(SIN); - EXPORT_PIX(TAN); - EXPORT_PIX(ZEA); - - EXPORT_PRECOMP(ProjEng_Precomp_NonTiled); - EXPORT_PRECOMP(ProjEng_Precomp_Tiled); - - EXPORT_ENGINE(ProjEng_HP_T_NonTiled); - EXPORT_ENGINE(ProjEng_HP_QU_NonTiled); - EXPORT_ENGINE(ProjEng_HP_TQU_NonTiled); - EXPORT_ENGINE(ProjEng_HP_T_Tiled); - EXPORT_ENGINE(ProjEng_HP_QU_Tiled); - EXPORT_ENGINE(ProjEng_HP_TQU_Tiled); +// EXPORT_PIX(CEA); +// EXPORT_PIX(ARC); +// EXPORT_PIX(SIN); +// EXPORT_PIX(TAN); +// EXPORT_PIX(ZEA); +// +// EXPORT_PRECOMP(ProjEng_Precomp_NonTiled); +// EXPORT_PRECOMP(ProjEng_Precomp_Tiled); +// +// EXPORT_ENGINE(ProjEng_HP_T_NonTiled); +// EXPORT_ENGINE(ProjEng_HP_QU_NonTiled); +// EXPORT_ENGINE(ProjEng_HP_TQU_NonTiled); +// EXPORT_ENGINE(ProjEng_HP_T_Tiled); +// EXPORT_ENGINE(ProjEng_HP_QU_Tiled); +// EXPORT_ENGINE(ProjEng_HP_TQU_Tiled); } From 868f16d9f400c745f2ba91a407892901d91dbb0f Mon Sep 17 00:00:00 2001 From: Matthew Hasselfield Date: Wed, 22 Apr 2026 12:18:13 -0400 Subject: [PATCH 2/7] RangesMatrix: skip shape check on .full --- python/proj/ranges.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/proj/ranges.py b/python/proj/ranges.py index 199a3401..63a3c6cb 100644 --- a/python/proj/ranges.py +++ b/python/proj/ranges.py @@ -132,7 +132,7 @@ def __add__(self, x): elif self.shape[0] == x.shape[0]: return self.__class__([r + d for r, d in zip(self.ranges, x)], skip_shape_check=True) return self.__class__([r + x for r in self.ranges], skip_shape_check=True) - + def __mul__(self, x): if isinstance(x, Ranges): return self.__class__([d * x for d in self.ranges], skip_shape_check=True) @@ -251,7 +251,7 @@ def full(shape, fill_value): return r return RangesMatrix([RangesMatrix.full(shape[1:], fill_value) for i in range(shape[0])], - child_shape=shape[1:]) + child_shape=shape[1:], skip_shape_check=True) @classmethod def zeros(cls, shape): From b992a4b0783f53d05cfabc93a45e89d8b406382e Mon Sep 17 00:00:00 2001 From: Matthew Hasselfield Date: Wed, 22 Apr 2026 12:21:36 -0400 Subject: [PATCH 3/7] RangesMatrix: faster len and getitem --- python/proj/ranges.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/proj/ranges.py b/python/proj/ranges.py index 63a3c6cb..c16d3001 100644 --- a/python/proj/ranges.py +++ b/python/proj/ranges.py @@ -35,8 +35,7 @@ def __repr__(self): return 'RangesMatrix(' + ','.join(map(str, self.shape)) + ')' def __len__(self): - #return len(self.ranges) - return self.shape[0] + return len(self.ranges) def copy(self): return RangesMatrix([x.copy() for x in self.ranges], @@ -94,8 +93,9 @@ def __getitem__(self, index): new_rm = rm[..., :] """ - #if isinstance(index, (int, np.int32, np.int64)): - # return self.ranges[index] + # Short-circuit return if this is a simple integer index. + if isinstance(index, (int, np.int32, np.int64)): + return self.ranges[index] if not isinstance(index, tuple): index = (index,) From 6bb170791e0466806d145b0cc83410b18b88fc67 Mon Sep 17 00:00:00 2001 From: Matthew Hasselfield Date: Wed, 22 Apr 2026 12:24:18 -0400 Subject: [PATCH 4/7] Faster extract_ranges --- include/Ranges.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/include/Ranges.h b/include/Ranges.h index 25ae7d1c..985e2a64 100644 --- a/include/Ranges.h +++ b/include/Ranges.h @@ -68,11 +68,9 @@ class Ranges { // Support for working with RangesMatrix, which is basically just a list of Ranges template vector> extract_ranges(const bp::object & ival_list) { - vector> v(bp::len(ival_list)); - for (int i=0; i> v(N); -// for (int i=0; i> v(N); + for (int i=0; i>(ival_list[i])(); return v; } From fe23722a8949a62be683f2254abc35eefdaee00c Mon Sep 17 00:00:00 2001 From: Matthew Hasselfield Date: Wed, 22 Apr 2026 12:34:22 -0400 Subject: [PATCH 5/7] And the D block --- src/Projection.cxx | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/Projection.cxx b/src/Projection.cxx index cc61a761..3b352545 100644 --- a/src/Projection.cxx +++ b/src/Projection.cxx @@ -1898,11 +1898,10 @@ vector>> derive_ranges( } else if(bp::extract(thread_intervals[0][0]).check()) { cout << "C\n"; // It's a per-thread RangesMatrix (nthread,ndet,nranges). Promote to single bunch - //const int N = bp::len(thread_intervals); - vector> bunch; - for (int i=0; i(thread_intervals[i]); - bunch.push_back(extract_ranges(thread_intervals[i])); + const int N = bp::len(thread_intervals); + vector> bunch(N); + for (int i=0; i(thread_intervals[i]); ivals.push_back(bunch); } else if(bp::extract(thread_intervals[0][0][0]).check()) { cout << "D\n"; @@ -1910,9 +1909,10 @@ vector>> derive_ranges( const int N = bp::len(thread_intervals); for (int i=0; i> bunch(M); for (int j=0; j(thread_intervals[i][j]); + bunch[j] = extract_ranges(ti_i[j]); ivals.push_back(bunch); } } else { From c1b25993de1add07ec7eebeda72a7def7eea68f2 Mon Sep 17 00:00:00 2001 From: Matthew Hasselfield Date: Wed, 22 Apr 2026 12:44:34 -0400 Subject: [PATCH 6/7] cleaning up ... --- src/Projection.cxx | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/src/Projection.cxx b/src/Projection.cxx index 3b352545..da177816 100644 --- a/src/Projection.cxx +++ b/src/Projection.cxx @@ -1867,7 +1867,6 @@ vector>> derive_ranges( bp::object thread_intervals, int n_det, int n_time, std::string arg_name) { - auto start = std::chrono::high_resolution_clock::now(); // The first index of the returned object should correspond to // (OMP) execution thread; the second index is over detectors. // The input object can be any of the following: @@ -1883,33 +1882,26 @@ vector>> derive_ranges( vector>> ivals; if (isNone(thread_intervals)) { - cout << "A\n "; // << RangesInt32::cc_count << "\n"; // It's None. Generate a single bunch with a single-thread covering all samples auto r = RangesInt32(n_time).add_interval(0, n_time); - vector> v(1, vector(n_det, r)); -// cout << "A - " << RangesInt32::cc_count << "\n"; ivals.push_back(v); - } else if(bp::extract(thread_intervals[0]).check()) { - cout << "B\n"; // It's a RangesMatrix (ndet,nranges). Promote to single thread, single bunch ivals.push_back(vector>(1, extract_ranges(thread_intervals))); } else if(bp::extract(thread_intervals[0][0]).check()) { - cout << "C\n"; // It's a per-thread RangesMatrix (nthread,ndet,nranges). Promote to single bunch - const int N = bp::len(thread_intervals); + int N = bp::len(thread_intervals); vector> bunch(N); for (int i=0; i(thread_intervals[i]); ivals.push_back(bunch); } else if(bp::extract(thread_intervals[0][0][0]).check()) { - cout << "D\n"; // It's a full multi-bunch (nbunch,nthread,ndet,nranges) thing. const int N = bp::len(thread_intervals); for (int i=0; i> bunch(M); for (int j=0; j(ti_i[j]); @@ -1919,9 +1911,6 @@ vector>> derive_ranges( // This should not happen assert(false); } - auto end1 = std::chrono::high_resolution_clock::now(); - std::chrono::duration elapsed1 = end1 - start; - // Check that these all have the right shape. Maybe consider a standard // for loop instead of foreach to give more useful error messages using // the index @@ -1942,12 +1931,6 @@ vector>> derive_ranges( } } } - - auto end2 = std::chrono::high_resolution_clock::now(); - std::chrono::duration elapsed2 = end2 - end1; - std::cout << "Block1: " << elapsed1.count() << " ms\n"; - std::cout << "Block2: " << elapsed2.count() << " ms\n"; - return ivals; } @@ -1980,8 +1963,12 @@ bp::object ProjectionEngine::to_map( // For multi-threading, the principle here is that we loop serially // over bunches, and then inside each block all threads loop over // all detectors in parallel, but the sample ranges are pixel-disjoint. + auto start = std::chrono::high_resolution_clock::now(); auto bunches = derive_ranges(thread_intervals, n_det, n_time, "thread_intervals"); + auto end2 = std::chrono::high_resolution_clock::now(); + std::chrono::duration elapsed2 = end2 - start; + std::cout << "Block2: " << elapsed2.count() << " ms\n"; // First loop over serial bunches for(int i_bunch = 0; i_bunch < bunches.size(); i_bunch++) { From 81f07a6872a1b59946efe5e3e2ce410c14d64e77 Mon Sep 17 00:00:00 2001 From: Matthew Hasselfield Date: Wed, 22 Apr 2026 12:47:37 -0400 Subject: [PATCH 7/7] final cleanup --- demos/demo_proj2.py | 8 ++------ src/Projection.cxx | 40 +++++++++++++++++----------------------- 2 files changed, 19 insertions(+), 29 deletions(-) diff --git a/demos/demo_proj2.py b/demos/demo_proj2.py index a0b12167..ae23207c 100644 --- a/demos/demo_proj2.py +++ b/demos/demo_proj2.py @@ -116,16 +116,12 @@ def get_pixelizor(emap): det_weights[n_det//2] = 0.0 # ... unless we mark it as noisy. # Then back to map. -print('setup cuts...', end='\n ... ') -with Timer(): - cuts = so3g.proj.RangesMatrix.ones(shape=(24, n_det, n_t)) - print('Create timestream...', end='\n ... ') with Timer(): - map1 = pe.to_map(None, ptg, ofs, resp, sig1, det_weights, cuts) + map1 = pe.to_map(None, ptg, ofs, resp, sig1, det_weights, None) # Get the weight map (matrix). -wmap1 = pe.to_weight_map(None, ptg, ofs, resp, det_weights, cuts) +wmap1 = pe.to_weight_map(None, ptg, ofs, resp, det_weights, None) wmap1[1,0] = wmap1[0,1] # fill in unpopulated entries... wmap1[2,0] = wmap1[0,2] wmap1[2,1] = wmap1[1,2] diff --git a/src/Projection.cxx b/src/Projection.cxx index da177816..ccc0a749 100644 --- a/src/Projection.cxx +++ b/src/Projection.cxx @@ -9,8 +9,6 @@ using namespace std; #include #include -#include - #ifdef _OPENMP # include #endif // ifdef _OPENMP @@ -1963,12 +1961,8 @@ bp::object ProjectionEngine::to_map( // For multi-threading, the principle here is that we loop serially // over bunches, and then inside each block all threads loop over // all detectors in parallel, but the sample ranges are pixel-disjoint. - auto start = std::chrono::high_resolution_clock::now(); auto bunches = derive_ranges(thread_intervals, n_det, n_time, "thread_intervals"); - auto end2 = std::chrono::high_resolution_clock::now(); - std::chrono::duration elapsed2 = end2 - start; - std::cout << "Block2: " << elapsed2.count() << " ms\n"; // First loop over serial bunches for(int i_bunch = 0; i_bunch < bunches.size(); i_bunch++) { @@ -2353,23 +2347,23 @@ int _index_count(const T &) { return T::index_count; } PYBINDINGS("so3g") { -// EXPORT_PIX(Flat); -// EXPORT_PIX(Quat); + EXPORT_PIX(Flat); + EXPORT_PIX(Quat); EXPORT_PIX(CAR); -// EXPORT_PIX(CEA); -// EXPORT_PIX(ARC); -// EXPORT_PIX(SIN); -// EXPORT_PIX(TAN); -// EXPORT_PIX(ZEA); -// -// EXPORT_PRECOMP(ProjEng_Precomp_NonTiled); -// EXPORT_PRECOMP(ProjEng_Precomp_Tiled); -// -// EXPORT_ENGINE(ProjEng_HP_T_NonTiled); -// EXPORT_ENGINE(ProjEng_HP_QU_NonTiled); -// EXPORT_ENGINE(ProjEng_HP_TQU_NonTiled); -// EXPORT_ENGINE(ProjEng_HP_T_Tiled); -// EXPORT_ENGINE(ProjEng_HP_QU_Tiled); -// EXPORT_ENGINE(ProjEng_HP_TQU_Tiled); + EXPORT_PIX(CEA); + EXPORT_PIX(ARC); + EXPORT_PIX(SIN); + EXPORT_PIX(TAN); + EXPORT_PIX(ZEA); + + EXPORT_PRECOMP(ProjEng_Precomp_NonTiled); + EXPORT_PRECOMP(ProjEng_Precomp_Tiled); + + EXPORT_ENGINE(ProjEng_HP_T_NonTiled); + EXPORT_ENGINE(ProjEng_HP_QU_NonTiled); + EXPORT_ENGINE(ProjEng_HP_TQU_NonTiled); + EXPORT_ENGINE(ProjEng_HP_T_Tiled); + EXPORT_ENGINE(ProjEng_HP_QU_Tiled); + EXPORT_ENGINE(ProjEng_HP_TQU_Tiled); }