From 2b7dd62aa024d7ce2963cf4e74f4fac277c26ff5 Mon Sep 17 00:00:00 2001
From: Gabriel Musat Mestre <gabriel.musatmestre@datadoghq.com>
Date: Sat, 23 May 2026 14:26:35 +0200
Subject: [PATCH] Support dynamic task count assignation

---
 .github/workflows/ci.yml                      |  43 +-
 Cargo.lock                                    |   1 +
 Cargo.toml                                    |   1 +
 benchmarks/cdk/bin/datafusion-bench.ts        |  10 +
 benchmarks/src/run.rs                         |   5 +
 src/common/mod.rs                             |   2 +
 src/common/recursion.rs                       |   1 +
 src/common/vec.rs                             |  80 +++
 src/coordinator/distributed.rs                |   8 +-
 src/coordinator/mod.rs                        |   1 +
 src/coordinator/prepare_dynamic_plan.rs       | 347 ++++++++++
 src/coordinator/prepare_static_plan.rs        |   1 +
 src/coordinator/query_coordinator.rs          |  52 +-
 src/distributed_ext.rs                        |  74 +++
 src/distributed_planner/distributed_config.rs |   9 +
 .../distributed_query_planner.rs              |   8 +
 .../inject_network_boundaries.rs              |  12 +-
 src/distributed_planner/mod.rs                |   6 +-
 src/distributed_planner/network_boundary.rs   |  61 +-
 .../prepare_network_boundaries.rs             |   7 +-
 .../benchmarks/shuffle_bench.rs               |   1 +
 .../benchmarks/transport_bench.rs             |   1 +
 src/execution_plans/mod.rs                    |   2 +
 src/execution_plans/network_broadcast.rs      |  13 +-
 src/execution_plans/network_coalesce.rs       |  20 +-
 src/execution_plans/network_shuffle.rs        |  18 +-
 src/execution_plans/sampler.rs                | 594 ++++++++++++++++++
 src/metrics/bytes_metric.rs                   |  11 +
 src/metrics/task_metrics_rewriter.rs          |   2 +
 src/protobuf/distributed_codec.rs             |  26 +-
 src/stage.rs                                  |  72 ++-
 src/work_unit_feed/remote_work_unit_feed.rs   |  10 +
 src/worker/generated/worker.rs                |  36 +-
 src/worker/impl_coordinator_channel.rs        |  36 +-
 src/worker/task_data.rs                       |   7 +-
 src/worker/worker.proto                       |  27 +
 tests/clickbench_correctness_test.rs          |   6 +-
 tests/metrics_collection.rs                   |  32 +
 tests/stateful_data_cleanup.rs                |  23 +-
 tests/tpcds_correctness_test.rs               |   6 +-
 tests/tpch_correctness_test.rs                |   6 +-
 41 files changed, 1597 insertions(+), 81 deletions(-)
 create mode 100644 src/common/vec.rs
 create mode 100644 src/coordinator/prepare_dynamic_plan.rs
 create mode 100644 src/execution_plans/sampler.rs

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index cc217c7b..3c5da3fd 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -41,19 +41,33 @@ jobs:
       - uses: ./.github/actions/setup
       - run: cargo test --features integration
 
-  tpch-test:
+  tpch-correctness-test:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        planning_mode: [ "adaptive", "static" ]
+    steps:
+      - uses: actions/checkout@v4
+      - uses: ./.github/actions/setup
+      - run: cargo test --features tpch --test tpch_correctness_test
+        env:
+          ADAPTIVE: ${{ matrix.planning_mode == 'adaptive' }}
+
+  tpch-plans-test:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
       - uses: ./.github/actions/setup
-      - run: cargo test --features tpch --test 'tpch_*'
+      - run: cargo test --features tpch --test tpch_plans_test
 
   tpcds-correctness-test:
     runs-on: ubuntu-latest
     strategy:
       fail-fast: false
       matrix:
-        shard: ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10"]
+        shard: [ "01", "02", "03", "04", "05", "06", "07", "08", "09", "10" ]
+        planning_mode: [ "adaptive", "static" ]
     steps:
       - uses: actions/checkout@v4
       - uses: ./.github/actions/setup
@@ -62,6 +76,8 @@ jobs:
           path: testdata/tpcds/main.zip
           key: "main.zip"
       - run: cargo test --features tpcds --test tpcds_correctness_test shard${{ matrix.shard }}
+        env:
+          ADAPTIVE: ${{ matrix.planning_mode == 'adaptive' }}
 
   tpcds-plans-test:
     runs-on: ubuntu-latest
@@ -74,7 +90,24 @@ jobs:
           key: "main.zip"
       - run: cargo test --features tpcds --test tpcds_plans_test
 
-  clickbench-test:
+  clickbench-correctness-test:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        planning_mode: [ "adaptive", "static" ]
+    steps:
+      - uses: actions/checkout@v4
+      - uses: ./.github/actions/setup
+      - uses: actions/cache@v4
+        with:
+          path: testdata/clickbench/
+          key: "data"
+      - run: cargo test --features clickbench --test clickbench_correctness_test
+        env:
+          ADAPTIVE: ${{ matrix.planning_mode == 'adaptive' }}
+
+  clickbench-plans-test:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
@@ -83,7 +116,7 @@ jobs:
         with:
           path: testdata/clickbench/
           key: "data"
-      - run: cargo test --features clickbench --test 'clickbench_*'
+      - run: cargo test --features clickbench --test clickbench_plans_test
 
   format-check:
     runs-on: ubuntu-latest
diff --git a/Cargo.lock b/Cargo.lock
index ebe698d2..7044adaa 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2208,6 +2208,7 @@ dependencies = [
  "insta",
  "itertools 0.14.0",
  "moka",
+ "num-traits",
  "object_store",
  "parquet",
  "pin-project",
diff --git a/Cargo.toml b/Cargo.toml
index 4d6e3e7e..fd5ceab2 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -48,6 +48,7 @@ moka = { version = "0.12", features = ["sync", "future"] }
 crossbeam-queue = "0.3"
 sysinfo = { version = "0.30", optional = true }
 sketches-ddsketch = { version = "0.3", features = ["use_serde"] }
+num-traits = "0.2"
 bincode = "1"
 tonic-prost = "0.14.2"
 
diff --git a/benchmarks/cdk/bin/datafusion-bench.ts b/benchmarks/cdk/bin/datafusion-bench.ts
index f5d15eea..f32af829 100644
--- a/benchmarks/cdk/bin/datafusion-bench.ts
+++ b/benchmarks/cdk/bin/datafusion-bench.ts
@@ -24,6 +24,8 @@ async function main() {
         .option('--max-tasks-per-stage <number>', 'Max tasks per stage', '0')
         .option('--repartition-file-min-size <number>', 'repartition_file_min_size DF option', '10485760' /* upstream default */)
         .option('--target-partitions <number>', 'target_partitions DF option', '8')
+        .option('--dynamic <boolean>', 'Use the dynamic task count assigner', 'false')
+        .option('--bytes-per-partition-per-second <number>', 'Target throughput in bytes per partition per second for the dynamic task count allocator', `${16 * 1024 * 1024}`)
         .option('--queries <string>', 'Specific queries to run', undefined)
         .option('--debug <boolean>', 'Print the generated plans to stdout')
         .option('--warmup <boolean>', 'Perform a warmup query before the benchmarks', 'true')
@@ -46,6 +48,8 @@ async function main() {
     const childrenIsolatorUnions = options.childrenIsolatorUnions === 'true' || options.childrenIsolatorUnions === 1
     const broadcastJoins = options.broadcastJoins === 'true' || options.broadcastJoins === 1
     const partialReduce = options.partialReduce === 'true' || options.partialReduce === 1
+    const dynamicTaskCount = options.dynamic === 'true' || options.dynamic === 1
+    const bytesPerPartitionPerSecond = parseInt(options.bytesPerPartitionPerSecond)
     const debug = options.debug === true || options.debug === 'true' || options.debug === 1
     const warmup = options.warmup === true || options.warmup === 'true' || options.warmup === 1
 
@@ -59,6 +63,8 @@ async function main() {
         compression,
         broadcastJoins,
         partialReduce,
+        dynamicTaskCount,
+        bytesPerPartitionPerSecond,
         maxTasksPerStage,
         repartitionFileMinSize,
         targetPartitions
@@ -98,6 +104,8 @@ class DataFusionRunner implements BenchmarkRunner {
         childrenIsolatorUnions: boolean;
         broadcastJoins: boolean;
         partialReduce: boolean;
+        dynamicTaskCount: boolean;
+        bytesPerPartitionPerSecond: number;
         maxTasksPerStage: number;
         repartitionFileMinSize: number;
         targetPartitions: number;
@@ -177,6 +185,8 @@ class DataFusionRunner implements BenchmarkRunner {
       SET distributed.children_isolator_unions=${this.options.childrenIsolatorUnions};
       SET distributed.broadcast_joins=${this.options.broadcastJoins};
       SET distributed.partial_reduce=${this.options.partialReduce};
+      SET distributed.dynamic_task_count=${this.options.dynamicTaskCount};
+      SET distributed.bytes_per_partition_per_second=${this.options.bytesPerPartitionPerSecond};
       SET distributed.max_tasks_per_stage=${this.options.maxTasksPerStage};
       SET datafusion.optimizer.repartition_file_min_size=${this.options.repartitionFileMinSize};
       SET datafusion.execution.target_partitions=${this.options.targetPartitions};
diff --git a/benchmarks/src/run.rs b/benchmarks/src/run.rs
index 94345f30..7ecaee98 100644
--- a/benchmarks/src/run.rs
+++ b/benchmarks/src/run.rs
@@ -106,6 +106,10 @@ pub struct RunOpt {
     #[structopt(long, default_value = "0")]
     max_tasks_per_stage: usize,
 
+    /// Activate dynamic task count
+    #[structopt(long)]
+    dynamic: bool,
+
     /// Number of iterations of each test run
     #[structopt(short = "i", long = "iterations", default_value = "5")]
     iterations: usize,
@@ -203,6 +207,7 @@ impl RunOpt {
             .with_distributed_cardinality_effect_task_scale_factor(
                 self.cardinality_task_sf.unwrap_or(1.0),
             )?
+            .with_distributed_dynamic_task_count(self.dynamic)?
             .with_distributed_compression(match self.compression.as_str() {
                 "zstd" => Some(CompressionType::ZSTD),
                 "lz4" => Some(CompressionType::LZ4_FRAME),
diff --git a/src/common/mod.rs b/src/common/mod.rs
index bf9ed549..18cb28a1 100644
--- a/src/common/mod.rs
+++ b/src/common/mod.rs
@@ -5,6 +5,7 @@ mod recursion;
 mod task_context_helpers;
 mod time;
 mod uuid;
+mod vec;
 
 pub(crate) use children_helpers::require_one_child;
 pub(crate) use on_drop_stream::on_drop_stream;
@@ -13,3 +14,4 @@ pub(crate) use recursion::TreeNodeExt;
 pub(crate) use task_context_helpers::task_ctx_with_extension;
 pub(crate) use time::now_ns;
 pub(crate) use uuid::{deserialize_uuid, serialize_uuid};
+pub(crate) use vec::{element_wise_sum, vec_avg_reduce, vec_cast, vec_div, vec_mul};
diff --git a/src/common/recursion.rs b/src/common/recursion.rs
index 2f2b9463..5c16116e 100644
--- a/src/common/recursion.rs
+++ b/src/common/recursion.rs
@@ -589,6 +589,7 @@ mod tests {
                 query_id: uuid::Uuid::nil(),
                 num: 0,
                 workers: vec![],
+                runtime_stats: None,
             }))
             .unwrap()
     }
diff --git a/src/common/vec.rs b/src/common/vec.rs
new file mode 100644
index 00000000..ec62678b
--- /dev/null
+++ b/src/common/vec.rs
@@ -0,0 +1,80 @@
+use datafusion::common::internal_err;
+use datafusion::error::Result;
+use num_traits::AsPrimitive;
+use std::ops::{AddAssign, DivAssign, MulAssign};
+
+/// Converts a slice of type `I` to a `Vec<O>` using `as`-style primitive casting.
+pub(crate) fn vec_cast<I, O>(input: &[I]) -> Vec<O>
+where
+    I: AsPrimitive<O>,
+    O: Copy + 'static,
+{
+    input.iter().map(|v| v.as_()).collect()
+}
+
+/// Adds each element of `other` into the corresponding element of `one`, converting types via `AsPrimitive`.
+pub(crate) fn element_wise_sum<I, O>(mut one: Vec<I>, other: &[O]) -> Result<Vec<I>>
+where
+    I: AddAssign + Copy + 'static,
+    O: AsPrimitive<I> + 'static,
+{
+    if one.len() != other.len() {
+        return internal_err!("Cannot do an element wise sum of two vectors of different lengths");
+    }
+    for i in 0..one.len() {
+        one[i] += other[i].as_();
+    }
+    Ok(one)
+}
+
+/// Multiplies every element of `one` by the scalar `other`, converting types via `AsPrimitive`.
+pub(crate) fn vec_mul<I, O>(mut one: Vec<I>, other: O) -> Vec<I>
+where
+    I: MulAssign + Copy + 'static,
+    O: AsPrimitive<I> + 'static,
+{
+    for el in one.iter_mut() {
+        *el *= other.as_();
+    }
+    one
+}
+
+/// Divides every element of `one` by the scalar `other`, converting types via `AsPrimitive`.
+pub(crate) fn vec_div<I, O>(mut one: Vec<I>, other: O) -> Vec<I>
+where
+    I: DivAssign + Copy + 'static,
+    O: AsPrimitive<I> + 'static,
+{
+    for el in one.iter_mut() {
+        *el /= other.as_();
+    }
+    one
+}
+
+/// Reduces a collection of same-length `f32` vectors into a single vector by averaging element-wise.
+/// Empty inner vecs are skipped; returns an empty vec if all inputs are empty.
+pub(crate) fn vec_avg_reduce(vecs: Vec<Vec<f32>>) -> Result<Vec<f32>> {
+    let sample_count = vecs.len();
+    let mut iter = vecs.into_iter();
+    let mut acc = loop {
+        let Some(v) = iter.next() else {
+            return Ok(vec![]);
+        };
+        if !v.is_empty() {
+            break v;
+        }
+    };
+    for v in iter {
+        if v.is_empty() {
+            continue;
+        } else if acc.len() != v.len() {
+            return internal_err!(
+                "vec_avg_reduce: length mismatch — first vec has {} elements, got {}",
+                acc.len(),
+                v.len()
+            );
+        }
+        acc = element_wise_sum(acc, &v)?;
+    }
+    Ok(vec_div(acc, sample_count as f32))
+}
diff --git a/src/coordinator/distributed.rs b/src/coordinator/distributed.rs
index fe1bbff3..d7d62a08 100644
--- a/src/coordinator/distributed.rs
+++ b/src/coordinator/distributed.rs
@@ -1,5 +1,7 @@
+use crate::DistributedConfig;
 use crate::common::{require_one_child, serialize_uuid};
 use crate::coordinator::metrics_store::MetricsStore;
+use crate::coordinator::prepare_dynamic_plan::prepare_dynamic_plan;
 use crate::coordinator::prepare_static_plan::prepare_static_plan;
 use crate::coordinator::query_coordinator::QueryCoordinator;
 use crate::distributed_planner::NetworkBoundaryExt;
@@ -198,7 +200,11 @@ impl ExecutionPlan for DistributedExec {
         builder.spawn(async move {
             let _guard = query_coordinator.end_query_guard();
 
-            let result = prepare_static_plan(&query_coordinator, &base_plan)?;
+            let d_cfg = DistributedConfig::from_config_options(context.session_config().options())?;
+            let result = match d_cfg.dynamic_task_count {
+                true => prepare_dynamic_plan(&query_coordinator, &base_plan).await?,
+                false => prepare_static_plan(&query_coordinator, &base_plan)?,
+            };
 
             plan_for_viz
                 .lock()
diff --git a/src/coordinator/mod.rs b/src/coordinator/mod.rs
index 8fe771d3..c1a8a8dd 100644
--- a/src/coordinator/mod.rs
+++ b/src/coordinator/mod.rs
@@ -1,6 +1,7 @@
 mod distributed;
 mod latency_metric;
 mod metrics_store;
+mod prepare_dynamic_plan;
 mod prepare_static_plan;
 mod query_coordinator;
 
diff --git a/src/coordinator/prepare_dynamic_plan.rs b/src/coordinator/prepare_dynamic_plan.rs
new file mode 100644
index 00000000..12d49002
--- /dev/null
+++ b/src/coordinator/prepare_dynamic_plan.rs
@@ -0,0 +1,347 @@
+use crate::TaskCountAnnotation::{Desired, Maximum};
+use crate::common::{TreeNodeExt, element_wise_sum, vec_avg_reduce, vec_div, vec_mul};
+use crate::coordinator::distributed::PreparedPlan;
+use crate::coordinator::query_coordinator::QueryCoordinator;
+use crate::distributed_planner::{
+    InjectNetworkBoundaryContext, NetworkBoundaryBuilderResult, ProducerHead, calculate_cost,
+    inject_network_boundaries,
+};
+use crate::execution_plans::SamplerExec;
+use crate::stage::{LocalStage, RemoteStage};
+use crate::worker::generated::worker as pb;
+use crate::{BytesCounterMetric, NetworkBoundaryExt, NetworkCoalesceExec, Stage};
+use dashmap::DashMap;
+use datafusion::common::stats::Precision;
+use datafusion::common::tree_node::{Transformed, TreeNode, TreeNodeRecursion};
+use datafusion::common::{Result, exec_err, plan_err};
+use datafusion::physical_plan::metrics::MetricsSet;
+use datafusion::physical_plan::{
+    ColumnStatistics, ExecutionPlan, ExecutionPlanProperties, Statistics,
+};
+use futures::{Stream, StreamExt};
+use std::any::TypeId;
+use std::sync::Arc;
+use tokio_stream::wrappers::UnboundedReceiverStream;
+
+pub(super) async fn prepare_dynamic_plan(
+    query_coordinator: &QueryCoordinator,
+    base_plan: &Arc<dyn ExecutionPlan>,
+) -> Result<PreparedPlan> {
+    let plans_for_viz = Arc::new(PlanReconstructor::default());
+
+    let head_stage = inject_network_boundaries(
+        Arc::clone(base_plan),
+        |mut input_stage: LocalStage, nb_type: TypeId, nb_ctx: &InjectNetworkBoundaryContext| {
+            let mut metrics = MetricsSet::new();
+
+            // At this point, input_stage.plan has two kind of leaf nodes:
+            // - The ones that naturally do not read from any children, like DataSourceExec
+            // - Network boundaries whose Stage was set to Stage::Remote by a previous iteration
+            //   of this same function.
+            // Both types of leaf nodes contain very valuable and accurate statistics that are used
+            // here for computing an estimation of the compute cost (measured in bytes):
+            // - DataSourceExec (or natural leaf nodes) contain stats pulled directly from their
+            //   data source, like parquet files.
+            // - Network boundaries contain statistics collected from runtime information, gathered
+            //   by the SamplerExec injected by this same function.
+            let compute_cost = calculate_cost(&input_stage.plan, nb_ctx.d_cfg)?;
+            metrics.push(BytesCounterMetric::new_metric("compute_cost", compute_cost));
+            let compute_based_task_count = compute_cost
+                .div_ceil(nb_ctx.d_cfg.bytes_per_partition_per_second.max(1))
+                .div_ceil(input_stage.plan.output_partitioning().partition_count())
+                .clamp(1, nb_ctx.max_tasks()?);
+            let task_count = nb_ctx
+                .task_count(&input_stage.plan)?
+                .merge(Desired(compute_based_task_count));
+
+            // Propagate the final task_count inferred based on runtime statistics and compute cost.
+            // Here is where leaf nodes are scaled up by TaskEstimator::scale_up_leaf_node, and the
+            // plan is finally left ready for distribution.
+            input_stage.plan = nb_ctx
+                .propagate_task_count_until_network_boundaries(&input_stage.plan, task_count)?;
+            input_stage.tasks = task_count.as_usize();
+            // In order to infer the compute the cost of the stage above this one, here a sampler
+            // is injected to gather runtime statistics.
+            input_stage.plan = ProducerHead::insert_sampler(input_stage.plan)?;
+
+            let mut stage_coordinator = query_coordinator.stage_coordinator(&input_stage);
+
+            let mut workers = Vec::with_capacity(input_stage.tasks);
+            let mut load_info_rxs = Vec::with_capacity(input_stage.tasks);
+
+            let routed_urls = if input_stage.tasks == 1 {
+                // If there's an input stage with a single worker, and the current stage is also
+                // going to run in a single worker, we want to co-locate them so that unnecessary
+                // network transfers are avoided.
+                match stage_coordinator.find_input_stage_with_single_url() {
+                    Some(single_url) => vec![single_url],
+                    None => stage_coordinator.routed_urls()?,
+                }
+            } else {
+                stage_coordinator.routed_urls()?
+            };
+
+            for (i, routed_url) in routed_urls.into_iter().enumerate() {
+                workers.push(routed_url.clone());
+                // Spawns the task that feeds this subplan to this worker. There will be as
+                // many as this spawned tasks as workers.
+                let (worker_tx, worker_rx) = stage_coordinator.send_plan_task(i, routed_url)?;
+                load_info_rxs.push({
+                    let rx = stage_coordinator.worker_to_coordinator_task(i, worker_rx);
+                    UnboundedReceiverStream::new(rx)
+                });
+                stage_coordinator.coordinator_to_worker_task(i, worker_tx)?;
+            }
+
+            let plans_for_viz = Arc::clone(&plans_for_viz);
+            Ok(async move {
+                let (stats, consumer_tc) = if nb_type == TypeId::of::<NetworkCoalesceExec>() {
+                    (None, Maximum(1))
+                } else {
+                    let stats = gather_runtime_statistics(load_info_rxs, &input_stage.plan).await?;
+                    let sampled_bytes = *stats.total_byte_size.get_value().unwrap_or(&0);
+                    metrics.push(BytesCounterMetric::new_metric(
+                        "sampled_bytes",
+                        sampled_bytes,
+                    ));
+                    // returning Desired(1) here is our way to tell the planner that we don't care
+                    // about the task count assigned to the network boundary in the consumer stage,
+                    // and we don't want it to affect other task count decisions.
+                    (Some(Arc::new(stats)), Desired(1))
+                };
+
+                // Capture the output partitioning of the (rescaled, sampler-wrapped) input plan
+                // before it's moved: the returned stage is remote and carries no plan to read it
+                // back from.
+                let input_properties = Arc::clone(input_stage.plan.properties());
+                plans_for_viz.insert(input_stage.num, input_stage.plan, metrics);
+                Ok(NetworkBoundaryBuilderResult {
+                    consumer_task_count: consumer_tc,
+                    input_stage: Stage::Remote(RemoteStage {
+                        query_id: input_stage.query_id,
+                        num: input_stage.num,
+                        workers,
+                        runtime_stats: stats,
+                    }),
+                    input_properties,
+                })
+            })
+        },
+        query_coordinator.session_config().options(),
+    )
+    .await?;
+
+    Ok(PreparedPlan {
+        plan_for_viz: plans_for_viz.reconstruct(&head_stage)?,
+        head_stage,
+    })
+}
+
+/// Reconstructs the plan dynamically as stages get transitioned to Remote and get sent to the
+/// respective workers.
+///
+/// As the [prepare_dynamic_plan] function recurses and progressively sends the plan to workers, the
+/// original plan gets modified, and subplans belong to the different [Stage]s get lost as they get
+/// transitioned to [Stage::Remote].
+///
+/// This struct is in charge of tracking the [prepare_dynamic_plan] process and storing the final
+/// version of all the subplans so that it can be reconstructed into a fully blown plan for
+/// visualization purposes.
+#[derive(Default)]
+struct PlanReconstructor {
+    stage_map: DashMap<usize, (Arc<dyn ExecutionPlan>, MetricsSet)>,
+}
+
+impl PlanReconstructor {
+    fn insert(&self, stage: usize, plan: Arc<dyn ExecutionPlan>, metrics_set: MetricsSet) {
+        self.stage_map.insert(stage, (plan, metrics_set));
+    }
+
+    fn reconstruct(&self, head_stage: &Arc<dyn ExecutionPlan>) -> Result<Arc<dyn ExecutionPlan>> {
+        let head_stage = Arc::clone(head_stage);
+        let reconstructed = head_stage.transform_down_with_task_count(1, |plan, tc| {
+            let Some(nb) = plan.as_network_boundary() else {
+                return Ok(Transformed::no(plan));
+            };
+            let input_stage = nb.input_stage();
+            let Some((_, entry)) = self.stage_map.remove(&input_stage.num()) else {
+                return exec_err!(
+                    "Failed to retrieve plan for stage {} for visualization purposes",
+                    input_stage.num()
+                );
+            };
+            let (plan_for_viz, metrics_set) = entry;
+
+            let plan_for_viz = nb.producer_head(tc).insert(plan_for_viz)?;
+
+            let nb = nb.with_input_stage(Stage::Local(LocalStage {
+                query_id: input_stage.query_id(),
+                num: input_stage.num(),
+                plan: plan_for_viz,
+                tasks: input_stage.task_count(),
+                metrics_set,
+            }))?;
+
+            Ok(Transformed::yes(nb))
+        })?;
+        Ok(reconstructed.data)
+    }
+}
+
+/// Estimates the bytes per second flowing through a stage by reading sample information.
+async fn gather_runtime_statistics(
+    per_task_load_info_stream: Vec<impl Stream<Item = pb::LoadInfo> + Unpin>,
+    plan: &Arc<dyn ExecutionPlan>,
+) -> Result<Statistics> {
+    const ESTIMATED_QUERY_TIME_S: usize = 10;
+    const BYTES_READY_SAMPLE_PERCENTAGE: f32 = 0.2;
+    const BYTES_PER_SECOND_SAMPLE_PERCENTAGE: f32 = 0.2;
+
+    let Some(sampler) = find_sampler(plan) else {
+        return plan_err!("Mising SamplerExec while gathering load report");
+    };
+    let n_cols = sampler.schema().fields.len();
+
+    fn apply_pct(value: usize, pct: f32) -> usize {
+        (value as f32 * pct).round() as usize
+    }
+
+    let partitions_per_task = sampler.partition_samplers.len();
+    let task_count = per_task_load_info_stream.len();
+    let total_partitions = partitions_per_task * task_count;
+
+    let mut partitions_with_bytes_per_second_done = 0;
+    let mut partitions_with_bytes_ready_done = 0;
+    let mut partitions_done = 0;
+    let mut rows_ready = 0;
+    let mut rows_per_second = 0;
+    let mut per_col_bytes_ready = vec![0usize; n_cols];
+    let mut per_col_bytes_per_second = vec![0usize; n_cols];
+
+    let mut ndv_pct = vec![];
+    let mut null_pct = vec![];
+
+    let mut load_info_stream = futures::stream::select_all(per_task_load_info_stream);
+    while let Some(load_info) = load_info_stream.next().await {
+        rows_per_second += load_info.rows_per_second as usize;
+        rows_ready += load_info.rows_ready as usize;
+        per_col_bytes_per_second = element_wise_sum(
+            per_col_bytes_per_second,
+            &load_info.per_column_bytes_per_second,
+        )?;
+        per_col_bytes_ready =
+            element_wise_sum(per_col_bytes_ready, &load_info.per_column_bytes_ready)?;
+        ndv_pct.push(load_info.per_column_ndv_percentage);
+        null_pct.push(load_info.per_column_null_percentage);
+
+        partitions_with_bytes_per_second_done +=
+            load_info.per_column_bytes_per_second.iter().any(|v| *v > 0) as usize;
+        partitions_with_bytes_ready_done +=
+            load_info.per_column_bytes_ready.iter().any(|v| *v > 0) as usize;
+        partitions_done += 1;
+
+        // Short circuit if we collected enough bytes_ready measurements.
+        if partitions_with_bytes_ready_done
+            >= apply_pct(total_partitions, BYTES_READY_SAMPLE_PERCENTAGE).max(1)
+        {
+            break;
+        }
+
+        // Short circuit if we collected enough bytes_per_second measurements.
+        if partitions_with_bytes_per_second_done
+            >= apply_pct(total_partitions, BYTES_PER_SECOND_SAMPLE_PERCENTAGE).max(1)
+        {
+            break;
+        }
+
+        // Short circuit if there are no further partitions remaining to sample from.
+        if partitions_done == total_partitions {
+            break;
+        }
+    }
+
+    if partitions_done == 0 {
+        return Ok(zero_stats(plan.schema().fields.len()));
+    }
+
+    let per_col_bytes_ready = vec_div(
+        vec_mul(per_col_bytes_ready, total_partitions),
+        partitions_done,
+    );
+    let per_col_bytes_per_second = vec_div(
+        vec_mul(per_col_bytes_per_second, total_partitions),
+        partitions_done,
+    );
+
+    let rows_ready = rows_ready * total_partitions / partitions_done;
+    let rows_per_second = rows_per_second * total_partitions / partitions_done;
+
+    let total_num_rows = rows_ready + rows_per_second * ESTIMATED_QUERY_TIME_S;
+
+    if total_num_rows == 0 {
+        return Ok(zero_stats(n_cols));
+    }
+
+    let per_col_byte_size = element_wise_sum(
+        per_col_bytes_ready,
+        &vec_mul(per_col_bytes_per_second, ESTIMATED_QUERY_TIME_S),
+    )?;
+    let total_byte_size: usize = per_col_byte_size.iter().sum();
+
+    let ndv_pct = vec_avg_reduce(ndv_pct)?;
+    if ndv_pct.len() != n_cols {
+        return plan_err!("Expected {n_cols} ndv values, but got {}", ndv_pct.len());
+    }
+    let null_pct = vec_avg_reduce(null_pct)?;
+    if null_pct.len() != n_cols {
+        return plan_err!("Expected {n_cols} null values, but got {}", null_pct.len());
+    }
+
+    Ok(Statistics {
+        num_rows: Precision::Inexact(total_num_rows),
+        total_byte_size: Precision::Inexact(total_byte_size),
+        column_statistics: ndv_pct
+            .into_iter()
+            .zip(null_pct)
+            .zip(per_col_byte_size)
+            .map(|((ndv, null), col_bytes)| ColumnStatistics {
+                null_count: Precision::Inexact((null * total_num_rows as f32) as usize),
+                distinct_count: Precision::Inexact((ndv * total_num_rows as f32) as usize),
+                byte_size: Precision::Inexact(col_bytes),
+                max_value: Precision::Absent,
+                min_value: Precision::Absent,
+                sum_value: Precision::Absent,
+            })
+            .collect(),
+    })
+}
+
+fn find_sampler(plan: &Arc<dyn ExecutionPlan>) -> Option<&SamplerExec> {
+    let mut sampler = None;
+    plan.apply(|plan| {
+        if let Some(node) = plan.downcast_ref::<SamplerExec>() {
+            sampler = Some(node);
+            return Ok(TreeNodeRecursion::Stop);
+        };
+        Ok(TreeNodeRecursion::Continue)
+    })
+    .expect("Cannot fail");
+    sampler
+}
+
+fn zero_stats(n_cols: usize) -> Statistics {
+    Statistics {
+        num_rows: Precision::Exact(0),
+        total_byte_size: Precision::Exact(0),
+        column_statistics: (0..n_cols)
+            .map(|_| ColumnStatistics {
+                null_count: Precision::Exact(0),
+                max_value: Precision::Absent,
+                min_value: Precision::Absent,
+                sum_value: Precision::Absent,
+                distinct_count: Precision::Exact(0),
+                byte_size: Precision::Exact(0),
+            })
+            .collect(),
+    }
+}
diff --git a/src/coordinator/prepare_static_plan.rs b/src/coordinator/prepare_static_plan.rs
index 65d74276..3bfc2d2f 100644
--- a/src/coordinator/prepare_static_plan.rs
+++ b/src/coordinator/prepare_static_plan.rs
@@ -50,6 +50,7 @@ pub(super) fn prepare_static_plan(
                 query_id: stage.query_id,
                 num: stage.num,
                 workers,
+                runtime_stats: None,
             },
         ))?))
     })?;
diff --git a/src/coordinator/query_coordinator.rs b/src/coordinator/query_coordinator.rs
index 3d19c750..6f0010e6 100644
--- a/src/coordinator/query_coordinator.rs
+++ b/src/coordinator/query_coordinator.rs
@@ -12,17 +12,19 @@ use crate::worker::generated::worker::coordinator_to_worker_msg::Inner;
 use crate::worker::generated::worker::set_plan_request::WorkUnitFeedDeclaration;
 use crate::{
     BytesCounterMetric, BytesMetricExt, DISTRIBUTED_DATAFUSION_TASK_ID_LABEL, DistributedCodec,
-    DistributedConfig, DistributedTaskContext, DistributedWorkUnitFeedContext, TaskEstimator,
-    TaskKey, TaskRoutingContext, get_distributed_channel_resolver, get_distributed_worker_resolver,
+    DistributedConfig, DistributedTaskContext, DistributedWorkUnitFeedContext, NetworkBoundaryExt,
+    Stage, TaskEstimator, TaskKey, TaskRoutingContext, get_distributed_channel_resolver,
+    get_distributed_worker_resolver,
 };
 use datafusion::common::instant::Instant;
 use datafusion::common::runtime::JoinSet;
-use datafusion::common::tree_node::{Transformed, TreeNodeRecursion};
+use datafusion::common::tree_node::{Transformed, TreeNode, TreeNodeRecursion};
 use datafusion::common::{DataFusionError, exec_datafusion_err};
 use datafusion::common::{Result, exec_err};
 use datafusion::execution::TaskContext;
 use datafusion::physical_expr_common::metrics::{ExecutionPlanMetricsSet, Label, MetricBuilder};
 use datafusion::physical_plan::ExecutionPlan;
+use datafusion::prelude::SessionConfig;
 use datafusion_proto::physical_plan::AsExecutionPlan;
 use datafusion_proto::protobuf::PhysicalPlanNode;
 use futures::{Stream, StreamExt};
@@ -88,6 +90,11 @@ impl QueryCoordinator {
         }
     }
 
+    /// Returns the [SessionConfig] for the current query.
+    pub(super) fn session_config(&self) -> &SessionConfig {
+        self.task_ctx.session_config()
+    }
+
     /// returns a guard that, when dropped, it signals all the coordinator->worker connections that
     /// the query is finished, ending them, and propagating the EOS to the workers so that they can
     /// clean up any remaining state.
@@ -200,8 +207,8 @@ impl<'a> StageCoordinator<'a> {
             let mut worker_to_coordinator_stream = response.into_inner();
             while let Some(msg_or_err) = worker_to_coordinator_stream.next().await {
                 let msg = msg_or_err.map_err(|err| {
-                    tonic_status_to_datafusion_error(err).unwrap_or_else(|| {
-                        exec_datafusion_err!("Unknown error on worker to coordinator stream")
+                    tonic_status_to_datafusion_error(&err).unwrap_or_else(|| {
+                        exec_datafusion_err!("Unknown error on worker to coordinator stream: {err}")
                     })
                 })?;
                 if worker_to_coordinator_tx.send(msg).is_err() {
@@ -221,13 +228,15 @@ impl<'a> StageCoordinator<'a> {
         &mut self,
         task_i: usize,
         mut worker_to_coordinator_rx: UnboundedReceiver<pb::WorkerToCoordinatorMsg>,
-    ) {
+    ) -> UnboundedReceiver<pb::LoadInfo> {
         let task_key = TaskKey {
             query_id: serialize_uuid(&self.query_id),
             stage_id: self.stage_id as u64,
             task_number: task_i as u64,
         };
         let task_metrics = self.metrics_store.clone();
+        let (load_info_tx, load_info_rx) = tokio::sync::mpsc::unbounded_channel();
+        let mut load_info_tx_opt = Some(load_info_tx);
 
         // Cannot use self.join_set because that's tied to the lifetime of the query, and the
         // metrics collection process might outlive the query's lifetime.
@@ -242,9 +251,18 @@ impl<'a> StageCoordinator<'a> {
                             task_metrics.insert(task_key.clone(), pre_order_metrics);
                         }
                     }
+                    pb::worker_to_coordinator_msg::Inner::LoadInfo(load_info) => {
+                        if let Some(tx) = &load_info_tx_opt {
+                            let _ = tx.send(load_info);
+                        }
+                    }
+                    pb::worker_to_coordinator_msg::Inner::LoadInfoEos(_) => {
+                        let _ = load_info_tx_opt.take();
+                    }
                 }
             }
         });
+        load_info_rx
     }
 
     /// Spawns a background task in charge of sending messages to workers. Some things that are sent
@@ -401,6 +419,28 @@ impl<'a> StageCoordinator<'a> {
         }
         Ok(routed_urls)
     }
+
+    pub(super) fn find_input_stage_with_single_url(&self) -> Option<Url> {
+        let mut single_stage_url = None;
+        self.plan
+            .apply(|plan| {
+                let Some(nb) = plan.as_network_boundary() else {
+                    return Ok(TreeNodeRecursion::Continue);
+                };
+
+                if let Stage::Remote(remote) = nb.input_stage()
+                    && remote.workers.len() == 1
+                {
+                    single_stage_url = Some(remote.workers[0].clone());
+                    return Ok(TreeNodeRecursion::Stop);
+                }
+
+                Ok(TreeNodeRecursion::Jump)
+            })
+            .expect("Cannot fail");
+
+        single_stage_url
+    }
 }
 
 fn keep_stream_alive<T: 'static>(notify: Arc<Notify>) -> impl Stream<Item = T> + 'static {
diff --git a/src/distributed_ext.rs b/src/distributed_ext.rs
index 17852a04..d33b18a6 100644
--- a/src/distributed_ext.rs
+++ b/src/distributed_ext.rs
@@ -577,6 +577,27 @@ pub trait DistributedExt: Sized {
         P: WorkUnitFeedProvider + 'static,
         P::WorkUnit: 'static,
         F: Fn(&T) -> Option<&WorkUnitFeed<P>> + Send + Sync + 'static;
+
+    /// Dynamically allocates tasks to the different stages based on runtime statistics
+    /// collected during execution.
+    fn with_distributed_dynamic_task_count(self, enabled: bool) -> Result<Self, DataFusionError>;
+
+    /// Same as [DistributedExt::with_distributed_dynamic_task_count] but with an in-place mutation.
+    fn set_distributed_dynamic_task_count(&mut self, enabled: bool) -> Result<(), DataFusionError>;
+
+    /// Target throughput in bytes per partition per second used by the dynamic task count
+    /// allocator to decide how many tasks to assign to each stage based on runtime statistics.
+    fn with_distributed_bytes_per_partition_per_second(
+        self,
+        bytes_per_partition_per_second: usize,
+    ) -> Result<Self, DataFusionError>;
+
+    /// Same as [DistributedExt::with_distributed_bytes_per_partition_per_second] but with an
+    /// in-place mutation.
+    fn set_distributed_bytes_per_partition_per_second(
+        &mut self,
+        bytes_per_partition_per_second: usize,
+    ) -> Result<(), DataFusionError>;
 }
 
 impl DistributedExt for SessionConfig {
@@ -722,6 +743,21 @@ impl DistributedExt for SessionConfig {
         })
     }
 
+    fn set_distributed_dynamic_task_count(&mut self, enabled: bool) -> Result<(), DataFusionError> {
+        let d_cfg = DistributedConfig::from_config_options_mut(self.options_mut())?;
+        d_cfg.dynamic_task_count = enabled;
+        Ok(())
+    }
+
+    fn set_distributed_bytes_per_partition_per_second(
+        &mut self,
+        bytes_per_partition_per_second: usize,
+    ) -> Result<(), DataFusionError> {
+        let d_cfg = DistributedConfig::from_config_options_mut(self.options_mut())?;
+        d_cfg.bytes_per_partition_per_second = bytes_per_partition_per_second;
+        Ok(())
+    }
+
     delegate! {
         to self {
             #[call(set_distributed_option_extension)]
@@ -804,6 +840,14 @@ impl DistributedExt for SessionConfig {
                 P: WorkUnitFeedProvider + 'static,
                 P::WorkUnit: 'static,
                 F: Fn(&T) -> Option<&WorkUnitFeed<P>> + Send + Sync + 'static;
+
+            #[call(set_distributed_dynamic_task_count)]
+            #[expr($?;Ok(self))]
+            fn with_distributed_dynamic_task_count(mut self, enabled: bool) -> Result<Self, DataFusionError>;
+
+            #[call(set_distributed_bytes_per_partition_per_second)]
+            #[expr($?;Ok(self))]
+            fn with_distributed_bytes_per_partition_per_second(mut self, bytes_per_partition_per_second: usize) -> Result<Self, DataFusionError>;
         }
     }
 }
@@ -915,6 +959,16 @@ impl DistributedExt for SessionStateBuilder {
                 P: WorkUnitFeedProvider + 'static,
                 P::WorkUnit: 'static,
                 F: Fn(&T) -> Option<&WorkUnitFeed<P>> + Send + Sync + 'static;
+
+            fn set_distributed_dynamic_task_count(&mut self, enabled: bool) -> Result<(), DataFusionError>;
+            #[call(set_distributed_dynamic_task_count)]
+            #[expr($?;Ok(self))]
+            fn with_distributed_dynamic_task_count(mut self, enabled: bool) -> Result<Self, DataFusionError>;
+
+            fn set_distributed_bytes_per_partition_per_second(&mut self, bytes_per_partition_per_second: usize) -> Result<(), DataFusionError>;
+            #[call(set_distributed_bytes_per_partition_per_second)]
+            #[expr($?;Ok(self))]
+            fn with_distributed_bytes_per_partition_per_second(mut self, bytes_per_partition_per_second: usize) -> Result<Self, DataFusionError>;
         }
     }
 }
@@ -1026,6 +1080,16 @@ impl DistributedExt for SessionState {
                 P: WorkUnitFeedProvider + 'static,
                 P::WorkUnit: 'static,
                 F: Fn(&T) -> Option<&WorkUnitFeed<P>> + Send + Sync + 'static;
+
+            fn set_distributed_dynamic_task_count(&mut self, enabled: bool) -> Result<(), DataFusionError>;
+            #[call(set_distributed_dynamic_task_count)]
+            #[expr($?;Ok(self))]
+            fn with_distributed_dynamic_task_count(mut self, enabled: bool) -> Result<Self, DataFusionError>;
+
+            fn set_distributed_bytes_per_partition_per_second(&mut self, bytes_per_partition_per_second: usize) -> Result<(), DataFusionError>;
+            #[call(set_distributed_bytes_per_partition_per_second)]
+            #[expr($?;Ok(self))]
+            fn with_distributed_bytes_per_partition_per_second(mut self, bytes_per_partition_per_second: usize) -> Result<Self, DataFusionError>;
         }
     }
 }
@@ -1137,6 +1201,16 @@ impl DistributedExt for SessionContext {
                 P: WorkUnitFeedProvider + 'static,
                 P::WorkUnit: 'static,
                 F: Fn(&T) -> Option<&WorkUnitFeed<P>> + Send + Sync + 'static;
+
+            fn set_distributed_dynamic_task_count(&mut self, enabled: bool) -> Result<(), DataFusionError>;
+            #[call(set_distributed_dynamic_task_count)]
+            #[expr($?;Ok(self))]
+            fn with_distributed_dynamic_task_count(self, enabled: bool) -> Result<Self, DataFusionError>;
+
+            fn set_distributed_bytes_per_partition_per_second(&mut self, bytes_per_partition_per_second: usize) -> Result<(), DataFusionError>;
+            #[call(set_distributed_bytes_per_partition_per_second)]
+            #[expr($?;Ok(self))]
+            fn with_distributed_bytes_per_partition_per_second(self, bytes_per_partition_per_second: usize) -> Result<Self, DataFusionError>;
         }
     }
 }
diff --git a/src/distributed_planner/distributed_config.rs b/src/distributed_planner/distributed_config.rs
index 6795fd95..c1e804e0 100644
--- a/src/distributed_planner/distributed_config.rs
+++ b/src/distributed_planner/distributed_config.rs
@@ -70,6 +70,15 @@ extensions_options! {
         /// should be used in serving the query. Some plans might not implement any kind of row count
         /// estimation, and this parameter sets the default estimated row count for those plans.
         pub default_estimated_row_count: Option<usize>, default = Some(0)
+        /// Calculates the task count of the different stages at execution time, based on runtime
+        /// information collected by sampling at the head of the stages.
+        ///
+        /// With this option enabled, the shape of the distributed plan is only known after fully
+        /// executing it, as it's dynamically created on the fly during execution.
+        pub dynamic_task_count: bool, default = false
+        /// If `dynamic_task_count` is enabled, this value is the amount of bytes/second each
+        /// partition is expected to handle. Lower values will result in greater parallelism.
+        pub bytes_per_partition_per_second: usize, default = 16 * 1024 * 1024
         /// Collection of [TaskEstimator]s that will be applied to leaf nodes in order to
         /// estimate how many tasks should be spawned for the [Stage] containing the leaf node.
         pub(crate) __private_task_estimator: CombinedTaskEstimator, default = CombinedTaskEstimator::default()
diff --git a/src/distributed_planner/distributed_query_planner.rs b/src/distributed_planner/distributed_query_planner.rs
index f5ac6168..8e2d8f2c 100644
--- a/src/distributed_planner/distributed_query_planner.rs
+++ b/src/distributed_planner/distributed_query_planner.rs
@@ -108,6 +108,14 @@ impl QueryPlanner for DistributedQueryPlanner {
 
         plan = insert_broadcast_execs(plan, cfg)?;
 
+        if d_cfg.dynamic_task_count {
+            // The task count will be decided dynamically at execution time.
+            return Ok(Arc::new(
+                DistributedExec::new(plan).with_metrics_collection(d_cfg.collect_metrics),
+            ));
+        }
+
+        // Compute per-node task counts and inject `Network*Exec` nodes at the stage boundaries.
         plan = inject_network_boundaries(plan, CardinalityBasedNetworkBoundaryBuilder, cfg).await?;
 
         plan = prepare_network_boundaries(plan)?;
diff --git a/src/distributed_planner/inject_network_boundaries.rs b/src/distributed_planner/inject_network_boundaries.rs
index 21898440..78fbe6b9 100644
--- a/src/distributed_planner/inject_network_boundaries.rs
+++ b/src/distributed_planner/inject_network_boundaries.rs
@@ -152,8 +152,9 @@ pub(crate) async fn inject_network_boundaries(
 
 #[derive(Clone)]
 pub(crate) struct InjectNetworkBoundaryContext<'a> {
+    pub(crate) d_cfg: &'a DistributedConfig,
+
     cfg: &'a ConfigOptions,
-    d_cfg: &'a DistributedConfig,
     nb_builder: &'a (dyn NetworkBoundaryBuilder + Send + Sync),
     task_counts: &'a Mutex<HashMap<usize, TaskCountAnnotation>>,
     query_id: Uuid,
@@ -161,7 +162,7 @@ pub(crate) struct InjectNetworkBoundaryContext<'a> {
 }
 
 impl<'a> InjectNetworkBoundaryContext<'a> {
-    fn max_tasks(&self) -> Result<usize> {
+    pub(crate) fn max_tasks(&self) -> Result<usize> {
         Ok(match self.d_cfg.max_tasks_per_stage {
             0 => self
                 .d_cfg
@@ -190,7 +191,7 @@ impl<'a> InjectNetworkBoundaryContext<'a> {
         plan
     }
 
-    fn task_count(&self, plan: &Arc<dyn ExecutionPlan>) -> Result<TaskCountAnnotation> {
+    pub(crate) fn task_count(&self, plan: &Arc<dyn ExecutionPlan>) -> Result<TaskCountAnnotation> {
         let Some(task_count) = self
             .task_counts
             .lock()
@@ -294,6 +295,7 @@ async fn _inject_network_boundaries(
                 num: nb_ctx.fetch_add_stage_id(),
                 plan: nb_ctx.plan_with_task_count(plan, task_count),
                 tasks: task_count.as_usize(),
+                metrics_set: Default::default(),
             };
             let result = nb_ctx
                 .nb_builder
@@ -323,6 +325,7 @@ async fn _inject_network_boundaries(
                 num: nb_ctx.fetch_add_stage_id(),
                 plan: nb_ctx.plan_with_task_count(plan, task_count),
                 tasks: task_count.as_usize(),
+                metrics_set: Default::default(),
             };
             let result = nb_ctx
                 .nb_builder
@@ -339,6 +342,7 @@ async fn _inject_network_boundaries(
                 num: nb_ctx.fetch_add_stage_id(),
                 plan: nb_ctx.plan_with_task_count(plan, task_count),
                 tasks: task_count.as_usize(),
+                metrics_set: Default::default(),
             };
             let result = nb_ctx
                 .nb_builder
@@ -409,7 +413,7 @@ async fn _inject_network_boundaries(
 /// - **Everything else**: recurse into children with the same `task_count`, then rebuild the
 ///   node with the rebuilt children.
 impl InjectNetworkBoundaryContext<'_> {
-    fn propagate_task_count_until_network_boundaries(
+    pub(crate) fn propagate_task_count_until_network_boundaries(
         &self,
         plan: &Arc<dyn ExecutionPlan>,
         task_count: TaskCountAnnotation,
diff --git a/src/distributed_planner/mod.rs b/src/distributed_planner/mod.rs
index f12c684c..18cfe6e0 100644
--- a/src/distributed_planner/mod.rs
+++ b/src/distributed_planner/mod.rs
@@ -11,8 +11,12 @@ mod statistics;
 mod task_estimator;
 
 pub use distributed_config::DistributedConfig;
+pub(crate) use inject_network_boundaries::{
+    InjectNetworkBoundaryContext, NetworkBoundaryBuilderResult, inject_network_boundaries,
+};
+pub(crate) use network_boundary::ProducerHead;
 pub use network_boundary::{NetworkBoundary, NetworkBoundaryExt};
-pub(crate) use network_boundary::{ProducerHead, insert_producer_head};
 pub use session_state_builder_ext::SessionStateBuilderExt;
+pub(crate) use statistics::calculate_cost;
 pub(crate) use task_estimator::set_distributed_task_estimator;
 pub use task_estimator::{TaskCountAnnotation, TaskEstimation, TaskEstimator, TaskRoutingContext};
diff --git a/src/distributed_planner/network_boundary.rs b/src/distributed_planner/network_boundary.rs
index 858a7e00..e6479e7e 100644
--- a/src/distributed_planner/network_boundary.rs
+++ b/src/distributed_planner/network_boundary.rs
@@ -1,3 +1,4 @@
+use crate::execution_plans::SamplerExec;
 use crate::{BroadcastExec, NetworkBroadcastExec, NetworkCoalesceExec, NetworkShuffleExec, Stage};
 use datafusion::common::Result;
 use datafusion::physical_expr::Partitioning;
@@ -13,7 +14,7 @@ pub trait NetworkBoundary: ExecutionPlan {
     /// information to perform any internal transformations necessary for distributed execution.
     ///
     /// Typically, [NetworkBoundary]s will use this call for transitioning from "Pending" to "ready".
-    fn with_input_stage(&self, input_stage: Stage) -> Result<Arc<dyn ExecutionPlan>>;
+    fn with_input_stage(&self, input_stage: Stage) -> Result<Arc<dyn NetworkBoundary>>;
 
     /// Returns the assigned input [Stage], if any.
     fn input_stage(&self) -> &Stage;
@@ -59,28 +60,40 @@ impl NetworkBoundaryExt for dyn ExecutionPlan {
     }
 }
 
-/// Ensures the head of the provided plan complies with the passed [ProducerHead] definition. This
-/// can be called both during planning and lazily at runtime.
-pub(crate) fn insert_producer_head(
-    input: Arc<dyn ExecutionPlan>,
-    head: ProducerHead,
-) -> Result<Arc<dyn ExecutionPlan>> {
-    let input = if let Some(r_exec) = input.downcast_ref::<RepartitionExec>() {
-        Arc::clone(r_exec.input())
-    } else if let Some(b_exec) = input.downcast_ref::<BroadcastExec>() {
-        Arc::clone(b_exec.input())
-    } else {
-        input
-    };
-    let plan = match head {
-        ProducerHead::None => input,
-        ProducerHead::BroadcastExec { output_partitions } => {
-            let partitions = input.output_partitioning().partition_count();
-            Arc::new(BroadcastExec::new(input, output_partitions / partitions))
-        }
-        ProducerHead::RepartitionExec { partitioning } => {
-            Arc::new(RepartitionExec::try_new(input, partitioning)?)
+impl ProducerHead {
+    /// Ensures the head of the provided plan complies with the passed [ProducerHead] definition. This
+    /// can be called both during planning and lazily at runtime.
+    pub(crate) fn insert(self, input: Arc<dyn ExecutionPlan>) -> Result<Arc<dyn ExecutionPlan>> {
+        let input = if let Some(r_exec) = input.downcast_ref::<RepartitionExec>() {
+            Arc::clone(r_exec.input())
+        } else if let Some(b_exec) = input.downcast_ref::<BroadcastExec>() {
+            Arc::clone(b_exec.input())
+        } else {
+            input
+        };
+        let plan = match self {
+            ProducerHead::None => input,
+            ProducerHead::BroadcastExec { output_partitions } => {
+                let partitions = input.output_partitioning().partition_count();
+                Arc::new(BroadcastExec::new(input, output_partitions / partitions))
+            }
+            ProducerHead::RepartitionExec { partitioning } => {
+                Arc::new(RepartitionExec::try_new(input, partitioning)?)
+            }
+        };
+        Ok(plan)
+    }
+
+    /// Injects a [SamplerExec] right below a [RepartitionExec] or [BroadcastExec].
+    pub(crate) fn insert_sampler(input: Arc<dyn ExecutionPlan>) -> Result<Arc<dyn ExecutionPlan>> {
+        if let Some(r_exec) = input.downcast_ref::<RepartitionExec>() {
+            let child = Arc::clone(r_exec.input());
+            input.with_new_children(vec![Arc::new(SamplerExec::new(child))])
+        } else if let Some(b_exec) = input.downcast_ref::<BroadcastExec>() {
+            let child = Arc::clone(b_exec.input());
+            input.with_new_children(vec![Arc::new(SamplerExec::new(child))])
+        } else {
+            Ok(input)
         }
-    };
-    Ok(plan)
+    }
 }
diff --git a/src/distributed_planner/prepare_network_boundaries.rs b/src/distributed_planner/prepare_network_boundaries.rs
index 1b6f1ac7..dd070793 100644
--- a/src/distributed_planner/prepare_network_boundaries.rs
+++ b/src/distributed_planner/prepare_network_boundaries.rs
@@ -1,5 +1,4 @@
 use crate::common::TreeNodeExt;
-use crate::distributed_planner::network_boundary::insert_producer_head;
 use crate::stage::LocalStage;
 use crate::{NetworkBoundaryExt, Stage};
 use datafusion::common::Result;
@@ -35,8 +34,9 @@ pub(crate) fn prepare_network_boundaries(
 
         // 2) Scale up the head node of the input stage in order to account for the amount of partition
         //    and consumer count above it.
-        let plan =
-            insert_producer_head(Arc::clone(&input_stage.plan), nb.producer_head(task_count))?;
+        let plan = nb
+            .producer_head(task_count)
+            .insert(Arc::clone(&input_stage.plan))?;
 
         // 3) Make sure the input stage can be uniquely identified with a stage index and query id.
         //    If there were already some `query_id` and `num` that's fine.
@@ -45,6 +45,7 @@ pub(crate) fn prepare_network_boundaries(
             num: stage_id,
             plan,
             tasks: input_stage.tasks,
+            metrics_set: Default::default(),
         }))?;
         stage_id += 1;
         Ok(Transformed::yes(nb))
diff --git a/src/execution_plans/benchmarks/shuffle_bench.rs b/src/execution_plans/benchmarks/shuffle_bench.rs
index 8406a052..4f46f1be 100644
--- a/src/execution_plans/benchmarks/shuffle_bench.rs
+++ b/src/execution_plans/benchmarks/shuffle_bench.rs
@@ -216,6 +216,7 @@ impl ShuffleFixture {
             query_id,
             num: 0,
             workers: self.input_stage_workers.clone(),
+            runtime_stats: None,
         });
 
         let mut join_set = JoinSet::default();
diff --git a/src/execution_plans/benchmarks/transport_bench.rs b/src/execution_plans/benchmarks/transport_bench.rs
index 847c307b..a386c7bd 100644
--- a/src/execution_plans/benchmarks/transport_bench.rs
+++ b/src/execution_plans/benchmarks/transport_bench.rs
@@ -266,6 +266,7 @@ impl TransportFixture {
             query_id,
             num: 0,
             workers: self.input_stage_tasks.clone(),
+            runtime_stats: None,
         });
 
         let mut join_set = JoinSet::default();
diff --git a/src/execution_plans/mod.rs b/src/execution_plans/mod.rs
index a1ea6316..ecdcbc35 100644
--- a/src/execution_plans/mod.rs
+++ b/src/execution_plans/mod.rs
@@ -6,6 +6,7 @@ mod metrics;
 mod network_broadcast;
 mod network_coalesce;
 mod network_shuffle;
+mod sampler;
 
 #[cfg(any(test, feature = "integration"))]
 pub mod benchmarks;
@@ -18,3 +19,4 @@ pub(crate) use metrics::MetricsWrapperExec;
 pub use network_broadcast::NetworkBroadcastExec;
 pub use network_coalesce::NetworkCoalesceExec;
 pub use network_shuffle::NetworkShuffleExec;
+pub use sampler::SamplerExec;
diff --git a/src/execution_plans/network_broadcast.rs b/src/execution_plans/network_broadcast.rs
index f9dee080..251ed29c 100644
--- a/src/execution_plans/network_broadcast.rs
+++ b/src/execution_plans/network_broadcast.rs
@@ -9,7 +9,7 @@ use datafusion::execution::{SendableRecordBatchStream, TaskContext};
 use datafusion::physical_expr_common::metrics::MetricsSet;
 use datafusion::physical_plan::stream::RecordBatchStreamAdapter;
 use datafusion::physical_plan::{
-    DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, PlanProperties,
+    DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, PlanProperties, Statistics,
 };
 use std::fmt::Formatter;
 use std::sync::Arc;
@@ -153,6 +153,7 @@ impl NetworkBroadcastExec {
                 num: 0,
                 plan: input,
                 tasks: producer_tasks,
+                metrics_set: Default::default(),
             }),
             input_properties,
         ))
@@ -160,7 +161,7 @@ impl NetworkBroadcastExec {
 }
 
 impl NetworkBoundary for NetworkBroadcastExec {
-    fn with_input_stage(&self, input_stage: Stage) -> Result<Arc<dyn ExecutionPlan>> {
+    fn with_input_stage(&self, input_stage: Stage) -> Result<Arc<dyn NetworkBoundary>> {
         let mut self_clone = self.clone();
         self_clone.worker_connections = WorkerConnectionPool::new(input_stage.task_count());
         self_clone.input_stage = input_stage;
@@ -268,4 +269,12 @@ impl ExecutionPlan for NetworkBroadcastExec {
     fn metrics(&self) -> Option<MetricsSet> {
         Some(self.worker_connections.metrics.clone_inner())
     }
+
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
+        self.input_stage.partition_statistics(
+            partition,
+            self.properties.output_partitioning().partition_count(),
+            self.schema(),
+        )
+    }
 }
diff --git a/src/execution_plans/network_coalesce.rs b/src/execution_plans/network_coalesce.rs
index 8fb06e74..1582d08c 100644
--- a/src/execution_plans/network_coalesce.rs
+++ b/src/execution_plans/network_coalesce.rs
@@ -12,7 +12,7 @@ use datafusion::physical_plan::limit::LocalLimitExec;
 use datafusion::physical_plan::stream::RecordBatchStreamAdapter;
 use datafusion::physical_plan::{
     DisplayAs, DisplayFormatType, EmptyRecordBatchStream, ExecutionPlan, PlanProperties,
-    internal_err,
+    Statistics, internal_err,
 };
 use std::fmt::{Debug, Formatter};
 use std::sync::Arc;
@@ -131,6 +131,7 @@ impl NetworkCoalesceExec {
                 num: 0,
                 plan: input,
                 tasks: producer_tasks,
+                metrics_set: Default::default(),
             }),
             input_properties,
             consumer_tasks,
@@ -157,6 +158,7 @@ impl NetworkCoalesceExec {
             num: local.num,
             plan: input_with_fetch,
             tasks: local.tasks,
+            metrics_set: Default::default(),
         });
         Ok(Arc::new(self_clone))
     }
@@ -167,7 +169,7 @@ impl NetworkBoundary for NetworkCoalesceExec {
         &self.input_stage
     }
 
-    fn with_input_stage(&self, input_stage: Stage) -> Result<Arc<dyn ExecutionPlan>> {
+    fn with_input_stage(&self, input_stage: Stage) -> Result<Arc<dyn NetworkBoundary>> {
         let mut self_clone = self.clone();
         self_clone.properties = scale_partitioning_props(self_clone.properties(), |p| {
             p * input_stage.task_count() / self_clone.input_stage.task_count().max(1)
@@ -247,10 +249,8 @@ impl ExecutionPlan for NetworkCoalesceExec {
             );
         }
 
-        let partitions_per_task = self
-            .properties()
-            .partitioning
-            .partition_count()
+        let out_partitions = self.properties().partitioning.partition_count();
+        let partitions_per_task = out_partitions
             .checked_div(
                 self.input_stage
                     .task_count()
@@ -311,6 +311,14 @@ impl ExecutionPlan for NetworkCoalesceExec {
     fn metrics(&self) -> Option<MetricsSet> {
         Some(self.worker_connections.metrics.clone_inner())
     }
+
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
+        self.input_stage.partition_statistics(
+            partition,
+            self.properties.output_partitioning().partition_count(),
+            self.schema(),
+        )
+    }
 }
 
 #[derive(Debug, Clone, Copy)]
diff --git a/src/execution_plans/network_shuffle.rs b/src/execution_plans/network_shuffle.rs
index 157cfd99..2a575b42 100644
--- a/src/execution_plans/network_shuffle.rs
+++ b/src/execution_plans/network_shuffle.rs
@@ -11,7 +11,9 @@ use datafusion::physical_expr::Partitioning;
 use datafusion::physical_expr_common::metrics::MetricsSet;
 use datafusion::physical_plan::repartition::RepartitionExec;
 use datafusion::physical_plan::stream::RecordBatchStreamAdapter;
-use datafusion::physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties};
+use datafusion::physical_plan::{
+    DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties, Statistics,
+};
 use std::fmt::Formatter;
 use std::sync::Arc;
 use uuid::Uuid;
@@ -133,6 +135,7 @@ impl NetworkShuffleExec {
                 num: 0,
                 plan: input,
                 tasks: producer_tasks,
+                metrics_set: Default::default(),
             }),
             input_properties,
         ))
@@ -144,7 +147,7 @@ impl NetworkBoundary for NetworkShuffleExec {
         &self.input_stage
     }
 
-    fn with_input_stage(&self, input_stage: Stage) -> Result<Arc<dyn ExecutionPlan>> {
+    fn with_input_stage(&self, input_stage: Stage) -> Result<Arc<dyn NetworkBoundary>> {
         let mut self_clone = self.clone();
         self_clone.worker_connections = WorkerConnectionPool::new(input_stage.task_count());
         self_clone.input_stage = input_stage;
@@ -217,7 +220,8 @@ impl ExecutionPlan for NetworkShuffleExec {
         };
 
         let task_context = DistributedTaskContext::from_ctx(&context);
-        let off = self.properties.partitioning.partition_count() * task_context.task_index;
+        let out_partitions = self.properties.partitioning.partition_count();
+        let off = out_partitions * task_context.task_index;
 
         let mut streams = Vec::with_capacity(remote_stage.workers.len());
         for input_task_index in 0..remote_stage.workers.len() {
@@ -242,4 +246,12 @@ impl ExecutionPlan for NetworkShuffleExec {
     fn metrics(&self) -> Option<MetricsSet> {
         Some(self.worker_connections.metrics.clone_inner())
     }
+
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
+        self.input_stage.partition_statistics(
+            partition,
+            self.properties.output_partitioning().partition_count(),
+            self.schema(),
+        )
+    }
 }
diff --git a/src/execution_plans/sampler.rs b/src/execution_plans/sampler.rs
new file mode 100644
index 00000000..0e120219
--- /dev/null
+++ b/src/execution_plans/sampler.rs
@@ -0,0 +1,594 @@
+use crate::common::{require_one_child, vec_cast};
+use crate::worker::generated::worker as pb;
+use crate::{
+    BytesCounterMetric, BytesMetricExt, GaugeMetricExt, LatencyMetricExt, MaxGaugeMetric,
+    MaxLatencyMetric, P50LatencyMetric,
+};
+use datafusion::arrow::array::Array;
+use datafusion::arrow::array::ArrayRef;
+use datafusion::arrow::record_batch::RecordBatch;
+use datafusion::common::runtime::SpawnedTask;
+use datafusion::common::tree_node::{TreeNode, TreeNodeRecursion};
+use datafusion::common::{DataFusionError, Result, exec_err};
+use datafusion::common::{HashSet, ScalarValue};
+use datafusion::execution::memory_pool::{MemoryConsumer, MemoryReservation};
+use datafusion::execution::{SendableRecordBatchStream, TaskContext};
+use datafusion::physical_expr_common::metrics::{Gauge, MetricValue, MetricsSet};
+use datafusion::physical_plan::metrics::{ExecutionPlanMetricsSet, MetricBuilder, Time};
+use datafusion::physical_plan::stream::RecordBatchStreamAdapter;
+use datafusion::physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties};
+use futures::{FutureExt, Stream, StreamExt, TryFutureExt, TryStreamExt};
+use std::collections::VecDeque;
+use std::fmt::{Debug, Formatter};
+use std::pin::Pin;
+use std::sync::atomic::{AtomicBool, Ordering};
+use std::sync::{Arc, LazyLock, Mutex, OnceLock};
+use std::task::{Context, Poll};
+use std::time::Instant;
+use tokio::sync::oneshot;
+
+/// How many [RecordBatch]s to allow the input stream to yield synchronously (without yielding back
+/// to tokio) before short-circuiting buffering.
+const READY_CHUNK_LIMIT: usize = 256;
+/// Maximum read of bytes per second allowed to be emitted. Reads greater than this will be
+/// truncated to this max value, as it's assumed that [READY_CHUNK_LIMIT] was hit and no useful
+/// measurement can actually be emitted.
+const MAX_BYTES_PER_SECOND: usize = 512 * 1024 * 1024;
+/// Maximum number of rows per second allowed to be emitted. Reads greater than this will be
+/// truncated to this max value, as it's assumed that [READY_CHUNK_LIMIT] was hit and no useful
+/// measurement can actually be emitted.
+const MAX_ROWS_PER_SECOND: usize = 1024 * 1024;
+/// Maximum number of rows sampled from the peek queue when estimating per-column NDV.
+const NDV_MAX_ROWS_SAMPLE: usize = 1000;
+
+#[derive(Debug)]
+pub struct SamplerExec {
+    pub(crate) input: Arc<dyn ExecutionPlan>,
+    pub(crate) metric_set: ExecutionPlanMetricsSet,
+    pub(crate) partition_samplers: Vec<PartitionSampler>,
+    pub(crate) execution_started: Arc<AtomicBool>,
+}
+
+/// Metrics that quantify how long the sampler held data in memory before the consumer
+/// (real execution) attached, plus the peak accumulated size reached. All metrics are shared
+/// across the partition samplers; the latency metrics aggregate per-partition observations.
+#[derive(Debug, Clone)]
+pub(crate) struct SamplerExecMetrics {
+    /// Time since [SamplerExec::kick_off_first_sampler] was called until the first batch from
+    /// the input arrived
+    kick_off_to_fist_batch_p50: P50LatencyMetric,
+    kick_off_to_fist_batch_max: MaxLatencyMetric,
+    /// Time since [SamplerExec::kick_off_first_sampler] was called until the [pb::LoadInfo] message
+    /// was sent.
+    kick_off_to_load_info_sent_p50: P50LatencyMetric,
+    kick_off_to_load_info_sent_max: MaxLatencyMetric,
+    /// Time since [SamplerExec::kick_off_first_sampler] was called until the node was properly
+    /// executed with [SamplerExec::execute].
+    kick_off_to_execution_p50: P50LatencyMetric,
+    kick_off_to_execution_max: MaxLatencyMetric,
+    /// Maximum number of record batches peeked by a sampler.
+    max_batches_peeked: MaxGaugeMetric,
+    /// Peak memory accumulated by any partition sampler during the sampling phase.
+    max_mem_used: Gauge,
+    /// Bytes per second flowing through the sampler node.
+    bytes_per_sec: BytesCounterMetric,
+    /// Bytes ready at the moment of reporting load info.
+    bytes_ready: BytesCounterMetric,
+    /// Elapsed compute while sampling.
+    elapsed_compute: Time,
+}
+
+impl SamplerExecMetrics {
+    fn new(metric_set: &ExecutionPlanMetricsSet) -> Self {
+        let bdr = || MetricBuilder::new(metric_set);
+        Self {
+            kick_off_to_fist_batch_p50: bdr().p50_latency("kick_off_to_first_batch_p50"),
+            kick_off_to_fist_batch_max: bdr().max_latency("kick_off_to_first_batch_max"),
+            kick_off_to_load_info_sent_p50: bdr().p50_latency("kick_off_to_load_info_sent_p50"),
+            kick_off_to_load_info_sent_max: bdr().max_latency("kick_off_to_load_info_sent_max"),
+            kick_off_to_execution_p50: bdr().p50_latency("kick_off_to_execution_p50"),
+            kick_off_to_execution_max: bdr().max_latency("kick_off_to_execution_max"),
+            max_batches_peeked: bdr().max_gauge("max_batches_peeked"),
+            max_mem_used: bdr().global_gauge("max_mem_used"),
+            bytes_per_sec: bdr().bytes_counter("bytes_per_sec"),
+            bytes_ready: bdr().bytes_counter("bytes_ready"),
+            elapsed_compute: {
+                let time = Time::new();
+                bdr().build(MetricValue::ElapsedCompute(time.clone()));
+                time
+            },
+        }
+    }
+}
+
+impl SamplerExec {
+    pub(crate) fn new(input: Arc<dyn ExecutionPlan>) -> Self {
+        let metric_set = ExecutionPlanMetricsSet::new();
+        let metric_set_clone = metric_set.clone();
+        // Metrics need to be lazily initialized, otherwise the coordinator side will register
+        // them when they are never relevant there, they are just relevant in workers.
+        //
+        // If we don't do this, the [SamplerExec] constructed during planning will register its
+        // own zeroed SamplerExecMetrics in the ExecutionPlanMetricsSet, even if the metrics we care
+        // about are just the ones collected in workers during execution.
+        let metrics: Arc<LazyLock<_, Box<dyn FnOnce() -> SamplerExecMetrics + Send>>> =
+            Arc::new(LazyLock::new(Box::new(move || {
+                SamplerExecMetrics::new(&metric_set_clone)
+            })));
+        let partitions = input.properties().partitioning.partition_count();
+        let execution_started = Arc::new(AtomicBool::new(false));
+        let mut samplers = Vec::with_capacity(partitions);
+        for i in 0..partitions {
+            samplers.push(PartitionSampler {
+                partition_idx: i,
+                input: Arc::clone(&input),
+                stream: Mutex::new(None),
+                metrics: Arc::clone(&metrics),
+                kick_off_at: Arc::new(OnceLock::new()),
+                first_batch_at: Arc::new(OnceLock::new()),
+                load_info_sent_at: Arc::new(OnceLock::new()),
+                execution_started: Arc::clone(&execution_started),
+            });
+        }
+        Self {
+            input,
+            metric_set,
+            partition_samplers: samplers,
+            execution_started,
+        }
+    }
+
+    pub(crate) fn kick_off_first_sampler(
+        plan: Arc<dyn ExecutionPlan>,
+        ctx: Arc<TaskContext>,
+    ) -> Result<Vec<oneshot::Receiver<pb::LoadInfo>>> {
+        let mut receivers = vec![];
+        plan.apply(|plan| {
+            let Some(sampler) = plan.downcast_ref::<SamplerExec>() else {
+                return Ok(TreeNodeRecursion::Continue);
+            };
+            receivers.reserve(sampler.partition_samplers.len());
+            for partition_sampler in &sampler.partition_samplers {
+                let rx = partition_sampler.kick_off(Arc::clone(&ctx))?;
+                receivers.push(rx);
+            }
+            Ok(TreeNodeRecursion::Stop)
+        })?;
+        Ok(receivers)
+    }
+}
+
+pub(crate) struct PartitionSampler {
+    partition_idx: usize,
+    input: Arc<dyn ExecutionPlan>,
+    stream: Mutex<Option<SendableRecordBatchStream>>,
+    execution_started: Arc<AtomicBool>,
+
+    // Metrics state.
+    metrics: Arc<LazyLock<SamplerExecMetrics, Box<dyn FnOnce() -> SamplerExecMetrics + Send>>>,
+    /// Set when `kick_off` is invoked. Used at `execute()` time to record how long the
+    /// sampler sampled data before the consumer attached.
+    kick_off_at: Arc<OnceLock<Instant>>,
+    /// Set the first time the producer task emits a `LoadInfo`. Used at `execute()` time
+    /// to record the gap between the first sample and the consumer starting.
+    first_batch_at: Arc<OnceLock<Instant>>,
+    /// Set immediately after `sampling_tx.send()` succeeds. Used to measure the full
+    /// round-trip: LoadInfo sent → coordinator collects votes → downstream plan dispatched
+    /// → consumer calls execute().
+    load_info_sent_at: Arc<OnceLock<Instant>>,
+}
+
+impl Debug for PartitionSampler {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("PartitionSampler").finish()
+    }
+}
+
+impl PartitionSampler {
+    fn start_stream(&self) -> Option<SendableRecordBatchStream> {
+        let Some(kick_off_at) = self.kick_off_at.get() else {
+            return self.stream.lock().unwrap().take();
+        };
+
+        // Time since this sampler was kicked off until the first batch arrived.
+        if let Some(t) = self.first_batch_at.get() {
+            let delay = t.saturating_duration_since(*kick_off_at);
+            self.metrics.kick_off_to_fist_batch_p50.add_duration(delay);
+            self.metrics.kick_off_to_fist_batch_max.add_duration(delay);
+        }
+
+        // Time since the sampler was kicked off until the pb::LoadInfo message was sent.
+        if let Some(t) = self.load_info_sent_at.get() {
+            let delay = t.saturating_duration_since(*kick_off_at);
+            self.metrics
+                .kick_off_to_load_info_sent_p50
+                .add_duration(delay);
+            self.metrics
+                .kick_off_to_load_info_sent_max
+                .add_duration(delay);
+        }
+
+        // Time since the sampler was kicked off until it started executing.
+        let delay = kick_off_at.elapsed();
+        self.metrics.kick_off_to_execution_p50.add_duration(delay);
+        self.metrics.kick_off_to_execution_max.add_duration(delay);
+
+        self.stream.lock().unwrap().take()
+    }
+
+    fn kick_off(&self, ctx: Arc<TaskContext>) -> Result<oneshot::Receiver<pb::LoadInfo>> {
+        let _ = self.kick_off_at.set(Instant::now());
+        let (sampling_tx, sampling_rx) = oneshot::channel();
+
+        let input = Arc::clone(&self.input);
+        let partition_idx = self.partition_idx;
+        let schema = input.schema();
+        let elapsed_compute = self.metrics.elapsed_compute.clone();
+        let first_batch_at = Arc::clone(&self.first_batch_at);
+        let n_cols = self.input.schema().fields.len();
+
+        let reporter = LoadInfoDropHandler {
+            load_info: zero_load_info(partition_idx, n_cols),
+            sampling_tx: Some(sampling_tx),
+            bytes_per_second_metric: self.metrics.bytes_per_sec.clone(),
+            load_info_sent_at: Arc::clone(&self.load_info_sent_at),
+            bytes_ready_metric: self.metrics.bytes_ready.clone(),
+            omit: Arc::clone(&self.execution_started),
+        };
+
+        let mut peek = RecordBatchPeek {
+            peek: VecDeque::new(),
+            n_cols,
+            max_mem_used: self.metrics.max_mem_used.clone(),
+            max_batches_peeked: self.metrics.max_batches_peeked.clone(),
+            memory_reservation: Arc::new(
+                MemoryConsumer::new(format!("PartitionSampler[{partition_idx}]"))
+                    .register(ctx.memory_pool()),
+            ),
+            first_batch_at: Arc::clone(&self.first_batch_at),
+        };
+
+        // Execute the input synchronously so any setup error surfaces before we
+        // spawn the producer task.
+        let mut input_stream = input.execute(partition_idx, ctx)?.fuse();
+
+        let task = SpawnedTask::spawn(async move {
+            // First, read at once all the RecordBatches that are ready to be yielded synchronously.
+            // Some downstream nodes will accumulate data in-memory, and will then yield several
+            // RecordBatches at once synchronously (without Poll::Pending gaps in between).
+            let mut chunked = (&mut input_stream).ready_chunks(READY_CHUNK_LIMIT);
+            let Some(batches) = chunked.next().await else {
+                // Not a single RecordBatch was produced, so let bytes_per_second=0 be sent as-is.
+                return Ok(peek.chain(input_stream).boxed());
+            };
+            let _elapsed_compute_timer = elapsed_compute.timer();
+            for batch in batches {
+                let _ = first_batch_at.set(Instant::now());
+                peek.push(batch?);
+            }
+
+            // Peek whether there is more data to be produced.
+            if let Some(result) = input_stream.next().now_or_never() {
+                return if let Some(batch) = result {
+                    // A batch was immediately available without hitting an async gap (the input is
+                    // still yielding synchronously). store it so its rows are not lost. We cannot
+                    // measure a meaningful arrival velocity in this case, so as before, assume the
+                    // worst.
+                    peek.push(batch?);
+                    reporter.report(&peek, MAX_BYTES_PER_SECOND, MAX_ROWS_PER_SECOND);
+                    Ok(peek.chain(input_stream).boxed())
+                } else {
+                    // No more batches to read, so no velocity measurement.
+                    reporter.report(&peek, 0, 0);
+                    Ok(peek.chain(input_stream).boxed())
+                };
+            }
+
+            drop(_elapsed_compute_timer);
+
+            // Wait for an async gap in order to measure data velocity.
+            let poll_start = Instant::now();
+            let Some(batch) = input_stream.try_next().await? else {
+                let _elapsed_compute_timer = elapsed_compute.timer();
+                // The last message was somehow the last message in the stream, but the stream did
+                // not end immediately. This is an unlikely scenario.
+                reporter.report(&peek, 0, 0);
+                return Ok(peek.chain(input_stream).boxed());
+            };
+            let _elapsed_compute_timer = elapsed_compute.timer();
+
+            let bytes_per_second =
+                (record_batch_size(&batch) as f32 / poll_start.elapsed().as_secs_f32()) as usize;
+            let rows_per_second =
+                (batch.num_rows() as f32 / poll_start.elapsed().as_secs_f32()) as usize;
+
+            peek.push(batch);
+
+            // Some RecordBatches where buffered, but there's more to be yielded, so both
+            // bytes_per_second and bytes_ready can be reported.
+            reporter.report(&peek, bytes_per_second, rows_per_second);
+
+            Ok(peek.chain(input_stream).boxed())
+        });
+
+        let stream = async move {
+            task.await
+                .map_err(|err| DataFusionError::Internal(err.to_string()))?
+        }
+        .try_flatten_stream();
+
+        self.stream
+            .lock()
+            .expect("poisoned lock")
+            .replace(Box::pin(RecordBatchStreamAdapter::new(schema, stream)));
+
+        Ok(sampling_rx)
+    }
+}
+
+/// Wraps a [pb::LoadInfo] and emits it on [Drop] through the provided [oneshot] channel.
+///
+/// Emitting on drop ensures that it's always emitted.
+struct LoadInfoDropHandler {
+    omit: Arc<AtomicBool>,
+
+    load_info: pb::LoadInfo,
+    bytes_ready_metric: BytesCounterMetric,
+    bytes_per_second_metric: BytesCounterMetric,
+    sampling_tx: Option<oneshot::Sender<pb::LoadInfo>>,
+    load_info_sent_at: Arc<OnceLock<Instant>>,
+}
+
+impl LoadInfoDropHandler {
+    fn report(mut self, peek: &RecordBatchPeek, bps: usize, rps: usize) {
+        if self.omit.load(Ordering::Relaxed) {
+            return;
+        }
+
+        self.set_per_col_bytes_ready(peek.per_col_bytes_ready());
+        self.set_rows_ready(peek.rows_ready());
+        self.set_per_col_ndv(peek.per_col_ndv());
+        self.set_per_col_null_pct(peek.per_col_null_pct());
+        self.set_per_col_bytes_per_second(bps);
+        self.set_rows_per_second(rps)
+    }
+
+    fn set_per_col_bytes_ready(&mut self, bytes_ready: Vec<usize>) {
+        self.load_info.per_column_bytes_ready = vec_cast(&bytes_ready);
+        self.bytes_ready_metric.add_bytes(bytes_ready.iter().sum());
+    }
+
+    fn set_per_col_bytes_per_second(&mut self, total_bytes_per_second: usize) {
+        let per_col_ready: &[u64] = &self.load_info.per_column_bytes_ready;
+        let total_ready: u64 = per_col_ready.iter().sum();
+        let per_col: Vec<usize> = if total_ready == 0 {
+            vec![total_bytes_per_second / per_col_ready.len().max(1); per_col_ready.len()]
+        } else {
+            per_col_ready
+                .iter()
+                .map(|&ready| {
+                    (ready.saturating_mul(total_bytes_per_second as u64) / total_ready) as usize
+                })
+                .collect()
+        };
+        self.load_info.per_column_bytes_per_second = vec_cast(&per_col);
+        self.bytes_per_second_metric
+            .add_bytes(total_bytes_per_second);
+    }
+
+    fn set_rows_ready(&mut self, rows_ready: usize) {
+        self.load_info.rows_ready = rows_ready as u64;
+    }
+
+    fn set_rows_per_second(&mut self, rows_per_second: usize) {
+        self.load_info.rows_per_second = rows_per_second as u64;
+    }
+
+    fn set_per_col_ndv(&mut self, per_column_ndv: Vec<f32>) {
+        self.load_info.per_column_ndv_percentage = per_column_ndv;
+    }
+
+    fn set_per_col_null_pct(&mut self, per_column_null_pct: Vec<f32>) {
+        self.load_info.per_column_null_percentage = per_column_null_pct;
+    }
+}
+
+impl Drop for LoadInfoDropHandler {
+    fn drop(&mut self) {
+        if self.omit.load(Ordering::Relaxed) {
+            return;
+        }
+        if let Some(sampling_tx) = self.sampling_tx.take() {
+            let _ = sampling_tx.send(std::mem::take(&mut self.load_info));
+            let _ = self.load_info_sent_at.set(Instant::now());
+        }
+    }
+}
+
+fn zero_load_info(partition_idx: usize, n_cols: usize) -> pb::LoadInfo {
+    pb::LoadInfo {
+        partition: partition_idx as u64,
+        rows_per_second: 0,
+        rows_ready: 0,
+        per_column_bytes_per_second: vec![0; n_cols],
+        per_column_bytes_ready: vec![0; n_cols],
+        per_column_ndv_percentage: vec![0.0; n_cols],
+        per_column_null_percentage: vec![0.0; n_cols],
+    }
+}
+
+struct RecordBatchPeek {
+    peek: VecDeque<RecordBatch>,
+    n_cols: usize,
+    max_batches_peeked: MaxGaugeMetric,
+    max_mem_used: Gauge,
+    memory_reservation: Arc<MemoryReservation>,
+    first_batch_at: Arc<OnceLock<Instant>>,
+}
+
+impl RecordBatchPeek {
+    fn push(&mut self, batch: RecordBatch) {
+        let batch_size = record_batch_size(&batch);
+        if self.peek.is_empty() {
+            let _ = self.first_batch_at.set(Instant::now());
+        }
+        self.max_mem_used.add(batch_size);
+        self.memory_reservation.grow(batch_size);
+        self.peek.push_back(batch);
+        self.max_batches_peeked.set_max(self.peek.len());
+    }
+
+    fn per_col_bytes_ready(&self) -> Vec<usize> {
+        let mut result = vec![0; self.n_cols];
+        for batch in self.peek.iter() {
+            for (i, col) in batch.columns().iter().enumerate() {
+                result[i] += column_size(col)
+            }
+        }
+        result
+    }
+
+    fn rows_ready(&self) -> usize {
+        self.peek.iter().map(|batch| batch.num_rows()).sum()
+    }
+
+    fn per_col_ndv(&self) -> Vec<f32> {
+        let total_rows: usize = self.peek.iter().map(|b| b.num_rows()).sum();
+        if total_rows == 0 {
+            return vec![0.0; self.n_cols];
+        }
+
+        // Build the list of flat row indices to sample, sorted for cache-friendly access.
+        let sampled_indices: Vec<usize> = if total_rows <= NDV_MAX_ROWS_SAMPLE {
+            (0..total_rows).collect()
+        } else {
+            let mut indices =
+                rand::seq::index::sample(&mut rand::rng(), total_rows, NDV_MAX_ROWS_SAMPLE)
+                    .into_vec();
+            indices.sort_unstable();
+            indices
+        };
+        let rows_sampled = sampled_indices.len();
+
+        let mut sets: Vec<HashSet<ScalarValue>> = vec![HashSet::new(); self.n_cols];
+        let mut flat_base = 0usize;
+        let mut sample_pos = 0usize;
+
+        for batch in &self.peek {
+            let batch_end = flat_base + batch.num_rows();
+            while sample_pos < sampled_indices.len() && sampled_indices[sample_pos] < batch_end {
+                let row = sampled_indices[sample_pos] - flat_base;
+                for (col_idx, set) in sets.iter_mut().enumerate() {
+                    let col = batch.column(col_idx);
+                    if !col.is_null(row)
+                        && let Ok(v) = ScalarValue::try_from_array(col, row)
+                    {
+                        set.insert(v);
+                    }
+                }
+                sample_pos += 1;
+            }
+            if sample_pos >= sampled_indices.len() {
+                break;
+            }
+            flat_base = batch_end;
+        }
+
+        sets.into_iter()
+            .map(|s| s.len() as f32 / rows_sampled as f32)
+            .collect()
+    }
+
+    fn per_col_null_pct(&self) -> Vec<f32> {
+        let total_rows: usize = self.peek.iter().map(|b| b.num_rows()).sum();
+        if total_rows == 0 {
+            return vec![0.0; self.n_cols];
+        }
+        let mut counts = vec![0usize; self.n_cols];
+        for batch in &self.peek {
+            for (col_idx, count) in counts.iter_mut().enumerate() {
+                *count += batch.column(col_idx).null_count();
+            }
+        }
+        counts
+            .into_iter()
+            .map(|c| c as f32 / total_rows as f32)
+            .collect()
+    }
+}
+
+impl Stream for RecordBatchPeek {
+    type Item = Result<RecordBatch>;
+
+    fn poll_next(mut self: Pin<&mut Self>, _cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        match self.as_mut().peek.pop_front() {
+            None => Poll::Ready(None),
+            Some(batch) => {
+                self.memory_reservation.shrink(record_batch_size(&batch));
+                Poll::Ready(Some(Ok(batch)))
+            }
+        }
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        (self.peek.len(), Some(self.peek.len()))
+    }
+}
+
+fn column_size(arr: &ArrayRef) -> usize {
+    arr.to_data().get_slice_memory_size().unwrap_or(0)
+}
+
+fn record_batch_size(batch: &RecordBatch) -> usize {
+    batch.columns().iter().map(column_size).sum()
+}
+
+impl DisplayAs for SamplerExec {
+    fn fmt_as(&self, _t: DisplayFormatType, f: &mut Formatter) -> std::fmt::Result {
+        write!(
+            f,
+            "SamplerExec: partitions={}",
+            self.partition_samplers.len()
+        )
+    }
+}
+
+impl ExecutionPlan for SamplerExec {
+    fn name(&self) -> &str {
+        "SamplerExec"
+    }
+
+    fn properties(&self) -> &Arc<PlanProperties> {
+        self.input.properties()
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+        vec![&self.input]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        Ok(Arc::new(Self::new(require_one_child(children)?)))
+    }
+
+    fn execute(
+        &self,
+        partition: usize,
+        _context: Arc<TaskContext>,
+    ) -> Result<SendableRecordBatchStream> {
+        self.execution_started.store(true, Ordering::Relaxed);
+        let Some(partition_sampler) = self.partition_samplers.get(partition) else {
+            return exec_err!("Partition {partition} not available in SamplerExec");
+        };
+        let Some(stream) = partition_sampler.start_stream() else {
+            return exec_err!("SamplerExec[{partition}] was not kicked off");
+        };
+        Ok(stream)
+    }
+
+    fn metrics(&self) -> Option<MetricsSet> {
+        Some(self.metric_set.clone_inner())
+    }
+}
diff --git a/src/metrics/bytes_metric.rs b/src/metrics/bytes_metric.rs
index 3bdd7f5a..5d8c7e6c 100644
--- a/src/metrics/bytes_metric.rs
+++ b/src/metrics/bytes_metric.rs
@@ -5,6 +5,7 @@ use std::{
     sync::{Arc, atomic::AtomicUsize},
 };
 
+use datafusion::physical_plan::Metric;
 use datafusion::{
     common::human_readable_size,
     physical_plan::metrics::{CustomMetricValue, MetricBuilder, MetricValue},
@@ -50,6 +51,16 @@ impl Default for BytesCounterMetric {
 }
 
 impl BytesCounterMetric {
+    pub fn new_metric(name: impl Into<Cow<'static, str>>, bytes: usize) -> Arc<Metric> {
+        Arc::new(Metric::new(
+            MetricValue::Custom {
+                name: name.into(),
+                value: Arc::new(BytesCounterMetric::from_value(bytes)),
+            },
+            None,
+        ))
+    }
+
     pub fn from_value(bytes: usize) -> Self {
         Self {
             bytes: Arc::new(AtomicUsize::new(bytes)),
diff --git a/src/metrics/task_metrics_rewriter.rs b/src/metrics/task_metrics_rewriter.rs
index 3ba5c88c..87d93c96 100644
--- a/src/metrics/task_metrics_rewriter.rs
+++ b/src/metrics/task_metrics_rewriter.rs
@@ -81,6 +81,7 @@ pub async fn rewrite_distributed_plan_with_metrics(
                 num: stage.num,
                 plan: plan_with_metrics,
                 tasks: stage.tasks,
+                metrics_set: stage.metrics_set.clone(),
             }))?;
             let network_boundary =
                 MetricsWrapperExec::new(network_boundary, plan.metrics().unwrap_or_default());
@@ -415,6 +416,7 @@ mod tests {
             num: 2,
             plan,
             tasks: 4,
+            metrics_set: Default::default(),
         }
     }
 
diff --git a/src/protobuf/distributed_codec.rs b/src/protobuf/distributed_codec.rs
index 3cfec30a..1ea27541 100644
--- a/src/protobuf/distributed_codec.rs
+++ b/src/protobuf/distributed_codec.rs
@@ -1,9 +1,9 @@
 use super::get_distributed_user_codecs;
 use crate::NetworkShuffleExec;
-use crate::common::{deserialize_uuid, serialize_uuid};
+use crate::common::{deserialize_uuid, require_one_child, serialize_uuid};
 use crate::execution_plans::{
     BroadcastExec, ChildWeight, ChildrenIsolatorUnionExec, NetworkBroadcastExec,
-    NetworkCoalesceExec,
+    NetworkCoalesceExec, SamplerExec,
 };
 use crate::stage::{LocalStage, RemoteStage, Stage};
 use crate::worker::WorkerConnectionPool;
@@ -74,6 +74,7 @@ impl PhysicalExtensionCodec for DistributedCodec {
                     num: proto.num as usize,
                     plan: input,
                     tasks: proto.tasks.len(),
+                    metrics_set: Default::default(),
                 }))
             } else {
                 let mut worker_urls = Vec::with_capacity(proto.tasks.len());
@@ -90,6 +91,7 @@ impl PhysicalExtensionCodec for DistributedCodec {
                     query_id: deserialize_uuid(proto.query_id.as_ref())?,
                     num: proto.num as usize,
                     workers: worker_urls,
+                    runtime_stats: None,
                 }))
             }
         }
@@ -233,6 +235,9 @@ impl PhysicalExtensionCodec for DistributedCodec {
                         .collect(),
                 }))
             }
+            DistributedExecNode::Sampler(SamplerExecProto {}) => {
+                Ok(Arc::new(SamplerExec::new(require_one_child(inputs)?)))
+            }
         }
     }
 
@@ -349,6 +354,14 @@ impl PhysicalExtensionCodec for DistributedCodec {
                 node: Some(DistributedExecNode::ChildrenIsolatorUnion(inner)),
             };
 
+            wrapper.encode(buf).map_err(|e| proto_error(format!("{e}")))
+        } else if let Some(_node) = node.downcast_ref::<SamplerExec>() {
+            let inner = SamplerExecProto {};
+
+            let wrapper = DistributedExecProto {
+                node: Some(DistributedExecNode::Sampler(inner)),
+            };
+
             wrapper.encode(buf).map_err(|e| proto_error(format!("{e}")))
         } else {
             Err(proto_error(format!("Unexpected plan {}", node.name())))
@@ -380,7 +393,7 @@ pub struct ExecutionTaskProto {
 
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct DistributedExecProto {
-    #[prost(oneof = "DistributedExecNode", tags = "1, 2, 3, 4, 5, 6")]
+    #[prost(oneof = "DistributedExecNode", tags = "1, 2, 3, 4, 5, 6, 7")]
     pub node: Option<DistributedExecNode>,
 }
 
@@ -397,6 +410,8 @@ pub enum DistributedExecNode {
     NetworkBroadcast(NetworkBroadcastExecProto),
     #[prost(message, tag = "6")]
     Broadcast(BroadcastExecProto),
+    #[prost(message, tag = "7")]
+    Sampler(SamplerExecProto),
 }
 
 /// Protobuf representation of the [NetworkShuffleExec] physical node. It serves as
@@ -509,6 +524,9 @@ pub struct BroadcastExecProto {
     pub consumer_task_count: u64,
 }
 
+#[derive(Clone, PartialEq, ::prost::Message)]
+pub struct SamplerExecProto {}
+
 fn new_network_broadcast_exec(
     partitioning: Partitioning,
     schema: SchemaRef,
@@ -547,6 +565,7 @@ mod tests {
             query_id: Default::default(),
             num: 0,
             workers: vec![],
+            runtime_stats: None,
         })
     }
 
@@ -556,6 +575,7 @@ mod tests {
             num: 0,
             plan: empty_exec(),
             tasks: 1,
+            metrics_set: Default::default(),
         })
     }
 
diff --git a/src/stage.rs b/src/stage.rs
index 0545d718..3ac39e7a 100644
--- a/src/stage.rs
+++ b/src/stage.rs
@@ -1,13 +1,15 @@
 use crate::coordinator::{DistributedExec, MetricsStore};
 use crate::execution_plans::{DistributedLeafExec, NetworkCoalesceExec};
 use crate::metrics::DISTRIBUTED_DATAFUSION_TASK_ID_LABEL;
-use datafusion::common::{HashMap, config_err};
+use datafusion::common::{HashMap, Statistics, config_err};
 use datafusion::common::{exec_err, plan_err};
 use datafusion::error::Result;
 use datafusion::execution::{SendableRecordBatchStream, TaskContext};
 use datafusion::physical_plan::display::DisplayableExecutionPlan;
 use datafusion::physical_plan::metrics::{Label, Metric, MetricsSet};
-use datafusion::physical_plan::{ExecutionPlan, ExecutionPlanProperties, displayable};
+use datafusion::physical_plan::{
+    ColumnStatistics, ExecutionPlan, ExecutionPlanProperties, displayable,
+};
 use itertools::Either;
 use std::collections::VecDeque;
 use std::sync::Arc;
@@ -84,6 +86,8 @@ pub struct LocalStage {
     pub plan: Arc<dyn ExecutionPlan>,
     /// The number of tasks the stage has.
     pub tasks: usize,
+    /// Metrics collected by the coordinator
+    pub metrics_set: MetricsSet,
 }
 
 impl LocalStage {
@@ -107,6 +111,8 @@ pub struct RemoteStage {
     pub num: usize,
     /// The worker URLs to which queries should be issued.
     pub workers: Vec<Url>,
+    /// Statistics collected at runtime, if any.
+    pub runtime_stats: Option<Arc<Statistics>>,
 }
 
 impl Stage {
@@ -137,6 +143,63 @@ impl Stage {
             Self::Remote(_) => None,
         }
     }
+
+    pub fn metrics(&self) -> MetricsSet {
+        match &self {
+            Self::Local(v) => v.metrics_set.clone(),
+            Self::Remote(_) => MetricsSet::new(),
+        }
+    }
+
+    pub fn partition_statistics(
+        &self,
+        partition: Option<usize>,
+        partition_count: usize,
+        schema: SchemaRef,
+    ) -> Result<Arc<Statistics>> {
+        match self {
+            Stage::Local(local) => local.plan.partition_statistics(partition),
+            Stage::Remote(remote) => {
+                let Some(runtime_stats) = &remote.runtime_stats else {
+                    return Ok(Arc::new(Statistics::new_unknown(&schema)));
+                };
+                match partition {
+                    None => Ok(Arc::clone(runtime_stats)),
+                    Some(_) => Ok(Arc::new(multiply_stats(
+                        runtime_stats,
+                        1.0 / partition_count as f32,
+                    ))),
+                }
+            }
+        }
+    }
+}
+
+fn multiply_stats(stats: &Statistics, f: f32) -> Statistics {
+    Statistics {
+        num_rows: multiply_precision(stats.num_rows, f),
+        total_byte_size: multiply_precision(stats.total_byte_size, f),
+        column_statistics: stats
+            .column_statistics
+            .iter()
+            .map(|col| ColumnStatistics {
+                null_count: multiply_precision(col.null_count, f),
+                max_value: Precision::Absent,
+                min_value: Precision::Absent,
+                sum_value: Precision::Absent,
+                distinct_count: multiply_precision(col.distinct_count, f),
+                byte_size: multiply_precision(col.byte_size, f),
+            })
+            .collect(),
+    }
+}
+
+fn multiply_precision(p: Precision<usize>, f: f32) -> Precision<usize> {
+    match p {
+        Precision::Exact(v) => Precision::Exact((v as f32 * f) as usize),
+        Precision::Inexact(v) => Precision::Inexact((v as f32 * f) as usize),
+        Precision::Absent => Precision::Absent,
+    }
 }
 
 #[derive(Debug, Clone, Copy, PartialEq)]
@@ -161,7 +224,9 @@ use crate::metrics::proto::metric_proto_to_df;
 use crate::worker::generated::worker as pb;
 use crate::{DistributedMetricsFormat, NetworkShuffleExec, rewrite_distributed_plan_with_metrics};
 use crate::{NetworkBoundary, NetworkBoundaryExt};
+use datafusion::arrow::datatypes::SchemaRef;
 use datafusion::common::DataFusionError;
+use datafusion::common::stats::Precision;
 use datafusion::physical_expr::Partitioning;
 /// Be able to display a nice tree for stages.
 ///
@@ -373,7 +438,7 @@ fn gather_stage_header_metrics(stage: &Stage, metrics_store: &MetricsStore) -> M
         stage_id: stage.num() as u64,
         task_number: 0,
     };
-    let mut all_metrics = MetricsSet::new();
+    let mut all_metrics = stage.metrics();
     while let Some(metrics_set) = metrics_store.get(&task_key).and_then(|v| v.task_metrics) {
         for mut metric in metrics_set.metrics {
             metric.labels.push(pb::Label {
@@ -573,6 +638,7 @@ pub fn display_plan_graphviz(plan: Arc<dyn ExecutionPlan>) -> Result<String> {
             num: max_num + 1,
             plan: plan.clone(),
             tasks: 1,
+            metrics_set: MetricsSet::new(),
         });
         all_stages.insert(0, &head_stage);
 
diff --git a/src/work_unit_feed/remote_work_unit_feed.rs b/src/work_unit_feed/remote_work_unit_feed.rs
index f914f228..1526508c 100644
--- a/src/work_unit_feed/remote_work_unit_feed.rs
+++ b/src/work_unit_feed/remote_work_unit_feed.rs
@@ -38,8 +38,18 @@ pub(crate) struct RemoteWorkUnitFeedRegistry {
 impl RemoteWorkUnitFeedRegistry {
     /// Creates all the receivers and senders for a specific [WorkUnit] Feed id. One feed per
     /// partition is created.
+    ///
+    /// Calling this twice with the same `id` is a coordinator bug — duplicate declarations
+    /// mean two plan nodes share a UUID, which would cause "already consumed" when both
+    /// nodes call `feed()`. We skip rather than overwrite so the coordinator-side duplicate
+    /// detection in `task_specialized_plan` surfaces the real error first.
     pub(crate) fn add(&mut self, id: Uuid, partitions: usize) {
         for partition in 0..partitions {
+            // Skip if already registered; overwriting would silently drop the existing
+            // receiver and cause a confusing "already consumed" error at execution time.
+            if self.receivers.contains_key(&(id, partition)) {
+                continue;
+            }
             let (tx, rx) = tokio::sync::mpsc::unbounded_channel();
             self.receivers.insert((id, partition), Mutex::new(Some(rx)));
             self.senders.insert((id, partition), tx);
diff --git a/src/worker/generated/worker.rs b/src/worker/generated/worker.rs
index fe7a0137..290261ed 100644
--- a/src/worker/generated/worker.rs
+++ b/src/worker/generated/worker.rs
@@ -24,7 +24,7 @@ pub mod coordinator_to_worker_msg {
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct WorkerToCoordinatorMsg {
-    #[prost(oneof = "worker_to_coordinator_msg::Inner", tags = "1")]
+    #[prost(oneof = "worker_to_coordinator_msg::Inner", tags = "1, 2, 3")]
     pub inner: ::core::option::Option<worker_to_coordinator_msg::Inner>,
 }
 /// Nested message and enum types in `WorkerToCoordinatorMsg`.
@@ -37,6 +37,12 @@ pub mod worker_to_coordinator_msg {
         /// metrics\[i\] is the set of metrics for plan node i in pre-order traversal order.
         #[prost(message, tag = "1")]
         TaskMetrics(super::TaskMetrics),
+        /// Load information reported by a task. This information is used for dynamically
+        /// sizing the number of workers involved in a query.
+        #[prost(message, tag = "2")]
+        LoadInfo(super::LoadInfo),
+        #[prost(bool, tag = "3")]
+        LoadInfoEos(bool),
     }
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
@@ -52,6 +58,34 @@ pub struct TaskMetrics {
     #[prost(message, optional, tag = "2")]
     pub task_metrics: ::core::option::Option<MetricsSet>,
 }
+/// Load information reported for a specific partition with information about this
+/// amount of data flowing through the plan.
+#[derive(Clone, PartialEq, ::prost::Message)]
+pub struct LoadInfo {
+    /// The partition index to which this message belongs to.
+    #[prost(uint64, tag = "1")]
+    pub partition: u64,
+    /// The amount of rows ready to be returned.
+    #[prost(uint64, tag = "2")]
+    pub rows_ready: u64,
+    /// The estimated velocity at which rows will flow through the node. If all the rows were
+    /// already accumulated, they will be reported by `rows_ready`, and this field will be 0.
+    #[prost(uint64, tag = "3")]
+    pub rows_per_second: u64,
+    /// The amount of bytes ready to be returned per column.
+    #[prost(uint64, repeated, tag = "4")]
+    pub per_column_bytes_ready: ::prost::alloc::vec::Vec<u64>,
+    /// The estimated velocity at which data will flow through each column. If all the bytes were
+    /// already accumulated, they will be reported by `bytes_ready`, and this field will be 0.
+    #[prost(uint64, repeated, tag = "5")]
+    pub per_column_bytes_per_second: ::prost::alloc::vec::Vec<u64>,
+    /// Approximate ratio of NDV for each column.
+    #[prost(float, repeated, tag = "6")]
+    pub per_column_ndv_percentage: ::prost::alloc::vec::Vec<f32>,
+    /// Approximate ratio of null count for each column.
+    #[prost(float, repeated, tag = "7")]
+    pub per_column_null_percentage: ::prost::alloc::vec::Vec<f32>,
+}
 #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct GetWorkerInfoRequest {}
 #[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
diff --git a/src/worker/impl_coordinator_channel.rs b/src/worker/impl_coordinator_channel.rs
index d1c2097f..fcb3944a 100644
--- a/src/worker/impl_coordinator_channel.rs
+++ b/src/worker/impl_coordinator_channel.rs
@@ -1,4 +1,5 @@
 use crate::common::deserialize_uuid;
+use crate::execution_plans::SamplerExec;
 use crate::work_unit_feed::{RemoteWorkUnitFeedRegistry, set_work_unit_received_time};
 use crate::worker::LocalWorkerContext;
 use crate::worker::generated::worker::coordinator_to_worker_msg::Inner;
@@ -17,6 +18,7 @@ use datafusion::execution::SessionStateBuilder;
 use datafusion::prelude::SessionConfig;
 use datafusion_proto::physical_plan::AsExecutionPlan;
 use datafusion_proto::protobuf::PhysicalPlanNode;
+use futures::stream::FuturesUnordered;
 use futures::{FutureExt, StreamExt, TryStreamExt};
 use std::sync::atomic::AtomicUsize;
 use std::sync::{Arc, OnceLock};
@@ -55,6 +57,7 @@ impl Worker {
         }
 
         let (metrics_tx, metrics_rx) = oneshot::channel();
+        let mut load_info_rxs = vec![];
 
         let task_data = || async {
             let headers = grpc_headers.into_headers();
@@ -98,6 +101,8 @@ impl Worker {
             for hook in self.hooks.on_plan.iter() {
                 plan = hook(plan, session_state.config())?;
             }
+            load_info_rxs =
+                SamplerExec::kick_off_first_sampler(Arc::clone(&plan), Arc::clone(&task_ctx))?;
 
             // Initialize partition count to the number of partitions in the stage
             let total_partitions = plan.properties().partitioning.partition_count();
@@ -172,19 +177,34 @@ impl Worker {
             tokio::spawn(async move { task_data_entries.invalidate(&key).await });
         });
 
+        let load_info_stream = FuturesUnordered::from_iter(load_info_rxs)
+            .filter_map(async |load_info_or_channel_dropped| {
+                // This error can only happen if the pb::LoadInfo sender was dropped, which is fine.
+                let load_info = load_info_or_channel_dropped.ok()?;
+                Some(Ok(WorkerToCoordinatorMsg {
+                    inner: Some(worker_to_coordinator_msg::Inner::LoadInfo(load_info)),
+                }))
+            })
+            .chain(futures::stream::once(async move {
+                Ok(WorkerToCoordinatorMsg {
+                    inner: Some(worker_to_coordinator_msg::Inner::LoadInfoEos(true)),
+                })
+            }));
+
         // Stream back the metrics once the task finishes executing.
         // The oneshot receiver resolves when impl_execute_task sends the collected
         // metrics after all partitions have finished or been dropped.
         let metrics_stream = metrics_rx.into_stream();
-        let metrics_stream = metrics_stream.filter_map(|task_metrics| async move {
-            match task_metrics {
-                Ok(task_metrics) => Some(WorkerToCoordinatorMsg {
-                    inner: Some(worker_to_coordinator_msg::Inner::TaskMetrics(task_metrics)),
-                }),
-                Err(_) => None, // channel dropped without sending any message
-            }
+        let metrics_stream = metrics_stream.filter_map(async |task_metrics_or_channel_dropped| {
+            let task_metrics = task_metrics_or_channel_dropped.ok()?;
+            Some(Ok(WorkerToCoordinatorMsg {
+                inner: Some(worker_to_coordinator_msg::Inner::TaskMetrics(task_metrics)),
+            }))
         });
-        Ok(Response::new(metrics_stream.map(Ok).boxed()))
+
+        Ok(Response::new(
+            futures::stream::select(load_info_stream, metrics_stream).boxed(),
+        ))
     }
 }
 
diff --git a/src/worker/task_data.rs b/src/worker/task_data.rs
index 28f5ca5d..97b2e806 100644
--- a/src/worker/task_data.rs
+++ b/src/worker/task_data.rs
@@ -1,6 +1,7 @@
 use crate::MaxLatencyMetric;
-use crate::common::{OnceLockResult, now_ns};
-use crate::distributed_planner::{ProducerHead, insert_producer_head};
+use crate::common::OnceLockResult;
+use crate::common::now_ns;
+use crate::distributed_planner::ProducerHead;
 use crate::worker::generated::worker as pb;
 use datafusion::common::{DataFusionError, Result};
 use datafusion::execution::TaskContext;
@@ -134,7 +135,7 @@ impl TaskData {
             let producer_head =
                 ProducerHead::from_proto(producer_head, &self.base_plan.schema(), &self.task_ctx)?;
 
-            let plan = insert_producer_head(Arc::clone(&self.base_plan), producer_head)?;
+            let plan = producer_head.insert(Arc::clone(&self.base_plan))?;
 
             self.num_partitions_remaining.store(
                 plan.output_partitioning().partition_count(),
diff --git a/src/worker/worker.proto b/src/worker/worker.proto
index bc1e3412..ef691b59 100644
--- a/src/worker/worker.proto
+++ b/src/worker/worker.proto
@@ -33,6 +33,12 @@ message WorkerToCoordinatorMsg {
     // ensuring metrics are never lost due to early stream termination.
     // metrics[i] is the set of metrics for plan node i in pre-order traversal order.
     TaskMetrics task_metrics = 1;
+
+    // Load information reported by a task. This information is used for dynamically
+    // sizing the number of workers involved in a query.
+    LoadInfo load_info = 2;
+
+    bool load_info_eos = 3;
   }
 }
 
@@ -47,6 +53,27 @@ message TaskMetrics {
   MetricsSet task_metrics = 2;
 }
 
+// Load information reported for a specific partition with information about this
+// amount of data flowing through the plan.
+message LoadInfo {
+  // The partition index to which this message belongs to.
+  uint64 partition = 1;
+  // The amount of rows ready to be returned.
+  uint64 rows_ready = 2;
+  // The estimated velocity at which rows will flow through the node. If all the rows were
+  // already accumulated, they will be reported by `rows_ready`, and this field will be 0.
+  uint64 rows_per_second = 3;
+  // The amount of bytes ready to be returned per column.
+  repeated uint64 per_column_bytes_ready = 4;
+  // The estimated velocity at which data will flow through each column. If all the bytes were
+  // already accumulated, they will be reported by `bytes_ready`, and this field will be 0.
+  repeated uint64 per_column_bytes_per_second = 5;
+  // Approximate ratio of NDV for each column.
+  repeated float per_column_ndv_percentage = 6;
+  // Approximate ratio of null count for each column.
+  repeated float per_column_null_percentage = 7;
+}
+
 message GetWorkerInfoRequest {}
 
 message GetWorkerInfoResponse {
diff --git a/tests/clickbench_correctness_test.rs b/tests/clickbench_correctness_test.rs
index 9df2a8d3..acf3daa8 100644
--- a/tests/clickbench_correctness_test.rs
+++ b/tests/clickbench_correctness_test.rs
@@ -18,6 +18,7 @@ mod tests {
     use std::sync::Arc;
     use tokio::sync::OnceCell;
 
+    const ADAPTIVE_ENV_VAR: &str = "ADAPTIVE";
     const NUM_WORKERS: usize = 4;
     const PARTITIONS: usize = 3;
     const FILE_SCAN_CONFIG_BYTES_PER_PARTITION: usize = 1;
@@ -289,12 +290,15 @@ mod tests {
             .options_mut()
             .execution
             .target_partitions = PARTITIONS;
-        let d_ctx = d_ctx
+        let mut d_ctx = d_ctx
             .with_distributed_file_scan_config_bytes_per_partition(
                 FILE_SCAN_CONFIG_BYTES_PER_PARTITION,
             )?
             .with_distributed_cardinality_effect_task_scale_factor(CARDINALITY_TASK_COUNT_FACTOR)?
             .with_distributed_broadcast_joins(true)?;
+        if std::env::var(ADAPTIVE_ENV_VAR).unwrap_or_default() == "true" {
+            d_ctx.set_distributed_dynamic_task_count(true)?;
+        }
 
         register_tables(&s_ctx, &data_dir).await?;
         register_tables(&d_ctx, &data_dir).await?;
diff --git a/tests/metrics_collection.rs b/tests/metrics_collection.rs
index ccb43a57..40d05532 100644
--- a/tests/metrics_collection.rs
+++ b/tests/metrics_collection.rs
@@ -6,6 +6,7 @@ mod tests {
     use datafusion::common::{Result, assert_contains};
     use datafusion::execution::SessionState;
     use datafusion::physical_plan::display::DisplayableExecutionPlan;
+    use datafusion::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec;
     use datafusion::physical_plan::{ExecutionPlan, execute_stream};
     use datafusion::prelude::SessionContext;
     use datafusion_distributed::test_utils::localhost::start_localhost_context;
@@ -341,6 +342,37 @@ mod tests {
         Ok(())
     }
 
+    #[tokio::test]
+    async fn test_metrics_collection_dynamic() -> Result<(), Box<dyn std::error::Error>> {
+        let (mut d_ctx, _guard, _) = start_localhost_context(3, DefaultSessionBuilder).await;
+        d_ctx.set_distributed_dynamic_task_count(true)?;
+
+        let query =
+            r#"SELECT count(*), "RainToday" FROM weather GROUP BY "RainToday" ORDER BY count(*)"#;
+
+        let s_ctx = SessionContext::default();
+        let (s_physical, mut d_physical) = execute(&s_ctx, &d_ctx, query).await?;
+        d_physical = rewrite_with_metrics(d_physical, DistributedMetricsFormat::Aggregated).await;
+        println!("{}", display_plan_ascii(s_physical.as_ref(), true));
+        println!("{}", display_plan_ascii(d_physical.as_ref(), true));
+
+        assert_metrics_equal::<DataSourceExec, DistributedLeafExec>(
+            ["output_rows", "output_bytes"],
+            &s_physical,
+            &d_physical,
+            0,
+        );
+
+        assert_metrics_equal::<SortPreservingMergeExec, SortPreservingMergeExec>(
+            ["output_rows", "output_bytes"],
+            &s_physical,
+            &d_physical,
+            0,
+        );
+
+        Ok(())
+    }
+
     /// Looks for an [ExecutionPlan] that matches the provided type parameter `T1` in
     /// the left node and `T2` in the right node and compares its metrics.
     /// There might be more than one, so `index` determines which one is compared.
diff --git a/tests/stateful_data_cleanup.rs b/tests/stateful_data_cleanup.rs
index a3fe7bca..fae892d2 100644
--- a/tests/stateful_data_cleanup.rs
+++ b/tests/stateful_data_cleanup.rs
@@ -20,13 +20,18 @@ mod tests {
     const TPCH_DATA_PARTS: usize = 16;
     const CARDINALITY_TASK_COUNT_FACTOR: f64 = 1.0;
 
-    #[test_case(false; "metrics_disabled")]
-    #[test_case(true; "metrics_enabled")]
+    #[test_case((false, false); "metrics_disabled_static_planner")]
+    #[test_case((true, false); "metrics_enabled_static_planner")]
+    #[test_case((false, true); "metrics_disabled_dynamic_planner")]
+    #[test_case((true, true); "metrics_enabled_dynamic_planner")]
     #[tokio::test(flavor = "multi_thread")]
-    async fn no_pending_tasks_if_dynamic_query_completes(collect_metrics: bool) -> Result<()> {
+    async fn no_pending_tasks_if_dynamic_query_completes(
+        (collect_metrics, adaptive): (bool, bool),
+    ) -> Result<()> {
         let (mut d_ctx, _guard, workers) =
             start_localhost_context(NUM_WORKERS, DefaultSessionBuilder).await;
         d_ctx.set_distributed_metrics_collection(collect_metrics)?;
+        d_ctx.set_distributed_dynamic_task_count(adaptive)?;
 
         run_tpch_query(d_ctx, "q1").await?;
 
@@ -35,10 +40,18 @@ mod tests {
         Ok(())
     }
 
+    #[test_case((false, false); "metrics_disabled_static_planner")]
+    #[test_case((true, false); "metrics_enabled_static_planner")]
+    #[test_case((false, true); "metrics_disabled_dynamic_planner")]
+    #[test_case((true, true); "metrics_enabled_dynamic_planner")]
     #[tokio::test(flavor = "multi_thread")]
-    async fn no_pending_tasks_if_query_aborts() -> Result<()> {
-        let (d_ctx, _guard, workers) =
+    async fn no_pending_tasks_if_query_aborts(
+        (collect_metrics, adaptive): (bool, bool),
+    ) -> Result<()> {
+        let (mut d_ctx, _guard, workers) =
             start_localhost_context(NUM_WORKERS, DefaultSessionBuilder).await;
+        d_ctx.set_distributed_metrics_collection(collect_metrics)?;
+        d_ctx.set_distributed_dynamic_task_count(adaptive)?;
 
         let _ = timeout(Duration::from_millis(100), run_tpch_query(d_ctx, "q1")).await;
 
diff --git a/tests/tpcds_correctness_test.rs b/tests/tpcds_correctness_test.rs
index bb6011a7..e4baeba9 100644
--- a/tests/tpcds_correctness_test.rs
+++ b/tests/tpcds_correctness_test.rs
@@ -18,6 +18,7 @@ mod tests {
     use std::sync::Arc;
     use tokio::sync::OnceCell;
 
+    const ADAPTIVE_ENV_VAR: &str = "ADAPTIVE";
     const NUM_WORKERS: usize = 4;
     const PARTITIONS: usize = 3;
     const FILE_SCAN_CONFIG_BYTES_PER_PARTITION: usize = 1;
@@ -575,12 +576,15 @@ mod tests {
             .options_mut()
             .execution
             .target_partitions = PARTITIONS;
-        let d_ctx = d_ctx
+        let mut d_ctx = d_ctx
             .with_distributed_file_scan_config_bytes_per_partition(
                 FILE_SCAN_CONFIG_BYTES_PER_PARTITION,
             )?
             .with_distributed_cardinality_effect_task_scale_factor(CARDINALITY_TASK_COUNT_FACTOR)?
             .with_distributed_broadcast_joins(true)?;
+        if std::env::var(ADAPTIVE_ENV_VAR).unwrap_or_default() == "true" {
+            d_ctx.set_distributed_dynamic_task_count(true)?;
+        }
 
         register_tables(&s_ctx, &data_dir).await?;
         register_tables(&d_ctx, &data_dir).await?;
diff --git a/tests/tpch_correctness_test.rs b/tests/tpch_correctness_test.rs
index 3b6bc5a4..3ae089dc 100644
--- a/tests/tpch_correctness_test.rs
+++ b/tests/tpch_correctness_test.rs
@@ -12,6 +12,7 @@ mod tests {
     use std::path::Path;
     use tokio::sync::OnceCell;
 
+    const ADAPTIVE_ENV_VAR: &str = "ADAPTIVE";
     const NUM_WORKERS: usize = 4;
     const PARTITIONS: usize = 6;
     const FILE_SCAN_CONFIG_BYTES_PER_PARTITION: usize = 1;
@@ -139,7 +140,10 @@ mod tests {
     // in a non-distributed manner. For each query, it asserts that the results are identical.
     async fn test_tpch_query(sql: String) -> Result<(), Box<dyn Error>> {
         let d_ctx = start_in_memory_context(NUM_WORKERS, DefaultSessionBuilder).await;
-        let d_ctx = d_ctx.with_distributed_broadcast_joins(true)?;
+        let mut d_ctx = d_ctx.with_distributed_broadcast_joins(true)?;
+        if std::env::var(ADAPTIVE_ENV_VAR).unwrap_or_default() == "true" {
+            d_ctx.set_distributed_dynamic_task_count(true)?;
+        }
 
         let d_ctx = d_ctx
             .with_distributed_file_scan_config_bytes_per_partition(