From 2b7dd62aa024d7ce2963cf4e74f4fac277c26ff5 Mon Sep 17 00:00:00 2001 From: Gabriel Musat Mestre Date: Sat, 23 May 2026 14:26:35 +0200 Subject: [PATCH] Support dynamic task count assignation --- .github/workflows/ci.yml | 43 +- Cargo.lock | 1 + Cargo.toml | 1 + benchmarks/cdk/bin/datafusion-bench.ts | 10 + benchmarks/src/run.rs | 5 + src/common/mod.rs | 2 + src/common/recursion.rs | 1 + src/common/vec.rs | 80 +++ src/coordinator/distributed.rs | 8 +- src/coordinator/mod.rs | 1 + src/coordinator/prepare_dynamic_plan.rs | 347 ++++++++++ src/coordinator/prepare_static_plan.rs | 1 + src/coordinator/query_coordinator.rs | 52 +- src/distributed_ext.rs | 74 +++ src/distributed_planner/distributed_config.rs | 9 + .../distributed_query_planner.rs | 8 + .../inject_network_boundaries.rs | 12 +- src/distributed_planner/mod.rs | 6 +- src/distributed_planner/network_boundary.rs | 61 +- .../prepare_network_boundaries.rs | 7 +- .../benchmarks/shuffle_bench.rs | 1 + .../benchmarks/transport_bench.rs | 1 + src/execution_plans/mod.rs | 2 + src/execution_plans/network_broadcast.rs | 13 +- src/execution_plans/network_coalesce.rs | 20 +- src/execution_plans/network_shuffle.rs | 18 +- src/execution_plans/sampler.rs | 594 ++++++++++++++++++ src/metrics/bytes_metric.rs | 11 + src/metrics/task_metrics_rewriter.rs | 2 + src/protobuf/distributed_codec.rs | 26 +- src/stage.rs | 72 ++- src/work_unit_feed/remote_work_unit_feed.rs | 10 + src/worker/generated/worker.rs | 36 +- src/worker/impl_coordinator_channel.rs | 36 +- src/worker/task_data.rs | 7 +- src/worker/worker.proto | 27 + tests/clickbench_correctness_test.rs | 6 +- tests/metrics_collection.rs | 32 + tests/stateful_data_cleanup.rs | 23 +- tests/tpcds_correctness_test.rs | 6 +- tests/tpch_correctness_test.rs | 6 +- 41 files changed, 1597 insertions(+), 81 deletions(-) create mode 100644 src/common/vec.rs create mode 100644 src/coordinator/prepare_dynamic_plan.rs create mode 100644 src/execution_plans/sampler.rs diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index cc217c7b..3c5da3fd 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -41,19 +41,33 @@ jobs: - uses: ./.github/actions/setup - run: cargo test --features integration - tpch-test: + tpch-correctness-test: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + planning_mode: [ "adaptive", "static" ] + steps: + - uses: actions/checkout@v4 + - uses: ./.github/actions/setup + - run: cargo test --features tpch --test tpch_correctness_test + env: + ADAPTIVE: ${{ matrix.planning_mode == 'adaptive' }} + + tpch-plans-test: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: ./.github/actions/setup - - run: cargo test --features tpch --test 'tpch_*' + - run: cargo test --features tpch --test tpch_plans_test tpcds-correctness-test: runs-on: ubuntu-latest strategy: fail-fast: false matrix: - shard: ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10"] + shard: [ "01", "02", "03", "04", "05", "06", "07", "08", "09", "10" ] + planning_mode: [ "adaptive", "static" ] steps: - uses: actions/checkout@v4 - uses: ./.github/actions/setup @@ -62,6 +76,8 @@ jobs: path: testdata/tpcds/main.zip key: "main.zip" - run: cargo test --features tpcds --test tpcds_correctness_test shard${{ matrix.shard }} + env: + ADAPTIVE: ${{ matrix.planning_mode == 'adaptive' }} tpcds-plans-test: runs-on: ubuntu-latest @@ -74,7 +90,24 @@ jobs: key: "main.zip" - run: cargo test --features tpcds --test tpcds_plans_test - clickbench-test: + clickbench-correctness-test: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + planning_mode: [ "adaptive", "static" ] + steps: + - uses: actions/checkout@v4 + - uses: ./.github/actions/setup + - uses: actions/cache@v4 + with: + path: testdata/clickbench/ + key: "data" + - run: cargo test --features clickbench --test clickbench_correctness_test + env: + ADAPTIVE: ${{ matrix.planning_mode == 'adaptive' }} + + clickbench-plans-test: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 @@ -83,7 +116,7 @@ jobs: with: path: testdata/clickbench/ key: "data" - - run: cargo test --features clickbench --test 'clickbench_*' + - run: cargo test --features clickbench --test clickbench_plans_test format-check: runs-on: ubuntu-latest diff --git a/Cargo.lock b/Cargo.lock index ebe698d2..7044adaa 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2208,6 +2208,7 @@ dependencies = [ "insta", "itertools 0.14.0", "moka", + "num-traits", "object_store", "parquet", "pin-project", diff --git a/Cargo.toml b/Cargo.toml index 4d6e3e7e..fd5ceab2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -48,6 +48,7 @@ moka = { version = "0.12", features = ["sync", "future"] } crossbeam-queue = "0.3" sysinfo = { version = "0.30", optional = true } sketches-ddsketch = { version = "0.3", features = ["use_serde"] } +num-traits = "0.2" bincode = "1" tonic-prost = "0.14.2" diff --git a/benchmarks/cdk/bin/datafusion-bench.ts b/benchmarks/cdk/bin/datafusion-bench.ts index f5d15eea..f32af829 100644 --- a/benchmarks/cdk/bin/datafusion-bench.ts +++ b/benchmarks/cdk/bin/datafusion-bench.ts @@ -24,6 +24,8 @@ async function main() { .option('--max-tasks-per-stage ', 'Max tasks per stage', '0') .option('--repartition-file-min-size ', 'repartition_file_min_size DF option', '10485760' /* upstream default */) .option('--target-partitions ', 'target_partitions DF option', '8') + .option('--dynamic ', 'Use the dynamic task count assigner', 'false') + .option('--bytes-per-partition-per-second ', 'Target throughput in bytes per partition per second for the dynamic task count allocator', `${16 * 1024 * 1024}`) .option('--queries ', 'Specific queries to run', undefined) .option('--debug ', 'Print the generated plans to stdout') .option('--warmup ', 'Perform a warmup query before the benchmarks', 'true') @@ -46,6 +48,8 @@ async function main() { const childrenIsolatorUnions = options.childrenIsolatorUnions === 'true' || options.childrenIsolatorUnions === 1 const broadcastJoins = options.broadcastJoins === 'true' || options.broadcastJoins === 1 const partialReduce = options.partialReduce === 'true' || options.partialReduce === 1 + const dynamicTaskCount = options.dynamic === 'true' || options.dynamic === 1 + const bytesPerPartitionPerSecond = parseInt(options.bytesPerPartitionPerSecond) const debug = options.debug === true || options.debug === 'true' || options.debug === 1 const warmup = options.warmup === true || options.warmup === 'true' || options.warmup === 1 @@ -59,6 +63,8 @@ async function main() { compression, broadcastJoins, partialReduce, + dynamicTaskCount, + bytesPerPartitionPerSecond, maxTasksPerStage, repartitionFileMinSize, targetPartitions @@ -98,6 +104,8 @@ class DataFusionRunner implements BenchmarkRunner { childrenIsolatorUnions: boolean; broadcastJoins: boolean; partialReduce: boolean; + dynamicTaskCount: boolean; + bytesPerPartitionPerSecond: number; maxTasksPerStage: number; repartitionFileMinSize: number; targetPartitions: number; @@ -177,6 +185,8 @@ class DataFusionRunner implements BenchmarkRunner { SET distributed.children_isolator_unions=${this.options.childrenIsolatorUnions}; SET distributed.broadcast_joins=${this.options.broadcastJoins}; SET distributed.partial_reduce=${this.options.partialReduce}; + SET distributed.dynamic_task_count=${this.options.dynamicTaskCount}; + SET distributed.bytes_per_partition_per_second=${this.options.bytesPerPartitionPerSecond}; SET distributed.max_tasks_per_stage=${this.options.maxTasksPerStage}; SET datafusion.optimizer.repartition_file_min_size=${this.options.repartitionFileMinSize}; SET datafusion.execution.target_partitions=${this.options.targetPartitions}; diff --git a/benchmarks/src/run.rs b/benchmarks/src/run.rs index 94345f30..7ecaee98 100644 --- a/benchmarks/src/run.rs +++ b/benchmarks/src/run.rs @@ -106,6 +106,10 @@ pub struct RunOpt { #[structopt(long, default_value = "0")] max_tasks_per_stage: usize, + /// Activate dynamic task count + #[structopt(long)] + dynamic: bool, + /// Number of iterations of each test run #[structopt(short = "i", long = "iterations", default_value = "5")] iterations: usize, @@ -203,6 +207,7 @@ impl RunOpt { .with_distributed_cardinality_effect_task_scale_factor( self.cardinality_task_sf.unwrap_or(1.0), )? + .with_distributed_dynamic_task_count(self.dynamic)? .with_distributed_compression(match self.compression.as_str() { "zstd" => Some(CompressionType::ZSTD), "lz4" => Some(CompressionType::LZ4_FRAME), diff --git a/src/common/mod.rs b/src/common/mod.rs index bf9ed549..18cb28a1 100644 --- a/src/common/mod.rs +++ b/src/common/mod.rs @@ -5,6 +5,7 @@ mod recursion; mod task_context_helpers; mod time; mod uuid; +mod vec; pub(crate) use children_helpers::require_one_child; pub(crate) use on_drop_stream::on_drop_stream; @@ -13,3 +14,4 @@ pub(crate) use recursion::TreeNodeExt; pub(crate) use task_context_helpers::task_ctx_with_extension; pub(crate) use time::now_ns; pub(crate) use uuid::{deserialize_uuid, serialize_uuid}; +pub(crate) use vec::{element_wise_sum, vec_avg_reduce, vec_cast, vec_div, vec_mul}; diff --git a/src/common/recursion.rs b/src/common/recursion.rs index 2f2b9463..5c16116e 100644 --- a/src/common/recursion.rs +++ b/src/common/recursion.rs @@ -589,6 +589,7 @@ mod tests { query_id: uuid::Uuid::nil(), num: 0, workers: vec![], + runtime_stats: None, })) .unwrap() } diff --git a/src/common/vec.rs b/src/common/vec.rs new file mode 100644 index 00000000..ec62678b --- /dev/null +++ b/src/common/vec.rs @@ -0,0 +1,80 @@ +use datafusion::common::internal_err; +use datafusion::error::Result; +use num_traits::AsPrimitive; +use std::ops::{AddAssign, DivAssign, MulAssign}; + +/// Converts a slice of type `I` to a `Vec` using `as`-style primitive casting. +pub(crate) fn vec_cast(input: &[I]) -> Vec +where + I: AsPrimitive, + O: Copy + 'static, +{ + input.iter().map(|v| v.as_()).collect() +} + +/// Adds each element of `other` into the corresponding element of `one`, converting types via `AsPrimitive`. +pub(crate) fn element_wise_sum(mut one: Vec, other: &[O]) -> Result> +where + I: AddAssign + Copy + 'static, + O: AsPrimitive + 'static, +{ + if one.len() != other.len() { + return internal_err!("Cannot do an element wise sum of two vectors of different lengths"); + } + for i in 0..one.len() { + one[i] += other[i].as_(); + } + Ok(one) +} + +/// Multiplies every element of `one` by the scalar `other`, converting types via `AsPrimitive`. +pub(crate) fn vec_mul(mut one: Vec, other: O) -> Vec +where + I: MulAssign + Copy + 'static, + O: AsPrimitive + 'static, +{ + for el in one.iter_mut() { + *el *= other.as_(); + } + one +} + +/// Divides every element of `one` by the scalar `other`, converting types via `AsPrimitive`. +pub(crate) fn vec_div(mut one: Vec, other: O) -> Vec +where + I: DivAssign + Copy + 'static, + O: AsPrimitive + 'static, +{ + for el in one.iter_mut() { + *el /= other.as_(); + } + one +} + +/// Reduces a collection of same-length `f32` vectors into a single vector by averaging element-wise. +/// Empty inner vecs are skipped; returns an empty vec if all inputs are empty. +pub(crate) fn vec_avg_reduce(vecs: Vec>) -> Result> { + let sample_count = vecs.len(); + let mut iter = vecs.into_iter(); + let mut acc = loop { + let Some(v) = iter.next() else { + return Ok(vec![]); + }; + if !v.is_empty() { + break v; + } + }; + for v in iter { + if v.is_empty() { + continue; + } else if acc.len() != v.len() { + return internal_err!( + "vec_avg_reduce: length mismatch — first vec has {} elements, got {}", + acc.len(), + v.len() + ); + } + acc = element_wise_sum(acc, &v)?; + } + Ok(vec_div(acc, sample_count as f32)) +} diff --git a/src/coordinator/distributed.rs b/src/coordinator/distributed.rs index fe1bbff3..d7d62a08 100644 --- a/src/coordinator/distributed.rs +++ b/src/coordinator/distributed.rs @@ -1,5 +1,7 @@ +use crate::DistributedConfig; use crate::common::{require_one_child, serialize_uuid}; use crate::coordinator::metrics_store::MetricsStore; +use crate::coordinator::prepare_dynamic_plan::prepare_dynamic_plan; use crate::coordinator::prepare_static_plan::prepare_static_plan; use crate::coordinator::query_coordinator::QueryCoordinator; use crate::distributed_planner::NetworkBoundaryExt; @@ -198,7 +200,11 @@ impl ExecutionPlan for DistributedExec { builder.spawn(async move { let _guard = query_coordinator.end_query_guard(); - let result = prepare_static_plan(&query_coordinator, &base_plan)?; + let d_cfg = DistributedConfig::from_config_options(context.session_config().options())?; + let result = match d_cfg.dynamic_task_count { + true => prepare_dynamic_plan(&query_coordinator, &base_plan).await?, + false => prepare_static_plan(&query_coordinator, &base_plan)?, + }; plan_for_viz .lock() diff --git a/src/coordinator/mod.rs b/src/coordinator/mod.rs index 8fe771d3..c1a8a8dd 100644 --- a/src/coordinator/mod.rs +++ b/src/coordinator/mod.rs @@ -1,6 +1,7 @@ mod distributed; mod latency_metric; mod metrics_store; +mod prepare_dynamic_plan; mod prepare_static_plan; mod query_coordinator; diff --git a/src/coordinator/prepare_dynamic_plan.rs b/src/coordinator/prepare_dynamic_plan.rs new file mode 100644 index 00000000..12d49002 --- /dev/null +++ b/src/coordinator/prepare_dynamic_plan.rs @@ -0,0 +1,347 @@ +use crate::TaskCountAnnotation::{Desired, Maximum}; +use crate::common::{TreeNodeExt, element_wise_sum, vec_avg_reduce, vec_div, vec_mul}; +use crate::coordinator::distributed::PreparedPlan; +use crate::coordinator::query_coordinator::QueryCoordinator; +use crate::distributed_planner::{ + InjectNetworkBoundaryContext, NetworkBoundaryBuilderResult, ProducerHead, calculate_cost, + inject_network_boundaries, +}; +use crate::execution_plans::SamplerExec; +use crate::stage::{LocalStage, RemoteStage}; +use crate::worker::generated::worker as pb; +use crate::{BytesCounterMetric, NetworkBoundaryExt, NetworkCoalesceExec, Stage}; +use dashmap::DashMap; +use datafusion::common::stats::Precision; +use datafusion::common::tree_node::{Transformed, TreeNode, TreeNodeRecursion}; +use datafusion::common::{Result, exec_err, plan_err}; +use datafusion::physical_plan::metrics::MetricsSet; +use datafusion::physical_plan::{ + ColumnStatistics, ExecutionPlan, ExecutionPlanProperties, Statistics, +}; +use futures::{Stream, StreamExt}; +use std::any::TypeId; +use std::sync::Arc; +use tokio_stream::wrappers::UnboundedReceiverStream; + +pub(super) async fn prepare_dynamic_plan( + query_coordinator: &QueryCoordinator, + base_plan: &Arc, +) -> Result { + let plans_for_viz = Arc::new(PlanReconstructor::default()); + + let head_stage = inject_network_boundaries( + Arc::clone(base_plan), + |mut input_stage: LocalStage, nb_type: TypeId, nb_ctx: &InjectNetworkBoundaryContext| { + let mut metrics = MetricsSet::new(); + + // At this point, input_stage.plan has two kind of leaf nodes: + // - The ones that naturally do not read from any children, like DataSourceExec + // - Network boundaries whose Stage was set to Stage::Remote by a previous iteration + // of this same function. + // Both types of leaf nodes contain very valuable and accurate statistics that are used + // here for computing an estimation of the compute cost (measured in bytes): + // - DataSourceExec (or natural leaf nodes) contain stats pulled directly from their + // data source, like parquet files. + // - Network boundaries contain statistics collected from runtime information, gathered + // by the SamplerExec injected by this same function. + let compute_cost = calculate_cost(&input_stage.plan, nb_ctx.d_cfg)?; + metrics.push(BytesCounterMetric::new_metric("compute_cost", compute_cost)); + let compute_based_task_count = compute_cost + .div_ceil(nb_ctx.d_cfg.bytes_per_partition_per_second.max(1)) + .div_ceil(input_stage.plan.output_partitioning().partition_count()) + .clamp(1, nb_ctx.max_tasks()?); + let task_count = nb_ctx + .task_count(&input_stage.plan)? + .merge(Desired(compute_based_task_count)); + + // Propagate the final task_count inferred based on runtime statistics and compute cost. + // Here is where leaf nodes are scaled up by TaskEstimator::scale_up_leaf_node, and the + // plan is finally left ready for distribution. + input_stage.plan = nb_ctx + .propagate_task_count_until_network_boundaries(&input_stage.plan, task_count)?; + input_stage.tasks = task_count.as_usize(); + // In order to infer the compute the cost of the stage above this one, here a sampler + // is injected to gather runtime statistics. + input_stage.plan = ProducerHead::insert_sampler(input_stage.plan)?; + + let mut stage_coordinator = query_coordinator.stage_coordinator(&input_stage); + + let mut workers = Vec::with_capacity(input_stage.tasks); + let mut load_info_rxs = Vec::with_capacity(input_stage.tasks); + + let routed_urls = if input_stage.tasks == 1 { + // If there's an input stage with a single worker, and the current stage is also + // going to run in a single worker, we want to co-locate them so that unnecessary + // network transfers are avoided. + match stage_coordinator.find_input_stage_with_single_url() { + Some(single_url) => vec![single_url], + None => stage_coordinator.routed_urls()?, + } + } else { + stage_coordinator.routed_urls()? + }; + + for (i, routed_url) in routed_urls.into_iter().enumerate() { + workers.push(routed_url.clone()); + // Spawns the task that feeds this subplan to this worker. There will be as + // many as this spawned tasks as workers. + let (worker_tx, worker_rx) = stage_coordinator.send_plan_task(i, routed_url)?; + load_info_rxs.push({ + let rx = stage_coordinator.worker_to_coordinator_task(i, worker_rx); + UnboundedReceiverStream::new(rx) + }); + stage_coordinator.coordinator_to_worker_task(i, worker_tx)?; + } + + let plans_for_viz = Arc::clone(&plans_for_viz); + Ok(async move { + let (stats, consumer_tc) = if nb_type == TypeId::of::() { + (None, Maximum(1)) + } else { + let stats = gather_runtime_statistics(load_info_rxs, &input_stage.plan).await?; + let sampled_bytes = *stats.total_byte_size.get_value().unwrap_or(&0); + metrics.push(BytesCounterMetric::new_metric( + "sampled_bytes", + sampled_bytes, + )); + // returning Desired(1) here is our way to tell the planner that we don't care + // about the task count assigned to the network boundary in the consumer stage, + // and we don't want it to affect other task count decisions. + (Some(Arc::new(stats)), Desired(1)) + }; + + // Capture the output partitioning of the (rescaled, sampler-wrapped) input plan + // before it's moved: the returned stage is remote and carries no plan to read it + // back from. + let input_properties = Arc::clone(input_stage.plan.properties()); + plans_for_viz.insert(input_stage.num, input_stage.plan, metrics); + Ok(NetworkBoundaryBuilderResult { + consumer_task_count: consumer_tc, + input_stage: Stage::Remote(RemoteStage { + query_id: input_stage.query_id, + num: input_stage.num, + workers, + runtime_stats: stats, + }), + input_properties, + }) + }) + }, + query_coordinator.session_config().options(), + ) + .await?; + + Ok(PreparedPlan { + plan_for_viz: plans_for_viz.reconstruct(&head_stage)?, + head_stage, + }) +} + +/// Reconstructs the plan dynamically as stages get transitioned to Remote and get sent to the +/// respective workers. +/// +/// As the [prepare_dynamic_plan] function recurses and progressively sends the plan to workers, the +/// original plan gets modified, and subplans belong to the different [Stage]s get lost as they get +/// transitioned to [Stage::Remote]. +/// +/// This struct is in charge of tracking the [prepare_dynamic_plan] process and storing the final +/// version of all the subplans so that it can be reconstructed into a fully blown plan for +/// visualization purposes. +#[derive(Default)] +struct PlanReconstructor { + stage_map: DashMap, MetricsSet)>, +} + +impl PlanReconstructor { + fn insert(&self, stage: usize, plan: Arc, metrics_set: MetricsSet) { + self.stage_map.insert(stage, (plan, metrics_set)); + } + + fn reconstruct(&self, head_stage: &Arc) -> Result> { + let head_stage = Arc::clone(head_stage); + let reconstructed = head_stage.transform_down_with_task_count(1, |plan, tc| { + let Some(nb) = plan.as_network_boundary() else { + return Ok(Transformed::no(plan)); + }; + let input_stage = nb.input_stage(); + let Some((_, entry)) = self.stage_map.remove(&input_stage.num()) else { + return exec_err!( + "Failed to retrieve plan for stage {} for visualization purposes", + input_stage.num() + ); + }; + let (plan_for_viz, metrics_set) = entry; + + let plan_for_viz = nb.producer_head(tc).insert(plan_for_viz)?; + + let nb = nb.with_input_stage(Stage::Local(LocalStage { + query_id: input_stage.query_id(), + num: input_stage.num(), + plan: plan_for_viz, + tasks: input_stage.task_count(), + metrics_set, + }))?; + + Ok(Transformed::yes(nb)) + })?; + Ok(reconstructed.data) + } +} + +/// Estimates the bytes per second flowing through a stage by reading sample information. +async fn gather_runtime_statistics( + per_task_load_info_stream: Vec + Unpin>, + plan: &Arc, +) -> Result { + const ESTIMATED_QUERY_TIME_S: usize = 10; + const BYTES_READY_SAMPLE_PERCENTAGE: f32 = 0.2; + const BYTES_PER_SECOND_SAMPLE_PERCENTAGE: f32 = 0.2; + + let Some(sampler) = find_sampler(plan) else { + return plan_err!("Mising SamplerExec while gathering load report"); + }; + let n_cols = sampler.schema().fields.len(); + + fn apply_pct(value: usize, pct: f32) -> usize { + (value as f32 * pct).round() as usize + } + + let partitions_per_task = sampler.partition_samplers.len(); + let task_count = per_task_load_info_stream.len(); + let total_partitions = partitions_per_task * task_count; + + let mut partitions_with_bytes_per_second_done = 0; + let mut partitions_with_bytes_ready_done = 0; + let mut partitions_done = 0; + let mut rows_ready = 0; + let mut rows_per_second = 0; + let mut per_col_bytes_ready = vec![0usize; n_cols]; + let mut per_col_bytes_per_second = vec![0usize; n_cols]; + + let mut ndv_pct = vec![]; + let mut null_pct = vec![]; + + let mut load_info_stream = futures::stream::select_all(per_task_load_info_stream); + while let Some(load_info) = load_info_stream.next().await { + rows_per_second += load_info.rows_per_second as usize; + rows_ready += load_info.rows_ready as usize; + per_col_bytes_per_second = element_wise_sum( + per_col_bytes_per_second, + &load_info.per_column_bytes_per_second, + )?; + per_col_bytes_ready = + element_wise_sum(per_col_bytes_ready, &load_info.per_column_bytes_ready)?; + ndv_pct.push(load_info.per_column_ndv_percentage); + null_pct.push(load_info.per_column_null_percentage); + + partitions_with_bytes_per_second_done += + load_info.per_column_bytes_per_second.iter().any(|v| *v > 0) as usize; + partitions_with_bytes_ready_done += + load_info.per_column_bytes_ready.iter().any(|v| *v > 0) as usize; + partitions_done += 1; + + // Short circuit if we collected enough bytes_ready measurements. + if partitions_with_bytes_ready_done + >= apply_pct(total_partitions, BYTES_READY_SAMPLE_PERCENTAGE).max(1) + { + break; + } + + // Short circuit if we collected enough bytes_per_second measurements. + if partitions_with_bytes_per_second_done + >= apply_pct(total_partitions, BYTES_PER_SECOND_SAMPLE_PERCENTAGE).max(1) + { + break; + } + + // Short circuit if there are no further partitions remaining to sample from. + if partitions_done == total_partitions { + break; + } + } + + if partitions_done == 0 { + return Ok(zero_stats(plan.schema().fields.len())); + } + + let per_col_bytes_ready = vec_div( + vec_mul(per_col_bytes_ready, total_partitions), + partitions_done, + ); + let per_col_bytes_per_second = vec_div( + vec_mul(per_col_bytes_per_second, total_partitions), + partitions_done, + ); + + let rows_ready = rows_ready * total_partitions / partitions_done; + let rows_per_second = rows_per_second * total_partitions / partitions_done; + + let total_num_rows = rows_ready + rows_per_second * ESTIMATED_QUERY_TIME_S; + + if total_num_rows == 0 { + return Ok(zero_stats(n_cols)); + } + + let per_col_byte_size = element_wise_sum( + per_col_bytes_ready, + &vec_mul(per_col_bytes_per_second, ESTIMATED_QUERY_TIME_S), + )?; + let total_byte_size: usize = per_col_byte_size.iter().sum(); + + let ndv_pct = vec_avg_reduce(ndv_pct)?; + if ndv_pct.len() != n_cols { + return plan_err!("Expected {n_cols} ndv values, but got {}", ndv_pct.len()); + } + let null_pct = vec_avg_reduce(null_pct)?; + if null_pct.len() != n_cols { + return plan_err!("Expected {n_cols} null values, but got {}", null_pct.len()); + } + + Ok(Statistics { + num_rows: Precision::Inexact(total_num_rows), + total_byte_size: Precision::Inexact(total_byte_size), + column_statistics: ndv_pct + .into_iter() + .zip(null_pct) + .zip(per_col_byte_size) + .map(|((ndv, null), col_bytes)| ColumnStatistics { + null_count: Precision::Inexact((null * total_num_rows as f32) as usize), + distinct_count: Precision::Inexact((ndv * total_num_rows as f32) as usize), + byte_size: Precision::Inexact(col_bytes), + max_value: Precision::Absent, + min_value: Precision::Absent, + sum_value: Precision::Absent, + }) + .collect(), + }) +} + +fn find_sampler(plan: &Arc) -> Option<&SamplerExec> { + let mut sampler = None; + plan.apply(|plan| { + if let Some(node) = plan.downcast_ref::() { + sampler = Some(node); + return Ok(TreeNodeRecursion::Stop); + }; + Ok(TreeNodeRecursion::Continue) + }) + .expect("Cannot fail"); + sampler +} + +fn zero_stats(n_cols: usize) -> Statistics { + Statistics { + num_rows: Precision::Exact(0), + total_byte_size: Precision::Exact(0), + column_statistics: (0..n_cols) + .map(|_| ColumnStatistics { + null_count: Precision::Exact(0), + max_value: Precision::Absent, + min_value: Precision::Absent, + sum_value: Precision::Absent, + distinct_count: Precision::Exact(0), + byte_size: Precision::Exact(0), + }) + .collect(), + } +} diff --git a/src/coordinator/prepare_static_plan.rs b/src/coordinator/prepare_static_plan.rs index 65d74276..3bfc2d2f 100644 --- a/src/coordinator/prepare_static_plan.rs +++ b/src/coordinator/prepare_static_plan.rs @@ -50,6 +50,7 @@ pub(super) fn prepare_static_plan( query_id: stage.query_id, num: stage.num, workers, + runtime_stats: None, }, ))?)) })?; diff --git a/src/coordinator/query_coordinator.rs b/src/coordinator/query_coordinator.rs index 3d19c750..6f0010e6 100644 --- a/src/coordinator/query_coordinator.rs +++ b/src/coordinator/query_coordinator.rs @@ -12,17 +12,19 @@ use crate::worker::generated::worker::coordinator_to_worker_msg::Inner; use crate::worker::generated::worker::set_plan_request::WorkUnitFeedDeclaration; use crate::{ BytesCounterMetric, BytesMetricExt, DISTRIBUTED_DATAFUSION_TASK_ID_LABEL, DistributedCodec, - DistributedConfig, DistributedTaskContext, DistributedWorkUnitFeedContext, TaskEstimator, - TaskKey, TaskRoutingContext, get_distributed_channel_resolver, get_distributed_worker_resolver, + DistributedConfig, DistributedTaskContext, DistributedWorkUnitFeedContext, NetworkBoundaryExt, + Stage, TaskEstimator, TaskKey, TaskRoutingContext, get_distributed_channel_resolver, + get_distributed_worker_resolver, }; use datafusion::common::instant::Instant; use datafusion::common::runtime::JoinSet; -use datafusion::common::tree_node::{Transformed, TreeNodeRecursion}; +use datafusion::common::tree_node::{Transformed, TreeNode, TreeNodeRecursion}; use datafusion::common::{DataFusionError, exec_datafusion_err}; use datafusion::common::{Result, exec_err}; use datafusion::execution::TaskContext; use datafusion::physical_expr_common::metrics::{ExecutionPlanMetricsSet, Label, MetricBuilder}; use datafusion::physical_plan::ExecutionPlan; +use datafusion::prelude::SessionConfig; use datafusion_proto::physical_plan::AsExecutionPlan; use datafusion_proto::protobuf::PhysicalPlanNode; use futures::{Stream, StreamExt}; @@ -88,6 +90,11 @@ impl QueryCoordinator { } } + /// Returns the [SessionConfig] for the current query. + pub(super) fn session_config(&self) -> &SessionConfig { + self.task_ctx.session_config() + } + /// returns a guard that, when dropped, it signals all the coordinator->worker connections that /// the query is finished, ending them, and propagating the EOS to the workers so that they can /// clean up any remaining state. @@ -200,8 +207,8 @@ impl<'a> StageCoordinator<'a> { let mut worker_to_coordinator_stream = response.into_inner(); while let Some(msg_or_err) = worker_to_coordinator_stream.next().await { let msg = msg_or_err.map_err(|err| { - tonic_status_to_datafusion_error(err).unwrap_or_else(|| { - exec_datafusion_err!("Unknown error on worker to coordinator stream") + tonic_status_to_datafusion_error(&err).unwrap_or_else(|| { + exec_datafusion_err!("Unknown error on worker to coordinator stream: {err}") }) })?; if worker_to_coordinator_tx.send(msg).is_err() { @@ -221,13 +228,15 @@ impl<'a> StageCoordinator<'a> { &mut self, task_i: usize, mut worker_to_coordinator_rx: UnboundedReceiver, - ) { + ) -> UnboundedReceiver { let task_key = TaskKey { query_id: serialize_uuid(&self.query_id), stage_id: self.stage_id as u64, task_number: task_i as u64, }; let task_metrics = self.metrics_store.clone(); + let (load_info_tx, load_info_rx) = tokio::sync::mpsc::unbounded_channel(); + let mut load_info_tx_opt = Some(load_info_tx); // Cannot use self.join_set because that's tied to the lifetime of the query, and the // metrics collection process might outlive the query's lifetime. @@ -242,9 +251,18 @@ impl<'a> StageCoordinator<'a> { task_metrics.insert(task_key.clone(), pre_order_metrics); } } + pb::worker_to_coordinator_msg::Inner::LoadInfo(load_info) => { + if let Some(tx) = &load_info_tx_opt { + let _ = tx.send(load_info); + } + } + pb::worker_to_coordinator_msg::Inner::LoadInfoEos(_) => { + let _ = load_info_tx_opt.take(); + } } } }); + load_info_rx } /// Spawns a background task in charge of sending messages to workers. Some things that are sent @@ -401,6 +419,28 @@ impl<'a> StageCoordinator<'a> { } Ok(routed_urls) } + + pub(super) fn find_input_stage_with_single_url(&self) -> Option { + let mut single_stage_url = None; + self.plan + .apply(|plan| { + let Some(nb) = plan.as_network_boundary() else { + return Ok(TreeNodeRecursion::Continue); + }; + + if let Stage::Remote(remote) = nb.input_stage() + && remote.workers.len() == 1 + { + single_stage_url = Some(remote.workers[0].clone()); + return Ok(TreeNodeRecursion::Stop); + } + + Ok(TreeNodeRecursion::Jump) + }) + .expect("Cannot fail"); + + single_stage_url + } } fn keep_stream_alive(notify: Arc) -> impl Stream + 'static { diff --git a/src/distributed_ext.rs b/src/distributed_ext.rs index 17852a04..d33b18a6 100644 --- a/src/distributed_ext.rs +++ b/src/distributed_ext.rs @@ -577,6 +577,27 @@ pub trait DistributedExt: Sized { P: WorkUnitFeedProvider + 'static, P::WorkUnit: 'static, F: Fn(&T) -> Option<&WorkUnitFeed

> + Send + Sync + 'static; + + /// Dynamically allocates tasks to the different stages based on runtime statistics + /// collected during execution. + fn with_distributed_dynamic_task_count(self, enabled: bool) -> Result; + + /// Same as [DistributedExt::with_distributed_dynamic_task_count] but with an in-place mutation. + fn set_distributed_dynamic_task_count(&mut self, enabled: bool) -> Result<(), DataFusionError>; + + /// Target throughput in bytes per partition per second used by the dynamic task count + /// allocator to decide how many tasks to assign to each stage based on runtime statistics. + fn with_distributed_bytes_per_partition_per_second( + self, + bytes_per_partition_per_second: usize, + ) -> Result; + + /// Same as [DistributedExt::with_distributed_bytes_per_partition_per_second] but with an + /// in-place mutation. + fn set_distributed_bytes_per_partition_per_second( + &mut self, + bytes_per_partition_per_second: usize, + ) -> Result<(), DataFusionError>; } impl DistributedExt for SessionConfig { @@ -722,6 +743,21 @@ impl DistributedExt for SessionConfig { }) } + fn set_distributed_dynamic_task_count(&mut self, enabled: bool) -> Result<(), DataFusionError> { + let d_cfg = DistributedConfig::from_config_options_mut(self.options_mut())?; + d_cfg.dynamic_task_count = enabled; + Ok(()) + } + + fn set_distributed_bytes_per_partition_per_second( + &mut self, + bytes_per_partition_per_second: usize, + ) -> Result<(), DataFusionError> { + let d_cfg = DistributedConfig::from_config_options_mut(self.options_mut())?; + d_cfg.bytes_per_partition_per_second = bytes_per_partition_per_second; + Ok(()) + } + delegate! { to self { #[call(set_distributed_option_extension)] @@ -804,6 +840,14 @@ impl DistributedExt for SessionConfig { P: WorkUnitFeedProvider + 'static, P::WorkUnit: 'static, F: Fn(&T) -> Option<&WorkUnitFeed

> + Send + Sync + 'static; + + #[call(set_distributed_dynamic_task_count)] + #[expr($?;Ok(self))] + fn with_distributed_dynamic_task_count(mut self, enabled: bool) -> Result; + + #[call(set_distributed_bytes_per_partition_per_second)] + #[expr($?;Ok(self))] + fn with_distributed_bytes_per_partition_per_second(mut self, bytes_per_partition_per_second: usize) -> Result; } } } @@ -915,6 +959,16 @@ impl DistributedExt for SessionStateBuilder { P: WorkUnitFeedProvider + 'static, P::WorkUnit: 'static, F: Fn(&T) -> Option<&WorkUnitFeed

> + Send + Sync + 'static; + + fn set_distributed_dynamic_task_count(&mut self, enabled: bool) -> Result<(), DataFusionError>; + #[call(set_distributed_dynamic_task_count)] + #[expr($?;Ok(self))] + fn with_distributed_dynamic_task_count(mut self, enabled: bool) -> Result; + + fn set_distributed_bytes_per_partition_per_second(&mut self, bytes_per_partition_per_second: usize) -> Result<(), DataFusionError>; + #[call(set_distributed_bytes_per_partition_per_second)] + #[expr($?;Ok(self))] + fn with_distributed_bytes_per_partition_per_second(mut self, bytes_per_partition_per_second: usize) -> Result; } } } @@ -1026,6 +1080,16 @@ impl DistributedExt for SessionState { P: WorkUnitFeedProvider + 'static, P::WorkUnit: 'static, F: Fn(&T) -> Option<&WorkUnitFeed

> + Send + Sync + 'static; + + fn set_distributed_dynamic_task_count(&mut self, enabled: bool) -> Result<(), DataFusionError>; + #[call(set_distributed_dynamic_task_count)] + #[expr($?;Ok(self))] + fn with_distributed_dynamic_task_count(mut self, enabled: bool) -> Result; + + fn set_distributed_bytes_per_partition_per_second(&mut self, bytes_per_partition_per_second: usize) -> Result<(), DataFusionError>; + #[call(set_distributed_bytes_per_partition_per_second)] + #[expr($?;Ok(self))] + fn with_distributed_bytes_per_partition_per_second(mut self, bytes_per_partition_per_second: usize) -> Result; } } } @@ -1137,6 +1201,16 @@ impl DistributedExt for SessionContext { P: WorkUnitFeedProvider + 'static, P::WorkUnit: 'static, F: Fn(&T) -> Option<&WorkUnitFeed

> + Send + Sync + 'static; + + fn set_distributed_dynamic_task_count(&mut self, enabled: bool) -> Result<(), DataFusionError>; + #[call(set_distributed_dynamic_task_count)] + #[expr($?;Ok(self))] + fn with_distributed_dynamic_task_count(self, enabled: bool) -> Result; + + fn set_distributed_bytes_per_partition_per_second(&mut self, bytes_per_partition_per_second: usize) -> Result<(), DataFusionError>; + #[call(set_distributed_bytes_per_partition_per_second)] + #[expr($?;Ok(self))] + fn with_distributed_bytes_per_partition_per_second(self, bytes_per_partition_per_second: usize) -> Result; } } } diff --git a/src/distributed_planner/distributed_config.rs b/src/distributed_planner/distributed_config.rs index 6795fd95..c1e804e0 100644 --- a/src/distributed_planner/distributed_config.rs +++ b/src/distributed_planner/distributed_config.rs @@ -70,6 +70,15 @@ extensions_options! { /// should be used in serving the query. Some plans might not implement any kind of row count /// estimation, and this parameter sets the default estimated row count for those plans. pub default_estimated_row_count: Option, default = Some(0) + /// Calculates the task count of the different stages at execution time, based on runtime + /// information collected by sampling at the head of the stages. + /// + /// With this option enabled, the shape of the distributed plan is only known after fully + /// executing it, as it's dynamically created on the fly during execution. + pub dynamic_task_count: bool, default = false + /// If `dynamic_task_count` is enabled, this value is the amount of bytes/second each + /// partition is expected to handle. Lower values will result in greater parallelism. + pub bytes_per_partition_per_second: usize, default = 16 * 1024 * 1024 /// Collection of [TaskEstimator]s that will be applied to leaf nodes in order to /// estimate how many tasks should be spawned for the [Stage] containing the leaf node. pub(crate) __private_task_estimator: CombinedTaskEstimator, default = CombinedTaskEstimator::default() diff --git a/src/distributed_planner/distributed_query_planner.rs b/src/distributed_planner/distributed_query_planner.rs index f5ac6168..8e2d8f2c 100644 --- a/src/distributed_planner/distributed_query_planner.rs +++ b/src/distributed_planner/distributed_query_planner.rs @@ -108,6 +108,14 @@ impl QueryPlanner for DistributedQueryPlanner { plan = insert_broadcast_execs(plan, cfg)?; + if d_cfg.dynamic_task_count { + // The task count will be decided dynamically at execution time. + return Ok(Arc::new( + DistributedExec::new(plan).with_metrics_collection(d_cfg.collect_metrics), + )); + } + + // Compute per-node task counts and inject `Network*Exec` nodes at the stage boundaries. plan = inject_network_boundaries(plan, CardinalityBasedNetworkBoundaryBuilder, cfg).await?; plan = prepare_network_boundaries(plan)?; diff --git a/src/distributed_planner/inject_network_boundaries.rs b/src/distributed_planner/inject_network_boundaries.rs index 21898440..78fbe6b9 100644 --- a/src/distributed_planner/inject_network_boundaries.rs +++ b/src/distributed_planner/inject_network_boundaries.rs @@ -152,8 +152,9 @@ pub(crate) async fn inject_network_boundaries( #[derive(Clone)] pub(crate) struct InjectNetworkBoundaryContext<'a> { + pub(crate) d_cfg: &'a DistributedConfig, + cfg: &'a ConfigOptions, - d_cfg: &'a DistributedConfig, nb_builder: &'a (dyn NetworkBoundaryBuilder + Send + Sync), task_counts: &'a Mutex>, query_id: Uuid, @@ -161,7 +162,7 @@ pub(crate) struct InjectNetworkBoundaryContext<'a> { } impl<'a> InjectNetworkBoundaryContext<'a> { - fn max_tasks(&self) -> Result { + pub(crate) fn max_tasks(&self) -> Result { Ok(match self.d_cfg.max_tasks_per_stage { 0 => self .d_cfg @@ -190,7 +191,7 @@ impl<'a> InjectNetworkBoundaryContext<'a> { plan } - fn task_count(&self, plan: &Arc) -> Result { + pub(crate) fn task_count(&self, plan: &Arc) -> Result { let Some(task_count) = self .task_counts .lock() @@ -294,6 +295,7 @@ async fn _inject_network_boundaries( num: nb_ctx.fetch_add_stage_id(), plan: nb_ctx.plan_with_task_count(plan, task_count), tasks: task_count.as_usize(), + metrics_set: Default::default(), }; let result = nb_ctx .nb_builder @@ -323,6 +325,7 @@ async fn _inject_network_boundaries( num: nb_ctx.fetch_add_stage_id(), plan: nb_ctx.plan_with_task_count(plan, task_count), tasks: task_count.as_usize(), + metrics_set: Default::default(), }; let result = nb_ctx .nb_builder @@ -339,6 +342,7 @@ async fn _inject_network_boundaries( num: nb_ctx.fetch_add_stage_id(), plan: nb_ctx.plan_with_task_count(plan, task_count), tasks: task_count.as_usize(), + metrics_set: Default::default(), }; let result = nb_ctx .nb_builder @@ -409,7 +413,7 @@ async fn _inject_network_boundaries( /// - **Everything else**: recurse into children with the same `task_count`, then rebuild the /// node with the rebuilt children. impl InjectNetworkBoundaryContext<'_> { - fn propagate_task_count_until_network_boundaries( + pub(crate) fn propagate_task_count_until_network_boundaries( &self, plan: &Arc, task_count: TaskCountAnnotation, diff --git a/src/distributed_planner/mod.rs b/src/distributed_planner/mod.rs index f12c684c..18cfe6e0 100644 --- a/src/distributed_planner/mod.rs +++ b/src/distributed_planner/mod.rs @@ -11,8 +11,12 @@ mod statistics; mod task_estimator; pub use distributed_config::DistributedConfig; +pub(crate) use inject_network_boundaries::{ + InjectNetworkBoundaryContext, NetworkBoundaryBuilderResult, inject_network_boundaries, +}; +pub(crate) use network_boundary::ProducerHead; pub use network_boundary::{NetworkBoundary, NetworkBoundaryExt}; -pub(crate) use network_boundary::{ProducerHead, insert_producer_head}; pub use session_state_builder_ext::SessionStateBuilderExt; +pub(crate) use statistics::calculate_cost; pub(crate) use task_estimator::set_distributed_task_estimator; pub use task_estimator::{TaskCountAnnotation, TaskEstimation, TaskEstimator, TaskRoutingContext}; diff --git a/src/distributed_planner/network_boundary.rs b/src/distributed_planner/network_boundary.rs index 858a7e00..e6479e7e 100644 --- a/src/distributed_planner/network_boundary.rs +++ b/src/distributed_planner/network_boundary.rs @@ -1,3 +1,4 @@ +use crate::execution_plans::SamplerExec; use crate::{BroadcastExec, NetworkBroadcastExec, NetworkCoalesceExec, NetworkShuffleExec, Stage}; use datafusion::common::Result; use datafusion::physical_expr::Partitioning; @@ -13,7 +14,7 @@ pub trait NetworkBoundary: ExecutionPlan { /// information to perform any internal transformations necessary for distributed execution. /// /// Typically, [NetworkBoundary]s will use this call for transitioning from "Pending" to "ready". - fn with_input_stage(&self, input_stage: Stage) -> Result>; + fn with_input_stage(&self, input_stage: Stage) -> Result>; /// Returns the assigned input [Stage], if any. fn input_stage(&self) -> &Stage; @@ -59,28 +60,40 @@ impl NetworkBoundaryExt for dyn ExecutionPlan { } } -/// Ensures the head of the provided plan complies with the passed [ProducerHead] definition. This -/// can be called both during planning and lazily at runtime. -pub(crate) fn insert_producer_head( - input: Arc, - head: ProducerHead, -) -> Result> { - let input = if let Some(r_exec) = input.downcast_ref::() { - Arc::clone(r_exec.input()) - } else if let Some(b_exec) = input.downcast_ref::() { - Arc::clone(b_exec.input()) - } else { - input - }; - let plan = match head { - ProducerHead::None => input, - ProducerHead::BroadcastExec { output_partitions } => { - let partitions = input.output_partitioning().partition_count(); - Arc::new(BroadcastExec::new(input, output_partitions / partitions)) - } - ProducerHead::RepartitionExec { partitioning } => { - Arc::new(RepartitionExec::try_new(input, partitioning)?) +impl ProducerHead { + /// Ensures the head of the provided plan complies with the passed [ProducerHead] definition. This + /// can be called both during planning and lazily at runtime. + pub(crate) fn insert(self, input: Arc) -> Result> { + let input = if let Some(r_exec) = input.downcast_ref::() { + Arc::clone(r_exec.input()) + } else if let Some(b_exec) = input.downcast_ref::() { + Arc::clone(b_exec.input()) + } else { + input + }; + let plan = match self { + ProducerHead::None => input, + ProducerHead::BroadcastExec { output_partitions } => { + let partitions = input.output_partitioning().partition_count(); + Arc::new(BroadcastExec::new(input, output_partitions / partitions)) + } + ProducerHead::RepartitionExec { partitioning } => { + Arc::new(RepartitionExec::try_new(input, partitioning)?) + } + }; + Ok(plan) + } + + /// Injects a [SamplerExec] right below a [RepartitionExec] or [BroadcastExec]. + pub(crate) fn insert_sampler(input: Arc) -> Result> { + if let Some(r_exec) = input.downcast_ref::() { + let child = Arc::clone(r_exec.input()); + input.with_new_children(vec![Arc::new(SamplerExec::new(child))]) + } else if let Some(b_exec) = input.downcast_ref::() { + let child = Arc::clone(b_exec.input()); + input.with_new_children(vec![Arc::new(SamplerExec::new(child))]) + } else { + Ok(input) } - }; - Ok(plan) + } } diff --git a/src/distributed_planner/prepare_network_boundaries.rs b/src/distributed_planner/prepare_network_boundaries.rs index 1b6f1ac7..dd070793 100644 --- a/src/distributed_planner/prepare_network_boundaries.rs +++ b/src/distributed_planner/prepare_network_boundaries.rs @@ -1,5 +1,4 @@ use crate::common::TreeNodeExt; -use crate::distributed_planner::network_boundary::insert_producer_head; use crate::stage::LocalStage; use crate::{NetworkBoundaryExt, Stage}; use datafusion::common::Result; @@ -35,8 +34,9 @@ pub(crate) fn prepare_network_boundaries( // 2) Scale up the head node of the input stage in order to account for the amount of partition // and consumer count above it. - let plan = - insert_producer_head(Arc::clone(&input_stage.plan), nb.producer_head(task_count))?; + let plan = nb + .producer_head(task_count) + .insert(Arc::clone(&input_stage.plan))?; // 3) Make sure the input stage can be uniquely identified with a stage index and query id. // If there were already some `query_id` and `num` that's fine. @@ -45,6 +45,7 @@ pub(crate) fn prepare_network_boundaries( num: stage_id, plan, tasks: input_stage.tasks, + metrics_set: Default::default(), }))?; stage_id += 1; Ok(Transformed::yes(nb)) diff --git a/src/execution_plans/benchmarks/shuffle_bench.rs b/src/execution_plans/benchmarks/shuffle_bench.rs index 8406a052..4f46f1be 100644 --- a/src/execution_plans/benchmarks/shuffle_bench.rs +++ b/src/execution_plans/benchmarks/shuffle_bench.rs @@ -216,6 +216,7 @@ impl ShuffleFixture { query_id, num: 0, workers: self.input_stage_workers.clone(), + runtime_stats: None, }); let mut join_set = JoinSet::default(); diff --git a/src/execution_plans/benchmarks/transport_bench.rs b/src/execution_plans/benchmarks/transport_bench.rs index 847c307b..a386c7bd 100644 --- a/src/execution_plans/benchmarks/transport_bench.rs +++ b/src/execution_plans/benchmarks/transport_bench.rs @@ -266,6 +266,7 @@ impl TransportFixture { query_id, num: 0, workers: self.input_stage_tasks.clone(), + runtime_stats: None, }); let mut join_set = JoinSet::default(); diff --git a/src/execution_plans/mod.rs b/src/execution_plans/mod.rs index a1ea6316..ecdcbc35 100644 --- a/src/execution_plans/mod.rs +++ b/src/execution_plans/mod.rs @@ -6,6 +6,7 @@ mod metrics; mod network_broadcast; mod network_coalesce; mod network_shuffle; +mod sampler; #[cfg(any(test, feature = "integration"))] pub mod benchmarks; @@ -18,3 +19,4 @@ pub(crate) use metrics::MetricsWrapperExec; pub use network_broadcast::NetworkBroadcastExec; pub use network_coalesce::NetworkCoalesceExec; pub use network_shuffle::NetworkShuffleExec; +pub use sampler::SamplerExec; diff --git a/src/execution_plans/network_broadcast.rs b/src/execution_plans/network_broadcast.rs index f9dee080..251ed29c 100644 --- a/src/execution_plans/network_broadcast.rs +++ b/src/execution_plans/network_broadcast.rs @@ -9,7 +9,7 @@ use datafusion::execution::{SendableRecordBatchStream, TaskContext}; use datafusion::physical_expr_common::metrics::MetricsSet; use datafusion::physical_plan::stream::RecordBatchStreamAdapter; use datafusion::physical_plan::{ - DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, PlanProperties, + DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, PlanProperties, Statistics, }; use std::fmt::Formatter; use std::sync::Arc; @@ -153,6 +153,7 @@ impl NetworkBroadcastExec { num: 0, plan: input, tasks: producer_tasks, + metrics_set: Default::default(), }), input_properties, )) @@ -160,7 +161,7 @@ impl NetworkBroadcastExec { } impl NetworkBoundary for NetworkBroadcastExec { - fn with_input_stage(&self, input_stage: Stage) -> Result> { + fn with_input_stage(&self, input_stage: Stage) -> Result> { let mut self_clone = self.clone(); self_clone.worker_connections = WorkerConnectionPool::new(input_stage.task_count()); self_clone.input_stage = input_stage; @@ -268,4 +269,12 @@ impl ExecutionPlan for NetworkBroadcastExec { fn metrics(&self) -> Option { Some(self.worker_connections.metrics.clone_inner()) } + + fn partition_statistics(&self, partition: Option) -> Result> { + self.input_stage.partition_statistics( + partition, + self.properties.output_partitioning().partition_count(), + self.schema(), + ) + } } diff --git a/src/execution_plans/network_coalesce.rs b/src/execution_plans/network_coalesce.rs index 8fb06e74..1582d08c 100644 --- a/src/execution_plans/network_coalesce.rs +++ b/src/execution_plans/network_coalesce.rs @@ -12,7 +12,7 @@ use datafusion::physical_plan::limit::LocalLimitExec; use datafusion::physical_plan::stream::RecordBatchStreamAdapter; use datafusion::physical_plan::{ DisplayAs, DisplayFormatType, EmptyRecordBatchStream, ExecutionPlan, PlanProperties, - internal_err, + Statistics, internal_err, }; use std::fmt::{Debug, Formatter}; use std::sync::Arc; @@ -131,6 +131,7 @@ impl NetworkCoalesceExec { num: 0, plan: input, tasks: producer_tasks, + metrics_set: Default::default(), }), input_properties, consumer_tasks, @@ -157,6 +158,7 @@ impl NetworkCoalesceExec { num: local.num, plan: input_with_fetch, tasks: local.tasks, + metrics_set: Default::default(), }); Ok(Arc::new(self_clone)) } @@ -167,7 +169,7 @@ impl NetworkBoundary for NetworkCoalesceExec { &self.input_stage } - fn with_input_stage(&self, input_stage: Stage) -> Result> { + fn with_input_stage(&self, input_stage: Stage) -> Result> { let mut self_clone = self.clone(); self_clone.properties = scale_partitioning_props(self_clone.properties(), |p| { p * input_stage.task_count() / self_clone.input_stage.task_count().max(1) @@ -247,10 +249,8 @@ impl ExecutionPlan for NetworkCoalesceExec { ); } - let partitions_per_task = self - .properties() - .partitioning - .partition_count() + let out_partitions = self.properties().partitioning.partition_count(); + let partitions_per_task = out_partitions .checked_div( self.input_stage .task_count() @@ -311,6 +311,14 @@ impl ExecutionPlan for NetworkCoalesceExec { fn metrics(&self) -> Option { Some(self.worker_connections.metrics.clone_inner()) } + + fn partition_statistics(&self, partition: Option) -> Result> { + self.input_stage.partition_statistics( + partition, + self.properties.output_partitioning().partition_count(), + self.schema(), + ) + } } #[derive(Debug, Clone, Copy)] diff --git a/src/execution_plans/network_shuffle.rs b/src/execution_plans/network_shuffle.rs index 157cfd99..2a575b42 100644 --- a/src/execution_plans/network_shuffle.rs +++ b/src/execution_plans/network_shuffle.rs @@ -11,7 +11,9 @@ use datafusion::physical_expr::Partitioning; use datafusion::physical_expr_common::metrics::MetricsSet; use datafusion::physical_plan::repartition::RepartitionExec; use datafusion::physical_plan::stream::RecordBatchStreamAdapter; -use datafusion::physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties}; +use datafusion::physical_plan::{ + DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties, Statistics, +}; use std::fmt::Formatter; use std::sync::Arc; use uuid::Uuid; @@ -133,6 +135,7 @@ impl NetworkShuffleExec { num: 0, plan: input, tasks: producer_tasks, + metrics_set: Default::default(), }), input_properties, )) @@ -144,7 +147,7 @@ impl NetworkBoundary for NetworkShuffleExec { &self.input_stage } - fn with_input_stage(&self, input_stage: Stage) -> Result> { + fn with_input_stage(&self, input_stage: Stage) -> Result> { let mut self_clone = self.clone(); self_clone.worker_connections = WorkerConnectionPool::new(input_stage.task_count()); self_clone.input_stage = input_stage; @@ -217,7 +220,8 @@ impl ExecutionPlan for NetworkShuffleExec { }; let task_context = DistributedTaskContext::from_ctx(&context); - let off = self.properties.partitioning.partition_count() * task_context.task_index; + let out_partitions = self.properties.partitioning.partition_count(); + let off = out_partitions * task_context.task_index; let mut streams = Vec::with_capacity(remote_stage.workers.len()); for input_task_index in 0..remote_stage.workers.len() { @@ -242,4 +246,12 @@ impl ExecutionPlan for NetworkShuffleExec { fn metrics(&self) -> Option { Some(self.worker_connections.metrics.clone_inner()) } + + fn partition_statistics(&self, partition: Option) -> Result> { + self.input_stage.partition_statistics( + partition, + self.properties.output_partitioning().partition_count(), + self.schema(), + ) + } } diff --git a/src/execution_plans/sampler.rs b/src/execution_plans/sampler.rs new file mode 100644 index 00000000..0e120219 --- /dev/null +++ b/src/execution_plans/sampler.rs @@ -0,0 +1,594 @@ +use crate::common::{require_one_child, vec_cast}; +use crate::worker::generated::worker as pb; +use crate::{ + BytesCounterMetric, BytesMetricExt, GaugeMetricExt, LatencyMetricExt, MaxGaugeMetric, + MaxLatencyMetric, P50LatencyMetric, +}; +use datafusion::arrow::array::Array; +use datafusion::arrow::array::ArrayRef; +use datafusion::arrow::record_batch::RecordBatch; +use datafusion::common::runtime::SpawnedTask; +use datafusion::common::tree_node::{TreeNode, TreeNodeRecursion}; +use datafusion::common::{DataFusionError, Result, exec_err}; +use datafusion::common::{HashSet, ScalarValue}; +use datafusion::execution::memory_pool::{MemoryConsumer, MemoryReservation}; +use datafusion::execution::{SendableRecordBatchStream, TaskContext}; +use datafusion::physical_expr_common::metrics::{Gauge, MetricValue, MetricsSet}; +use datafusion::physical_plan::metrics::{ExecutionPlanMetricsSet, MetricBuilder, Time}; +use datafusion::physical_plan::stream::RecordBatchStreamAdapter; +use datafusion::physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties}; +use futures::{FutureExt, Stream, StreamExt, TryFutureExt, TryStreamExt}; +use std::collections::VecDeque; +use std::fmt::{Debug, Formatter}; +use std::pin::Pin; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::{Arc, LazyLock, Mutex, OnceLock}; +use std::task::{Context, Poll}; +use std::time::Instant; +use tokio::sync::oneshot; + +/// How many [RecordBatch]s to allow the input stream to yield synchronously (without yielding back +/// to tokio) before short-circuiting buffering. +const READY_CHUNK_LIMIT: usize = 256; +/// Maximum read of bytes per second allowed to be emitted. Reads greater than this will be +/// truncated to this max value, as it's assumed that [READY_CHUNK_LIMIT] was hit and no useful +/// measurement can actually be emitted. +const MAX_BYTES_PER_SECOND: usize = 512 * 1024 * 1024; +/// Maximum number of rows per second allowed to be emitted. Reads greater than this will be +/// truncated to this max value, as it's assumed that [READY_CHUNK_LIMIT] was hit and no useful +/// measurement can actually be emitted. +const MAX_ROWS_PER_SECOND: usize = 1024 * 1024; +/// Maximum number of rows sampled from the peek queue when estimating per-column NDV. +const NDV_MAX_ROWS_SAMPLE: usize = 1000; + +#[derive(Debug)] +pub struct SamplerExec { + pub(crate) input: Arc, + pub(crate) metric_set: ExecutionPlanMetricsSet, + pub(crate) partition_samplers: Vec, + pub(crate) execution_started: Arc, +} + +/// Metrics that quantify how long the sampler held data in memory before the consumer +/// (real execution) attached, plus the peak accumulated size reached. All metrics are shared +/// across the partition samplers; the latency metrics aggregate per-partition observations. +#[derive(Debug, Clone)] +pub(crate) struct SamplerExecMetrics { + /// Time since [SamplerExec::kick_off_first_sampler] was called until the first batch from + /// the input arrived + kick_off_to_fist_batch_p50: P50LatencyMetric, + kick_off_to_fist_batch_max: MaxLatencyMetric, + /// Time since [SamplerExec::kick_off_first_sampler] was called until the [pb::LoadInfo] message + /// was sent. + kick_off_to_load_info_sent_p50: P50LatencyMetric, + kick_off_to_load_info_sent_max: MaxLatencyMetric, + /// Time since [SamplerExec::kick_off_first_sampler] was called until the node was properly + /// executed with [SamplerExec::execute]. + kick_off_to_execution_p50: P50LatencyMetric, + kick_off_to_execution_max: MaxLatencyMetric, + /// Maximum number of record batches peeked by a sampler. + max_batches_peeked: MaxGaugeMetric, + /// Peak memory accumulated by any partition sampler during the sampling phase. + max_mem_used: Gauge, + /// Bytes per second flowing through the sampler node. + bytes_per_sec: BytesCounterMetric, + /// Bytes ready at the moment of reporting load info. + bytes_ready: BytesCounterMetric, + /// Elapsed compute while sampling. + elapsed_compute: Time, +} + +impl SamplerExecMetrics { + fn new(metric_set: &ExecutionPlanMetricsSet) -> Self { + let bdr = || MetricBuilder::new(metric_set); + Self { + kick_off_to_fist_batch_p50: bdr().p50_latency("kick_off_to_first_batch_p50"), + kick_off_to_fist_batch_max: bdr().max_latency("kick_off_to_first_batch_max"), + kick_off_to_load_info_sent_p50: bdr().p50_latency("kick_off_to_load_info_sent_p50"), + kick_off_to_load_info_sent_max: bdr().max_latency("kick_off_to_load_info_sent_max"), + kick_off_to_execution_p50: bdr().p50_latency("kick_off_to_execution_p50"), + kick_off_to_execution_max: bdr().max_latency("kick_off_to_execution_max"), + max_batches_peeked: bdr().max_gauge("max_batches_peeked"), + max_mem_used: bdr().global_gauge("max_mem_used"), + bytes_per_sec: bdr().bytes_counter("bytes_per_sec"), + bytes_ready: bdr().bytes_counter("bytes_ready"), + elapsed_compute: { + let time = Time::new(); + bdr().build(MetricValue::ElapsedCompute(time.clone())); + time + }, + } + } +} + +impl SamplerExec { + pub(crate) fn new(input: Arc) -> Self { + let metric_set = ExecutionPlanMetricsSet::new(); + let metric_set_clone = metric_set.clone(); + // Metrics need to be lazily initialized, otherwise the coordinator side will register + // them when they are never relevant there, they are just relevant in workers. + // + // If we don't do this, the [SamplerExec] constructed during planning will register its + // own zeroed SamplerExecMetrics in the ExecutionPlanMetricsSet, even if the metrics we care + // about are just the ones collected in workers during execution. + let metrics: Arc SamplerExecMetrics + Send>>> = + Arc::new(LazyLock::new(Box::new(move || { + SamplerExecMetrics::new(&metric_set_clone) + }))); + let partitions = input.properties().partitioning.partition_count(); + let execution_started = Arc::new(AtomicBool::new(false)); + let mut samplers = Vec::with_capacity(partitions); + for i in 0..partitions { + samplers.push(PartitionSampler { + partition_idx: i, + input: Arc::clone(&input), + stream: Mutex::new(None), + metrics: Arc::clone(&metrics), + kick_off_at: Arc::new(OnceLock::new()), + first_batch_at: Arc::new(OnceLock::new()), + load_info_sent_at: Arc::new(OnceLock::new()), + execution_started: Arc::clone(&execution_started), + }); + } + Self { + input, + metric_set, + partition_samplers: samplers, + execution_started, + } + } + + pub(crate) fn kick_off_first_sampler( + plan: Arc, + ctx: Arc, + ) -> Result>> { + let mut receivers = vec![]; + plan.apply(|plan| { + let Some(sampler) = plan.downcast_ref::() else { + return Ok(TreeNodeRecursion::Continue); + }; + receivers.reserve(sampler.partition_samplers.len()); + for partition_sampler in &sampler.partition_samplers { + let rx = partition_sampler.kick_off(Arc::clone(&ctx))?; + receivers.push(rx); + } + Ok(TreeNodeRecursion::Stop) + })?; + Ok(receivers) + } +} + +pub(crate) struct PartitionSampler { + partition_idx: usize, + input: Arc, + stream: Mutex>, + execution_started: Arc, + + // Metrics state. + metrics: Arc SamplerExecMetrics + Send>>>, + /// Set when `kick_off` is invoked. Used at `execute()` time to record how long the + /// sampler sampled data before the consumer attached. + kick_off_at: Arc>, + /// Set the first time the producer task emits a `LoadInfo`. Used at `execute()` time + /// to record the gap between the first sample and the consumer starting. + first_batch_at: Arc>, + /// Set immediately after `sampling_tx.send()` succeeds. Used to measure the full + /// round-trip: LoadInfo sent → coordinator collects votes → downstream plan dispatched + /// → consumer calls execute(). + load_info_sent_at: Arc>, +} + +impl Debug for PartitionSampler { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_struct("PartitionSampler").finish() + } +} + +impl PartitionSampler { + fn start_stream(&self) -> Option { + let Some(kick_off_at) = self.kick_off_at.get() else { + return self.stream.lock().unwrap().take(); + }; + + // Time since this sampler was kicked off until the first batch arrived. + if let Some(t) = self.first_batch_at.get() { + let delay = t.saturating_duration_since(*kick_off_at); + self.metrics.kick_off_to_fist_batch_p50.add_duration(delay); + self.metrics.kick_off_to_fist_batch_max.add_duration(delay); + } + + // Time since the sampler was kicked off until the pb::LoadInfo message was sent. + if let Some(t) = self.load_info_sent_at.get() { + let delay = t.saturating_duration_since(*kick_off_at); + self.metrics + .kick_off_to_load_info_sent_p50 + .add_duration(delay); + self.metrics + .kick_off_to_load_info_sent_max + .add_duration(delay); + } + + // Time since the sampler was kicked off until it started executing. + let delay = kick_off_at.elapsed(); + self.metrics.kick_off_to_execution_p50.add_duration(delay); + self.metrics.kick_off_to_execution_max.add_duration(delay); + + self.stream.lock().unwrap().take() + } + + fn kick_off(&self, ctx: Arc) -> Result> { + let _ = self.kick_off_at.set(Instant::now()); + let (sampling_tx, sampling_rx) = oneshot::channel(); + + let input = Arc::clone(&self.input); + let partition_idx = self.partition_idx; + let schema = input.schema(); + let elapsed_compute = self.metrics.elapsed_compute.clone(); + let first_batch_at = Arc::clone(&self.first_batch_at); + let n_cols = self.input.schema().fields.len(); + + let reporter = LoadInfoDropHandler { + load_info: zero_load_info(partition_idx, n_cols), + sampling_tx: Some(sampling_tx), + bytes_per_second_metric: self.metrics.bytes_per_sec.clone(), + load_info_sent_at: Arc::clone(&self.load_info_sent_at), + bytes_ready_metric: self.metrics.bytes_ready.clone(), + omit: Arc::clone(&self.execution_started), + }; + + let mut peek = RecordBatchPeek { + peek: VecDeque::new(), + n_cols, + max_mem_used: self.metrics.max_mem_used.clone(), + max_batches_peeked: self.metrics.max_batches_peeked.clone(), + memory_reservation: Arc::new( + MemoryConsumer::new(format!("PartitionSampler[{partition_idx}]")) + .register(ctx.memory_pool()), + ), + first_batch_at: Arc::clone(&self.first_batch_at), + }; + + // Execute the input synchronously so any setup error surfaces before we + // spawn the producer task. + let mut input_stream = input.execute(partition_idx, ctx)?.fuse(); + + let task = SpawnedTask::spawn(async move { + // First, read at once all the RecordBatches that are ready to be yielded synchronously. + // Some downstream nodes will accumulate data in-memory, and will then yield several + // RecordBatches at once synchronously (without Poll::Pending gaps in between). + let mut chunked = (&mut input_stream).ready_chunks(READY_CHUNK_LIMIT); + let Some(batches) = chunked.next().await else { + // Not a single RecordBatch was produced, so let bytes_per_second=0 be sent as-is. + return Ok(peek.chain(input_stream).boxed()); + }; + let _elapsed_compute_timer = elapsed_compute.timer(); + for batch in batches { + let _ = first_batch_at.set(Instant::now()); + peek.push(batch?); + } + + // Peek whether there is more data to be produced. + if let Some(result) = input_stream.next().now_or_never() { + return if let Some(batch) = result { + // A batch was immediately available without hitting an async gap (the input is + // still yielding synchronously). store it so its rows are not lost. We cannot + // measure a meaningful arrival velocity in this case, so as before, assume the + // worst. + peek.push(batch?); + reporter.report(&peek, MAX_BYTES_PER_SECOND, MAX_ROWS_PER_SECOND); + Ok(peek.chain(input_stream).boxed()) + } else { + // No more batches to read, so no velocity measurement. + reporter.report(&peek, 0, 0); + Ok(peek.chain(input_stream).boxed()) + }; + } + + drop(_elapsed_compute_timer); + + // Wait for an async gap in order to measure data velocity. + let poll_start = Instant::now(); + let Some(batch) = input_stream.try_next().await? else { + let _elapsed_compute_timer = elapsed_compute.timer(); + // The last message was somehow the last message in the stream, but the stream did + // not end immediately. This is an unlikely scenario. + reporter.report(&peek, 0, 0); + return Ok(peek.chain(input_stream).boxed()); + }; + let _elapsed_compute_timer = elapsed_compute.timer(); + + let bytes_per_second = + (record_batch_size(&batch) as f32 / poll_start.elapsed().as_secs_f32()) as usize; + let rows_per_second = + (batch.num_rows() as f32 / poll_start.elapsed().as_secs_f32()) as usize; + + peek.push(batch); + + // Some RecordBatches where buffered, but there's more to be yielded, so both + // bytes_per_second and bytes_ready can be reported. + reporter.report(&peek, bytes_per_second, rows_per_second); + + Ok(peek.chain(input_stream).boxed()) + }); + + let stream = async move { + task.await + .map_err(|err| DataFusionError::Internal(err.to_string()))? + } + .try_flatten_stream(); + + self.stream + .lock() + .expect("poisoned lock") + .replace(Box::pin(RecordBatchStreamAdapter::new(schema, stream))); + + Ok(sampling_rx) + } +} + +/// Wraps a [pb::LoadInfo] and emits it on [Drop] through the provided [oneshot] channel. +/// +/// Emitting on drop ensures that it's always emitted. +struct LoadInfoDropHandler { + omit: Arc, + + load_info: pb::LoadInfo, + bytes_ready_metric: BytesCounterMetric, + bytes_per_second_metric: BytesCounterMetric, + sampling_tx: Option>, + load_info_sent_at: Arc>, +} + +impl LoadInfoDropHandler { + fn report(mut self, peek: &RecordBatchPeek, bps: usize, rps: usize) { + if self.omit.load(Ordering::Relaxed) { + return; + } + + self.set_per_col_bytes_ready(peek.per_col_bytes_ready()); + self.set_rows_ready(peek.rows_ready()); + self.set_per_col_ndv(peek.per_col_ndv()); + self.set_per_col_null_pct(peek.per_col_null_pct()); + self.set_per_col_bytes_per_second(bps); + self.set_rows_per_second(rps) + } + + fn set_per_col_bytes_ready(&mut self, bytes_ready: Vec) { + self.load_info.per_column_bytes_ready = vec_cast(&bytes_ready); + self.bytes_ready_metric.add_bytes(bytes_ready.iter().sum()); + } + + fn set_per_col_bytes_per_second(&mut self, total_bytes_per_second: usize) { + let per_col_ready: &[u64] = &self.load_info.per_column_bytes_ready; + let total_ready: u64 = per_col_ready.iter().sum(); + let per_col: Vec = if total_ready == 0 { + vec![total_bytes_per_second / per_col_ready.len().max(1); per_col_ready.len()] + } else { + per_col_ready + .iter() + .map(|&ready| { + (ready.saturating_mul(total_bytes_per_second as u64) / total_ready) as usize + }) + .collect() + }; + self.load_info.per_column_bytes_per_second = vec_cast(&per_col); + self.bytes_per_second_metric + .add_bytes(total_bytes_per_second); + } + + fn set_rows_ready(&mut self, rows_ready: usize) { + self.load_info.rows_ready = rows_ready as u64; + } + + fn set_rows_per_second(&mut self, rows_per_second: usize) { + self.load_info.rows_per_second = rows_per_second as u64; + } + + fn set_per_col_ndv(&mut self, per_column_ndv: Vec) { + self.load_info.per_column_ndv_percentage = per_column_ndv; + } + + fn set_per_col_null_pct(&mut self, per_column_null_pct: Vec) { + self.load_info.per_column_null_percentage = per_column_null_pct; + } +} + +impl Drop for LoadInfoDropHandler { + fn drop(&mut self) { + if self.omit.load(Ordering::Relaxed) { + return; + } + if let Some(sampling_tx) = self.sampling_tx.take() { + let _ = sampling_tx.send(std::mem::take(&mut self.load_info)); + let _ = self.load_info_sent_at.set(Instant::now()); + } + } +} + +fn zero_load_info(partition_idx: usize, n_cols: usize) -> pb::LoadInfo { + pb::LoadInfo { + partition: partition_idx as u64, + rows_per_second: 0, + rows_ready: 0, + per_column_bytes_per_second: vec![0; n_cols], + per_column_bytes_ready: vec![0; n_cols], + per_column_ndv_percentage: vec![0.0; n_cols], + per_column_null_percentage: vec![0.0; n_cols], + } +} + +struct RecordBatchPeek { + peek: VecDeque, + n_cols: usize, + max_batches_peeked: MaxGaugeMetric, + max_mem_used: Gauge, + memory_reservation: Arc, + first_batch_at: Arc>, +} + +impl RecordBatchPeek { + fn push(&mut self, batch: RecordBatch) { + let batch_size = record_batch_size(&batch); + if self.peek.is_empty() { + let _ = self.first_batch_at.set(Instant::now()); + } + self.max_mem_used.add(batch_size); + self.memory_reservation.grow(batch_size); + self.peek.push_back(batch); + self.max_batches_peeked.set_max(self.peek.len()); + } + + fn per_col_bytes_ready(&self) -> Vec { + let mut result = vec![0; self.n_cols]; + for batch in self.peek.iter() { + for (i, col) in batch.columns().iter().enumerate() { + result[i] += column_size(col) + } + } + result + } + + fn rows_ready(&self) -> usize { + self.peek.iter().map(|batch| batch.num_rows()).sum() + } + + fn per_col_ndv(&self) -> Vec { + let total_rows: usize = self.peek.iter().map(|b| b.num_rows()).sum(); + if total_rows == 0 { + return vec![0.0; self.n_cols]; + } + + // Build the list of flat row indices to sample, sorted for cache-friendly access. + let sampled_indices: Vec = if total_rows <= NDV_MAX_ROWS_SAMPLE { + (0..total_rows).collect() + } else { + let mut indices = + rand::seq::index::sample(&mut rand::rng(), total_rows, NDV_MAX_ROWS_SAMPLE) + .into_vec(); + indices.sort_unstable(); + indices + }; + let rows_sampled = sampled_indices.len(); + + let mut sets: Vec> = vec![HashSet::new(); self.n_cols]; + let mut flat_base = 0usize; + let mut sample_pos = 0usize; + + for batch in &self.peek { + let batch_end = flat_base + batch.num_rows(); + while sample_pos < sampled_indices.len() && sampled_indices[sample_pos] < batch_end { + let row = sampled_indices[sample_pos] - flat_base; + for (col_idx, set) in sets.iter_mut().enumerate() { + let col = batch.column(col_idx); + if !col.is_null(row) + && let Ok(v) = ScalarValue::try_from_array(col, row) + { + set.insert(v); + } + } + sample_pos += 1; + } + if sample_pos >= sampled_indices.len() { + break; + } + flat_base = batch_end; + } + + sets.into_iter() + .map(|s| s.len() as f32 / rows_sampled as f32) + .collect() + } + + fn per_col_null_pct(&self) -> Vec { + let total_rows: usize = self.peek.iter().map(|b| b.num_rows()).sum(); + if total_rows == 0 { + return vec![0.0; self.n_cols]; + } + let mut counts = vec![0usize; self.n_cols]; + for batch in &self.peek { + for (col_idx, count) in counts.iter_mut().enumerate() { + *count += batch.column(col_idx).null_count(); + } + } + counts + .into_iter() + .map(|c| c as f32 / total_rows as f32) + .collect() + } +} + +impl Stream for RecordBatchPeek { + type Item = Result; + + fn poll_next(mut self: Pin<&mut Self>, _cx: &mut Context<'_>) -> Poll> { + match self.as_mut().peek.pop_front() { + None => Poll::Ready(None), + Some(batch) => { + self.memory_reservation.shrink(record_batch_size(&batch)); + Poll::Ready(Some(Ok(batch))) + } + } + } + + fn size_hint(&self) -> (usize, Option) { + (self.peek.len(), Some(self.peek.len())) + } +} + +fn column_size(arr: &ArrayRef) -> usize { + arr.to_data().get_slice_memory_size().unwrap_or(0) +} + +fn record_batch_size(batch: &RecordBatch) -> usize { + batch.columns().iter().map(column_size).sum() +} + +impl DisplayAs for SamplerExec { + fn fmt_as(&self, _t: DisplayFormatType, f: &mut Formatter) -> std::fmt::Result { + write!( + f, + "SamplerExec: partitions={}", + self.partition_samplers.len() + ) + } +} + +impl ExecutionPlan for SamplerExec { + fn name(&self) -> &str { + "SamplerExec" + } + + fn properties(&self) -> &Arc { + self.input.properties() + } + + fn children(&self) -> Vec<&Arc> { + vec![&self.input] + } + + fn with_new_children( + self: Arc, + children: Vec>, + ) -> Result> { + Ok(Arc::new(Self::new(require_one_child(children)?))) + } + + fn execute( + &self, + partition: usize, + _context: Arc, + ) -> Result { + self.execution_started.store(true, Ordering::Relaxed); + let Some(partition_sampler) = self.partition_samplers.get(partition) else { + return exec_err!("Partition {partition} not available in SamplerExec"); + }; + let Some(stream) = partition_sampler.start_stream() else { + return exec_err!("SamplerExec[{partition}] was not kicked off"); + }; + Ok(stream) + } + + fn metrics(&self) -> Option { + Some(self.metric_set.clone_inner()) + } +} diff --git a/src/metrics/bytes_metric.rs b/src/metrics/bytes_metric.rs index 3bdd7f5a..5d8c7e6c 100644 --- a/src/metrics/bytes_metric.rs +++ b/src/metrics/bytes_metric.rs @@ -5,6 +5,7 @@ use std::{ sync::{Arc, atomic::AtomicUsize}, }; +use datafusion::physical_plan::Metric; use datafusion::{ common::human_readable_size, physical_plan::metrics::{CustomMetricValue, MetricBuilder, MetricValue}, @@ -50,6 +51,16 @@ impl Default for BytesCounterMetric { } impl BytesCounterMetric { + pub fn new_metric(name: impl Into>, bytes: usize) -> Arc { + Arc::new(Metric::new( + MetricValue::Custom { + name: name.into(), + value: Arc::new(BytesCounterMetric::from_value(bytes)), + }, + None, + )) + } + pub fn from_value(bytes: usize) -> Self { Self { bytes: Arc::new(AtomicUsize::new(bytes)), diff --git a/src/metrics/task_metrics_rewriter.rs b/src/metrics/task_metrics_rewriter.rs index 3ba5c88c..87d93c96 100644 --- a/src/metrics/task_metrics_rewriter.rs +++ b/src/metrics/task_metrics_rewriter.rs @@ -81,6 +81,7 @@ pub async fn rewrite_distributed_plan_with_metrics( num: stage.num, plan: plan_with_metrics, tasks: stage.tasks, + metrics_set: stage.metrics_set.clone(), }))?; let network_boundary = MetricsWrapperExec::new(network_boundary, plan.metrics().unwrap_or_default()); @@ -415,6 +416,7 @@ mod tests { num: 2, plan, tasks: 4, + metrics_set: Default::default(), } } diff --git a/src/protobuf/distributed_codec.rs b/src/protobuf/distributed_codec.rs index 3cfec30a..1ea27541 100644 --- a/src/protobuf/distributed_codec.rs +++ b/src/protobuf/distributed_codec.rs @@ -1,9 +1,9 @@ use super::get_distributed_user_codecs; use crate::NetworkShuffleExec; -use crate::common::{deserialize_uuid, serialize_uuid}; +use crate::common::{deserialize_uuid, require_one_child, serialize_uuid}; use crate::execution_plans::{ BroadcastExec, ChildWeight, ChildrenIsolatorUnionExec, NetworkBroadcastExec, - NetworkCoalesceExec, + NetworkCoalesceExec, SamplerExec, }; use crate::stage::{LocalStage, RemoteStage, Stage}; use crate::worker::WorkerConnectionPool; @@ -74,6 +74,7 @@ impl PhysicalExtensionCodec for DistributedCodec { num: proto.num as usize, plan: input, tasks: proto.tasks.len(), + metrics_set: Default::default(), })) } else { let mut worker_urls = Vec::with_capacity(proto.tasks.len()); @@ -90,6 +91,7 @@ impl PhysicalExtensionCodec for DistributedCodec { query_id: deserialize_uuid(proto.query_id.as_ref())?, num: proto.num as usize, workers: worker_urls, + runtime_stats: None, })) } } @@ -233,6 +235,9 @@ impl PhysicalExtensionCodec for DistributedCodec { .collect(), })) } + DistributedExecNode::Sampler(SamplerExecProto {}) => { + Ok(Arc::new(SamplerExec::new(require_one_child(inputs)?))) + } } } @@ -349,6 +354,14 @@ impl PhysicalExtensionCodec for DistributedCodec { node: Some(DistributedExecNode::ChildrenIsolatorUnion(inner)), }; + wrapper.encode(buf).map_err(|e| proto_error(format!("{e}"))) + } else if let Some(_node) = node.downcast_ref::() { + let inner = SamplerExecProto {}; + + let wrapper = DistributedExecProto { + node: Some(DistributedExecNode::Sampler(inner)), + }; + wrapper.encode(buf).map_err(|e| proto_error(format!("{e}"))) } else { Err(proto_error(format!("Unexpected plan {}", node.name()))) @@ -380,7 +393,7 @@ pub struct ExecutionTaskProto { #[derive(Clone, PartialEq, ::prost::Message)] pub struct DistributedExecProto { - #[prost(oneof = "DistributedExecNode", tags = "1, 2, 3, 4, 5, 6")] + #[prost(oneof = "DistributedExecNode", tags = "1, 2, 3, 4, 5, 6, 7")] pub node: Option, } @@ -397,6 +410,8 @@ pub enum DistributedExecNode { NetworkBroadcast(NetworkBroadcastExecProto), #[prost(message, tag = "6")] Broadcast(BroadcastExecProto), + #[prost(message, tag = "7")] + Sampler(SamplerExecProto), } /// Protobuf representation of the [NetworkShuffleExec] physical node. It serves as @@ -509,6 +524,9 @@ pub struct BroadcastExecProto { pub consumer_task_count: u64, } +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct SamplerExecProto {} + fn new_network_broadcast_exec( partitioning: Partitioning, schema: SchemaRef, @@ -547,6 +565,7 @@ mod tests { query_id: Default::default(), num: 0, workers: vec![], + runtime_stats: None, }) } @@ -556,6 +575,7 @@ mod tests { num: 0, plan: empty_exec(), tasks: 1, + metrics_set: Default::default(), }) } diff --git a/src/stage.rs b/src/stage.rs index 0545d718..3ac39e7a 100644 --- a/src/stage.rs +++ b/src/stage.rs @@ -1,13 +1,15 @@ use crate::coordinator::{DistributedExec, MetricsStore}; use crate::execution_plans::{DistributedLeafExec, NetworkCoalesceExec}; use crate::metrics::DISTRIBUTED_DATAFUSION_TASK_ID_LABEL; -use datafusion::common::{HashMap, config_err}; +use datafusion::common::{HashMap, Statistics, config_err}; use datafusion::common::{exec_err, plan_err}; use datafusion::error::Result; use datafusion::execution::{SendableRecordBatchStream, TaskContext}; use datafusion::physical_plan::display::DisplayableExecutionPlan; use datafusion::physical_plan::metrics::{Label, Metric, MetricsSet}; -use datafusion::physical_plan::{ExecutionPlan, ExecutionPlanProperties, displayable}; +use datafusion::physical_plan::{ + ColumnStatistics, ExecutionPlan, ExecutionPlanProperties, displayable, +}; use itertools::Either; use std::collections::VecDeque; use std::sync::Arc; @@ -84,6 +86,8 @@ pub struct LocalStage { pub plan: Arc, /// The number of tasks the stage has. pub tasks: usize, + /// Metrics collected by the coordinator + pub metrics_set: MetricsSet, } impl LocalStage { @@ -107,6 +111,8 @@ pub struct RemoteStage { pub num: usize, /// The worker URLs to which queries should be issued. pub workers: Vec, + /// Statistics collected at runtime, if any. + pub runtime_stats: Option>, } impl Stage { @@ -137,6 +143,63 @@ impl Stage { Self::Remote(_) => None, } } + + pub fn metrics(&self) -> MetricsSet { + match &self { + Self::Local(v) => v.metrics_set.clone(), + Self::Remote(_) => MetricsSet::new(), + } + } + + pub fn partition_statistics( + &self, + partition: Option, + partition_count: usize, + schema: SchemaRef, + ) -> Result> { + match self { + Stage::Local(local) => local.plan.partition_statistics(partition), + Stage::Remote(remote) => { + let Some(runtime_stats) = &remote.runtime_stats else { + return Ok(Arc::new(Statistics::new_unknown(&schema))); + }; + match partition { + None => Ok(Arc::clone(runtime_stats)), + Some(_) => Ok(Arc::new(multiply_stats( + runtime_stats, + 1.0 / partition_count as f32, + ))), + } + } + } + } +} + +fn multiply_stats(stats: &Statistics, f: f32) -> Statistics { + Statistics { + num_rows: multiply_precision(stats.num_rows, f), + total_byte_size: multiply_precision(stats.total_byte_size, f), + column_statistics: stats + .column_statistics + .iter() + .map(|col| ColumnStatistics { + null_count: multiply_precision(col.null_count, f), + max_value: Precision::Absent, + min_value: Precision::Absent, + sum_value: Precision::Absent, + distinct_count: multiply_precision(col.distinct_count, f), + byte_size: multiply_precision(col.byte_size, f), + }) + .collect(), + } +} + +fn multiply_precision(p: Precision, f: f32) -> Precision { + match p { + Precision::Exact(v) => Precision::Exact((v as f32 * f) as usize), + Precision::Inexact(v) => Precision::Inexact((v as f32 * f) as usize), + Precision::Absent => Precision::Absent, + } } #[derive(Debug, Clone, Copy, PartialEq)] @@ -161,7 +224,9 @@ use crate::metrics::proto::metric_proto_to_df; use crate::worker::generated::worker as pb; use crate::{DistributedMetricsFormat, NetworkShuffleExec, rewrite_distributed_plan_with_metrics}; use crate::{NetworkBoundary, NetworkBoundaryExt}; +use datafusion::arrow::datatypes::SchemaRef; use datafusion::common::DataFusionError; +use datafusion::common::stats::Precision; use datafusion::physical_expr::Partitioning; /// Be able to display a nice tree for stages. /// @@ -373,7 +438,7 @@ fn gather_stage_header_metrics(stage: &Stage, metrics_store: &MetricsStore) -> M stage_id: stage.num() as u64, task_number: 0, }; - let mut all_metrics = MetricsSet::new(); + let mut all_metrics = stage.metrics(); while let Some(metrics_set) = metrics_store.get(&task_key).and_then(|v| v.task_metrics) { for mut metric in metrics_set.metrics { metric.labels.push(pb::Label { @@ -573,6 +638,7 @@ pub fn display_plan_graphviz(plan: Arc) -> Result { num: max_num + 1, plan: plan.clone(), tasks: 1, + metrics_set: MetricsSet::new(), }); all_stages.insert(0, &head_stage); diff --git a/src/work_unit_feed/remote_work_unit_feed.rs b/src/work_unit_feed/remote_work_unit_feed.rs index f914f228..1526508c 100644 --- a/src/work_unit_feed/remote_work_unit_feed.rs +++ b/src/work_unit_feed/remote_work_unit_feed.rs @@ -38,8 +38,18 @@ pub(crate) struct RemoteWorkUnitFeedRegistry { impl RemoteWorkUnitFeedRegistry { /// Creates all the receivers and senders for a specific [WorkUnit] Feed id. One feed per /// partition is created. + /// + /// Calling this twice with the same `id` is a coordinator bug — duplicate declarations + /// mean two plan nodes share a UUID, which would cause "already consumed" when both + /// nodes call `feed()`. We skip rather than overwrite so the coordinator-side duplicate + /// detection in `task_specialized_plan` surfaces the real error first. pub(crate) fn add(&mut self, id: Uuid, partitions: usize) { for partition in 0..partitions { + // Skip if already registered; overwriting would silently drop the existing + // receiver and cause a confusing "already consumed" error at execution time. + if self.receivers.contains_key(&(id, partition)) { + continue; + } let (tx, rx) = tokio::sync::mpsc::unbounded_channel(); self.receivers.insert((id, partition), Mutex::new(Some(rx))); self.senders.insert((id, partition), tx); diff --git a/src/worker/generated/worker.rs b/src/worker/generated/worker.rs index fe7a0137..290261ed 100644 --- a/src/worker/generated/worker.rs +++ b/src/worker/generated/worker.rs @@ -24,7 +24,7 @@ pub mod coordinator_to_worker_msg { } #[derive(Clone, PartialEq, ::prost::Message)] pub struct WorkerToCoordinatorMsg { - #[prost(oneof = "worker_to_coordinator_msg::Inner", tags = "1")] + #[prost(oneof = "worker_to_coordinator_msg::Inner", tags = "1, 2, 3")] pub inner: ::core::option::Option, } /// Nested message and enum types in `WorkerToCoordinatorMsg`. @@ -37,6 +37,12 @@ pub mod worker_to_coordinator_msg { /// metrics\[i\] is the set of metrics for plan node i in pre-order traversal order. #[prost(message, tag = "1")] TaskMetrics(super::TaskMetrics), + /// Load information reported by a task. This information is used for dynamically + /// sizing the number of workers involved in a query. + #[prost(message, tag = "2")] + LoadInfo(super::LoadInfo), + #[prost(bool, tag = "3")] + LoadInfoEos(bool), } } #[derive(Clone, PartialEq, ::prost::Message)] @@ -52,6 +58,34 @@ pub struct TaskMetrics { #[prost(message, optional, tag = "2")] pub task_metrics: ::core::option::Option, } +/// Load information reported for a specific partition with information about this +/// amount of data flowing through the plan. +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct LoadInfo { + /// The partition index to which this message belongs to. + #[prost(uint64, tag = "1")] + pub partition: u64, + /// The amount of rows ready to be returned. + #[prost(uint64, tag = "2")] + pub rows_ready: u64, + /// The estimated velocity at which rows will flow through the node. If all the rows were + /// already accumulated, they will be reported by `rows_ready`, and this field will be 0. + #[prost(uint64, tag = "3")] + pub rows_per_second: u64, + /// The amount of bytes ready to be returned per column. + #[prost(uint64, repeated, tag = "4")] + pub per_column_bytes_ready: ::prost::alloc::vec::Vec, + /// The estimated velocity at which data will flow through each column. If all the bytes were + /// already accumulated, they will be reported by `bytes_ready`, and this field will be 0. + #[prost(uint64, repeated, tag = "5")] + pub per_column_bytes_per_second: ::prost::alloc::vec::Vec, + /// Approximate ratio of NDV for each column. + #[prost(float, repeated, tag = "6")] + pub per_column_ndv_percentage: ::prost::alloc::vec::Vec, + /// Approximate ratio of null count for each column. + #[prost(float, repeated, tag = "7")] + pub per_column_null_percentage: ::prost::alloc::vec::Vec, +} #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] pub struct GetWorkerInfoRequest {} #[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] diff --git a/src/worker/impl_coordinator_channel.rs b/src/worker/impl_coordinator_channel.rs index d1c2097f..fcb3944a 100644 --- a/src/worker/impl_coordinator_channel.rs +++ b/src/worker/impl_coordinator_channel.rs @@ -1,4 +1,5 @@ use crate::common::deserialize_uuid; +use crate::execution_plans::SamplerExec; use crate::work_unit_feed::{RemoteWorkUnitFeedRegistry, set_work_unit_received_time}; use crate::worker::LocalWorkerContext; use crate::worker::generated::worker::coordinator_to_worker_msg::Inner; @@ -17,6 +18,7 @@ use datafusion::execution::SessionStateBuilder; use datafusion::prelude::SessionConfig; use datafusion_proto::physical_plan::AsExecutionPlan; use datafusion_proto::protobuf::PhysicalPlanNode; +use futures::stream::FuturesUnordered; use futures::{FutureExt, StreamExt, TryStreamExt}; use std::sync::atomic::AtomicUsize; use std::sync::{Arc, OnceLock}; @@ -55,6 +57,7 @@ impl Worker { } let (metrics_tx, metrics_rx) = oneshot::channel(); + let mut load_info_rxs = vec![]; let task_data = || async { let headers = grpc_headers.into_headers(); @@ -98,6 +101,8 @@ impl Worker { for hook in self.hooks.on_plan.iter() { plan = hook(plan, session_state.config())?; } + load_info_rxs = + SamplerExec::kick_off_first_sampler(Arc::clone(&plan), Arc::clone(&task_ctx))?; // Initialize partition count to the number of partitions in the stage let total_partitions = plan.properties().partitioning.partition_count(); @@ -172,19 +177,34 @@ impl Worker { tokio::spawn(async move { task_data_entries.invalidate(&key).await }); }); + let load_info_stream = FuturesUnordered::from_iter(load_info_rxs) + .filter_map(async |load_info_or_channel_dropped| { + // This error can only happen if the pb::LoadInfo sender was dropped, which is fine. + let load_info = load_info_or_channel_dropped.ok()?; + Some(Ok(WorkerToCoordinatorMsg { + inner: Some(worker_to_coordinator_msg::Inner::LoadInfo(load_info)), + })) + }) + .chain(futures::stream::once(async move { + Ok(WorkerToCoordinatorMsg { + inner: Some(worker_to_coordinator_msg::Inner::LoadInfoEos(true)), + }) + })); + // Stream back the metrics once the task finishes executing. // The oneshot receiver resolves when impl_execute_task sends the collected // metrics after all partitions have finished or been dropped. let metrics_stream = metrics_rx.into_stream(); - let metrics_stream = metrics_stream.filter_map(|task_metrics| async move { - match task_metrics { - Ok(task_metrics) => Some(WorkerToCoordinatorMsg { - inner: Some(worker_to_coordinator_msg::Inner::TaskMetrics(task_metrics)), - }), - Err(_) => None, // channel dropped without sending any message - } + let metrics_stream = metrics_stream.filter_map(async |task_metrics_or_channel_dropped| { + let task_metrics = task_metrics_or_channel_dropped.ok()?; + Some(Ok(WorkerToCoordinatorMsg { + inner: Some(worker_to_coordinator_msg::Inner::TaskMetrics(task_metrics)), + })) }); - Ok(Response::new(metrics_stream.map(Ok).boxed())) + + Ok(Response::new( + futures::stream::select(load_info_stream, metrics_stream).boxed(), + )) } } diff --git a/src/worker/task_data.rs b/src/worker/task_data.rs index 28f5ca5d..97b2e806 100644 --- a/src/worker/task_data.rs +++ b/src/worker/task_data.rs @@ -1,6 +1,7 @@ use crate::MaxLatencyMetric; -use crate::common::{OnceLockResult, now_ns}; -use crate::distributed_planner::{ProducerHead, insert_producer_head}; +use crate::common::OnceLockResult; +use crate::common::now_ns; +use crate::distributed_planner::ProducerHead; use crate::worker::generated::worker as pb; use datafusion::common::{DataFusionError, Result}; use datafusion::execution::TaskContext; @@ -134,7 +135,7 @@ impl TaskData { let producer_head = ProducerHead::from_proto(producer_head, &self.base_plan.schema(), &self.task_ctx)?; - let plan = insert_producer_head(Arc::clone(&self.base_plan), producer_head)?; + let plan = producer_head.insert(Arc::clone(&self.base_plan))?; self.num_partitions_remaining.store( plan.output_partitioning().partition_count(), diff --git a/src/worker/worker.proto b/src/worker/worker.proto index bc1e3412..ef691b59 100644 --- a/src/worker/worker.proto +++ b/src/worker/worker.proto @@ -33,6 +33,12 @@ message WorkerToCoordinatorMsg { // ensuring metrics are never lost due to early stream termination. // metrics[i] is the set of metrics for plan node i in pre-order traversal order. TaskMetrics task_metrics = 1; + + // Load information reported by a task. This information is used for dynamically + // sizing the number of workers involved in a query. + LoadInfo load_info = 2; + + bool load_info_eos = 3; } } @@ -47,6 +53,27 @@ message TaskMetrics { MetricsSet task_metrics = 2; } +// Load information reported for a specific partition with information about this +// amount of data flowing through the plan. +message LoadInfo { + // The partition index to which this message belongs to. + uint64 partition = 1; + // The amount of rows ready to be returned. + uint64 rows_ready = 2; + // The estimated velocity at which rows will flow through the node. If all the rows were + // already accumulated, they will be reported by `rows_ready`, and this field will be 0. + uint64 rows_per_second = 3; + // The amount of bytes ready to be returned per column. + repeated uint64 per_column_bytes_ready = 4; + // The estimated velocity at which data will flow through each column. If all the bytes were + // already accumulated, they will be reported by `bytes_ready`, and this field will be 0. + repeated uint64 per_column_bytes_per_second = 5; + // Approximate ratio of NDV for each column. + repeated float per_column_ndv_percentage = 6; + // Approximate ratio of null count for each column. + repeated float per_column_null_percentage = 7; +} + message GetWorkerInfoRequest {} message GetWorkerInfoResponse { diff --git a/tests/clickbench_correctness_test.rs b/tests/clickbench_correctness_test.rs index 9df2a8d3..acf3daa8 100644 --- a/tests/clickbench_correctness_test.rs +++ b/tests/clickbench_correctness_test.rs @@ -18,6 +18,7 @@ mod tests { use std::sync::Arc; use tokio::sync::OnceCell; + const ADAPTIVE_ENV_VAR: &str = "ADAPTIVE"; const NUM_WORKERS: usize = 4; const PARTITIONS: usize = 3; const FILE_SCAN_CONFIG_BYTES_PER_PARTITION: usize = 1; @@ -289,12 +290,15 @@ mod tests { .options_mut() .execution .target_partitions = PARTITIONS; - let d_ctx = d_ctx + let mut d_ctx = d_ctx .with_distributed_file_scan_config_bytes_per_partition( FILE_SCAN_CONFIG_BYTES_PER_PARTITION, )? .with_distributed_cardinality_effect_task_scale_factor(CARDINALITY_TASK_COUNT_FACTOR)? .with_distributed_broadcast_joins(true)?; + if std::env::var(ADAPTIVE_ENV_VAR).unwrap_or_default() == "true" { + d_ctx.set_distributed_dynamic_task_count(true)?; + } register_tables(&s_ctx, &data_dir).await?; register_tables(&d_ctx, &data_dir).await?; diff --git a/tests/metrics_collection.rs b/tests/metrics_collection.rs index ccb43a57..40d05532 100644 --- a/tests/metrics_collection.rs +++ b/tests/metrics_collection.rs @@ -6,6 +6,7 @@ mod tests { use datafusion::common::{Result, assert_contains}; use datafusion::execution::SessionState; use datafusion::physical_plan::display::DisplayableExecutionPlan; + use datafusion::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; use datafusion::physical_plan::{ExecutionPlan, execute_stream}; use datafusion::prelude::SessionContext; use datafusion_distributed::test_utils::localhost::start_localhost_context; @@ -341,6 +342,37 @@ mod tests { Ok(()) } + #[tokio::test] + async fn test_metrics_collection_dynamic() -> Result<(), Box> { + let (mut d_ctx, _guard, _) = start_localhost_context(3, DefaultSessionBuilder).await; + d_ctx.set_distributed_dynamic_task_count(true)?; + + let query = + r#"SELECT count(*), "RainToday" FROM weather GROUP BY "RainToday" ORDER BY count(*)"#; + + let s_ctx = SessionContext::default(); + let (s_physical, mut d_physical) = execute(&s_ctx, &d_ctx, query).await?; + d_physical = rewrite_with_metrics(d_physical, DistributedMetricsFormat::Aggregated).await; + println!("{}", display_plan_ascii(s_physical.as_ref(), true)); + println!("{}", display_plan_ascii(d_physical.as_ref(), true)); + + assert_metrics_equal::( + ["output_rows", "output_bytes"], + &s_physical, + &d_physical, + 0, + ); + + assert_metrics_equal::( + ["output_rows", "output_bytes"], + &s_physical, + &d_physical, + 0, + ); + + Ok(()) + } + /// Looks for an [ExecutionPlan] that matches the provided type parameter `T1` in /// the left node and `T2` in the right node and compares its metrics. /// There might be more than one, so `index` determines which one is compared. diff --git a/tests/stateful_data_cleanup.rs b/tests/stateful_data_cleanup.rs index a3fe7bca..fae892d2 100644 --- a/tests/stateful_data_cleanup.rs +++ b/tests/stateful_data_cleanup.rs @@ -20,13 +20,18 @@ mod tests { const TPCH_DATA_PARTS: usize = 16; const CARDINALITY_TASK_COUNT_FACTOR: f64 = 1.0; - #[test_case(false; "metrics_disabled")] - #[test_case(true; "metrics_enabled")] + #[test_case((false, false); "metrics_disabled_static_planner")] + #[test_case((true, false); "metrics_enabled_static_planner")] + #[test_case((false, true); "metrics_disabled_dynamic_planner")] + #[test_case((true, true); "metrics_enabled_dynamic_planner")] #[tokio::test(flavor = "multi_thread")] - async fn no_pending_tasks_if_dynamic_query_completes(collect_metrics: bool) -> Result<()> { + async fn no_pending_tasks_if_dynamic_query_completes( + (collect_metrics, adaptive): (bool, bool), + ) -> Result<()> { let (mut d_ctx, _guard, workers) = start_localhost_context(NUM_WORKERS, DefaultSessionBuilder).await; d_ctx.set_distributed_metrics_collection(collect_metrics)?; + d_ctx.set_distributed_dynamic_task_count(adaptive)?; run_tpch_query(d_ctx, "q1").await?; @@ -35,10 +40,18 @@ mod tests { Ok(()) } + #[test_case((false, false); "metrics_disabled_static_planner")] + #[test_case((true, false); "metrics_enabled_static_planner")] + #[test_case((false, true); "metrics_disabled_dynamic_planner")] + #[test_case((true, true); "metrics_enabled_dynamic_planner")] #[tokio::test(flavor = "multi_thread")] - async fn no_pending_tasks_if_query_aborts() -> Result<()> { - let (d_ctx, _guard, workers) = + async fn no_pending_tasks_if_query_aborts( + (collect_metrics, adaptive): (bool, bool), + ) -> Result<()> { + let (mut d_ctx, _guard, workers) = start_localhost_context(NUM_WORKERS, DefaultSessionBuilder).await; + d_ctx.set_distributed_metrics_collection(collect_metrics)?; + d_ctx.set_distributed_dynamic_task_count(adaptive)?; let _ = timeout(Duration::from_millis(100), run_tpch_query(d_ctx, "q1")).await; diff --git a/tests/tpcds_correctness_test.rs b/tests/tpcds_correctness_test.rs index bb6011a7..e4baeba9 100644 --- a/tests/tpcds_correctness_test.rs +++ b/tests/tpcds_correctness_test.rs @@ -18,6 +18,7 @@ mod tests { use std::sync::Arc; use tokio::sync::OnceCell; + const ADAPTIVE_ENV_VAR: &str = "ADAPTIVE"; const NUM_WORKERS: usize = 4; const PARTITIONS: usize = 3; const FILE_SCAN_CONFIG_BYTES_PER_PARTITION: usize = 1; @@ -575,12 +576,15 @@ mod tests { .options_mut() .execution .target_partitions = PARTITIONS; - let d_ctx = d_ctx + let mut d_ctx = d_ctx .with_distributed_file_scan_config_bytes_per_partition( FILE_SCAN_CONFIG_BYTES_PER_PARTITION, )? .with_distributed_cardinality_effect_task_scale_factor(CARDINALITY_TASK_COUNT_FACTOR)? .with_distributed_broadcast_joins(true)?; + if std::env::var(ADAPTIVE_ENV_VAR).unwrap_or_default() == "true" { + d_ctx.set_distributed_dynamic_task_count(true)?; + } register_tables(&s_ctx, &data_dir).await?; register_tables(&d_ctx, &data_dir).await?; diff --git a/tests/tpch_correctness_test.rs b/tests/tpch_correctness_test.rs index 3b6bc5a4..3ae089dc 100644 --- a/tests/tpch_correctness_test.rs +++ b/tests/tpch_correctness_test.rs @@ -12,6 +12,7 @@ mod tests { use std::path::Path; use tokio::sync::OnceCell; + const ADAPTIVE_ENV_VAR: &str = "ADAPTIVE"; const NUM_WORKERS: usize = 4; const PARTITIONS: usize = 6; const FILE_SCAN_CONFIG_BYTES_PER_PARTITION: usize = 1; @@ -139,7 +140,10 @@ mod tests { // in a non-distributed manner. For each query, it asserts that the results are identical. async fn test_tpch_query(sql: String) -> Result<(), Box> { let d_ctx = start_in_memory_context(NUM_WORKERS, DefaultSessionBuilder).await; - let d_ctx = d_ctx.with_distributed_broadcast_joins(true)?; + let mut d_ctx = d_ctx.with_distributed_broadcast_joins(true)?; + if std::env::var(ADAPTIVE_ENV_VAR).unwrap_or_default() == "true" { + d_ctx.set_distributed_dynamic_task_count(true)?; + } let d_ctx = d_ctx .with_distributed_file_scan_config_bytes_per_partition(