From 17ead1272c66a606de16233e5af456abbf2bfaf2 Mon Sep 17 00:00:00 2001 From: Sylvestre Ledru Date: Sun, 7 Jun 2026 11:37:20 +0200 Subject: [PATCH] ptx: use char counts for before-chunk sizing in get_output_chunks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The max_before_size assert compared against before.len() (byte length) while max_before_size is measured in chars, panicking on multibyte input like 'éé word'. The tail-chunk budget (max_tail_size) had the same byte/char mismatch, shrinking the tail too much and dropping a word that fits. Use char counts in both places, matching the after chunk. Fixes #10893 --- src/uu/ptx/src/ptx.rs | 2 +- tests/by-util/test_ptx.rs | 27 +++++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/src/uu/ptx/src/ptx.rs b/src/uu/ptx/src/ptx.rs index 28d5e880a4c..8b57cd7e59a 100644 --- a/src/uu/ptx/src/ptx.rs +++ b/src/uu/ptx/src/ptx.rs @@ -522,7 +522,7 @@ fn get_output_chunks( // max size of the tail chunk = max size of left half - space taken by before chunk - gap size. let max_tail_size = cmp::max( - max_before_size as isize - before.len() as isize - config.gap_size as isize, + max_before_size as isize - before.chars().count() as isize - config.gap_size as isize, 0, ) as usize; diff --git a/tests/by-util/test_ptx.rs b/tests/by-util/test_ptx.rs index 4752d924e0c..2cfa1def906 100644 --- a/tests/by-util/test_ptx.rs +++ b/tests/by-util/test_ptx.rs @@ -365,6 +365,33 @@ fn test_unicode_in_after_chunk_does_not_panic() { .stdout_contains("We've got +11"); } +#[test] +fn test_unicode_in_before_chunk_does_not_panic() { + // Regression test for issue #10893: a panic in get_output_chunks() when the + // computed max_before_size used char counts but the assert compared against + // before.len() (byte length). A multibyte char in the before chunk could + // trigger: `assertion failed: max_before_size >= before.len()`. + new_ucmd!() + .args(&["-w", "10"]) + .pipe_in("éé word\n") + .succeeds() + .no_stderr(); +} + +#[test] +fn test_unicode_tail_chunk_sizing() { + // The tail chunk budget (max_tail_size) subtracts the size of the before + // chunk. It must use the before chunk's char count, not its byte length, + // otherwise a multibyte before chunk shrinks the tail too much and drops a + // word that fits. Here "cc" wraps into the tail before "aé bé KEY"; with the + // byte-based budget it was dropped. + new_ucmd!() + .args(&["-w", "20"]) + .pipe_in("aé bé KEY cc dd ee ff gg\n") + .succeeds() + .stdout_contains("cc/ aé"); +} + #[test] fn test_duplicate_input_files() { new_ucmd!()