From cf580845766f8d92ca8274748aeb0a2bc31bca30 Mon Sep 17 00:00:00 2001 From: Stefano Di Martino Date: Sun, 14 Jun 2026 20:22:07 +0200 Subject: [PATCH] Fix font fallback for compound/ZWJ-joined emoji (#861) Compound emoji that rely on automatic font fallback (i.e. that are not explicitly wrapped in a with an emoji font) were dropped entirely. This affected flags (regional-indicator pairs such as the UK flag) and ZWJ sequences (rainbow flag, family, ...). Emoji-only text and explicitly tagged emoji worked, which is why the existing tests never caught it. Root cause ---------- shape_text resolved missing glyphs by re-shaping the *whole* text with a fallback font and then merging the two glyph lists. The merge required both shapings to have the same number of glyphs (`glyphs.len() != fallback_glyphs.len() => break`) and copied glyphs one-by-one by index. Compound emoji ligate several code points into a single glyph in the emoji font, so the glyph counts differ and the whole fallback was abandoned, leaving the emoji unrendered. The per-index assumption was wrong for a second reason: default-ignorable code points (U+FE0F, U+200D) are turned into hidden space glyphs by the shaper, so the .notdef glyphs of one emoji are not even contiguous. Example, "Hithere" shaped with Noto Sans (primary) produced (glyph_id, text): (H), (i), (0,""), (3,""), (3,""), (0,""), (t), ... -> id 0 = .notdef (U+1F3F3 / U+1F308), id 3 = hidden space (U+FE0F / U+200D) Before: Hi[][]there (flag dropped, two .notdef boxes) After: Hithere (single ligated glyph from the emoji font) Fix --- Replace the index-based merge with a cluster-based one (merge_fallback_glyphs): the text is only cut at cluster boundaries shared by *both* shapings, and whole clusters are replaced when the primary font could not resolve them. This implements the existing `// TODO: Replace clusters and not glyphs` and is correct regardless of how many glyphs each font produces, for both LTR and RTL/BIDI runs (BIDI run boundaries are always shared cluster boundaries). Tests ----- Adds the usvg test `compound_emoji_font_fallback`, which fails before and passes after the fix. The full suite (1724 render tests) stays green except for one reference image (see below). Updated reference image: tests/text/direction/rtl.png ------------------------------------------------------ This is the only one of the 1724 render tests whose output changes (by 80 px). The test mixes Arabic (Noto Sans -> Amiri fallback) with the Latin word "SVG", i.e. a two-stage fallback where neither font covers everything. With the old index merge the fallback glyphs were spliced into the primary shaping's structure, carrying over advances position-by-position; the new cluster merge takes Amiri's own advances for the whole fallback cluster, which shifts the mixed run by a sub-pixel amount. The change is a refinement, not a regression: - the glyph shapes are identical and match Chrome's rendering of the same fonts (verified visually); - the isolated Arabic word renders byte-for-byte identically before and after; only the mixed Arabic+Latin advances differ slightly. Because tiny-skia rasterizes deterministically, the regenerated reference is stable across platforms/CI, so the strict pixel comparison keeps working. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../resvg/tests/tests/text/direction/rtl.png | Bin 933 -> 929 bytes crates/usvg/src/text/layout.rs | 109 ++++++++++++++---- crates/usvg/tests/parser.rs | 69 +++++++++++ 3 files changed, 156 insertions(+), 22 deletions(-) diff --git a/crates/resvg/tests/tests/text/direction/rtl.png b/crates/resvg/tests/tests/text/direction/rtl.png index 1d0416d2526b200c5b5eb65eb88884996157e7b0..ef52b109b26117ebb57a583af8e0b16840414806 100644 GIT binary patch delta 537 zcmZ3=zL0%_3Tt(MPl)S8L*a=IEE63x*sQFq7#J8P=BZ4aDpb!kT`362(Q^O{=Ci(* zej4caJal5>*4=wO-@lT79I^DXUv0$NRUJlq%`9z1{&tt#D`H!H_lfNVtL3{k&A%1< zCis;s9OJ{}7|M6_^ zy|Zt10U^JFt?MER;u^@@wQuYLIb<%i9mRNGj-+3S?bzQ0`aN#yqJ zsP(txR`EqVzJK}g=O5FzO6=EPcRV@&@cXdXCEs0tm8q_|o@bw=dwKgVZL`@zVP2bk zvsRhiFWJ#?`Q?O!r6tm<1K*!LYcuU-^wZS+YqB=ItnZ)df6>I-K0`qI`*HuHG9G4g$k#8kiK*0kIJwSeFfyo%PLXNoMm^Ls3^ft7U)0+^aoI UyQj3g1;#Rir>mdKI;Vst0MSbI0ssI2 delta 540 zcmZ3;zLb4}3Ts_}Pl)S8L*a=IEE63x*cccXtgNgi=BZ5V7p>1zJ`)BM&@2h^3uX{d zFtB&<4~U45Pe{lwD6XxmZ|I*earW#v^B1h&uwlpk1LrSXzyIL<2a{jA^$ZM5Ydu{Y zLn>~)y}>$3$x*=JqRs-g?5fl|v7i2l$LOdfY;nGJ-~JG{XQxrXajz?HX1tPFz4leW z?jx%%Dqj81%`K)Au^~Z*6HFZx0&xyDse(yXZy;OBKXI}kv-jizMt0_3-Ww+eGFsP% z@rONc+r8o5Iv=k+8Q-O@O*{TQ!nEeh@w%LF;cISx&&26u$$fjhp;!E=z%{qCQZIHL z*>lfe+b+A_8~PjKW~3S~w>oLI?N)uyj{Jn&Yu$;@{w6=m)i=}rZQ1iTHbVOAd+&U` zuO+wopWnKf@chZfojc^uB8pGyMmj?elxkFS6E{xzRp?)qakcNeeyn)}Cg_3&$Lc=pyVId+eJ zX;3kL*pZjMIHCh^Ly<=N?nPx@SxJ zS3RSAf9d%_`Kb|;A2C${!$aJK1rm6eVfXhMFc4X}#n#+h;5 break 'outer, }; - // Shape again, using a new font. + // Shape the whole text again, using the new font. let fallback_glyphs = shape_text_with_font( text, fallback_font.clone(), @@ -1349,27 +1349,10 @@ pub(crate) fn shape_text( ) .unwrap_or_default(); - let all_matched = fallback_glyphs.iter().all(|g| !g.is_missing()); - if all_matched { - // Replace all glyphs when all of them were matched. - glyphs = fallback_glyphs; - break 'outer; - } - - // We assume, that shaping with an any font will produce the same amount of glyphs. - // This is incorrect, but good enough for now. - if glyphs.len() != fallback_glyphs.len() { - break 'outer; - } - - // TODO: Replace clusters and not glyphs. This should be more accurate. - - // Copy new glyphs. - for i in 0..glyphs.len() { - if glyphs[i].is_missing() && !fallback_glyphs[i].is_missing() { - glyphs[i] = fallback_glyphs[i].clone(); - } - } + // Merge the newly shaped glyphs into the current ones, replacing + // every text cluster that is still missing and that the fallback + // font is able to resolve. + merge_fallback_glyphs(&mut glyphs, &fallback_glyphs, text); // Remember this font. used_fonts.push(fallback_font.id); @@ -1394,6 +1377,88 @@ pub(crate) fn shape_text( glyphs } +/// Merges fallback glyphs into the base glyphs. +/// +/// Both `base` and `fallback` are the result of shaping the same `text` with two +/// different fonts. Every text cluster that is still missing (`.notdef`) in +/// `base` is replaced with the corresponding glyphs from `fallback`, but only if +/// the fallback font is able to resolve that whole cluster. +/// +/// The two shapings can disagree on cluster boundaries. The most important case +/// are multi-codepoint emoji (flags and other ZWJ sequences): the primary font +/// produces one `.notdef` glyph per codepoint, while the emoji font ligates the +/// whole sequence into a single glyph. To merge them correctly we cut the text +/// only at boundaries shared by *both* shapings and replace whole clusters at a +/// time, instead of trying to align the two glyph lists one by one (which fails +/// as soon as they have a different length). +fn merge_fallback_glyphs(base: &mut Vec, fallback: &[Glyph], text: &str) { + if fallback.is_empty() || base.iter().all(|g| !g.is_missing()) { + return; + } + + // Byte positions at which a cluster starts. A position that is a cluster + // boundary in *both* shapings can be used to splice glyphs without ever + // splitting a ligature. + let base_bounds: HashSet = base.iter().map(|g| g.byte_idx.value()).collect(); + let mut bounds: Vec = fallback + .iter() + .map(|g| g.byte_idx.value()) + .filter(|b| base_bounds.contains(b)) + .collect(); + bounds.push(0); + bounds.push(text.len()); + bounds.sort_unstable(); + bounds.dedup(); + + // Returns the `[start, end)` shared-boundary segment that `byte` falls into. + let segment_of = |byte: usize| -> (usize, usize) { + let start = bounds + .iter() + .rev() + .copied() + .find(|&b| b <= byte) + .unwrap_or(0); + let end = bounds + .iter() + .copied() + .find(|&b| b > byte) + .unwrap_or(text.len()); + (start, end) + }; + + let mut result = Vec::with_capacity(base.len()); + let mut i = 0; + while i < base.len() { + let segment = segment_of(base[i].byte_idx.value()); + + // Collect the whole run of base glyphs belonging to this segment. The + // glyphs of a single cluster are always adjacent (in both LTR and RTL + // visual order), so this run is contiguous. + let run_start = i; + while i < base.len() && segment_of(base[i].byte_idx.value()) == segment { + i += 1; + } + + if base[run_start..i].iter().any(|g| g.is_missing()) { + let fallback_run: Vec = fallback + .iter() + .filter(|g| segment_of(g.byte_idx.value()) == segment) + .cloned() + .collect(); + + // Only replace the cluster if the fallback font resolved all of it. + if !fallback_run.is_empty() && fallback_run.iter().all(|g| !g.is_missing()) { + result.extend(fallback_run); + continue; + } + } + + result.extend_from_slice(&base[run_start..i]); + } + + *base = result; +} + /// Converts a text into a list of glyph IDs. /// /// This function will do the BIDI reordering and text shaping. diff --git a/crates/usvg/tests/parser.rs b/crates/usvg/tests/parser.rs index 905bbc0db..481e2d15e 100644 --- a/crates/usvg/tests/parser.rs +++ b/crates/usvg/tests/parser.rs @@ -601,3 +601,72 @@ fn flattened_text_should_inherit_absolute_transform() { path.abs_bounding_box() ); } + +#[test] +fn compound_emoji_font_fallback() { + // https://github.com/linebender/resvg/issues/861 + // + // A compound (ZWJ-joined) emoji that is not present in the primary font must + // be resolved through font fallback. This is tricky because the fallback + // font shapes the multi-codepoint sequence into a single ligated glyph, + // while the primary font produces one (`.notdef`) glyph per codepoint. The + // old merging logic bailed out whenever the two shapings had a different + // number of glyphs, which dropped the emoji entirely. + // + // U+1F3F3 U+FE0F U+200D U+1F308 is the "rainbow flag" emoji. + let svg = " + + Hi\u{1F3F3}\u{FE0F}\u{200D}\u{1F308}there + + "; + + // Load exactly two fonts so that the fallback target is deterministic: + // Latin text comes from Noto Sans, the emoji can only come from Twitter + // Color Emoji. + let fonts_dir = env!("CARGO_MANIFEST_DIR").to_string() + "/../resvg/tests/fonts"; + let mut opts = usvg::Options::default(); + opts.fontdb_mut() + .load_font_file(format!("{fonts_dir}/NotoSans-Regular.ttf")) + .unwrap(); + opts.fontdb_mut() + .load_font_file(format!("{fonts_dir}/TwitterColorEmoji.subset.ttf")) + .unwrap(); + opts.font_family = "Noto Sans".to_string(); + + let tree = usvg::Tree::from_str(svg, &opts).unwrap(); + + let usvg::Node::Text(text) = &tree.root().children()[0] else { + unreachable!() + }; + + let glyphs: Vec<_> = text + .layouted() + .iter() + .flat_map(|span| span.positioned_glyphs.iter()) + .collect(); + + // No glyph may be `.notdef` (glyph id 0): the emoji must be resolved, not + // dropped. + assert!( + glyphs.iter().all(|g| g.id.0 != 0), + "text contains unresolved (.notdef) glyphs: {:?}", + glyphs + .iter() + .map(|g| (g.id.0, g.text.clone())) + .collect::>() + ); + + // The Latin text is shaped with the primary font; the whole emoji sequence + // must collapse into a single ligated glyph taken from the fallback font. + let primary_font = glyphs[0].font; + let fallback_glyphs: Vec<_> = glyphs.iter().filter(|g| g.font != primary_font).collect(); + assert_eq!( + fallback_glyphs.len(), + 1, + "expected the ZWJ emoji to be a single ligated glyph from the fallback font, got {:?}", + glyphs + .iter() + .map(|g| (g.id.0, g.text.clone())) + .collect::>() + ); +}