Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion src/voxcpm/utils/text_normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,14 @@ def replace_blank(text: str):
out_str = []
for i, c in enumerate(text):
if c == " ":
if (text[i + 1].isascii() and text[i + 1] != " ") and (text[i - 1].isascii() and text[i - 1] != " "):
# Keep a blank only when it sits between two ASCII non-space
# characters. Guard the neighbour lookups so a leading space
# (i == 0) does not wrap around to text[-1] and a trailing
# space (i == len(text) - 1) does not raise IndexError. This
# mirrors the bounds check already used in split_paragraph().
prev_ok = i > 0 and text[i - 1].isascii() and text[i - 1] != " "
next_ok = i + 1 < len(text) and text[i + 1].isascii() and text[i + 1] != " "
if prev_ok and next_ok:
Comment on lines +120 to +122
out_str.append(c)
else:
out_str.append(c)
Expand Down
59 changes: 59 additions & 0 deletions tests/test_text_normalize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from __future__ import annotations

import importlib.util
import sys
import types
from pathlib import Path

ROOT = Path(__file__).resolve().parents[1]
TEXT_NORMALIZE_PATH = ROOT / "src" / "voxcpm" / "utils" / "text_normalize.py"

# Stub heavy/third-party imports so the module loads without them. We only
# exercise ``replace_blank``, which depends on nothing beyond the stdlib.
for _name in ("regex", "inflect"):
sys.modules.setdefault(_name, types.ModuleType(_name))

_wetext_stub = types.ModuleType("wetext")
_wetext_stub.Normalizer = object
sys.modules.setdefault("wetext", _wetext_stub)
Comment on lines +11 to +18

spec = importlib.util.spec_from_file_location("voxcpm.utils.text_normalize", TEXT_NORMALIZE_PATH)
text_normalize = importlib.util.module_from_spec(spec)
assert spec.loader is not None
spec.loader.exec_module(text_normalize)
Comment on lines +20 to +23

replace_blank = text_normalize.replace_blank


def test_replace_blank_handles_trailing_space():
# A space at the end of the string has no right-hand neighbour. The old
# implementation indexed text[i + 1] unconditionally and raised
# IndexError. The trailing blank should simply be dropped.
assert replace_blank("hello ") == "hello"
assert replace_blank("\u4e2d\u6587 ") == "\u4e2d\u6587"
assert replace_blank("a b ") == "a b"


def test_replace_blank_handles_leading_space():
# A space at the start has no left-hand neighbour. The old implementation
# let text[i - 1] wrap around to text[-1] (the last character), which
# could spuriously keep the leading blank. It should be dropped.
assert replace_blank(" ab") == "ab"
assert replace_blank(" a") == "a"


def test_replace_blank_keeps_space_between_ascii():
# The documented behaviour: keep a blank only when it sits between two
# ASCII non-space characters.
assert replace_blank("a b") == "a b"
assert replace_blank("x 1") == "x 1"
assert replace_blank("hello world") == "hello world"


def test_replace_blank_drops_space_around_cjk():
assert replace_blank("\u4e2d \u6587") == "\u4e2d\u6587"
assert replace_blank("\u4f60\u597d world ok") == "\u4f60\u597dworld ok"


def test_replace_blank_empty_string():
assert replace_blank("") == ""