diff --git a/src/voxcpm/utils/text_normalize.py b/src/voxcpm/utils/text_normalize.py index 423a173b..f5328e1f 100644 --- a/src/voxcpm/utils/text_normalize.py +++ b/src/voxcpm/utils/text_normalize.py @@ -112,7 +112,18 @@ def replace_blank(text: str): out_str = [] for i, c in enumerate(text): if c == " ": - if (text[i + 1].isascii() and text[i + 1] != " ") and (text[i - 1].isascii() and text[i - 1] != " "): + # Keep a space only when it sits between two ASCII word characters. + # Guard the neighbour lookups: a trailing space would make text[i + 1] + # raise IndexError, and a leading space would make text[i - 1] wrap + # around to the last character. Edge spaces are not between two + # words, so they are dropped. + if ( + 0 < i < len(text) - 1 + and text[i + 1].isascii() + and text[i + 1] != " " + and text[i - 1].isascii() + and text[i - 1] != " " + ): out_str.append(c) else: out_str.append(c) diff --git a/tests/test_text_normalize.py b/tests/test_text_normalize.py new file mode 100644 index 00000000..a047bb43 --- /dev/null +++ b/tests/test_text_normalize.py @@ -0,0 +1,18 @@ +from voxcpm.utils.text_normalize import replace_blank + + +def test_replace_blank_keeps_interior_ascii_space(): + assert replace_blank("a b") == "a b" + + +def test_replace_blank_drops_edge_spaces(): + # A space is only kept between two ASCII word characters. A trailing space + # used to raise IndexError (text[i + 1]) and a leading space was wrongly + # kept (text[i - 1] wrapping to the last character); both are now dropped. + assert replace_blank("hello ") == "hello" + assert replace_blank(" hello") == "hello" + assert replace_blank("a b ") == "a b" + + +def test_replace_blank_drops_space_adjacent_to_non_ascii(): + assert replace_blank("中 文") == "中文"