From 5bea219b6a883fa70fd7eab56f0f4a1cb973f89c Mon Sep 17 00:00:00 2001 From: yangjie01 Date: Wed, 17 Jun 2026 22:54:07 +0800 Subject: [PATCH] [SPARK-57507][SQL] Fix UTF8String.reverse reading past the end of a truncated trailing UTF-8 sequence ### What changes were proposed in this pull request? `UTF8String.reverse()` reverses a string one UTF-8 character at a time, using `numBytesForFirstByte(getByte(i))` to determine the width of the character at byte position `i`. The per-character copy length was clamped to `numBytes` (the total length) instead of `numBytes - i` (the bytes that actually remain). When the last bytes of the string are a truncated multi-byte sequence (a leader byte whose declared width exceeds the remaining bytes), `copyMemory` read past the end of the string into adjacent memory. This changes the clamp to `numBytes - i`. ### Why are the changes needed? `UTF8String` can hold malformed UTF-8 (for example, bytes produced by binary coercion or truncated input). For such a string ending in an incomplete multi-byte sequence, `reverse()` performed an out-of-bounds read and produced a wrong result. Every other byte scan in the class already bounds by the remaining bytes; this brings `reverse()` in line. Well-formed UTF-8 is unaffected, since a complete sequence never exceeds the remaining bytes. ### Does this PR introduce _any_ user-facing change? Yes, it fixes incorrect results. The SQL `reverse()` function on a string value that contains malformed UTF-8 ending in a truncated multi-byte sequence no longer reads past the end of the value; only previously-incorrect results change. ### How was this patch tested? Added cases to `UTF8StringSuite#reverse()` for truncated trailing 2-, 3-, and 4-byte leaders, and for a complete multi-byte character followed by an orphan leader. Each uses a sliced backing array with a trailing sentinel byte so the previous over-read produces a deterministically wrong value; the cases fail on the old code and pass with the fix. --- .../apache/spark/unsafe/types/UTF8String.java | 2 +- .../spark/unsafe/types/UTF8StringSuite.java | 24 +++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index ab57e308ee4d7..06ec4af7a5f27 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -1157,7 +1157,7 @@ public UTF8String reverse() { int i = 0; // position in byte while (i < numBytes) { - int len = Math.min(numBytesForFirstByte(getByte(i)), numBytes); + int len = Math.min(numBytesForFirstByte(getByte(i)), numBytes - i); int targetOffset = Math.max(result.length - i - len, 0); copyMemory(this.base, this.offset + i, result, BYTE_ARRAY_OFFSET + targetOffset, len); diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java index 26b96155377e8..a9f365173be64 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java @@ -325,6 +325,30 @@ public void reverse() { assertEquals(EMPTY_UTF8, EMPTY_UTF8.reverse()); assertEquals(fromString("者行孙"), fromString("孙行者").reverse()); assertEquals(fromString("者行孙 olleh"), fromString("hello 孙行者").reverse()); + // Malformed UTF-8: a truncated trailing multi-byte sequence must be reversed as orphan + // bytes without reading past the end of the string. The backing arrays carry an extra + // trailing byte so a regression that over-reads would produce a deterministically wrong + // result rather than reading uninitialized memory. + // 'A' followed by an incomplete 2-byte leader (0xCE). + byte[] truncated2 = new byte[]{0x41, (byte) 0xCE, 0x42}; + assertEquals( + fromBytes(new byte[]{(byte) 0xCE, 0x41}), + fromBytes(truncated2, 0, 2).reverse()); + // 'A' followed by an incomplete 3-byte leader (0xE4 0xB8). + byte[] truncated3 = new byte[]{0x41, (byte) 0xE4, (byte) 0xB8, 0x42}; + assertEquals( + fromBytes(new byte[]{(byte) 0xE4, (byte) 0xB8, 0x41}), + fromBytes(truncated3, 0, 3).reverse()); + // 'A' followed by an incomplete 4-byte leader (0xF0 0x90). + byte[] truncated4 = new byte[]{0x41, (byte) 0xF0, (byte) 0x90, 0x42}; + assertEquals( + fromBytes(new byte[]{(byte) 0xF0, (byte) 0x90, 0x41}), + fromBytes(truncated4, 0, 3).reverse()); + // A complete 3-byte character (U+4E16) followed by an incomplete 2-byte leader (0xCE). + byte[] truncatedMid = new byte[]{(byte) 0xE4, (byte) 0xB8, (byte) 0x96, (byte) 0xCE, 0x42}; + assertEquals( + fromBytes(new byte[]{(byte) 0xCE, (byte) 0xE4, (byte) 0xB8, (byte) 0x96}), + fromBytes(truncatedMid, 0, 4).reverse()); } @Test