apache · LuciferYang · Jun 17, 2026 · cloud-fan · Jun 17, 2026 · LuciferYang
diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -1157,7 +1157,7 @@ public UTF8String reverse() {
 
     int i = 0; // position in byte
     while (i < numBytes) {
-      int len = Math.min(numBytesForFirstByte(getByte(i)), numBytes);
+      int len = Math.min(numBytesForFirstByte(getByte(i)), numBytes - i);
       int targetOffset = Math.max(result.length - i - len, 0);
       copyMemory(this.base, this.offset + i, result,
         BYTE_ARRAY_OFFSET + targetOffset, len);

diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
@@ -325,6 +325,30 @@ public void reverse() {
     assertEquals(EMPTY_UTF8, EMPTY_UTF8.reverse());
     assertEquals(fromString("者行孙"), fromString("孙行者").reverse());
     assertEquals(fromString("者行孙 olleh"), fromString("hello 孙行者").reverse());
+    // Malformed UTF-8: a truncated trailing multi-byte sequence must be reversed as orphan
+    // bytes without reading past the end of the string. The backing arrays carry an extra
+    // trailing byte so a regression that over-reads would produce a deterministically wrong
+    // result rather than reading uninitialized memory.
+    // 'A' followed by an incomplete 2-byte leader (0xCE).
+    byte[] truncated2 = new byte[]{0x41, (byte) 0xCE, 0x42};
+    assertEquals(
+      fromBytes(new byte[]{(byte) 0xCE, 0x41}),
+      fromBytes(truncated2, 0, 2).reverse());
+    // 'A' followed by an incomplete 3-byte leader (0xE4 0xB8).
+    byte[] truncated3 = new byte[]{0x41, (byte) 0xE4, (byte) 0xB8, 0x42};
+    assertEquals(
+      fromBytes(new byte[]{(byte) 0xE4, (byte) 0xB8, 0x41}),
+      fromBytes(truncated3, 0, 3).reverse());
+    // 'A' followed by an incomplete 4-byte leader (0xF0 0x90).
+    byte[] truncated4 = new byte[]{0x41, (byte) 0xF0, (byte) 0x90, 0x42};
+    assertEquals(
+      fromBytes(new byte[]{(byte) 0xF0, (byte) 0x90, 0x41}),
+      fromBytes(truncated4, 0, 3).reverse());
+    // A complete 3-byte character (U+4E16) followed by an incomplete 2-byte leader (0xCE).
+    byte[] truncatedMid = new byte[]{(byte) 0xE4, (byte) 0xB8, (byte) 0x96, (byte) 0xCE, 0x42};
+    assertEquals(
+      fromBytes(new byte[]{(byte) 0xCE, (byte) 0xE4, (byte) 0xB8, (byte) 0x96}),
+      fromBytes(truncatedMid, 0, 4).reverse());
   }
 
   @Test