Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -723,7 +723,8 @@ public int getChar(int charIndex) {

/**
* Returns the code point starting from the byte at position `byteIndex`.
* If byte index is invalid, throws exception.
* If byte index is invalid, throws exception. If the sequence is truncated (the leader byte
* declares more bytes than remain), the missing continuation bytes are treated as 0.
*/
public int codePointFrom(int byteIndex) {
Objects.checkIndex(byteIndex, numBytes);
Expand All @@ -733,18 +734,28 @@ public int codePointFrom(int byteIndex) {
case 1 ->
b & 0x7F;
case 2 ->
((b & 0x1F) << 6) | (getByte(byteIndex + 1) & 0x3F);
((b & 0x1F) << 6) | continuationByte(byteIndex + 1);
case 3 ->
((b & 0x0F) << 12) | ((getByte(byteIndex + 1) & 0x3F) << 6) |
(getByte(byteIndex + 2) & 0x3F);
((b & 0x0F) << 12) | (continuationByte(byteIndex + 1) << 6) |
continuationByte(byteIndex + 2);
case 4 ->
((b & 0x07) << 18) | ((getByte(byteIndex + 1) & 0x3F) << 12) |
((getByte(byteIndex + 2) & 0x3F) << 6) | (getByte(byteIndex + 3) & 0x3F);
((b & 0x07) << 18) | (continuationByte(byteIndex + 1) << 12) |
(continuationByte(byteIndex + 2) << 6) | continuationByte(byteIndex + 3);
default ->
throw new IllegalStateException("Error in UTF-8 code point");
};
}

/**
* Returns the low 6 bits of the UTF-8 continuation byte at `byteIndex`, or 0 when `byteIndex`
* is past the end of the string. The bounds check stops a truncated trailing multi-byte
* sequence (a leader byte whose declared width exceeds the bytes that remain) from reading
* past the end of the backing memory.
*/
private int continuationByte(int byteIndex) {
return byteIndex < numBytes ? getByte(byteIndex) & 0x3F : 0;
}

public boolean matchAt(final UTF8String s, int pos) {
if (s.numBytes + pos > numBytes || pos < 0) {
return false;
Expand Down Expand Up @@ -941,7 +952,10 @@ public int findInSet(UTF8String match) {
* @return a new UTF8String in the position of [start, end] of current UTF8String bytes.
*/
public UTF8String copyUTF8String(int start, int end) {
int len = end - start + 1;
// Clamp to the bytes that actually remain so an out-of-range `end` (for example, derived
// from a truncated trailing multi-byte sequence) can't copy past the end of the backing
// memory.
int len = Math.min(end - start + 1, numBytes - start);
byte[] newBytes = new byte[len];
copyMemory(base, offset + start, newBytes, BYTE_ARRAY_OFFSET, len);
return UTF8String.fromBytes(newBytes);
Expand Down Expand Up @@ -1134,7 +1148,9 @@ public UTF8String trimRight(UTF8String trimString) {
stringCharPos[numChars - 1],
stringCharPos[numChars - 1] + stringCharLen[numChars - 1] - 1);
if (trimString.find(searchChar, 0) >= 0) {
trimEnd -= stringCharLen[numChars - 1];
// Advance by the bytes the character actually occupies. A truncated trailing leader is
// shorter than the width its leader byte declares, so use the (clamped) search char.
trimEnd -= searchChar.numBytes;
} else {
break;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1204,6 +1204,61 @@ public void testCodePointFrom() {
assertThrows(IndexOutOfBoundsException.class, () -> s.codePointFrom(-1));
assertThrows(IndexOutOfBoundsException.class, () -> s.codePointFrom(str.length()));
assertThrows(IndexOutOfBoundsException.class, () -> s.codePointFrom(str.length() + 1));

// Truncated trailing multi-byte sequence: the leader declares more bytes than remain.
// codePointFrom should decode only the bytes present (missing continuation bytes count as
// 0) and not read past the end. Each backing array has extra trailing bytes, so an
// over-read regression would show up in the result.
// 2-byte leader 0xCE with no continuation byte present.
assertEquals((0xCE & 0x1F) << 6,
fromBytes(new byte[] {(byte) 0xCE, 0x42}, 0, 1).codePointFrom(0));
// 3-byte leader 0xE4 with no continuation bytes present.
assertEquals((0xE4 & 0x0F) << 12,
fromBytes(new byte[] {(byte) 0xE4, 0x42, 0x43}, 0, 1).codePointFrom(0));
// 3-byte leader 0xE4 0xB8 with the final continuation byte missing.
assertEquals(((0xE4 & 0x0F) << 12) | ((0xB8 & 0x3F) << 6),
fromBytes(new byte[] {(byte) 0xE4, (byte) 0xB8, 0x42}, 0, 2).codePointFrom(0));
// 4-byte leader 0xF1 with no continuation bytes present.
assertEquals((0xF1 & 0x07) << 18,
fromBytes(new byte[] {(byte) 0xF1, 0x42, 0x43, 0x44}, 0, 1).codePointFrom(0));
// 4-byte leader 0xF1 with two continuation bytes present and only the last one missing,
// so just the final read crosses the end.
assertEquals(((0xF1 & 0x07) << 18) | ((0x9F & 0x3F) << 12) | ((0x8F & 0x3F) << 6),
fromBytes(new byte[] {(byte) 0xF1, (byte) 0x9F, (byte) 0x8F, 0x42}, 0, 3).codePointFrom(0));
}

@Test
public void copyUTF8StringClampsToRemainingBytes() {
// Here `end` runs one byte past the string, as it would for a truncated trailing sequence.
// copyUTF8String should clamp to the available bytes; the extra backing byte would show up
// in the result if it over-read.
byte[] backing = new byte[] {0x41, 0x42, 0x43};
UTF8String s = fromBytes(backing, 0, 2); // views "AB"
// `end` (2) is one past the last valid byte index (1); only the two real bytes are copied.
assertEquals(fromString("AB"), s.copyUTF8String(0, 2));
// Same with a non-zero start, so the clamp uses `numBytes - start`, not `numBytes`.
assertEquals(fromString("B"), s.copyUTF8String(1, 2));
// In-bounds copies are unaffected.
assertEquals(fromString("AB"), s.copyUTF8String(0, 1));
assertEquals(fromString("B"), s.copyUTF8String(1, 1));
}

@Test
public void trimTruncatedTrailingSequence() {
// trimLeft/trimRight build the search character through copyUTF8String, so an over-read would
// make it longer than the bytes that remain. The backing arrays carry an extra trailing byte
// to make any over-read deterministic.
// A lone truncated 2-byte leader (0xC2): the clamped search char is just the leader, which
// matches the 1-byte trim set and is trimmed away.
UTF8String lone = fromBytes(new byte[] {(byte) 0xC2, 0x42}, 0, 1);
UTF8String trim2 = fromBytes(new byte[] {(byte) 0xC2});
assertEquals(EMPTY_UTF8, lone.trimLeft(trim2));
assertEquals(EMPTY_UTF8, lone.trimRight(trim2));
// 'A' followed by a truncated 3-byte leader (0xE4). Trimming the leader from the right must
// keep 'A': the trailing character occupies only one real byte, so only that byte is removed.
UTF8String prefixed = fromBytes(new byte[] {0x41, (byte) 0xE4, 0x42}, 0, 2);
UTF8String trim3 = fromBytes(new byte[] {(byte) 0xE4});
assertEquals(fromString("A"), prefixed.trimRight(trim3));
}

@Test
Expand Down