From 7342fe614dc39ad1f8947f2a0916fde7d5938073 Mon Sep 17 00:00:00 2001 From: Kostub D Date: Fri, 12 Jun 2026 12:24:18 +0530 Subject: [PATCH 1/4] fix(REN-5): raise MTParseErrorInvalidCharacter for non-ASCII literal input MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, non-ASCII characters (e.g. π, ×, ≤) typed directly into a LaTeX string were silently dropped by the builder — the rendered output was missing the character and the caller's error: out-param stayed nil. Adds MTParseErrorInvalidCharacter to the MTParseErrors enum and replaces the silent continue in MTMathListBuilder.m with a setError:/return nil call (scoped to ch > 0x7E) that matches the existing error-reporting model used by MTParseErrorInvalidCommand. ASCII specials (space, $, %, etc.) continue to be silently ignored as before. Closes REN-5. Co-Authored-By: Claude Sonnet 4.6 --- iosMath/lib/MTMathListBuilder.h | 2 ++ iosMath/lib/MTMathListBuilder.m | 10 +++++++++- iosMathTests/MTMathListBuilderTest.m | 4 ++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/iosMath/lib/MTMathListBuilder.h b/iosMath/lib/MTMathListBuilder.h index 69a69e72..c65d9126 100644 --- a/iosMath/lib/MTMathListBuilder.h +++ b/iosMath/lib/MTMathListBuilder.h @@ -89,6 +89,8 @@ typedef NS_ENUM(NSUInteger, MTParseErrors) { MTParseErrorInvalidLimits, /// The LaTeX nesting depth exceeded the safe parsing limit. MTParseErrorNestingTooDeep, + /// A character in the string is not recognized (e.g. a non-ASCII literal). + MTParseErrorInvalidCharacter, }; @end diff --git a/iosMath/lib/MTMathListBuilder.m b/iosMath/lib/MTMathListBuilder.m index e87288c5..8a13de08 100644 --- a/iosMath/lib/MTMathListBuilder.m +++ b/iosMath/lib/MTMathListBuilder.m @@ -361,7 +361,15 @@ - (MTMathList*)buildInternal:(BOOL) oneCharOnly stopChar:(unichar) stop } else { atom = [MTMathAtomFactory atomForCharacter:ch]; if (!atom) { - // Not a recognized character + if (ch > 0x7E) { + // Non-ASCII literal characters are not supported — report an error instead of + // silently dropping the character. Callers should use the corresponding LaTeX + // command (e.g. \pi instead of π, \times instead of ×). + [self setError:MTParseErrorInvalidCharacter + message:[NSString stringWithFormat:@"Unknown character U+%04X ('%C') is not a recognized LaTeX input character. Use the corresponding LaTeX command instead.", ch, ch]]; + return nil; + } + // Other unrecognized characters (e.g. space, $ % #) are silently ignored. continue; } } diff --git a/iosMathTests/MTMathListBuilderTest.m b/iosMathTests/MTMathListBuilderTest.m index 4bfe8c15..7b1a9f29 100644 --- a/iosMathTests/MTMathListBuilderTest.m +++ b/iosMathTests/MTMathListBuilderTest.m @@ -1488,6 +1488,10 @@ - (void) testDisplayLines @[@"x^\\choose y", @(MTParseErrorInvalidCommand)], @[@"x^\\brack y", @(MTParseErrorInvalidCommand)], @[@"x^\\brace y", @(MTParseErrorInvalidCommand)], + // REN-5: non-ASCII literal characters should produce MTParseErrorInvalidCharacter + @[@"π", @(MTParseErrorInvalidCharacter)], // π (U+03C0) + @[@"3 × 4", @(MTParseErrorInvalidCharacter)], // 3 × 4 + @[@"x ≤ y", @(MTParseErrorInvalidCharacter)], // x ≤ y ]; }; From 79a5f1ef6940929526fefd0a0508c785915e82be Mon Sep 17 00:00:00 2001 From: Kostub D Date: Fri, 26 Jun 2026 00:39:30 +0530 Subject: [PATCH 2/4] fix(REN-5): full LaTeX semantics for special characters and tilde Extends the non-ASCII handling to cover all special characters, matching real LaTeX behaviour in math mode: - ~ renders as a space (LaTeX non-breaking space). - % # $ now raise MTParseErrorInvalidCharacter (a comment, a macro parameter, and a math-mode toggle respectively - none valid here) instead of being silently dropped. - Whitespace (space, tab, newline, CR) remains silently ignored. - Non-ASCII literals continue to raise MTParseErrorInvalidCharacter. Adds builder/error test cases for ~ and for % # $. Co-Authored-By: Claude Opus 4.8 --- iosMath/lib/MTMathListBuilder.h | 4 +++- iosMath/lib/MTMathListBuilder.m | 22 +++++++++++++--------- iosMathTests/MTMathListBuilderTest.m | 7 +++++++ 3 files changed, 23 insertions(+), 10 deletions(-) diff --git a/iosMath/lib/MTMathListBuilder.h b/iosMath/lib/MTMathListBuilder.h index c65d9126..15245b79 100644 --- a/iosMath/lib/MTMathListBuilder.h +++ b/iosMath/lib/MTMathListBuilder.h @@ -89,7 +89,9 @@ typedef NS_ENUM(NSUInteger, MTParseErrors) { MTParseErrorInvalidLimits, /// The LaTeX nesting depth exceeded the safe parsing limit. MTParseErrorNestingTooDeep, - /// A character in the string is not recognized (e.g. a non-ASCII literal). + /// A character in the string is not a valid LaTeX input character in math + /// mode (e.g. a non-ASCII literal like π, or a special character such as + /// %, #, $ that has no meaning here). MTParseErrorInvalidCharacter, }; diff --git a/iosMath/lib/MTMathListBuilder.m b/iosMath/lib/MTMathListBuilder.m index 8a13de08..62c46350 100644 --- a/iosMath/lib/MTMathListBuilder.m +++ b/iosMath/lib/MTMathListBuilder.m @@ -358,19 +358,23 @@ - (MTMathList*)buildInternal:(BOOL) oneCharOnly stopChar:(unichar) stop } else if (_spacesAllowed && ch == ' ') { // If spaces are allowed then spaces do not need escaping with a \ before being used. atom = [MTMathAtomFactory atomForLatexSymbolName:@" "]; + } else if (ch == '~') { + // Tilde is a non-breaking space in LaTeX; render it as an ordinary space. + atom = [MTMathAtomFactory atomForLatexSymbolName:@" "]; } else { atom = [MTMathAtomFactory atomForCharacter:ch]; if (!atom) { - if (ch > 0x7E) { - // Non-ASCII literal characters are not supported — report an error instead of - // silently dropping the character. Callers should use the corresponding LaTeX - // command (e.g. \pi instead of π, \times instead of ×). - [self setError:MTParseErrorInvalidCharacter - message:[NSString stringWithFormat:@"Unknown character U+%04X ('%C') is not a recognized LaTeX input character. Use the corresponding LaTeX command instead.", ch, ch]]; - return nil; + // Whitespace is insignificant in math mode and is silently ignored. + if (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r') { + continue; } - // Other unrecognized characters (e.g. space, $ % #) are silently ignored. - continue; + // Any other unrecognized character is an error: a non-ASCII literal + // (e.g. π, ×, ≤) or a special character with no meaning in math mode + // (% is a comment, # a macro parameter, $ toggles math mode). Callers + // should use the corresponding LaTeX command (e.g. \pi, \%, \#). + [self setError:MTParseErrorInvalidCharacter + message:[NSString stringWithFormat:@"Unknown character U+%04X ('%C') is not a valid LaTeX input character in math mode. Use the corresponding LaTeX command instead.", ch, ch]]; + return nil; } } NSAssert(atom != nil, @"Atom shouldn't be nil"); diff --git a/iosMathTests/MTMathListBuilderTest.m b/iosMathTests/MTMathListBuilderTest.m index 7b1a9f29..6eb434f9 100644 --- a/iosMathTests/MTMathListBuilderTest.m +++ b/iosMathTests/MTMathListBuilderTest.m @@ -69,6 +69,8 @@ - (void)tearDown @[ @"x \\ y", @[ @(kMTMathAtomVariable), @(kMTMathAtomOrdinary), @(kMTMathAtomVariable)], @"x\\ y"], // spacing @[ @"x \\quad y \\; z \\! q", @[ @(kMTMathAtomVariable), @(kMTMathAtomSpace), @(kMTMathAtomVariable),@(kMTMathAtomSpace), @(kMTMathAtomVariable),@(kMTMathAtomSpace), @(kMTMathAtomVariable)], @"x\\quad y\\; z\\! q"], + // tilde is a non-breaking space (renders as an ordinary space, same as a literal space) + @[ @"x~y", @[ @(kMTMathAtomVariable), @(kMTMathAtomOrdinary), @(kMTMathAtomVariable)], @"x\\ y"], ]; } @@ -1492,6 +1494,11 @@ - (void) testDisplayLines @[@"π", @(MTParseErrorInvalidCharacter)], // π (U+03C0) @[@"3 × 4", @(MTParseErrorInvalidCharacter)], // 3 × 4 @[@"x ≤ y", @(MTParseErrorInvalidCharacter)], // x ≤ y + // Special characters with no meaning in math mode are errors (match LaTeX: + // % is a comment, # is a macro parameter, $ toggles math mode - none valid here). + @[@"a % b", @(MTParseErrorInvalidCharacter)], + @[@"a # b", @(MTParseErrorInvalidCharacter)], + @[@"a $ b", @(MTParseErrorInvalidCharacter)], ]; }; From ad020ca4ce287337b65f1b11a4b40b0b76dee963 Mon Sep 17 00:00:00 2001 From: Kostub D Date: Sun, 28 Jun 2026 01:35:40 +0530 Subject: [PATCH 3/4] =?UTF-8?q?fix(REN-5):=20address=20review=20=E2=80=94?= =?UTF-8?q?=20surrogate-aware=20error,=20NUL=20ignore,=20doc=20fixes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Decode UTF-16 surrogate pairs in the MTParseErrorInvalidCharacter message so above-BMP literals (e.g. 𝑎) report the real scalar U+1D44E instead of a lone surrogate U+D835. - Silently ignore NUL (catcode 9), the one whitespace-like character TeX actually discards that we were missing. Keep form feed (\par) and vertical tab (catcode "other") as errors — they are not spaces in TeX. - Update stale atomForCharacter: comments that said these characters are "skipped"/"not supported"; the builder now decides (ignore vs. error). - Add tests: astral-literal error, surrogate-pair message decoding, and a whitespace/NUL silent-ignore regression guard. Co-Authored-By: Claude Opus 4.8 --- iosMath/lib/MTMathAtomFactory.m | 10 +++++--- iosMath/lib/MTMathListBuilder.m | 27 +++++++++++++++++--- iosMathTests/MTMathListBuilderTest.m | 37 ++++++++++++++++++++++++++++ 3 files changed, 68 insertions(+), 6 deletions(-) diff --git a/iosMath/lib/MTMathAtomFactory.m b/iosMath/lib/MTMathAtomFactory.m index 727d86eb..f3945ed4 100644 --- a/iosMath/lib/MTMathAtomFactory.m +++ b/iosMath/lib/MTMathAtomFactory.m @@ -98,11 +98,15 @@ + (nullable MTMathAtom *)atomForCharacter:(unichar)ch { NSString *chStr = [NSString stringWithCharacters:&ch length:1]; if (ch < 0x21 || ch > 0x7E) { - // skip non ascii characters and spaces. Non-Latin text must be - // wrapped in \text*, \textbf{...}, etc. + // No atom for control characters, spaces, or non-ASCII literals. The + // builder decides what to do with these: whitespace is silently ignored, + // everything else raises MTParseErrorInvalidCharacter. Non-Latin text + // must be wrapped in \text*, \textbf{...}, etc. return nil; } else if (ch == '$' || ch == '%' || ch == '#' || ch == '&' || ch == '~' || ch == '\'') { - // These are latex control characters that have special meanings. We don't support them. + // LaTeX control characters with special meanings. They have no atom of + // their own; the builder handles them (& / ~ / ' are consumed before + // reaching here, while $ % # raise MTParseErrorInvalidCharacter). return nil; } else if (ch == '^' || ch == '_' || ch == '{' || ch == '}' || ch == '\\') { // more special characters for Latex. diff --git a/iosMath/lib/MTMathListBuilder.m b/iosMath/lib/MTMathListBuilder.m index 62c46350..38d77bf2 100644 --- a/iosMath/lib/MTMathListBuilder.m +++ b/iosMath/lib/MTMathListBuilder.m @@ -364,16 +364,37 @@ - (MTMathList*)buildInternal:(BOOL) oneCharOnly stopChar:(unichar) stop } else { atom = [MTMathAtomFactory atomForCharacter:ch]; if (!atom) { - // Whitespace is insignificant in math mode and is silently ignored. - if (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r') { + // Characters TeX silently discards: whitespace (catcode 10/5, + // ignored in math mode) and NUL (catcode 9). Note that other + // control characters are *not* spaces in TeX (form feed is \par, + // vertical tab is an ordinary "other" character), so they fall + // through to the error below, as they should. + if (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' || ch == '\0') { continue; } // Any other unrecognized character is an error: a non-ASCII literal // (e.g. π, ×, ≤) or a special character with no meaning in math mode // (% is a comment, # a macro parameter, $ toggles math mode). Callers // should use the corresponding LaTeX command (e.g. \pi, \%, \#). + // _chars is UTF-16, so decode a surrogate pair to report the real + // Unicode scalar (e.g. U+1D44E) instead of a lone surrogate. + UTF32Char codepoint = ch; + NSString* charStr; + if (CFStringIsSurrogateHighCharacter(ch) && [self hasCharacters]) { + unichar low = [self getNextCharacter]; + if (CFStringIsSurrogateLowCharacter(low)) { + unichar pair[2] = {ch, low}; + charStr = [NSString stringWithCharacters:pair length:2]; + codepoint = CFStringGetLongCharacterForSurrogatePair(ch, low); + } else { + [self unlookCharacter]; + charStr = [NSString stringWithCharacters:&ch length:1]; + } + } else { + charStr = [NSString stringWithCharacters:&ch length:1]; + } [self setError:MTParseErrorInvalidCharacter - message:[NSString stringWithFormat:@"Unknown character U+%04X ('%C') is not a valid LaTeX input character in math mode. Use the corresponding LaTeX command instead.", ch, ch]]; + message:[NSString stringWithFormat:@"Unknown character U+%04X ('%@') is not a valid LaTeX input character in math mode. Use the corresponding LaTeX command instead.", codepoint, charStr]]; return nil; } } diff --git a/iosMathTests/MTMathListBuilderTest.m b/iosMathTests/MTMathListBuilderTest.m index 6eb434f9..7892e824 100644 --- a/iosMathTests/MTMathListBuilderTest.m +++ b/iosMathTests/MTMathListBuilderTest.m @@ -1494,6 +1494,7 @@ - (void) testDisplayLines @[@"π", @(MTParseErrorInvalidCharacter)], // π (U+03C0) @[@"3 × 4", @(MTParseErrorInvalidCharacter)], // 3 × 4 @[@"x ≤ y", @(MTParseErrorInvalidCharacter)], // x ≤ y + @[@"x 𝑎 y", @(MTParseErrorInvalidCharacter)], // above-BMP literal (U+1D44E, surrogate pair) // Special characters with no meaning in math mode are errors (match LaTeX: // % is a comment, # is a macro parameter, $ toggles math mode - none valid here). @[@"a % b", @(MTParseErrorInvalidCharacter)], @@ -1519,6 +1520,42 @@ - (void) testErrors } } +// REN-5: an above-BMP literal is a UTF-16 surrogate pair; the error message must +// name the real Unicode scalar (U+1D44E), not a lone surrogate (U+D835). +- (void) testInvalidCharacterErrorMessageDecodesSurrogatePair +{ + NSError* error = nil; + MTMathList* list = [MTMathListBuilder buildFromString:@"𝑎" error:&error]; + XCTAssertNil(list); + XCTAssertNotNil(error); + XCTAssertEqual(error.code, MTParseErrorInvalidCharacter); + NSString* message = error.userInfo[NSLocalizedDescriptionKey]; + XCTAssertTrue([message containsString:@"U+1D44E"], + @"Expected real scalar U+1D44E in message, got: %@", message); + XCTAssertFalse([message containsString:@"U+D835"], + @"Message should not report a lone surrogate: %@", message); +} + +// REN-5: characters TeX silently discards (whitespace catcode 10/5 and NUL +// catcode 9) must continue to parse without error. Guards against the error +// path swallowing legitimate whitespace. +- (void) testIgnoredWhitespaceCharacters +{ + unichar nulChars[3] = { 'x', 0x0000, 'y' }; + NSString* withNul = [NSString stringWithCharacters:nulChars length:3]; + NSArray* inputs = @[ @"x\ty", @"x\ny", @"x\ry", withNul ]; + for (NSString* str in inputs) { + NSError* error = nil; + MTMathList* list = [MTMathListBuilder buildFromString:str error:&error]; + NSString* desc = [NSString stringWithFormat:@"whitespace input %@", str]; + XCTAssertNotNil(list, @"%@", desc); + XCTAssertNil(error, @"%@", desc); + XCTAssertEqual(list.atoms.count, 2u, @"%@", desc); + XCTAssertEqual([list.atoms[0] type], kMTMathAtomVariable, @"%@", desc); + XCTAssertEqual([list.atoms[1] type], kMTMathAtomVariable, @"%@", desc); + } +} + // REN-6: \over inside an explicit-brace script group must still parse correctly. - (void) testOverInScriptBraces { From d0e4b57a43d983dea15c953759ec645f1b87e7b1 Mon Sep 17 00:00:00 2001 From: Kostub D Date: Sun, 28 Jun 2026 16:01:03 +0530 Subject: [PATCH 4/4] fix(REN-5): simplify invalid-character error message The character is already known to be invalid, so the message doesn't need to render the glyph. Drop the surrogate-pair decoding (only needed for the glyph) and just report the UTF-16 code unit via %04X, which is plain integer formatting with no crash risk. An above-BMP character reports its leading surrogate, which is acceptable for an error string. Removes the now-obsolete surrogate-decoding message test; the astral-literal error-code case and the whitespace/NUL ignore test remain. Co-Authored-By: Claude Opus 4.8 --- iosMath/lib/MTMathListBuilder.m | 22 ++++------------------ iosMathTests/MTMathListBuilderTest.m | 16 ---------------- 2 files changed, 4 insertions(+), 34 deletions(-) diff --git a/iosMath/lib/MTMathListBuilder.m b/iosMath/lib/MTMathListBuilder.m index 38d77bf2..fb51fc42 100644 --- a/iosMath/lib/MTMathListBuilder.m +++ b/iosMath/lib/MTMathListBuilder.m @@ -376,25 +376,11 @@ - (MTMathList*)buildInternal:(BOOL) oneCharOnly stopChar:(unichar) stop // (e.g. π, ×, ≤) or a special character with no meaning in math mode // (% is a comment, # a macro parameter, $ toggles math mode). Callers // should use the corresponding LaTeX command (e.g. \pi, \%, \#). - // _chars is UTF-16, so decode a surrogate pair to report the real - // Unicode scalar (e.g. U+1D44E) instead of a lone surrogate. - UTF32Char codepoint = ch; - NSString* charStr; - if (CFStringIsSurrogateHighCharacter(ch) && [self hasCharacters]) { - unichar low = [self getNextCharacter]; - if (CFStringIsSurrogateLowCharacter(low)) { - unichar pair[2] = {ch, low}; - charStr = [NSString stringWithCharacters:pair length:2]; - codepoint = CFStringGetLongCharacterForSurrogatePair(ch, low); - } else { - [self unlookCharacter]; - charStr = [NSString stringWithCharacters:&ch length:1]; - } - } else { - charStr = [NSString stringWithCharacters:&ch length:1]; - } + // ch is a single UTF-16 code unit; we just report its value (an + // above-BMP character reports its leading surrogate, which is fine + // for an error message). [self setError:MTParseErrorInvalidCharacter - message:[NSString stringWithFormat:@"Unknown character U+%04X ('%@') is not a valid LaTeX input character in math mode. Use the corresponding LaTeX command instead.", codepoint, charStr]]; + message:[NSString stringWithFormat:@"Unknown character U+%04X is not a valid LaTeX input character in math mode. Use the corresponding LaTeX command instead.", ch]]; return nil; } } diff --git a/iosMathTests/MTMathListBuilderTest.m b/iosMathTests/MTMathListBuilderTest.m index 7892e824..24c2b957 100644 --- a/iosMathTests/MTMathListBuilderTest.m +++ b/iosMathTests/MTMathListBuilderTest.m @@ -1520,22 +1520,6 @@ - (void) testErrors } } -// REN-5: an above-BMP literal is a UTF-16 surrogate pair; the error message must -// name the real Unicode scalar (U+1D44E), not a lone surrogate (U+D835). -- (void) testInvalidCharacterErrorMessageDecodesSurrogatePair -{ - NSError* error = nil; - MTMathList* list = [MTMathListBuilder buildFromString:@"𝑎" error:&error]; - XCTAssertNil(list); - XCTAssertNotNil(error); - XCTAssertEqual(error.code, MTParseErrorInvalidCharacter); - NSString* message = error.userInfo[NSLocalizedDescriptionKey]; - XCTAssertTrue([message containsString:@"U+1D44E"], - @"Expected real scalar U+1D44E in message, got: %@", message); - XCTAssertFalse([message containsString:@"U+D835"], - @"Message should not report a lone surrogate: %@", message); -} - // REN-5: characters TeX silently discards (whitespace catcode 10/5 and NUL // catcode 9) must continue to parse without error. Guards against the error // path swallowing legitimate whitespace.