From d8b1eeb28a50710f5a87ab66845dbe625a421d75 Mon Sep 17 00:00:00 2001 From: Juan Cruz Viotti Date: Tue, 23 Jun 2026 20:50:12 -0400 Subject: [PATCH 1/2] WIP Signed-off-by: Juan Cruz Viotti --- .github/workflows/website-build.yml | 1 + .github/workflows/website-deploy.yml | 1 + CMakeLists.txt | 9 + config.cmake.in | 4 + src/core/langtag/CMakeLists.txt | 9 + .../langtag/include/sourcemeta/core/langtag.h | 47 ++ src/core/langtag/langtag.cc | 252 +++++++++++ test/langtag/CMakeLists.txt | 5 + test/langtag/langtag_test.cc | 404 ++++++++++++++++++ 9 files changed, 732 insertions(+) create mode 100644 src/core/langtag/CMakeLists.txt create mode 100644 src/core/langtag/include/sourcemeta/core/langtag.h create mode 100644 src/core/langtag/langtag.cc create mode 100644 test/langtag/CMakeLists.txt create mode 100644 test/langtag/langtag_test.cc diff --git a/.github/workflows/website-build.yml b/.github/workflows/website-build.yml index 29f60fd7d..884083204 100644 --- a/.github/workflows/website-build.yml +++ b/.github/workflows/website-build.yml @@ -34,6 +34,7 @@ jobs: -DSOURCEMETA_CORE_IDNA:BOOL=OFF -DSOURCEMETA_CORE_DNS:BOOL=OFF -DSOURCEMETA_CORE_EMAIL:BOOL=OFF + -DSOURCEMETA_CORE_LANGTAG:BOOL=OFF -DSOURCEMETA_CORE_URI:BOOL=OFF -DSOURCEMETA_CORE_URITEMPLATE:BOOL=OFF -DSOURCEMETA_CORE_JSON:BOOL=OFF diff --git a/.github/workflows/website-deploy.yml b/.github/workflows/website-deploy.yml index 51d7bb264..013b71d0d 100644 --- a/.github/workflows/website-deploy.yml +++ b/.github/workflows/website-deploy.yml @@ -44,6 +44,7 @@ jobs: -DSOURCEMETA_CORE_IDNA:BOOL=OFF -DSOURCEMETA_CORE_DNS:BOOL=OFF -DSOURCEMETA_CORE_EMAIL:BOOL=OFF + -DSOURCEMETA_CORE_LANGTAG:BOOL=OFF -DSOURCEMETA_CORE_URI:BOOL=OFF -DSOURCEMETA_CORE_URITEMPLATE:BOOL=OFF -DSOURCEMETA_CORE_JSON:BOOL=OFF diff --git a/CMakeLists.txt b/CMakeLists.txt index da1806b61..22f9a9a78 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,6 +22,7 @@ option(SOURCEMETA_CORE_IP "Build the Sourcemeta Core IP library" ON) option(SOURCEMETA_CORE_IDNA "Build the Sourcemeta Core IDNA library" ON) option(SOURCEMETA_CORE_DNS "Build the Sourcemeta Core DNS library" ON) option(SOURCEMETA_CORE_EMAIL "Build the Sourcemeta Core Email library" ON) +option(SOURCEMETA_CORE_LANGTAG "Build the Sourcemeta Core LangTag library" ON) option(SOURCEMETA_CORE_URI "Build the Sourcemeta Core URI library" ON) option(SOURCEMETA_CORE_URITEMPLATE "Build the Sourcemeta Core URI Template library" ON) option(SOURCEMETA_CORE_JSON "Build the Sourcemeta Core JSON library" ON) @@ -150,6 +151,10 @@ if(SOURCEMETA_CORE_EMAIL) add_subdirectory(src/core/email) endif() +if(SOURCEMETA_CORE_LANGTAG) + add_subdirectory(src/core/langtag) +endif() + if(SOURCEMETA_CORE_URI) add_subdirectory(src/core/uri) endif() @@ -313,6 +318,10 @@ if(SOURCEMETA_CORE_TESTS) add_subdirectory(test/email) endif() + if(SOURCEMETA_CORE_LANGTAG) + add_subdirectory(test/langtag) + endif() + if(SOURCEMETA_CORE_URI) add_subdirectory(test/uri) endif() diff --git a/config.cmake.in b/config.cmake.in index cd58347e2..5867d812e 100644 --- a/config.cmake.in +++ b/config.cmake.in @@ -18,6 +18,7 @@ if(NOT SOURCEMETA_CORE_COMPONENTS) list(APPEND SOURCEMETA_CORE_COMPONENTS idna) list(APPEND SOURCEMETA_CORE_COMPONENTS dns) list(APPEND SOURCEMETA_CORE_COMPONENTS email) + list(APPEND SOURCEMETA_CORE_COMPONENTS langtag) list(APPEND SOURCEMETA_CORE_COMPONENTS uri) list(APPEND SOURCEMETA_CORE_COMPONENTS uritemplate) list(APPEND SOURCEMETA_CORE_COMPONENTS json) @@ -94,6 +95,9 @@ foreach(component ${SOURCEMETA_CORE_COMPONENTS}) include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_text.cmake") include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_dns.cmake") include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_email.cmake") + elseif(component STREQUAL "langtag") + include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_text.cmake") + include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_langtag.cmake") elseif(component STREQUAL "uri") include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_io.cmake") include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_ip.cmake") diff --git a/src/core/langtag/CMakeLists.txt b/src/core/langtag/CMakeLists.txt new file mode 100644 index 000000000..9ec88c581 --- /dev/null +++ b/src/core/langtag/CMakeLists.txt @@ -0,0 +1,9 @@ +sourcemeta_library(NAMESPACE sourcemeta PROJECT core NAME langtag + SOURCES langtag.cc) + +if(SOURCEMETA_CORE_INSTALL) + sourcemeta_library_install(NAMESPACE sourcemeta PROJECT core NAME langtag) +endif() + +target_link_libraries(sourcemeta_core_langtag + PRIVATE sourcemeta::core::text) diff --git a/src/core/langtag/include/sourcemeta/core/langtag.h b/src/core/langtag/include/sourcemeta/core/langtag.h new file mode 100644 index 000000000..7019df93e --- /dev/null +++ b/src/core/langtag/include/sourcemeta/core/langtag.h @@ -0,0 +1,47 @@ +#ifndef SOURCEMETA_CORE_LANGTAG_H_ +#define SOURCEMETA_CORE_LANGTAG_H_ + +#ifndef SOURCEMETA_CORE_LANGTAG_EXPORT +#include +#endif + +#include // std::string_view + +/// @defgroup langtag LangTag +/// @brief BCP 47 language tag validation utilities. +/// +/// This functionality is included as follows: +/// +/// ```cpp +/// #include +/// ``` + +namespace sourcemeta::core { + +/// @ingroup langtag +/// Check whether the given string is a well-formed language tag per RFC 5646 +/// (BCP 47). In addition to the grammar, the two duplication errors that the +/// specification forbids without consulting any registry are rejected: a +/// repeated variant subtag (RFC 5646 Section 2.2.5) and more than one extension +/// for the same singleton (RFC 5646 Section 2.2.6). Validity against the IANA +/// Language Subtag Registry is not checked, so a structurally well-formed tag +/// whose subtags are not registered is still accepted. Comparison is +/// case-insensitive, as language tags are. For example: +/// +/// ```cpp +/// #include +/// +/// #include +/// +/// assert(sourcemeta::core::is_langtag("en")); +/// assert(sourcemeta::core::is_langtag("zh-Hant-HK")); +/// assert(sourcemeta::core::is_langtag("x-private")); +/// assert(!sourcemeta::core::is_langtag("en-")); +/// assert(!sourcemeta::core::is_langtag("de-1996-1996")); +/// ``` +SOURCEMETA_CORE_LANGTAG_EXPORT +auto is_langtag(const std::string_view value) -> bool; + +} // namespace sourcemeta::core + +#endif diff --git a/src/core/langtag/langtag.cc b/src/core/langtag/langtag.cc new file mode 100644 index 000000000..ed00282e4 --- /dev/null +++ b/src/core/langtag/langtag.cc @@ -0,0 +1,252 @@ +#include +#include + +#include // std::array +#include // std::size_t +#include // std::uint64_t +#include // std::string_view + +namespace { + +// The irregular grandfathered tags (RFC 5646 Section 2.2.8) do not fit the +// langtag grammar and must be matched literally. The regular grandfathered +// tags are intentionally omitted, as they are already accepted by the langtag +// grammar. +constexpr std::array irregular_grandfathered{ + {"en-GB-oed", "i-ami", "i-bnn", "i-default", "i-enochian", "i-hak", + "i-klingon", "i-lux", "i-mingo", "i-navajo", "i-pwn", "i-tao", "i-tay", + "i-tsu", "sgn-BE-FR", "sgn-BE-NL", "sgn-CH-DE"}}; + +// The subtag starting at the given position, up to the next hyphen or the end. +auto subtag_at(const std::string_view value, const std::size_t position) + -> std::string_view { + auto end{position}; + while (end < value.size() && value[end] != '-') { + end += 1; + } + return value.substr(position, end - position); +} + +// Advance past a subtag of the given length, and the separating hyphen if any. +auto advance(const std::string_view value, std::size_t &position, + const std::size_t length) -> void { + position += length; + if (position < value.size()) { + position += 1; + } +} + +// singleton = DIGIT / a-w / y-z, mapped to the range [0, 35] +auto singleton_index(const char character) -> std::size_t { + const auto lowered{sourcemeta::core::to_lowercase(character)}; + if (sourcemeta::core::is_digit(lowered)) { + return static_cast(lowered - '0'); + } + return static_cast(lowered - 'a') + 10; +} + +// language = 2*3ALPHA ["-" extlang] / 4ALPHA / 5*8ALPHA +auto parse_language(const std::string_view value, std::size_t &position) + -> bool { + const auto primary{subtag_at(value, position)}; + if (!sourcemeta::core::is_alpha(primary)) { + return false; + } + if (primary.size() >= 2 && primary.size() <= 3) { + advance(value, position, primary.size()); + // extlang = 3ALPHA *2("-" 3ALPHA), so up to three subtags of three letters + for (std::size_t count{0}; count < 3; count += 1) { + const auto extlang{subtag_at(value, position)}; + if (extlang.size() != 3 || !sourcemeta::core::is_alpha(extlang)) { + break; + } + advance(value, position, 3); + } + return true; + } + if (primary.size() == 4 || (primary.size() >= 5 && primary.size() <= 8)) { + advance(value, position, primary.size()); + return true; + } + return false; +} + +// script = 4ALPHA +auto parse_script(const std::string_view value, std::size_t &position) -> void { + const auto script{subtag_at(value, position)}; + if (script.size() == 4 && sourcemeta::core::is_alpha(script)) { + advance(value, position, 4); + } +} + +// region = 2ALPHA / 3DIGIT +auto parse_region(const std::string_view value, std::size_t &position) -> void { + const auto region{subtag_at(value, position)}; + if ((region.size() == 2 && sourcemeta::core::is_alpha(region)) || + (region.size() == 3 && sourcemeta::core::is_digit(region))) { + advance(value, position, region.size()); + } +} + +// Whether the candidate variant already appears among the subtags in +// [begin, end). The variant region is contiguous, so it is re-scanned rather +// than stored. +auto seen_variant(const std::string_view value, const std::size_t begin, + const std::size_t end, const std::string_view candidate) + -> bool { + std::size_t cursor{begin}; + while (cursor < end) { + auto stop{cursor}; + while (stop < end && value[stop] != '-') { + stop += 1; + } + if (sourcemeta::core::equals_ignore_case( + value.substr(cursor, stop - cursor), candidate)) { + return true; + } + cursor = stop + 1; + } + return false; +} + +// variant = 5*8alphanum / (DIGIT 3alphanum), with no repeats +auto parse_variants(const std::string_view value, std::size_t &position) + -> bool { + const auto begin{position}; + while (true) { + const auto variant{subtag_at(value, position)}; + const bool matches{(variant.size() >= 5 && variant.size() <= 8 && + sourcemeta::core::is_alphanum(variant)) || + (variant.size() == 4 && + sourcemeta::core::is_digit(variant.front()) && + sourcemeta::core::is_alphanum(variant))}; + if (!matches) { + break; + } + if (seen_variant(value, begin, position, variant)) { + return false; + } + advance(value, position, variant.size()); + } + return true; +} + +// Consume one or more alphanumeric subtags whose length is in +// [minimum_length, 8], requiring at least one. +auto consume_subtags(const std::string_view value, std::size_t &position, + const std::size_t minimum_length) -> bool { + std::size_t count{0}; + while (true) { + const auto subtag{subtag_at(value, position)}; + if (subtag.size() < minimum_length || subtag.size() > 8 || + !sourcemeta::core::is_alphanum(subtag)) { + break; + } + advance(value, position, subtag.size()); + count += 1; + } + return count > 0; +} + +// extension = singleton 1*("-" (2*8alphanum)), with each singleton at most +// once. The singleton "x" is excluded as it introduces the private use. +auto parse_extensions(const std::string_view value, std::size_t &position) + -> bool { + std::uint64_t seen{0}; + while (true) { + const auto singleton{subtag_at(value, position)}; + if (singleton.size() != 1) { + break; + } + const auto character{singleton.front()}; + if (!sourcemeta::core::is_alphanum(character) || + sourcemeta::core::to_lowercase(character) == 'x') { + break; + } + const auto bit{std::uint64_t{1} << singleton_index(character)}; + if ((seen & bit) != 0) { + return false; + } + seen |= bit; + advance(value, position, 1); + if (!consume_subtags(value, position, 2)) { + return false; + } + } + return true; +} + +// privateuse = "x" 1*("-" (1*8alphanum)) +auto parse_privateuse(const std::string_view value, std::size_t &position) + -> bool { + const auto singleton{subtag_at(value, position)}; + if (singleton.size() != 1 || + sourcemeta::core::to_lowercase(singleton.front()) != 'x') { + return true; + } + advance(value, position, 1); + return consume_subtags(value, position, 1); +} + +// Language-Tag = langtag (RFC 5646 Section 2.1) +auto valid_langtag(const std::string_view value) -> bool { + std::size_t position{0}; + if (!parse_language(value, position)) { + return false; + } + parse_script(value, position); + parse_region(value, position); + if (!parse_variants(value, position)) { + return false; + } + if (!parse_extensions(value, position)) { + return false; + } + if (!parse_privateuse(value, position)) { + return false; + } + return position >= value.size(); +} + +// Language-Tag = privateuse (RFC 5646 Section 2.1) +auto valid_privateuse(const std::string_view value) -> bool { + std::size_t position{0}; + const auto singleton{subtag_at(value, position)}; + if (singleton.size() != 1 || + sourcemeta::core::to_lowercase(singleton.front()) != 'x') { + return false; + } + advance(value, position, 1); + return consume_subtags(value, position, 1) && position >= value.size(); +} + +auto is_irregular_grandfathered(const std::string_view value) -> bool { + for (const auto entry : irregular_grandfathered) { + if (sourcemeta::core::equals_ignore_case(value, entry)) { + return true; + } + } + return false; +} + +} // namespace + +namespace sourcemeta::core { + +auto is_langtag(const std::string_view value) -> bool { + if (value.empty() || value.front() == '-' || value.back() == '-') { + return false; + } + // Language-Tag = langtag / privateuse / grandfathered. The regular + // grandfathered tags are covered by the langtag alternative, so only the + // irregular ones need a literal fallback. + if (valid_langtag(value)) { + return true; + } + if (valid_privateuse(value)) { + return true; + } + return is_irregular_grandfathered(value); +} + +} // namespace sourcemeta::core diff --git a/test/langtag/CMakeLists.txt b/test/langtag/CMakeLists.txt new file mode 100644 index 000000000..4dfd8cb41 --- /dev/null +++ b/test/langtag/CMakeLists.txt @@ -0,0 +1,5 @@ +sourcemeta_googletest(NAMESPACE sourcemeta PROJECT core NAME langtag + SOURCES langtag_test.cc) + +target_link_libraries(sourcemeta_core_langtag_unit + PRIVATE sourcemeta::core::langtag) diff --git a/test/langtag/langtag_test.cc b/test/langtag/langtag_test.cc new file mode 100644 index 000000000..07e6d5125 --- /dev/null +++ b/test/langtag/langtag_test.cc @@ -0,0 +1,404 @@ +#include + +#include + +// RFC 5646 Appendix A: simple language subtag +TEST(LangTag, rfc_simple_language_subtag) { + EXPECT_TRUE(sourcemeta::core::is_langtag("de")); + EXPECT_TRUE(sourcemeta::core::is_langtag("fr")); + EXPECT_TRUE(sourcemeta::core::is_langtag("ja")); +} + +// RFC 5646 Appendix A: language subtag plus script subtag +TEST(LangTag, rfc_language_plus_script) { + EXPECT_TRUE(sourcemeta::core::is_langtag("zh-Hant")); + EXPECT_TRUE(sourcemeta::core::is_langtag("zh-Hans")); + EXPECT_TRUE(sourcemeta::core::is_langtag("sr-Cyrl")); + EXPECT_TRUE(sourcemeta::core::is_langtag("sr-Latn")); +} + +// RFC 5646 Appendix A: extended language subtags +TEST(LangTag, rfc_extended_language_subtags) { + EXPECT_TRUE(sourcemeta::core::is_langtag("zh-cmn-Hans-CN")); + EXPECT_TRUE(sourcemeta::core::is_langtag("cmn-Hans-CN")); + EXPECT_TRUE(sourcemeta::core::is_langtag("zh-yue-HK")); + EXPECT_TRUE(sourcemeta::core::is_langtag("yue-HK")); +} + +// RFC 5646 Appendix A: language-script-region +TEST(LangTag, rfc_language_script_region) { + EXPECT_TRUE(sourcemeta::core::is_langtag("zh-Hans-CN")); + EXPECT_TRUE(sourcemeta::core::is_langtag("sr-Latn-RS")); +} + +// RFC 5646 Appendix A: language-variant +TEST(LangTag, rfc_language_variant) { + EXPECT_TRUE(sourcemeta::core::is_langtag("sl-rozaj")); + EXPECT_TRUE(sourcemeta::core::is_langtag("sl-rozaj-biske")); + EXPECT_TRUE(sourcemeta::core::is_langtag("sl-nedis")); +} + +// RFC 5646 Appendix A: language-region-variant +TEST(LangTag, rfc_language_region_variant) { + EXPECT_TRUE(sourcemeta::core::is_langtag("de-CH-1901")); + EXPECT_TRUE(sourcemeta::core::is_langtag("sl-IT-nedis")); +} + +// RFC 5646 Appendix A: language-script-region-variant +TEST(LangTag, rfc_language_script_region_variant) { + EXPECT_TRUE(sourcemeta::core::is_langtag("hy-Latn-IT-arevela")); +} + +// RFC 5646 Appendix A: language-region +TEST(LangTag, rfc_language_region) { + EXPECT_TRUE(sourcemeta::core::is_langtag("de-DE")); + EXPECT_TRUE(sourcemeta::core::is_langtag("en-US")); + EXPECT_TRUE(sourcemeta::core::is_langtag("es-419")); +} + +// RFC 5646 Appendix A: private use subtags +TEST(LangTag, rfc_private_use_subtags) { + EXPECT_TRUE(sourcemeta::core::is_langtag("de-CH-x-phonebk")); + EXPECT_TRUE(sourcemeta::core::is_langtag("az-Arab-x-AYB")); +} + +// RFC 5646 Appendix A: tags using the private use registry values +TEST(LangTag, rfc_private_use_registry_values) { + EXPECT_TRUE(sourcemeta::core::is_langtag("qaa-Qaaa-QM-x-southern")); + EXPECT_TRUE(sourcemeta::core::is_langtag("de-Qaaa")); + EXPECT_TRUE(sourcemeta::core::is_langtag("sr-Latn-QM")); + EXPECT_TRUE(sourcemeta::core::is_langtag("sr-Qaaa-RS")); +} + +// RFC 5646 Appendix A: tags using extensions +TEST(LangTag, rfc_extensions) { + EXPECT_TRUE(sourcemeta::core::is_langtag("en-US-u-islamcal")); + EXPECT_TRUE(sourcemeta::core::is_langtag("zh-CN-a-myext-x-private")); + EXPECT_TRUE(sourcemeta::core::is_langtag("en-a-myext-b-another")); +} + +// RFC 5646 Appendix A: private use only +TEST(LangTag, rfc_private_use_only) { + EXPECT_TRUE(sourcemeta::core::is_langtag("x-whatever")); +} + +// RFC 5646 Appendix A: invalid, two region subtags +TEST(LangTag, rfc_invalid_two_region_subtags) { + EXPECT_FALSE(sourcemeta::core::is_langtag("de-419-DE")); +} + +// RFC 5646 Appendix A: invalid, single-character primary language +TEST(LangTag, rfc_invalid_single_character_primary) { + EXPECT_FALSE(sourcemeta::core::is_langtag("a-DE")); +} + +// RFC 5646 Appendix A: invalid, two extensions with the same singleton +TEST(LangTag, rfc_invalid_duplicate_singleton) { + EXPECT_FALSE(sourcemeta::core::is_langtag("ar-a-aaa-b-bbb-a-ccc")); +} + +// RFC 5646 Appendix A: invalid, singleton with no following subtag +TEST(LangTag, rfc_invalid_extension_without_subtag) { + EXPECT_FALSE(sourcemeta::core::is_langtag("tlh-a-b-foo")); +} + +// RFC 5646 Section 2.2.1: primary language of two letters +TEST(LangTag, language_two_letters) { + EXPECT_TRUE(sourcemeta::core::is_langtag("en")); +} + +// RFC 5646 Section 2.2.1: primary language of three letters +TEST(LangTag, language_three_letters) { + EXPECT_TRUE(sourcemeta::core::is_langtag("mas")); +} + +// RFC 5646 Section 2.2.1: four-letter reserved primary language +TEST(LangTag, language_four_letters_reserved) { + EXPECT_TRUE(sourcemeta::core::is_langtag("aaaa")); +} + +// RFC 5646 Section 2.2.1: registered primary language of five to eight letters +TEST(LangTag, language_five_to_eight_letters) { + EXPECT_TRUE(sourcemeta::core::is_langtag("abcde")); + EXPECT_TRUE(sourcemeta::core::is_langtag("abcdefgh")); +} + +// RFC 5646 Section 2.2.1: a single-letter primary language is not allowed +TEST(LangTag, language_one_letter_rejected) { + EXPECT_FALSE(sourcemeta::core::is_langtag("a")); +} + +// RFC 5646 Section 2.2.1: a primary language longer than eight letters is not +// allowed +TEST(LangTag, language_nine_letters_rejected) { + EXPECT_FALSE(sourcemeta::core::is_langtag("abcdefghi")); +} + +// RFC 5646 Section 2.2.1: the primary language is letters only +TEST(LangTag, language_with_digit_rejected) { + EXPECT_FALSE(sourcemeta::core::is_langtag("e2")); + EXPECT_FALSE(sourcemeta::core::is_langtag("12")); +} + +// RFC 5646 Section 2.2.2: a single extended language subtag +TEST(LangTag, single_extlang) { + EXPECT_TRUE(sourcemeta::core::is_langtag("zh-yue")); +} + +// RFC 5646 Section 2.2.2: up to three extended language subtags +TEST(LangTag, three_extlangs) { + EXPECT_TRUE(sourcemeta::core::is_langtag("zh-aaa-bbb-ccc")); +} + +// RFC 5646 Section 2.2.2: more than three extended language subtags is not +// allowed +TEST(LangTag, four_extlangs_rejected) { + EXPECT_FALSE(sourcemeta::core::is_langtag("zh-aaa-bbb-ccc-ddd")); +} + +// RFC 5646 Section 2.2.2: extended language subtags only follow a two or three +// letter primary language +TEST(LangTag, extlang_only_after_short_primary) { + // A four-letter or longer primary language cannot take an extended language + EXPECT_FALSE(sourcemeta::core::is_langtag("aaaa-bbb")); + EXPECT_FALSE(sourcemeta::core::is_langtag("abcde-fgh")); +} + +// RFC 5646 Section 2.2.3: a script subtag after an extended language +TEST(LangTag, script_after_extlang) { + EXPECT_TRUE(sourcemeta::core::is_langtag("zh-yue-Hant-CN")); +} + +// RFC 5646 Section 2.2.3: at most one script subtag +TEST(LangTag, two_scripts_rejected) { + EXPECT_FALSE(sourcemeta::core::is_langtag("zh-Hant-Latn")); +} + +// RFC 5646 Section 2.2.4: an alphabetic region subtag +TEST(LangTag, region_alpha) { + EXPECT_TRUE(sourcemeta::core::is_langtag("en-GB")); +} + +// RFC 5646 Section 2.2.4: a numeric region subtag +TEST(LangTag, region_numeric) { + EXPECT_TRUE(sourcemeta::core::is_langtag("es-005")); +} + +// RFC 5646 Section 2.2.4: a numeric region subtag has exactly three digits +TEST(LangTag, two_digit_region_rejected) { + EXPECT_FALSE(sourcemeta::core::is_langtag("en-12")); +} + +// RFC 5646 Section 2.2.4: at most one region subtag +TEST(LangTag, two_regions_rejected) { + EXPECT_FALSE(sourcemeta::core::is_langtag("en-US-GB")); +} + +// RFC 5646 Section 2.2.5: a variant of five to eight characters +TEST(LangTag, variant_five_letters) { + EXPECT_TRUE(sourcemeta::core::is_langtag("de-nedis")); +} + +// RFC 5646 Section 2.2.5: a variant of eight characters +TEST(LangTag, variant_eight_characters) { + EXPECT_TRUE(sourcemeta::core::is_langtag("de-abcdefgh")); +} + +// RFC 5646 Section 2.2.5: a four-character variant beginning with a digit +TEST(LangTag, digit_led_four_character_variant) { + EXPECT_TRUE(sourcemeta::core::is_langtag("de-1994")); + EXPECT_TRUE(sourcemeta::core::is_langtag("de-1234")); +} + +// RFC 5646 Section 2.2.5: a four-character subtag must begin with a digit +TEST(LangTag, four_character_non_digit_led_rejected) { + // A four-character subtag is only a variant when it begins with a digit + EXPECT_FALSE(sourcemeta::core::is_langtag("de-ab12")); +} + +// RFC 5646 Section 2.2.5: several variant subtags in a row +TEST(LangTag, multiple_variants) { + EXPECT_TRUE(sourcemeta::core::is_langtag("sl-rozaj-biske-1994")); +} + +// RFC 5646 Section 2.2.5: the same variant must not appear twice +TEST(LangTag, duplicate_variant_rejected) { + EXPECT_FALSE(sourcemeta::core::is_langtag("de-1994-1994")); + EXPECT_FALSE(sourcemeta::core::is_langtag("sl-rozaj-rozaj")); +} + +// RFC 5646 Section 2.2.5: variant duplication is detected case insensitively +TEST(LangTag, duplicate_variant_case_insensitive_rejected) { + EXPECT_FALSE(sourcemeta::core::is_langtag("sl-rozaj-ROZAJ")); +} + +// RFC 5646 Section 2.2.6: a digit singleton introduces an extension +TEST(LangTag, digit_singleton_extension) { + EXPECT_TRUE(sourcemeta::core::is_langtag("en-0-bbb")); +} + +// RFC 5646 Section 2.2.6: an extension may carry several subtags +TEST(LangTag, extension_multiple_subtags) { + EXPECT_TRUE(sourcemeta::core::is_langtag("en-a-bbb-ccc-ddd")); +} + +// RFC 5646 Section 2.2.6: several extensions with different singletons +TEST(LangTag, multiple_extensions) { + EXPECT_TRUE(sourcemeta::core::is_langtag("en-a-bbb-c-ddd")); +} + +// RFC 5646 Section 2.2.6: a singleton must be followed by a subtag +TEST(LangTag, bare_extension_singleton_rejected) { + EXPECT_FALSE(sourcemeta::core::is_langtag("en-a")); +} + +// RFC 5646 Section 2.2.6: a singleton cannot be followed by another singleton +TEST(LangTag, adjacent_singletons_rejected) { + EXPECT_FALSE(sourcemeta::core::is_langtag("en-a-b")); +} + +// RFC 5646 Section 2.2.6: an extension subtag is at most eight characters +TEST(LangTag, extension_subtag_too_long_rejected) { + EXPECT_FALSE(sourcemeta::core::is_langtag("en-a-123456789")); +} + +// RFC 5646 Section 2.2.6: an extension subtag is at least two characters +TEST(LangTag, extension_subtag_single_character_rejected) { + EXPECT_FALSE(sourcemeta::core::is_langtag("en-a-b-ccc")); +} + +// RFC 5646 Section 2.2.6: a singleton must not appear more than once +TEST(LangTag, duplicate_singleton_rejected) { + EXPECT_FALSE(sourcemeta::core::is_langtag("en-a-bbb-a-ccc")); +} + +// RFC 5646 Section 2.2.6: singleton duplication is detected case insensitively +TEST(LangTag, duplicate_singleton_case_insensitive_rejected) { + EXPECT_FALSE(sourcemeta::core::is_langtag("en-a-bbb-A-ccc")); +} + +// RFC 5646 Section 2.2.7: a private use tag with a single subtag +TEST(LangTag, private_use_only_single_subtag) { + EXPECT_TRUE(sourcemeta::core::is_langtag("x-foo")); +} + +// RFC 5646 Section 2.2.7: a private use tag with several subtags +TEST(LangTag, private_use_only_multiple_subtags) { + EXPECT_TRUE(sourcemeta::core::is_langtag("x-a-b-c")); +} + +// RFC 5646 Section 2.2.7: a private use subtag of a single character +TEST(LangTag, private_use_single_character_subtag) { + EXPECT_TRUE(sourcemeta::core::is_langtag("x-a")); +} + +// RFC 5646 Section 2.2.7: the private use singleton is case insensitive +TEST(LangTag, private_use_uppercase_singleton) { + EXPECT_TRUE(sourcemeta::core::is_langtag("X-foo")); +} + +// RFC 5646 Section 2.2.7: a private use sequence after an extension +TEST(LangTag, private_use_after_extension) { + EXPECT_TRUE(sourcemeta::core::is_langtag("en-a-bb-x-cc")); +} + +// RFC 5646 Section 2.2.7: a private use singleton must be followed by a subtag +TEST(LangTag, bare_private_use_singleton_rejected) { + EXPECT_FALSE(sourcemeta::core::is_langtag("x")); + EXPECT_FALSE(sourcemeta::core::is_langtag("en-x")); +} + +// RFC 5646 Section 2.2.7: a private use subtag is at most eight characters +TEST(LangTag, private_use_subtag_too_long_rejected) { + EXPECT_FALSE(sourcemeta::core::is_langtag("x-123456789")); +} + +// RFC 5646 Section 2.2.8: the irregular grandfathered tags +TEST(LangTag, grandfathered_irregular) { + EXPECT_TRUE(sourcemeta::core::is_langtag("en-GB-oed")); + EXPECT_TRUE(sourcemeta::core::is_langtag("i-ami")); + EXPECT_TRUE(sourcemeta::core::is_langtag("i-bnn")); + EXPECT_TRUE(sourcemeta::core::is_langtag("i-default")); + EXPECT_TRUE(sourcemeta::core::is_langtag("i-enochian")); + EXPECT_TRUE(sourcemeta::core::is_langtag("i-hak")); + EXPECT_TRUE(sourcemeta::core::is_langtag("i-klingon")); + EXPECT_TRUE(sourcemeta::core::is_langtag("i-lux")); + EXPECT_TRUE(sourcemeta::core::is_langtag("i-mingo")); + EXPECT_TRUE(sourcemeta::core::is_langtag("i-navajo")); + EXPECT_TRUE(sourcemeta::core::is_langtag("i-pwn")); + EXPECT_TRUE(sourcemeta::core::is_langtag("i-tao")); + EXPECT_TRUE(sourcemeta::core::is_langtag("i-tay")); + EXPECT_TRUE(sourcemeta::core::is_langtag("i-tsu")); + EXPECT_TRUE(sourcemeta::core::is_langtag("sgn-BE-FR")); + EXPECT_TRUE(sourcemeta::core::is_langtag("sgn-BE-NL")); + EXPECT_TRUE(sourcemeta::core::is_langtag("sgn-CH-DE")); +} + +// RFC 5646 Section 2.2.8: irregular grandfathered tags are case insensitive +TEST(LangTag, grandfathered_irregular_case_insensitive) { + EXPECT_TRUE(sourcemeta::core::is_langtag("I-AMI")); + EXPECT_TRUE(sourcemeta::core::is_langtag("EN-gb-OED")); +} + +// RFC 5646 Section 2.2.8: the regular grandfathered tags match the langtag +// grammar +TEST(LangTag, grandfathered_regular_via_grammar) { + EXPECT_TRUE(sourcemeta::core::is_langtag("art-lojban")); + EXPECT_TRUE(sourcemeta::core::is_langtag("cel-gaulish")); + EXPECT_TRUE(sourcemeta::core::is_langtag("no-bok")); + EXPECT_TRUE(sourcemeta::core::is_langtag("no-nyn")); + EXPECT_TRUE(sourcemeta::core::is_langtag("zh-guoyu")); + EXPECT_TRUE(sourcemeta::core::is_langtag("zh-hakka")); + EXPECT_TRUE(sourcemeta::core::is_langtag("zh-min")); + EXPECT_TRUE(sourcemeta::core::is_langtag("zh-min-nan")); + EXPECT_TRUE(sourcemeta::core::is_langtag("zh-xiang")); +} + +// RFC 5646 Section 2.1.1: language tags are not case sensitive +TEST(LangTag, case_insensitive) { + EXPECT_TRUE(sourcemeta::core::is_langtag("EN-us")); + EXPECT_TRUE(sourcemeta::core::is_langtag("zh-hant-hk")); + EXPECT_TRUE(sourcemeta::core::is_langtag("DE-ch-1901")); +} + +// RFC 5646 Section 2.2.9: well-formedness does not require registered subtags +TEST(LangTag, well_formed_but_not_valid) { + // Unregistered subtags are still well-formed + EXPECT_TRUE(sourcemeta::core::is_langtag("zz-Zzzz")); + EXPECT_TRUE(sourcemeta::core::is_langtag("qaa")); +} + +// RFC 5646 Section 2.1: the empty string is not a language tag +TEST(LangTag, empty) { EXPECT_FALSE(sourcemeta::core::is_langtag("")); } + +// RFC 5646 Section 2.1: a lone separator is not a language tag +TEST(LangTag, only_hyphen) { EXPECT_FALSE(sourcemeta::core::is_langtag("-")); } + +// RFC 5646 Section 2.1: a tag must not begin with a separator +TEST(LangTag, leading_hyphen) { + EXPECT_FALSE(sourcemeta::core::is_langtag("-en")); +} + +// RFC 5646 Section 2.1: a tag must not end with a separator +TEST(LangTag, trailing_hyphen) { + EXPECT_FALSE(sourcemeta::core::is_langtag("en-")); +} + +// RFC 5646 Section 2.1: subtags must not be empty +TEST(LangTag, double_hyphen) { + EXPECT_FALSE(sourcemeta::core::is_langtag("en--US")); +} + +// RFC 5646 Section 2.1: subtags are alphanumeric only +TEST(LangTag, non_alphanumeric_character) { + EXPECT_FALSE(sourcemeta::core::is_langtag("en_US")); + EXPECT_FALSE(sourcemeta::core::is_langtag("en-US!")); + EXPECT_FALSE(sourcemeta::core::is_langtag("en.US")); +} + +// RFC 5646 Section 2.1: whitespace is not part of the grammar +TEST(LangTag, whitespace) { + EXPECT_FALSE(sourcemeta::core::is_langtag(" en")); + EXPECT_FALSE(sourcemeta::core::is_langtag("en ")); + EXPECT_FALSE(sourcemeta::core::is_langtag("en US")); +} From 6eb9721b0860cf1d563da272137ab5fc4f10269c Mon Sep 17 00:00:00 2001 From: Juan Cruz Viotti Date: Tue, 23 Jun 2026 21:35:01 -0400 Subject: [PATCH 2/2] More Signed-off-by: Juan Cruz Viotti --- test/langtag/langtag_test.cc | 64 ++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/test/langtag/langtag_test.cc b/test/langtag/langtag_test.cc index 07e6d5125..d88f72c0b 100644 --- a/test/langtag/langtag_test.cc +++ b/test/langtag/langtag_test.cc @@ -117,12 +117,23 @@ TEST(LangTag, language_four_letters_reserved) { EXPECT_TRUE(sourcemeta::core::is_langtag("aaaa")); } +// RFC 5646 Section 2.2.1: a four-letter primary language takes no extended +// language but still admits a script and region +TEST(LangTag, language_four_letters_with_script_region) { + EXPECT_TRUE(sourcemeta::core::is_langtag("aaaa-Latn-US")); +} + // RFC 5646 Section 2.2.1: registered primary language of five to eight letters TEST(LangTag, language_five_to_eight_letters) { EXPECT_TRUE(sourcemeta::core::is_langtag("abcde")); EXPECT_TRUE(sourcemeta::core::is_langtag("abcdefgh")); } +// RFC 5646 Section 2.2.1: a long primary language still admits a region +TEST(LangTag, language_five_to_eight_letters_with_region) { + EXPECT_TRUE(sourcemeta::core::is_langtag("abcde-US")); +} + // RFC 5646 Section 2.2.1: a single-letter primary language is not allowed TEST(LangTag, language_one_letter_rejected) { EXPECT_FALSE(sourcemeta::core::is_langtag("a")); @@ -169,6 +180,12 @@ TEST(LangTag, script_after_extlang) { EXPECT_TRUE(sourcemeta::core::is_langtag("zh-yue-Hant-CN")); } +// RFC 5646 Section 2.1: extlang, script, region, variant, extension and +// private use all present in a single tag +TEST(LangTag, full_langtag) { + EXPECT_TRUE(sourcemeta::core::is_langtag("zh-yue-Hant-CN-1994-a-bbb-x-foo")); +} + // RFC 5646 Section 2.2.3: at most one script subtag TEST(LangTag, two_scripts_rejected) { EXPECT_FALSE(sourcemeta::core::is_langtag("zh-Hant-Latn")); @@ -210,6 +227,12 @@ TEST(LangTag, digit_led_four_character_variant) { EXPECT_TRUE(sourcemeta::core::is_langtag("de-1234")); } +// RFC 5646 Section 2.2.5: a digit-led four-character variant may mix letters +TEST(LangTag, digit_led_four_character_variant_with_letters) { + EXPECT_TRUE(sourcemeta::core::is_langtag("de-1abc")); + EXPECT_TRUE(sourcemeta::core::is_langtag("de-1a2b")); +} + // RFC 5646 Section 2.2.5: a four-character subtag must begin with a digit TEST(LangTag, four_character_non_digit_led_rejected) { // A four-character subtag is only a variant when it begins with a digit @@ -247,6 +270,14 @@ TEST(LangTag, multiple_extensions) { EXPECT_TRUE(sourcemeta::core::is_langtag("en-a-bbb-c-ddd")); } +// RFC 5646 Section 2.2.6: every letter except "x" is a valid singleton, +// including the letters bordering the reserved "x" +TEST(LangTag, extension_singleton_letter_boundaries) { + EXPECT_TRUE(sourcemeta::core::is_langtag("en-w-aa")); + EXPECT_TRUE(sourcemeta::core::is_langtag("en-y-aa")); + EXPECT_TRUE(sourcemeta::core::is_langtag("en-z-aa")); +} + // RFC 5646 Section 2.2.6: a singleton must be followed by a subtag TEST(LangTag, bare_extension_singleton_rejected) { EXPECT_FALSE(sourcemeta::core::is_langtag("en-a")); @@ -262,6 +293,11 @@ TEST(LangTag, extension_subtag_too_long_rejected) { EXPECT_FALSE(sourcemeta::core::is_langtag("en-a-123456789")); } +// RFC 5646 Section 2.2.6: an eight-character extension subtag is allowed +TEST(LangTag, extension_subtag_eight_characters) { + EXPECT_TRUE(sourcemeta::core::is_langtag("en-a-abcd1234")); +} + // RFC 5646 Section 2.2.6: an extension subtag is at least two characters TEST(LangTag, extension_subtag_single_character_rejected) { EXPECT_FALSE(sourcemeta::core::is_langtag("en-a-b-ccc")); @@ -277,6 +313,11 @@ TEST(LangTag, duplicate_singleton_case_insensitive_rejected) { EXPECT_FALSE(sourcemeta::core::is_langtag("en-a-bbb-A-ccc")); } +// RFC 5646 Section 2.2.6: a digit singleton must not appear more than once +TEST(LangTag, duplicate_digit_singleton_rejected) { + EXPECT_FALSE(sourcemeta::core::is_langtag("en-0-aaa-0-bbb")); +} + // RFC 5646 Section 2.2.7: a private use tag with a single subtag TEST(LangTag, private_use_only_single_subtag) { EXPECT_TRUE(sourcemeta::core::is_langtag("x-foo")); @@ -292,11 +333,23 @@ TEST(LangTag, private_use_single_character_subtag) { EXPECT_TRUE(sourcemeta::core::is_langtag("x-a")); } +// RFC 5646 Section 2.2.7: a single-character private use subtag is allowed +// after a language, where an extension subtag could never be that short +TEST(LangTag, private_use_single_character_subtag_after_language) { + EXPECT_TRUE(sourcemeta::core::is_langtag("en-x-c")); +} + // RFC 5646 Section 2.2.7: the private use singleton is case insensitive TEST(LangTag, private_use_uppercase_singleton) { EXPECT_TRUE(sourcemeta::core::is_langtag("X-foo")); } +// RFC 5646 Section 2.2.7: an uppercase "X" introduces private use after a +// language and is never treated as an extension singleton +TEST(LangTag, private_use_uppercase_singleton_after_language) { + EXPECT_TRUE(sourcemeta::core::is_langtag("en-X-foo")); +} + // RFC 5646 Section 2.2.7: a private use sequence after an extension TEST(LangTag, private_use_after_extension) { EXPECT_TRUE(sourcemeta::core::is_langtag("en-a-bb-x-cc")); @@ -313,6 +366,11 @@ TEST(LangTag, private_use_subtag_too_long_rejected) { EXPECT_FALSE(sourcemeta::core::is_langtag("x-123456789")); } +// RFC 5646 Section 2.2.7: an eight-character private use subtag is allowed +TEST(LangTag, private_use_subtag_eight_characters) { + EXPECT_TRUE(sourcemeta::core::is_langtag("x-abcd1234")); +} + // RFC 5646 Section 2.2.8: the irregular grandfathered tags TEST(LangTag, grandfathered_irregular) { EXPECT_TRUE(sourcemeta::core::is_langtag("en-GB-oed")); @@ -396,6 +454,12 @@ TEST(LangTag, non_alphanumeric_character) { EXPECT_FALSE(sourcemeta::core::is_langtag("en.US")); } +// RFC 5646 Section 2.1: subtags are limited to ASCII letters and digits +TEST(LangTag, non_ascii_character) { + EXPECT_FALSE(sourcemeta::core::is_langtag("én")); + EXPECT_FALSE(sourcemeta::core::is_langtag("en-café")); +} + // RFC 5646 Section 2.1: whitespace is not part of the grammar TEST(LangTag, whitespace) { EXPECT_FALSE(sourcemeta::core::is_langtag(" en"));