diff --git a/config.cmake.in b/config.cmake.in index 16d87fed7..cd58347e2 100644 --- a/config.cmake.in +++ b/config.cmake.in @@ -84,12 +84,14 @@ foreach(component ${SOURCEMETA_CORE_COMPONENTS}) include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_unicode.cmake") include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_punycode.cmake") include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_idna.cmake") + include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_text.cmake") include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_dns.cmake") elseif(component STREQUAL "email") include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_ip.cmake") include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_unicode.cmake") include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_punycode.cmake") include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_idna.cmake") + include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_text.cmake") include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_dns.cmake") include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_email.cmake") elseif(component STREQUAL "uri") @@ -184,6 +186,7 @@ foreach(component ${SOURCEMETA_CORE_COMPONENTS}) include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_jose.cmake") elseif(component STREQUAL "semver") include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_preprocessor.cmake") + include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_text.cmake") include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_semver.cmake") elseif(component STREQUAL "gzip") find_dependency(LibDeflate CONFIG) diff --git a/src/core/dns/CMakeLists.txt b/src/core/dns/CMakeLists.txt index 1154d4114..3368ace46 100644 --- a/src/core/dns/CMakeLists.txt +++ b/src/core/dns/CMakeLists.txt @@ -8,4 +8,5 @@ endif() target_link_libraries(sourcemeta_core_dns PRIVATE sourcemeta::core::unicode PRIVATE sourcemeta::core::punycode - PRIVATE sourcemeta::core::idna) + PRIVATE sourcemeta::core::idna + PRIVATE sourcemeta::core::text) diff --git a/src/core/dns/hostname.cc b/src/core/dns/hostname.cc index 4a980f7e2..31aa097c4 100644 --- a/src/core/dns/hostname.cc +++ b/src/core/dns/hostname.cc @@ -1,19 +1,12 @@ #include #include +#include #include // std::string #include // std::string_view namespace sourcemeta::core { -// RFC 952 §B: let-dig = ALPHA / DIGIT -// RFC 1123 §2.1: first character of a label is letter or digit -static constexpr auto is_let_dig(const char character) -> bool { - return (character >= 'A' && character <= 'Z') || - (character >= 'a' && character <= 'z') || - (character >= '0' && character <= '9'); -} - auto is_hostname(const std::string_view value) -> bool { // RFC 952 §B: requires at least one if (value.empty()) { @@ -46,7 +39,8 @@ auto is_hostname(const std::string_view value) -> bool { continue; } - if (is_let_dig(character)) { + // RFC 952 §B: let-dig = ALPHA / DIGIT + if (is_alphanum(character)) { last_was_hyphen = false; position += 1; label_has_content = true; diff --git a/src/core/email/CMakeLists.txt b/src/core/email/CMakeLists.txt index 22828bcf3..dcbc083e9 100644 --- a/src/core/email/CMakeLists.txt +++ b/src/core/email/CMakeLists.txt @@ -11,3 +11,5 @@ target_link_libraries(sourcemeta_core_email PRIVATE sourcemeta::core::ip) target_link_libraries(sourcemeta_core_email PRIVATE sourcemeta::core::unicode) +target_link_libraries(sourcemeta_core_email + PRIVATE sourcemeta::core::text) diff --git a/src/core/email/helpers.h b/src/core/email/helpers.h index dd93f5bf4..0a37ae526 100644 --- a/src/core/email/helpers.h +++ b/src/core/email/helpers.h @@ -2,6 +2,7 @@ #define SOURCEMETA_CORE_EMAIL_HELPERS_H_ #include +#include #include // std::uint8_t, std::uint16_t #include // std::string_view @@ -34,9 +35,7 @@ inline constexpr auto is_atext(const char character) -> bool { case '~': return true; default: - return (character >= 'A' && character <= 'Z') || - (character >= 'a' && character <= 'z') || - (character >= '0' && character <= '9'); + return sourcemeta::core::is_alphanum(character); } } @@ -47,13 +46,6 @@ inline constexpr auto is_qtext_smtp(const unsigned char character) -> bool { (character >= 93 && character <= 126); } -// RFC 5321 §4.1.2: Let-dig = ALPHA / DIGIT -inline constexpr auto is_let_dig(const char character) -> bool { - return (character >= 'A' && character <= 'Z') || - (character >= 'a' && character <= 'z') || - (character >= '0' && character <= '9'); -} - // RFC 5321 §4.1.3: dcontent = %d33-90 / %d94-126 inline constexpr auto is_dcontent(const unsigned char character) -> bool { return (character >= 33 && character <= 90) || @@ -63,13 +55,13 @@ inline constexpr auto is_dcontent(const unsigned char character) -> bool { // RFC 5321 §4.1.2: Ldh-str = *( ALPHA / DIGIT / "-" ) Let-dig // RFC 5321 §4.1.3: Standardized-tag = Ldh-str inline constexpr auto is_ldh_str(const std::string_view value) -> bool { - if (value.empty() || !is_let_dig(value.back())) { + if (value.empty() || !sourcemeta::core::is_alphanum(value.back())) { return false; } for (std::string_view::size_type position{0}; position + 1 < value.size(); position += 1) { const auto character{value[position]}; - if (!is_let_dig(character) && character != '-') { + if (!sourcemeta::core::is_alphanum(character) && character != '-') { return false; } } diff --git a/src/core/semver/CMakeLists.txt b/src/core/semver/CMakeLists.txt index 3141a6695..edf268a24 100644 --- a/src/core/semver/CMakeLists.txt +++ b/src/core/semver/CMakeLists.txt @@ -3,6 +3,8 @@ sourcemeta_library(NAMESPACE sourcemeta PROJECT core NAME semver SOURCES semver.cc) target_link_libraries(sourcemeta_core_semver PUBLIC sourcemeta::core::preprocessor) +target_link_libraries(sourcemeta_core_semver + PRIVATE sourcemeta::core::text) if(SOURCEMETA_CORE_INSTALL) sourcemeta_library_install(NAMESPACE sourcemeta PROJECT core NAME semver) diff --git a/src/core/semver/semver.cc b/src/core/semver/semver.cc index 9e30ae9c2..d40013e6c 100644 --- a/src/core/semver/semver.cc +++ b/src/core/semver/semver.cc @@ -1,4 +1,5 @@ #include +#include #include // std::array #include // std::to_chars @@ -8,17 +9,8 @@ namespace { -auto is_digit(const char character) -> bool { - return character >= '0' && character <= '9'; -} - -auto is_letter(const char character) -> bool { - return (character >= 'A' && character <= 'Z') || - (character >= 'a' && character <= 'z'); -} - auto is_identifier_character(const char character) -> bool { - return is_digit(character) || is_letter(character) || character == '-'; + return sourcemeta::core::is_alphanum(character) || character == '-'; } constexpr auto UINT64_MAX_VALUE = std::numeric_limits::max(); @@ -30,17 +22,19 @@ enum class NumericParseResult : std::uint8_t { success, invalid, overflow }; auto parse_numeric_identifier(const std::string_view input, std::size_t &position, std::uint64_t &result) -> NumericParseResult { - if (position >= input.size() || !is_digit(input[position])) { + if (position >= input.size() || + !sourcemeta::core::is_digit(input[position])) { return NumericParseResult::invalid; } if (input[position] == '0' && position + 1 < input.size() && - is_digit(input[position + 1])) { + sourcemeta::core::is_digit(input[position + 1])) { return NumericParseResult::invalid; } std::uint64_t value = 0; - while (position < input.size() && is_digit(input[position])) { + while (position < input.size() && + sourcemeta::core::is_digit(input[position])) { const auto digit = static_cast(input[position] - '0'); if (value > UINT64_MAX_DIV_10 || (value == UINT64_MAX_DIV_10 && digit > UINT64_MAX_MOD_10)) { @@ -67,7 +61,7 @@ auto validate_pre_release_identifier(const std::string_view identifier) return false; } - if (!is_digit(character)) { + if (!sourcemeta::core::is_digit(character)) { has_non_digit = true; } } @@ -130,7 +124,7 @@ auto classify_identifier(const std::string_view identifier) noexcept -> IdentifierInfo { std::uint64_t value = 0; for (const auto character : identifier) { - if (!is_digit(character)) { + if (!sourcemeta::core::is_digit(character)) { return {.is_numeric = false, .overflowed = false, .numeric_value = 0}; } diff --git a/src/core/uri/grammar.h b/src/core/uri/grammar.h index a329cc955..1c36df336 100644 --- a/src/core/uri/grammar.h +++ b/src/core/uri/grammar.h @@ -1,7 +1,7 @@ #ifndef SOURCEMETA_CORE_URI_GRAMMAR_H_ #define SOURCEMETA_CORE_URI_GRAMMAR_H_ -#include // std::isalnum, std::isalpha, std::isdigit +#include namespace sourcemeta::core { @@ -50,7 +50,7 @@ constexpr char URI_PERCENT = '%'; // unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" // See https://www.rfc-editor.org/rfc/rfc3986#section-2.3 inline auto uri_is_unreserved(const char character) -> bool { - if (std::isalnum(static_cast(character))) { + if (is_alphanum(character)) { return true; } @@ -89,7 +89,7 @@ inline auto uri_is_sub_delim(const char character) -> bool { // Scheme characters: ALPHA / DIGIT / "+" / "-" / "." // See https://www.rfc-editor.org/rfc/rfc3986#section-3.1 inline auto uri_is_scheme_char(const char character) -> bool { - if (std::isalnum(static_cast(character))) { + if (is_alphanum(character)) { return true; } diff --git a/src/lang/text/include/sourcemeta/core/text.h b/src/lang/text/include/sourcemeta/core/text.h index a5dab4ff8..e137a0297 100644 --- a/src/lang/text/include/sourcemeta/core/text.h +++ b/src/lang/text/include/sourcemeta/core/text.h @@ -169,6 +169,69 @@ inline auto is_lowercase(const String &value) noexcept -> bool { SOURCEMETA_CORE_TEXT_EXPORT auto is_lowercase(const std::filesystem::path &value) noexcept -> bool; +/// @ingroup text +/// +/// Return whether a character is an ASCII letter (A-Z or a-z). For example: +/// +/// ```cpp +/// #include +/// #include +/// +/// assert(sourcemeta::core::is_alpha('a')); +/// assert(sourcemeta::core::is_alpha('Z')); +/// assert(!sourcemeta::core::is_alpha('5')); +/// ``` +template + requires std::same_as || + std::same_as || + std::same_as || + std::same_as +inline constexpr auto is_alpha(const Character character) noexcept -> bool { + return (character >= 'a' && character <= 'z') || + (character >= 'A' && character <= 'Z'); +} + +/// @ingroup text +/// +/// Return whether a character is an ASCII digit (0-9). For example: +/// +/// ```cpp +/// #include +/// #include +/// +/// assert(sourcemeta::core::is_digit('5')); +/// assert(!sourcemeta::core::is_digit('a')); +/// ``` +template + requires std::same_as || + std::same_as || + std::same_as || + std::same_as +inline constexpr auto is_digit(const Character character) noexcept -> bool { + return character >= '0' && character <= '9'; +} + +/// @ingroup text +/// +/// Return whether a character is an ASCII letter or digit. For example: +/// +/// ```cpp +/// #include +/// #include +/// +/// assert(sourcemeta::core::is_alphanum('a')); +/// assert(sourcemeta::core::is_alphanum('5')); +/// assert(!sourcemeta::core::is_alphanum('-')); +/// ``` +template + requires std::same_as || + std::same_as || + std::same_as || + std::same_as +inline constexpr auto is_alphanum(const Character character) noexcept -> bool { + return is_alpha(character) || is_digit(character); +} + /// @ingroup text /// /// Truncate a string in place to at most `maximum_length` bytes, appending diff --git a/test/text/CMakeLists.txt b/test/text/CMakeLists.txt index c8deaf3dc..05ce054ce 100644 --- a/test/text/CMakeLists.txt +++ b/test/text/CMakeLists.txt @@ -1,6 +1,7 @@ sourcemeta_googletest(NAMESPACE sourcemeta PROJECT core NAME text SOURCES text_to_title_case_test.cc text_to_lowercase_test.cc text_is_lowercase_test.cc + text_is_alpha_test.cc text_is_digit_test.cc text_is_alphanum_test.cc text_truncate_test.cc text_remove_suffix_ignore_case_test.cc text_equals_ignore_case_test.cc text_trim_test.cc text_take_until_test.cc text_split_once_test.cc diff --git a/test/text/text_is_alpha_test.cc b/test/text/text_is_alpha_test.cc new file mode 100644 index 000000000..e578702a5 --- /dev/null +++ b/test/text/text_is_alpha_test.cc @@ -0,0 +1,34 @@ +#include + +#include + +TEST(Text_is_alpha, lowercase_letters) { + EXPECT_TRUE(sourcemeta::core::is_alpha('a')); + EXPECT_TRUE(sourcemeta::core::is_alpha('m')); + EXPECT_TRUE(sourcemeta::core::is_alpha('z')); +} + +TEST(Text_is_alpha, uppercase_letters) { + EXPECT_TRUE(sourcemeta::core::is_alpha('A')); + EXPECT_TRUE(sourcemeta::core::is_alpha('M')); + EXPECT_TRUE(sourcemeta::core::is_alpha('Z')); +} + +TEST(Text_is_alpha, digits) { + EXPECT_FALSE(sourcemeta::core::is_alpha('0')); + EXPECT_FALSE(sourcemeta::core::is_alpha('9')); +} + +TEST(Text_is_alpha, punctuation) { + EXPECT_FALSE(sourcemeta::core::is_alpha('-')); + EXPECT_FALSE(sourcemeta::core::is_alpha('_')); + EXPECT_FALSE(sourcemeta::core::is_alpha(' ')); +} + +TEST(Text_is_alpha, ascii_boundaries) { + // The ASCII characters immediately outside the two letter ranges + EXPECT_FALSE(sourcemeta::core::is_alpha('@')); + EXPECT_FALSE(sourcemeta::core::is_alpha('[')); + EXPECT_FALSE(sourcemeta::core::is_alpha('`')); + EXPECT_FALSE(sourcemeta::core::is_alpha('{')); +} diff --git a/test/text/text_is_alphanum_test.cc b/test/text/text_is_alphanum_test.cc new file mode 100644 index 000000000..9b6edb3f1 --- /dev/null +++ b/test/text/text_is_alphanum_test.cc @@ -0,0 +1,20 @@ +#include + +#include + +TEST(Text_is_alphanum, letters) { + EXPECT_TRUE(sourcemeta::core::is_alphanum('a')); + EXPECT_TRUE(sourcemeta::core::is_alphanum('Z')); +} + +TEST(Text_is_alphanum, digits) { + EXPECT_TRUE(sourcemeta::core::is_alphanum('0')); + EXPECT_TRUE(sourcemeta::core::is_alphanum('9')); +} + +TEST(Text_is_alphanum, punctuation) { + EXPECT_FALSE(sourcemeta::core::is_alphanum('-')); + EXPECT_FALSE(sourcemeta::core::is_alphanum('_')); + EXPECT_FALSE(sourcemeta::core::is_alphanum('.')); + EXPECT_FALSE(sourcemeta::core::is_alphanum(' ')); +} diff --git a/test/text/text_is_digit_test.cc b/test/text/text_is_digit_test.cc new file mode 100644 index 000000000..c2ebdb50e --- /dev/null +++ b/test/text/text_is_digit_test.cc @@ -0,0 +1,25 @@ +#include + +#include + +TEST(Text_is_digit, digits) { + EXPECT_TRUE(sourcemeta::core::is_digit('0')); + EXPECT_TRUE(sourcemeta::core::is_digit('5')); + EXPECT_TRUE(sourcemeta::core::is_digit('9')); +} + +TEST(Text_is_digit, letters) { + EXPECT_FALSE(sourcemeta::core::is_digit('a')); + EXPECT_FALSE(sourcemeta::core::is_digit('Z')); +} + +TEST(Text_is_digit, punctuation) { + EXPECT_FALSE(sourcemeta::core::is_digit('-')); + EXPECT_FALSE(sourcemeta::core::is_digit(' ')); +} + +TEST(Text_is_digit, ascii_boundaries) { + // The ASCII characters immediately outside the digit range + EXPECT_FALSE(sourcemeta::core::is_digit('/')); + EXPECT_FALSE(sourcemeta::core::is_digit(':')); +}