diff --git a/builtin-functions/_functions.txt b/builtin-functions/_functions.txt index 1715ff5ff7..b7ccdecbfc 100644 --- a/builtin-functions/_functions.txt +++ b/builtin-functions/_functions.txt @@ -732,14 +732,6 @@ function setlocale ($category ::: int, $locale ::: string) ::: string | false; function iconv ($input_encoding ::: string, $output_encoding ::: string, $input_str ::: string) ::: string | false; -function mb_check_encoding ($str ::: string, $encoding ::: string = "1251") ::: bool; -function mb_strlen ($str ::: string, $encoding ::: string = "1251") ::: int; -function mb_strpos ($haystack ::: string, $needle ::: string, $offset ::: int = 0, $encoding ::: string = "1251") ::: int | false; -function mb_stripos ($haystack ::: string, $needle ::: string, $offset ::: int = 0, $encoding ::: string = "1251") ::: int | false; -function mb_strtolower ($str ::: string, $encoding ::: string = "1251") ::: string; -function mb_strtoupper ($str ::: string, $encoding ::: string = "1251") ::: string; -function mb_substr ($str ::: string, $start ::: int, $length ::: mixed = PHP_INT_MAX, $encoding ::: string = "1251") ::: string; - define('PHP_ROUND_HALF_UP', 123423141); define('PHP_ROUND_HALF_DOWN', 123423144); define('PHP_ROUND_HALF_EVEN', 123423145); @@ -1624,3 +1616,63 @@ class DateTimeImmutable implements DateTimeInterface { } function getenv(string $varname = '', bool $local_only = false): mixed; + +function mb_check_encoding(array|string $value, ?string $encoding = null): bool; +function mb_convert_encoding(array|string $string, string $to_encoding, array|string|null $from_encoding = null): array|string|false; +function mb_strlen(string $string, ?string $encoding = null): int; +function mb_strpos(string $haystack, string $needle, int $offset = 0, ?string $encoding = null): int|false; +function mb_stripos(string $haystack, string $needle, int $offset = 0, ?string $encoding = null): int|false; +function mb_strtolower(string $string, ?string $encoding = null): string; +function mb_strtoupper(string $string, ?string $encoding = null): string; +function mb_substr(string $string, int $start, ?int $length = null, ?string $encoding = null): string; +function mb_chr(int $codepoint, ?string $encoding = null): string|false; +function mb_convert_case(string $string, int $mode, ?string $encoding = null): string; +function mb_convert_kana(string $string, string $mode = "KV", ?string $encoding = null): string; +function mb_convert_variables(string $to_encoding, array|string $from_encoding, mixed &$vars): string|false; // ??? (change variable bytes + kwargs) +function mb_decode_mimeheader(string $string): string; +function mb_decode_numericentity(string $string, array $map, ?string $encoding = null): string; +function mb_detect_encoding(string $string, array|string|null $encodings = null, bool $strict = false): string|false; +function mb_detect_order(array|string|null $encoding = null): mixed; // return array|bool +function mb_encode_mimeheader(string $string, ?string $charset = null, ?string $transfer_encoding = null, string $newline = "\r\n", int $indent = 0): string; +function mb_encode_numericentity(string $string, array $map, ?string $encoding = null, bool $hex = false): string; +function mb_encoding_aliases(string $encoding): array; +function mb_ereg_match(string $pattern, string $string, ?string $options = null): bool; +function mb_ereg_replace_callback(string $pattern, callable $callback, string $string, ?string $options = null): string|false|null; +function mb_ereg_replace(string $pattern, string $replacement, string $string, ?string $options = null): string|false|null; +function mb_ereg_search_getpos(): int; +function mb_ereg_search_getregs(): array|false; +function mb_ereg_search_init(string $string, ?string $pattern = null, ?string $options = null): bool; +function mb_ereg_search_pos(?string $pattern = null, ?string $options = null): array|false; +function mb_ereg_search_regs(?string $pattern = null, ?string $options = null): array|false; +function mb_ereg_search_setpos(int $offset): bool; +function mb_ereg_search(?string $pattern = null, ?string $options = null): bool; +function mb_ereg(string $pattern, string $string, array &$matches = null): bool; +function mb_eregi_replace(string $pattern, string $replacement, string $string, ?string $options = null): string|false|null; +function mb_eregi(string $pattern, string $string, array &$matches = null): bool; +function mb_get_info(string $type = "all"): array|string|int|false; +function mb_http_input(?string $type = null): array|string|false; +function mb_http_output(?string $encoding = null): string|false; +function mb_internal_encoding(?string $encoding = null): string|false; +function mb_language(?string $language = null): string|false; +function mb_list_encodings(): array; +function mb_ord(string $string, ?string $encoding = null): int|false; +function mb_output_handler(string $string, int $status): string; +function mb_parse_str(string $string, array &$result): bool; +function mb_preferred_mime_name(string $encoding): string|false; +function mb_regex_encoding(?string $encoding = null): string|false; +function mb_regex_set_options(?string $options = null): string; +function mb_scrub(string $string, ?string $encoding = null): string; +function mb_send_mail(string $to, string $subject, string $message, array|string $additional_headers = [], ?string $additional_params = null): bool; +function mb_split(string $pattern, string $string, int $limit = -1): array|false; +function mb_str_split(string $string, int $length = 1, ?string $encoding = null): array; +function mb_strcut(string $string, int $start, ?int $length = null, ?string $encoding = null): string; +function mb_strimwidth(string $string, int $start, int $width, string $trim_marker = "", ?string $encoding = null): string; +function mb_stristr(string $haystack, string $needle, bool $before_needle = false, ?string $encoding = null): string|false; +function mb_strrchr(string $haystack, string $needle, bool $before_needle = false, ?string $encoding = null): string|false; +function mb_strrichr(string $haystack, string $needle, bool $before_needle = false, ?string $encoding = null): string|false; +function mb_strripos(string $haystack, string $needle, int $offset = 0, ?string $encoding = null): int|false; +function mb_strrpos(string $haystack, string $needle, int $offset = 0, string $encoding = null): int|false; +function mb_strstr(string $haystack, string $needle, bool $before_needle = false, ?string $encoding = null): string|false; +function mb_strwidth(string $string, ?string $encoding = null): int; +function mb_substitute_character(string|int|null $substitute_character = null): string|int|false; +function mb_substr_count(string $haystack, string $needle, ?string $encoding = null): int; \ No newline at end of file diff --git a/cmake/external-libraries.cmake b/cmake/external-libraries.cmake index 44a6734f28..cac16d83af 100644 --- a/cmake/external-libraries.cmake +++ b/cmake/external-libraries.cmake @@ -1,5 +1,7 @@ option(DOWNLOAD_MISSING_LIBRARIES "download and build missing libraries if needed" OFF) +option(MBFL "download and build libmbfl" OFF) cmake_print_variables(DOWNLOAD_MISSING_LIBRARIES) +cmake_print_variables(MBFL) function(handle_missing_library LIB_NAME) message(STATUS "------${LIB_NAME}---------") if(DOWNLOAD_MISSING_LIBRARIES) @@ -9,6 +11,33 @@ function(handle_missing_library LIB_NAME) endif() endfunction() +find_library(KPHP_TIMELIB kphp-timelib) +if(KPHP_TIMELIB) + add_library(kphp-timelib STATIC IMPORTED ${KPHP_TIMELIB}) +else() + handle_missing_library("kphp-timelib") + FetchContent_Declare(kphp-timelib GIT_REPOSITORY https://github.com/VKCOM/timelib) + message(STATUS "---------------------") + FetchContent_MakeAvailable(kphp-timelib) + include_directories(${kphp-timelib_SOURCE_DIR}/include) + add_definitions(-DKPHP_TIMELIB_LIB_DIR="${kphp-timelib_SOURCE_DIR}/objs") + add_link_options(-L${kphp-timelib_SOURCE_DIR}/objs) +endif() + +if(MBFL) + message(STATUS "MBFL=On, libmbfl will be downloaded and built") + add_compile_options(-DMBFL) + FetchContent_Declare(libmbfl GIT_REPOSITORY https://github.com/andreylzmw/libmbfl) + FetchContent_MakeAvailable(libmbfl) + include_directories(${libmbfl_SOURCE_DIR}/include) + add_definitions(-DLIBMBFL_LIB_DIR="${libmbfl_SOURCE_DIR}/objs") + add_link_options(-L${libmbfl_SOURCE_DIR}/objs) +endif() + +# '-Wno-redundant-move' flag works for C++/ObjC++ but not for C, +# so build C libraries above +add_compile_options(-Wno-redundant-move) + find_package(fmt QUIET) if(NOT fmt_FOUND) handle_missing_library("fmtlib") @@ -41,19 +70,6 @@ if(KPHP_TESTS) endif() endif() -find_library(KPHP_TIMELIB kphp-timelib) -if(KPHP_TIMELIB) - add_library(kphp-timelib STATIC IMPORTED ${KPHP_TIMELIB}) -else() - handle_missing_library("kphp-timelib") - FetchContent_Declare(kphp-timelib GIT_REPOSITORY https://github.com/VKCOM/timelib) - message(STATUS "---------------------") - FetchContent_MakeAvailable(kphp-timelib) - include_directories(${kphp-timelib_SOURCE_DIR}/include) - add_definitions(-DKPHP_TIMELIB_LIB_DIR="${kphp-timelib_SOURCE_DIR}/objs") - add_link_options(-L${kphp-timelib_SOURCE_DIR}/objs) -endif() - if(APPLE) if (DEFINED ENV{EPOLL_SHIM_REPO}) FetchContent_Declare( diff --git a/cmake/init-compilation-flags.cmake b/cmake/init-compilation-flags.cmake index 5d62ceabe6..5ed14874bb 100644 --- a/cmake/init-compilation-flags.cmake +++ b/cmake/init-compilation-flags.cmake @@ -76,6 +76,7 @@ if (PDO_DRIVER_PGSQL) add_definitions(-DPDO_DRIVER_PGSQL) add_compile_definitions(PDO_DRIVER_PGSQL_VERSION=${PostgreSQL_VERSION}) endif() + cmake_print_variables(PDO_DRIVER_PGSQL) option(KPHP_TESTS "Build the tests" ON) @@ -118,7 +119,7 @@ elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64") endif() add_compile_options(-Werror -Wall -Wextra -Wunused-function -Wfloat-conversion -Wno-sign-compare - -Wuninitialized -Wno-redundant-move -Wno-missing-field-initializers) + -Wuninitialized -Wno-missing-field-initializers) if(NOT APPLE) check_cxx_compiler_flag(-gz=zlib DEBUG_COMPRESSION_IS_FOUND) diff --git a/compiler/compiler-settings.cpp b/compiler/compiler-settings.cpp index 3733401f06..5a9ab2ede7 100644 --- a/compiler/compiler-settings.cpp +++ b/compiler/compiler-settings.cpp @@ -283,6 +283,9 @@ void CompilerSettings::init() { remove_extra_spaces(extra_cxx_flags.value_); std::stringstream ss; + #ifdef MBFL + ss << " -DMBFL "; + #endif ss << extra_cxx_flags.get(); ss << " -iquote" << kphp_src_path.get() << " -iquote " << kphp_src_path.get() << "objs/generated/auto/runtime"; @@ -331,6 +334,11 @@ void CompilerSettings::init() { ld_flags.value_ += " -L /usr/local/lib"; #endif +#ifdef LIBMBFL_LIB_DIR + external_static_libs.emplace_back("libmbfl"); + ld_flags.value_ += " -L" LIBMBFL_LIB_DIR; +#endif + #if defined(__APPLE__) && defined(__arm64__) // for development under M1, manual installation of libucontext is needed // see the docs: https://vkcom.github.io/kphp/kphp-internals/developing-and-extending-kphp/compiling-kphp-from-sources.html diff --git a/runtime/interface.cpp b/runtime/interface.cpp index 3155d73b5d..93522425e8 100644 --- a/runtime/interface.cpp +++ b/runtime/interface.cpp @@ -2379,8 +2379,10 @@ static void free_runtime_libs() { free_kphp_backtrace(); free_migration_php8(); - free_detect_incorrect_encoding_names(); + #ifndef MBFL + free_detect_incorrect_encoding_names(); + #endif vk::singleton::get().reset_buffers(); #ifdef PDO_DRIVER_MYSQL database_drivers::free_mysql_lib(); diff --git a/runtime/mbstring.h b/runtime/mbstring.h deleted file mode 100644 index 9685f4be76..0000000000 --- a/runtime/mbstring.h +++ /dev/null @@ -1,30 +0,0 @@ -// Compiler for PHP (aka KPHP) -// Copyright (c) 2020 LLC «V Kontakte» -// Distributed under the GPL v3 License, see LICENSE.notice.txt - -#pragma once - -#include - -#include "runtime/kphp_core.h" -#include "runtime/string_functions.h" - -bool mb_UTF8_check(const char *s); - -bool f$mb_check_encoding(const string &str, const string &encoding = CP1251); - -int64_t f$mb_strlen(const string &str, const string &encoding = CP1251); - -string f$mb_strtolower(const string &str, const string &encoding = CP1251); - -string f$mb_strtoupper(const string &str, const string &encoding = CP1251); - -Optional f$mb_strpos(const string &haystack, const string &needle, int64_t offset = 0, const string &encoding = CP1251) noexcept; - -Optional f$mb_stripos(const string &haystack, const string &needle, int64_t offset = 0, const string &encoding = CP1251) noexcept; - -string f$mb_substr(const string &str, int64_t start, const mixed &length = std::numeric_limits::max(), const string &encoding = CP1251); - -void f$set_detect_incorrect_encoding_names_warning(bool show); - -void free_detect_incorrect_encoding_names(); diff --git a/runtime/mbstring.cpp b/runtime/mbstring/mbstring.cpp similarity index 75% rename from runtime/mbstring.cpp rename to runtime/mbstring/mbstring.cpp index 8fa5a03be6..0e11898e04 100644 --- a/runtime/mbstring.cpp +++ b/runtime/mbstring/mbstring.cpp @@ -1,8 +1,155 @@ -// Compiler for PHP (aka KPHP) -// Copyright (c) 2020 LLC «V Kontakte» -// Distributed under the GPL v3 License, see LICENSE.notice.txt +#include "mbstring.h" -#include "runtime/mbstring.h" +bool mb_UTF8_check(const char *s) { + do { +#define CHECK(condition) if (!(condition)) {return false;} + unsigned int a = (unsigned char)(*s++); + if ((a & 0x80) == 0) { + if (a == 0) { + return true; + } + continue; + } + + CHECK ((a & 0x40) != 0); + + unsigned int b = (unsigned char)(*s++); + CHECK((b & 0xc0) == 0x80); + if ((a & 0x20) == 0) { + CHECK((a & 0x1e) > 0); + continue; + } + + unsigned int c = (unsigned char)(*s++); + CHECK((c & 0xc0) == 0x80); + if ((a & 0x10) == 0) { + int x = (((a & 0x0f) << 6) | (b & 0x20)); + CHECK(x != 0 && x != 0x360);//surrogates + continue; + } + + unsigned int d = (unsigned char)(*s++); + CHECK((d & 0xc0) == 0x80); + if ((a & 0x08) == 0) { + int t = (((a & 0x07) << 6) | (b & 0x30)); + CHECK(0 < t && t < 0x110);//end of unicode + continue; + } + + return false; +#undef CHECK + } while (true); + + php_assert (0); +} + +#ifdef MBFL +extern "C" { + #include +} + +mbfl_string *convert_encoding(const char *str, const char *to, const char *from) { + + int len = strlen(str); + enum mbfl_no_encoding from_encoding, to_encoding; + mbfl_buffer_converter *convd = NULL; + mbfl_string _string, result, *ret; + + /* from internal to mbfl */ + from_encoding = mbfl_name2no_encoding(from); + to_encoding = mbfl_name2no_encoding(to); + + /* init buffer mbfl strings */ + mbfl_string_init(&_string); + mbfl_string_init(&result); + _string.no_encoding = from_encoding; + _string.len = len; + _string.val = (unsigned char*)str; + + /* converting */ + convd = mbfl_buffer_converter_new(from_encoding, to_encoding, 0); + ret = mbfl_buffer_converter_feed_result(convd, &_string, &result); + mbfl_buffer_converter_delete(convd); + + /* fix converting with multibyte encodings */ + if (len % 2 != 0 && ret->len % 2 == 0 && len < ret->len) { + ret->len++; + ret->val[ret->len-1] = 63; + } + + return ret; +} + +bool check_encoding(const char *value, const char *encoding) { + + /* init buffer mbfl strins */ + mbfl_string _string; + mbfl_string_init(&_string); + _string.val = (unsigned char*)value; + _string.len = strlen((char*)value); + + /* from internal to mbfl */ + const mbfl_encoding *enc = mbfl_name2encoding(encoding); + + /* get all supported encodings */ + const mbfl_encoding **encs = mbfl_get_supported_encodings(); + int len = sizeof(**encs); + + /* identify encoding of input string */ + /* Warning! String can be represented in different encodings, so check needed */ + const mbfl_encoding *i_enc = mbfl_identify_encoding2(&_string, encs, len, 1); + + /* perform convering */ + const char *i_enc_str = (const char*)convert_encoding(value, i_enc->name, enc->name)->val; + const char *enc_str = (const char*)convert_encoding(i_enc_str, enc->name, i_enc->name)->val; + + /* check equality */ + /* Warning! strcmp not working, because of different encodings */ + bool res = true; + for (int i = 0; i < strlen(enc_str); i++) + if (enc_str[i] != value[i]) { + res = false; + break; + } + + free((void*)i_enc_str); + free((void*)enc_str); + return res; +} + +// TODO: check for array as value +mixed f$mb_convert_encoding(const mixed &str, const string &to_encoding, const mixed &from_encoding) { + + if (str.is_string() && from_encoding.is_string()) { + const string &s = str.to_string(); + const string &from = from_encoding.to_string(); + + const char *c_string = s.c_str(); + const char *c_to_encoding = to_encoding.c_str(); + const char *c_from_encoding = from.c_str(); + + /* perform convertion */ + mbfl_string *ret = convert_encoding(c_string, c_to_encoding, c_from_encoding); + string res = string((const char*)ret->val, ret->len); + + /* check if string represents in from_encoding, magic number 63 - '?' in ASCII */ + if (!check_encoding(c_string, c_from_encoding)) res = string(strlen(c_string), (char)63); + + return res; + } + return 0; +} + +// TODO: check for optional value +bool f$mb_check_encoding(const mixed &value, const Optional &encoding) { + const string &val = value.to_string(); + const string &enc = encoding.val(); + const char *c_value = val.c_str(); + const char *c_encoding = enc.c_str(); + return check_encoding(c_value, c_encoding); +} + +#else #include "common/unicode/unicode-utils.h" #include "common/unicode/utf8-utils.h" @@ -87,49 +234,6 @@ static int64_t mb_UTF8_get_offset(const char *s, int64_t pos) { return res; } -bool mb_UTF8_check(const char *s) { - do { -#define CHECK(condition) if (!(condition)) {return false;} - unsigned int a = (unsigned char)(*s++); - if ((a & 0x80) == 0) { - if (a == 0) { - return true; - } - continue; - } - - CHECK ((a & 0x40) != 0); - - unsigned int b = (unsigned char)(*s++); - CHECK((b & 0xc0) == 0x80); - if ((a & 0x20) == 0) { - CHECK((a & 0x1e) > 0); - continue; - } - - unsigned int c = (unsigned char)(*s++); - CHECK((c & 0xc0) == 0x80); - if ((a & 0x10) == 0) { - int x = (((a & 0x0f) << 6) | (b & 0x20)); - CHECK(x != 0 && x != 0x360);//surrogates - continue; - } - - unsigned int d = (unsigned char)(*s++); - CHECK((d & 0xc0) == 0x80); - if ((a & 0x08) == 0) { - int t = (((a & 0x07) << 6) | (b & 0x30)); - CHECK(0 < t && t < 0x110);//end of unicode - continue; - } - - return false; -#undef CHECK - } while (true); - - php_assert (0); -} - bool f$mb_check_encoding(const string &str, const string &encoding) { int encoding_num = mb_detect_encoding(encoding); if (encoding_num < 0) { @@ -388,3 +492,5 @@ string f$mb_substr(const string &str, int64_t start, const mixed &length_var, co return {str.c_str() + UTF8_start, static_cast(UTF8_length)}; } + +#endif \ No newline at end of file diff --git a/runtime/mbstring/mbstring.h b/runtime/mbstring/mbstring.h new file mode 100644 index 0000000000..2647d32163 --- /dev/null +++ b/runtime/mbstring/mbstring.h @@ -0,0 +1,744 @@ +#pragma once + +#include "runtime/kphp_core.h" +#include "common/type_traits/function_traits.h" +#include "common/vector-product.h" + +#include "runtime/kphp_core.h" +#include "runtime/math_functions.h" +#include "runtime/string_functions.h" + +bool mb_UTF8_check(const char *s); + +#ifdef MBFL + +/** + * Check if strings are valid for the specified encoding + * Checks if the specified byte stream is valid for the specified encoding. If value is of type array, all keys and values are validated recursively. + * It is useful to prevent so-called "Invalid Encoding Attack". + * @param array|string value The byte stream + * @param ?string encoding (default = null) The expected encoding + * @return bool Returns true on success or false on failure + */ +bool f$mb_check_encoding(const mixed &value, const Optional &encoding); + +/** + * Returns a string containing the character specified by the Unicode code point value, encoded in the specified encoding + * @param int codepoint A Unicode codepoint value, e.g. 128024 for U+1F418 ELEPHANT + * @param ?string encoding (default = null) The encoding parameter is the character encoding. If it is omitted or null, + * the internal character encoding value will be used. + * @return string|false A string containing the requested character, if it can be represented in the specified encoding or false on failure. + */ +Optional f$mb_chr(const int64_t codepoint, const Optional &encoding); + +/** + * Perform case folding on a string + * @param string str The string being converted + * @param int mode The mode of the conversion. It can be one of MB_CASE_UPPER, MB_CASE_LOWER, MB_CASE_TITLE, MB_CASE_FOLD, + * MB_CASE_UPPER_SIMPLE, MB_CASE_LOWER_SIMPLE, MB_CASE_TITLE_SIMPLE, MB_CASE_FOLD_SIMPLE + * @param ?string encoding (default = null) The encoding parameter is the character encoding. If it is omitted or null, + * the internal character encoding value will be used. + * @return string A case folded version of string converted in the way specified by mode + */ +string f$mb_convert_case(const string &str, const int64_t mode, const Optional &encoding); + +/** + * Convert from one character encoding to another + * @param array|string str The string or array to be converted + * @param string to_encoding The desired encoding of the result + * @param array|string|null from_encoding (default = null) The current encoding used to interpret string. + * Multiple encodings may be specified as an array or comma separated list, + * in which case the correct encoding will be guessed using the same algorithm as mb_detect_encoding(). + * If from_encoding is null or not specified, the mbstring.internal_encoding setting will be used if set, otherwise the default_charset setting. + * @return array|string|false The encoded string + */ +mixed f$mb_convert_encoding(const mixed &str, const string &to_encoding, const mixed &from_encoding); + +/** + * Convert "kana" one from another ("zen-kaku", "han-kaku" and more) + * @param string str The string being converted + * @param string mode The conversion option (default = "KV") + * r - Convert "zen-kaku" alphabets to "han-kaku" + * R - Convert "han-kaku" alphabets to "zen-kaku" + * n - Convert "zen-kaku" numbers to "han-kaku" + * N - Convert "han-kaku" numbers to "zen-kaku" + * a - Convert "zen-kaku" alphabets and numbers to "han-kaku" + * A - Convert "han-kaku" alphabets and numbers to "zen-kaku" + * (Characters included in "a", "A" options are U+0021 - U+007E excluding U+0022, U+0027, U+005C, U+007E) + * s - Convert "zen-kaku" space to "han-kaku" (U+3000 -> U+0020) + * S - Convert "han-kaku" space to "zen-kaku" (U+0020 -> U+3000) + * k - Convert "zen-kaku kata-kana" to "han-kaku kata-kana" + * K - Convert "han-kaku kata-kana" to "zen-kaku kata-kana" + * h - Convert "zen-kaku hira-gana" to "han-kaku kata-kana" + * H - Convert "han-kaku kata-kana" to "zen-kaku hira-gana" + * c - Convert "zen-kaku kata-kana" to "zen-kaku hira-gana" + * C - Convert "zen-kaku hira-gana" to "zen-kaku kata-kana" + * V - Collapse voiced sound notation and convert them into a character. Use with "K","H" + * @param ?string encoding (default = null) The encoding parameter is the character encoding. + * If it is omitted or null, the internal character encoding value will be used. + * @return string The converted string + */ +string f$mb_convert_kana(const string &str, const string &mode, const Optional &encoding); + +/** + * Convert character code in variable(s) + * @param string to_encoding The encoding that the string is being converted to + * @param array|string from_encoding is specified as an array or comma separated string, it tries to detect encoding from from-coding. + * When from_encoding is omitted, detect_order is used. + * @param mixed &vars References to the variable being converted. String, Array are accepted. mb_convert_variables() assumes + * all parameters have the same encoding. + * @return string|false The character encoding before conversion for success, or false for failure + */ +Optional f$mb_convert_variables(const string &to_encoding, const mixed &from_encoding, const mixed &vars); // ??? + +/** + * Decode string in MIME header field + * @param string str The string being decoded + * @return string The decoded string in internal character encoding + */ +string f$mb_decode_mimeheader(const string &string); + +/** + * Decode HTML numeric string reference to character + * @param string str The string being decoded + * @param array map An array that specifies the code area to convert + * @param ?string encoding (default = null) The encoding parameter is the character encoding. + * If it is omitted or null, the internal character encoding value will be used. + * @return string The converted string + */ +string f$mb_decode_numericentity(const string &str, const array &map, const Optional &encoding); + +/** + * Detect character encoding + * Detects the most likely character encoding for string string from an ordered list of candidates. Automatic detection of the intended character encoding + * can never be entirely reliable; without some additional information, it is similar to decoding an encrypted string without the key. It is always preferable + * to use an indication of character encoding stored or transmitted with the data, such as a "Content-Type" HTTP header. This function is most useful with + * multibyte encodings, where not all sequences of bytes form a valid string. If the input string contains such a sequence, that encoding will be rejected, + * and the next encoding checked. + * @param string str The string being inspected + * @param array|string|null encodings (default = null) A list of character encodings to try, in order. The list may be specified as an array of strings, + * or a single string separated by commas. If encodings is omitted or null, the current detect_order (set with the mbstring.detect_order configuration option, + * or mb_detect_order() function) will be used. + * @param bool strict (default = false) Controls the behaviour when string is not valid in any of the listed encodings. + * If strict is set to false, the closest matching encoding will be returned; if strict is set to true, false will be returned. + * @return string|false Controls the behaviour when string is not valid in any of the listed encodings. If strict is set to false, + * the closest matching encoding will be returned; if strict is set to true, false will be returned. The default value for strict can be set + * with the mbstring.strict_detection configuration option. + */ +Optional f$mb_detect_encoding(const string &str, const mixed &encodings, const bool strict = false); + +/** + * Set/Get character encoding detection order + * @param array|string|null encoding (default = null) encoding is an array or comma separated list of character encoding. See supported encodings. + * If encoding is omitted or null, it returns the current character encoding detection order as array. This setting affects + * mb_detect_encoding() and mb_send_mail(). + * @return array|bool When setting the encoding detection order, true is returned on success or false on failure. + * When getting the encoding detection order, an ordered array of the encodings is returned. + */ +mixed f$mb_detect_order(const mixed &encoding); + +/** + * Encode string for MIME header + * @param string str The string being encoded. Its encoding should be same as mb_internal_encoding() + * @param ?string charset (default = null) Specifies the name of the character set in which string is represented in. + * The default value is determined by the current NLS setting (mbstring.language) + * @param ?string transfer_encoding (default = null) Specifies the scheme of MIME encoding. + * It should be either "B" (Base64) or "Q" (Quoted-Printable). Falls back to "B" if not given. + * @param string newline (default = "\r\n") Specifies the EOL (end-of-line) marker with which mb_encode_mimeheader() performs line-folding + * (a » RFC term, the act of breaking a line longer than a certain length into multiple lines. The length is currently hard-coded to 74 characters). + * Falls back to "\r\n" (CRLF) if not given. + * @param int indent (default = 0) Indentation of the first line (number of characters in the header before string) + * @return string A converted version of the string represented in ASCII + */ +string f$mb_encode_mimeheader(const string &str, const Optional &charset, const Optional &transfer_encoding, const string &newline, const int64_t indent); + +/** + * Encode character to HTML numeric string reference + * Converts specified character codes in string string from character code to HTML numeric character reference + * @param string str The string being encoded + * @param array map Aarray specifies code area to convert + * @param ?string encding (default = null) The encoding parameter is the character encoding. If it is omitted or null, the internal character encoding value will be used + * @param bool hex (default = false) Whether the returned entity reference should be in hexadecimal notation (otherwise it is in decimal notation) + * @return string The converted string + */ +string f$mb_encode_numericentity(const string &str, const array &map, const Optional &encoding, const bool hex = false); + +/** + * Get aliases of a known encoding type + * @param string encoding The encoding type being checked, for aliases + * @return array Returns a numerically indexed array of encoding aliases + */ +array f$mb_encoding_aliases(const string &encoding); + +/** + * Regular expression match for multibyte string + * @param string pattern The regular expression pattern + * @param string str The string being evaluated + * @param ?string options (default = null) The search option. See mb_regex_set_options() for explanation + * @return bool Returns true if string matches the regular expression pattern, false if not + */ +bool f$mb_ereg_match(const string &pattern, const string &str, const Optional &options); + +/** + * Perform a regular expression search and replace with multibyte support using a callback + * Scans string for matches to pattern, then replaces the matched text with the output of callback function. + * The behavior of this function is almost identical to mb_ereg_replace(), except for the fact that instead of replacement parameter, + * one should specify a callback. + * @param string pattern The regular expression pattern. Multibyte characters may be used in pattern. + * @param callable callback A callback that will be called and passed an array of matched elements in the subject string. + * The callback should return the replacement string. You'll often need the callback function for a mb_ereg_replace_callback() in just one place. + * In this case you can use an anonymous function to declare the callback within the call to mb_ereg_replace_callback(). + * By doing it this way you have all information for the call in one place and do not clutter the function namespace with a callback + * function's name not used anywhere else. + * @param string str The string being checked + * @param ?string options (default = null) The search option. See mb_regex_set_options() for explanation + * @return string|false|null The resultant string on success, or false on error. If string is not valid for the current encoding, null is returned + */ +// Optional f$mb_ereg_replace_callback(const string &pattern, const CallableT &callback, const string &str, const Optional options); // callback + +/** + * Replace regular expression with multibyte support + * Scans string for matches to pattern, then replaces the matched text with replacement + * @param string pattern The regular expression pattern. Multibyte characters may be used in pattern + * @param string replacement The replacement text + * @param string str The string being checked + * @param ?string options (default = null) The search option. See mb_regex_set_options() for explanation + * @return string|false|null The resultant string on success, or false on error. If string is not valid for the current encoding, null is returned + */ +Optional f$mb_ereg_replace(const string &pattern, const string &replacement, const string &str, const Optional &options); + +/** + * Returns start point for next regular expression match + * @return int mb_ereg_search_getpos() returns the point to start regular expression match for mb_ereg_search(), mb_ereg_search_pos(), mb_ereg_search_regs(). + * The position is represented by bytes from the head of string. + */ +int64_t f$mb_ereg_search_getpos(void); + +/** + * Retrieve the result from the last multibyte regular expression match + * @return array|false An array including the sub-string of matched part by last mb_ereg_search(), mb_ereg_search_pos(), mb_ereg_search_regs(). + * If there are some matches, the first element will have the matched sub-string, the second element will have the first part grouped with brackets, + * the third element will have the second part grouped with brackets, and so on. It returns false on error. + */ +mixed f$mb_ereg_search_getregs(void); + +/** + * Setup string and regular expression for a multibyte regular expression match + * mb_ereg_search_init() sets string and pattern for a multibyte regular expression. + * These values are used for mb_ereg_search(), mb_ereg_search_pos(),and mb_ereg_search_regs(). + * @param string str The search string + * @param ?string pattern (default = null) The search pattern + * @param ?string options (default = null) The search option. See mb_regex_set_options() for explanation + * @return bool Returns true on success or false on failure + */ +bool f$mb_ereg_search_init(const string &str, const Optional &pattern, const Optional &options); + +/** + * Returns position and length of a matched part of the multibyte regular expression for a predefined multibyte string + * The string for match is specified by mb_ereg_search_init(). If it is not specified, the previous one will be used + * @param ?string pattern (default = null) The search pattern + * @param ?string options (default = null) The search option. See mb_regex_set_options() for explanation + * @return array|false An array containing two elements. The first element is the offset, in bytes, where the match begins relative to the start of + * the search string, and the second element is the length in bytes of the match. If an error occurs, false is returned. + */ +mixed f$mb_ereg_search_pos(const Optional &pattern, const Optional &options); + +/** + * Returns the matched part of a multibyte regular expression + * @param ?string pattern (default = null) The search pattern + * @param ?string options (deafult = null) The search option. See mb_regex_set_options() for explanation + * @return array|false mb_ereg_search_regs() executes the multibyte regular expression match, and if there are some matched part, + * it returns an array including substring of matched part as first element, the first grouped part with brackets as second element, + * the second grouped part as third element, and so on. It returns false on error. + */ +mixed f$mb_ereg_search_regs(const Optional &pattern, const Optional &options); + +/** + * Set start point of next regular expression match + * mb_ereg_search_setpos() sets the starting point of a match for mb_ereg_search(). + * @param int offset The position to set. If it is negative, it counts from the end of the string + * @return bool Returns true on success or false on failure + */ +bool f$mb_ereg_search_setpos(const int64_t offset); + +/** + * Multibyte regular expression match for predefined multibyte string + * @param ?string pattern (default = null) The search pattern + * @param ?string options (default = null) The search option. See mb_regex_set_options() for explanation + * @return bool mb_ereg_search() returns true if the multibyte string matches with the regular expression, or false otherwise. The string for matching + * is set by mb_ereg_search_init(). If pattern is not specified, the previous one is used. + */ +bool f$mb_ereg_search(const Optional &pattern, const Optional &options); + +/** + * Regular expression match with multibyte support + * @param string pattern The search pattern + * @param string str The search string + * @param array matches (default = null) If matches are found for parenthesized substrings of pattern and the function is called with the + * third argument matches, the matches will be stored in the elements of the array matches. If no matches are found, matches is set to an empty array. + * matches[1] will contain the substring which starts at the first left parenthesis; $matches[2] will contain the substring starting at the second, + * and so on. $matches[0] will contain a copy of the complete string matched. + * @return bool Returns whether pattern matches string + */ +bool f$mb_ereg(const string &pattern, const string &str, const array &matches); + +/** + * Replace regular expression with multibyte support ignoring case + * Scans string for matches to pattern, then replaces the matched text with replacement + * @param string pattern The regular expression pattern. Multibyte characters may be used. The case will be ignored + * @param string replacement The replacement text + * @param string str The searched string + * @param ?string options (default = null) The search option. See mb_regex_set_options() for explanation + * @return string|false|null The resultant string or false on error. If string is not valid for the current encoding, null is returned + */ +Optional f$mb_eregi_replace(const string &pattern, const string &replacement, const string &str, const Optional &options); + +/** + * Regular expression match ignoring case with multibyte support + * @param string pattern The regular expression pattern + * @param string str The string being searched + * @param array matches (default = null) If matches are found for parenthesized substrings of pattern and the function is called with the third argument matches, + * the matches will be stored in the elements of the array matches. If no matches are found, matches is set to an empty array. + * matches[1] will contain the substring which starts at the first left parenthesis; $matches[2] will contain the substring starting at the second, + * and so on. $matches[0] will contain a copy of the complete string matched. + * @return bool Returns whether pattern matches string + */ +bool f$mb_eregi(const string &pattern, const string &str, const array &matches); + +/** + * Get internal settings of mbstring + * @param string type (default = "all") If type is not specified or is specified as "all", "internal_encoding", "http_input", "http_output", + * "http_output_conv_mimetypes", "mail_charset", "mail_header_encoding", "mail_body_encoding", "illegal_chars", "encoding_translation", "language", + * "detect_order", "substitute_character" and "strict_detection" will be returned. + * If type is specified as "internal_encoding", "http_input", "http_output", "http_output_conv_mimetypes", "mail_charset", "mail_header_encoding", + * "mail_body_encoding", "illegal_chars", "encoding_translation", "language", "detect_order", "substitute_character" or "strict_detection" + * the specified setting parameter will be returned. + * @return array|string|int|false An array of type information if type is not specified, otherwise a specific type, or false on failure + */ +mixed f$mb_get_info(const string &type); + +/** + * Detect HTTP input character encoding + * @param ?string type (default = null) Input string specifies the input type. "G" for GET, "P" for POST, "C" for COOKIE, "S" for string, + * "L" for list, and "I" for the whole list (will return array). If type is omitted, it returns the last input type processed. + * @return array|string|false The character encoding name, as per the type, or an array of character encoding names, if type is "I". + * If mb_http_input() does not process specified HTTP input, it returns false. + */ +mixed f$mb_http_input(const Optional &type); + +/** + * Set/Get the HTTP output character encoding. Output after this function is called will be converted from the set internal encoding to encoding + * @param ?string encoding (default = null) If encoding is set, mb_http_output() sets the HTTP output character encoding to encoding. + * If encoding is omitted, mb_http_output() returns the current HTTP output character encoding. + * @return string|bool If encoding is omitted, mb_http_output() returns the current HTTP output character encoding. Otherwise, + * Returns true on success or false on failure. + */ +mixed f$mb_http_output(const Optional &encoding); + +/** + * Set/Get internal character encoding + * @param ?string encoding (default = null) encoding is the character encoding name used for the HTTP input character encoding conversion, + * HTTP output character encoding conversion, and the default character encoding for string functions defined by the mbstring module. + * You should notice that the internal encoding is totally different from the one for multibyte regex. + * @return string|bool If encoding is set, then Returns true on success or false on failure. + * In this case, the character encoding for multibyte regex is NOT changed. + * If encoding is omitted, then the current character encoding name is returned. + */ +mixed f$mb_internal_encoding(const Optional &encoding); + +/** + * Set/Get the current language + * @param ?string language (default = null) Used for encoding e-mail messages. The valid languages are listed in the following table. + * mb_send_mail() uses this setting to encode e-mail. + * +---------------------------+-------------+------------------+-----------+ + * | Language | Charset | Encoding | Alias | + * +---------------------------+-------------+------------------+-----------+ + * | German/de | ISO-8859-15 | Quoted-Printable | Deutsch | + * | English/en | ISO-8859-1 | Quoted-Printable | | + * | Armenian/hy | ArmSCII-8 | Quoted-Printable | | + * | Japanese/ja | ISO-2022-JP | BASE64 | | + * | Korean/ko | ISO-2022-KR | BASE64 | | + * | neutral | UTF-8 | BASE64 | | + * | Russian/ru | KOI8-R | Quoted-Printable | | + * | Turkish/tr | ISO-8859-9 | Quoted-Printable | | + * | Ukrainian/ua | KOI8-U | Quoted-Printable | | + * | uni | UTF-8 | BASE64 | universal | + * | Simplified Chinese/zh-cn | HZ | BASE64 | | + * | Traditional Chinese/zh-tw | BIG-5 | BASE64 | | + * +---------------------------+-------------+------------------+-----------+ + * @return string|bool If language is set and language is valid, it returns true. Otherwise, it returns false. When language is omitted or null, + * it returns the language name as a string + */ +mixed f$mb_language(const Optional &language); + +/** + * Returns an array of all supported encodings + * @return array Returns a numerically indexed array + */ +array f$mb_list_encodings(void); + +/** + * Returns the Unicode code point value of the given character. This function complements mb_chr(). + * @param string str A string + * @param string? encoding (default = null) The encoding parameter is the character encoding. If it is omitted or null, + * the internal character encoding value will be used. + * @return int|false The Unicode code point for the first character of string or false on failure. + */ +Optional f$mb_ord(const string &str, const Optional &encoding); + +/** + * mb_output_handler() is ob_start() callback function. mb_output_handler() converts characters in the output buffer from internal + * character encoding to HTTP output character encoding. + * @param string str The contents of the output buffer + * @param int status The status of the output buffer + * @return string The converted string + */ +string f$mb_output_handler(const string &str, const int64_t status); + +/** + * Parses GET/POST/COOKIE data and sets global variables. Since PHP does not provide raw POST/COOKIE data, it can only be used for GET data for now. + * It parses URL encoded data, detects encoding, converts coding to internal encoding and set values to the result array or global variables. + * @param string str The URL encoded data + * @param array result An array containing decoded and character encoded converted values + * @return bool Returns true on success or false on failure + */ +bool f$mb_parse_str(const string &str, const array &result); // result = map + +/** + * Get a MIME charset string for a specific encoding. + * @param string encoding The encoding being checked + * @return string|false The MIME charset string for character encoding encoding, or false if no charset is preferred for the given encoding + */ +Optional f$mb_preferred_mime_name(const string &encoding); + +/** + * Set/Get character encoding for a multibyte regex + * @param ?string encoding (default = null) The encoding parameter is the character encoding. If it is omitted or null, + * the internal character encoding value will be used + * @return string|bool If encoding is set, then Returns true on success or false on failure. In this case, the internal character encoding is NOT changed. + * If encoding is omitted, then the current character encoding name for a multibyte regex is returned + */ +mixed f$mb_regex_encoding(const Optional &encoding); + +/** + * Sets the default options described by options for multibyte regex functions + * @param ?string options (default = null) The options to set. This is a string where each character is an option. + * To set a mode, the mode character must be the last one set, however there can only be set one mode but multiple options + * + * Regex options: + * +--------+----------------------------------+ + * | Option | Meaning | + * +--------+----------------------------------+ + * | i | Ambiguity match on | + * | x | Enables extended pattern form | + * | m | '.' matches with newlines | + * | s | '^' -> '\A', '$' -> '\Z' | + * | p | Same as both the m and s options | + * | l | Finds longest matches | + * | n | Ignores empty matches | + * | e | eval() resulting code | + * +--------+----------------------------------+ + * + * Regex syntax modes: + * +------+----------------------------+ + * | Mode | Meaning | + * +------+----------------------------+ + * | j | Java (Sun java.util.regex) | + * | u | GNU regex | + * | g | grep | + * | c | Emacs | + * | r | Ruby | + * | z | Perl | + * | b | POSIX Basic regex | + * | d | POSIX Extended regex | + * +------+----------------------------+ + * + * @return string The previous options. If options is omitted or null, it returns the string that describes the current options + */ +string f$mb_regex_set_options(const Optional &options); + +/** + * This function is currently not documented; only its argument list is available. + * @param string str + * @param ?string encoding (default = null) + * @return string + */ +string f$mb_scrub(const string &str, const Optional &encoding); + +/** + * Sends email. Headers and messages are converted and encoded according to the mb_language() setting. + * It's a wrapper function for mail(), so see also mail() for detail + * @param string to The mail addresses being sent to. Multiple recipients may be specified by putting a comma between each address in to. + * This parameter is not automatically encoded + * @param string subject The subject of the mail + * @param string message The message of the mail + * @param array|string additional_headers (default = []) String or array to be inserted at the end of the email header. + * This is typically used to add extra headers (From, Cc, and Bcc). Multiple extra headers should be separated with a CRLF (\r\n). + * Validate parameter not to be injected unwanted headers by attackers. If an array is passed, its keys are the header names and its + * values are the respective header values + * Note: + * If messages are not received, try using a LF (\n) only. Some Unix mail transfer agents (most notably » qmail) replace LF by CRLF automatically + * (which leads to doubling CR if CRLF is used). This should be a last resort, as it does not comply with » RFC 2822. + * @param ?string additional_params (default = null) additional_params is a MTA command line parameter. It is useful when setting the correct Return-Path header + * when using sendmail. This parameter is escaped by escapeshellcmd() internally to prevent command execution. escapeshellcmd() prevents command execution, + * but allows to add additional parameters. For security reason, this parameter should be validated. Since escapeshellcmd() is applied automatically, + * some characters that are allowed as email addresses by internet RFCs cannot be used. Programs that are required to use these characters mail() cannot be used. + * The user that the webserver runs as should be added as a trusted user to the sendmail configuration to prevent a 'X-Warning' header from being added to + * the message when the envelope sender (-f) is set using this method. For sendmail users, this file is /etc/mail/trusted-users + * @return bool Returns true on success or false on failure + */ +bool f$mb_send_mail(const string &to, const string &subject, const string &message, const mixed &additional_headers, const Optional &additional_params); + +/** + * Split a multibyte string using regular expression pattern and returns the result as an array + * @param string pattern The regular expression pattern + * @param string str The string being split + * @param int limit (default = -1) If optional parameter limit is specified, it will be split in limit elements as maximum + * @return array|false The result as an array, or false on failure + */ +mixed f$mb_split(const string &pattern, const string &str, const int64_t limit = -1); + +/** + * This function will return an array of strings, it is a version of str_split() with support for encodings of variable character size as well + * as fixed-size encodings of 1,2 or 4 byte characters. If the length parameter is specified, the string is broken down into chunks of the specified + * length in characters (not bytes). The encoding parameter can be optionally specified and it is good practice to do so + * @param string str The string to split into characters or chunks + * @param int length (default = 1) If specified, each element of the returned array will be composed of multiple characters instead of a single character + * @param ?string encoding (default = null) The encoding parameter is the character encoding. If it is omitted or null, the internal character encoding value + * will be used. A string specifying one of the supported encodings + * @return array mb_str_split() returns an array of strings + */ +array f$mb_str_split(const string &str, const int64_t length, const Optional &encoding); + +/** + * mb_strcut() extracts a substring from a string similarly to mb_substr(), but operates on bytes instead of characters. + * If the cut position happens to be between two bytes of a multi-byte character, the cut is performed starting from the first byte of that character. + * This is also the difference to the substr() function, which would simply cut the string between the bytes and thus result in a malformed byte sequence + * @param string str The string being cut + * @param int start If start is non-negative, the returned string will start at the start'th byte position in string, counting from zero. + * For instance, in the string 'abcdef', the byte at position 0 is 'a', the byte at position 2 is 'c', and so forth. + * If start is negative, the returned string will start at the start'th byte counting back from the end of string. + * However, if the magnitude of a negative start is greater than the length of the string, the returned portion will start from the beginning of string + * @param ?int length (default = null) Length in bytes. If omitted or NULL is passed, extract all bytes to the end of the string. + * If length is negative, the returned string will end at the length'th byte counting back from the end of string. + * However, if the magnitude of a negative length is greater than the number of characters after the start position, an empty string will be returned + * @param ?string encoding The encoding parameter is the character encoding. If it is omitted or null, the internal character encoding value will be used + * @return string mb_strcut() returns the portion of string specified by the start and length parameters + */ +string f$mb_strcut(const string &str, const int64_t start, const Optional &length, const Optional &encoding); + +/** + * Truncates string string to specified width, where halfwidth characters count as 1, and fullwidth characters count as 2. + * See » http://www.unicode.org/reports/tr11/ for details regarding East Asian character widths + * @param string str The string being decoded + * @param int start The start position offset. Number of characters from the beginning of string (first character is 0), + * or if start is negative, number of characters from the end of the string + * @param int width The width of the desired trim. Negative widths count from the end of the string + * @param string trim_marker (default = "") A string that is added to the end of string when string is truncated + * @param ?string encoding (default = null) The encoding parameter is the character encoding. If it is omitted or null, + * the internal character encoding value will be used + * @return string The truncated string. If trim_marker is set, trim_marker replaces the last chars to match the width + */ +string f$mb_strimwidth(const string &str, const int64_t start, const int64_t width, const string &trim_marker, const Optional &encoding); + +/** + * mb_stripos() returns the numeric position of the first occurrence of needle in the haystack string. Unlike mb_strpos(), + * mb_stripos() is case-insensitive. If needle is not found, it returns false + * @param string haystack The string from which to get the position of the first occurrence of needl + * @param string needle The string to find in haystack + * @param int offset (default = 0) The position in haystack to start searching. A negative offset counts from the end of the string + * @param ?string encoding (default = null) Character encoding name to use. If it is omitted, internal character encoding is used + * @return int|false Return the numeric position of the first occurrence of needle in the haystack string, or false if needle is not found + */ +Optional f$mb_stripos(const string &haystack, const string &needle, const int64_t offset, const Optional &encoding); + +/** + * mb_stristr() finds the first occurrence of needle in haystack and returns the portion of haystack. + * Unlike mb_strstr(), mb_stristr() is case-insensitive. If needle is not found, it returns false + * @param string haystack The string from which to get the first occurrence of needle + * @param string needle The string to find in haystack + * @param bool before_needle (default = false) Determines which portion of haystack this function returns. + * If set to true, it returns all of haystack from the beginning to the first occurrence of needle (excluding needle). + * If set to false, it returns all of haystack from the first occurrence of needle to the end (including needle) + * @param ?string encoding (default = null) Character encoding name to use. If it is omitted, internal character encoding is used + * @return string|false Returns the portion of haystack, or false if needle is not found + */ +Optional f$mb_stristr(const string &haystack, const string &needle, const bool before_needle, const Optional &encoding); + +/** + * Gets the length of a string + * @param string str The string being checked for length + * @param ?string encoding (default = null) The encoding parameter is the character encoding. + * If it is omitted or null, the internal character encoding value will be used + * @return int Returns the number of characters in string string having character encoding encoding. A multi-byte character is counted as 1 + */ +int64_t f$mb_strlen(const string &str, const Optional &encoding); + +/** + * Finds position of the first occurrence of a string in a string. Performs a multi-byte safe strpos() operation based on number of characters. + * The first character's position is 0, the second character position is 1, and so on + * @param string haystack The string being checked + * @param string needle The string to find in haystack. In contrast with strpos(), numeric values are not applied as the ordinal value of a character + * @param int offset (default = 0) The search offset. If it is not specified, 0 is used. A negative offset counts from the end of the string + * @param ?string encoding (default = null) The encoding parameter is the character encoding. If it is omitted or null, + * the internal character encoding value will be used + * @return int|false Returns the numeric position of the first occurrence of needle in the haystack string. If needle is not found, it returns false + */ +Optional f$mb_strpos(const string &haystack, const string &needle, const int64_t offset, const Optional &encoding); + +/** + * mb_strrchr() finds the last occurrence of needle in haystack and returns the portion of haystack. If needle is not found, it returns false + * @param string haystack The string from which to get the last occurrence of needle + * @param string needle The string to find in haystack + * @param bool before_needle Determines which portion of haystack this function returns. + * If set to true, it returns all of haystack from the beginning to the last occurrence of needle. + * If set to false, it returns all of haystack from the last occurrence of needle to the end + * @param ?string encoding (default = null) Character encoding name to use. If it is omitted, internal character encoding is used + * @return string|false Returns the portion of haystack. or false if needle is not found + */ +Optional f$mb_strrchr(const string &haystack, const string &needle, const bool before_needle, const Optional &encoding); + +/** + * mb_strrichr() finds the last occurrence of needle in haystack and returns the portion of haystack. Unlike mb_strrchr(), mb_strrichr() is case-insensitive. + * If needle is not found, it returns false + * @param string haystack The string from which to get the last occurrence of needle + * @param string needle The string to find in haystack + * @param bool before_needle Determines which portion of haystack this function returns. + * If set to true, it returns all of haystack from the beginning to the last occurrence of needle. + * If set to false, it returns all of haystack from the last occurrence of needle to the end + * @param ?string encoding (default = null) + * @return string|false Character encoding name to use. If it is omitted, internal character encoding is used + */ +Optional f$mb_strrichr(const string &haystack, const string &needle, const bool before_needle, const Optional &encoding); + +/** + * mb_strripos() performs multi-byte safe strripos() operation based on number of characters. needle position is counted from the beginning of haystack. + * First character's position is 0. Second character position is 1. Unlike mb_strrpos(), mb_strripos() is case-insensitive + * @param string haystack The string from which to get the position of the last occurrence of needle + * @param string needle The string to find in haystack + * @param int offset The position in haystack to start searching + * @param ?string encoding (default = null) Character encoding name to use. If it is omitted, internal character encoding is used + * @return int|false Return the numeric position of the last occurrence of needle in the haystack string, or false if needle is not found + */ +Optional f$mb_strripos(const string &haystack, const string &needle, const int64_t offset, const Optional &encoding); + +/** + * Performs a multibyte safe strrpos() operation based on the number of characters. needle position is counted from the beginning of haystack. + * First character's position is 0. Second character position is 1 + * @param string haystack The string being checked, for the last occurrence of needle + * @param string needle The string to find in haystack + * @param int offset (default = 0) May be specified to begin searching an arbitrary number of characters into the string. Negative values will stop searching at an arbitrary point prior to the end of the string + * @param ?string encoding The encoding parameter is the character encoding. If it is omitted or null, the internal character encoding value will be used + * @return int|false Returns the numeric position of the last occurrence of needle in the haystack string. If needle is not found, it returns false + */ +Optional f$mb_strrpos(const string &haystack, const string &needle, const int64_t offset, const Optional &encoding); + +/** + * mb_strstr() finds the first occurrence of needle in haystack and returns the portion of haystack. If needle is not found, it returns false + * @param string haystack The string from which to get the first occurrence of needle + * @param string needle The string to find in haystack + * @param bool before_needle Determines which portion of haystack this function returns. + * If set to true, it returns all of haystack from the beginning to the first occurrence of needle (excluding needle). + * If set to false, it returns all of haystack from the first occurrence of needle to the end (including needle) + * @param ?string encoding (default = null) Character encoding name to use. If it is omitted, internal character encoding is used + * @return string|false Returns the portion of haystack, or false if needle is not found + */ +Optional f$mb_strstr(const string &haystack, const string &needle, const bool before_needle, const Optional &encoding); + +/** + * Returns string with all alphabetic characters converted to lowercase + * @param string str The string being lowercased + * @param ?string encoding (default = null) The encoding parameter is the character encoding. + * If it is omitted or null, the internal character encoding value will be used + * @return string string with all alphabetic characters converted to lowercase + */ +string f$mb_strtolower(const string &str, const Optional &encoding); + +/** + * Returns string with all alphabetic characters converted to uppercase. + * @param string str The string being uppercased + * @param ?string encoding (default = null) The encoding parameter is the character encoding. + * If it is omitted or null, the internal character encoding value will be used + * @return string string with all alphabetic characters converted to uppercase + */ +string f$mb_strtoupper(const string &str, const Optional &encoding); + +/** + * Returns the width of string string, where halfwidth characters count as 1, and fullwidth characters count as 2. + * See » http://www.unicode.org/reports/tr11/ for details regarding East Asian character widths. The fullwidth characters are: + * U+1100-U+115F, U+11A3-U+11A7, U+11FA-U+11FF, U+2329-U+232A, U+2E80-U+2E99, U+2E9B-U+2EF3, U+2F00-U+2FD5, U+2FF0-U+2FFB, U+3000-U+303E, U+3041-U+3096, + * U+3099-U+30FF, U+3105-U+312D, U+3131-U+318E, U+3190-U+31BA, U+31C0-U+31E3, U+31F0-U+321E, U+3220-U+3247, U+3250-U+32FE, U+3300-U+4DBF, U+4E00-U+A48C, + * U+A490-U+A4C6, U+A960-U+A97C, U+AC00-U+D7A3, U+D7B0-U+D7C6, U+D7CB-U+D7FB, U+F900-U+FAFF, U+FE10-U+FE19, U+FE30-U+FE52, U+FE54-U+FE66, U+FE68-U+FE6B, + * U+FF01-U+FF60, U+FFE0-U+FFE6, U+1B000-U+1B001, U+1F200-U+1F202, U+1F210-U+1F23A, U+1F240-U+1F248, U+1F250-U+1F251, U+20000-U+2FFFD, U+30000-U+3FFFD. + * All other characters are halfwidth characters + * @param string str The string being decoded + * @param ?string encoding (default = null) The encoding parameter is the character encoding. + * If it is omitted or null, the internal character encoding value will be used + * @return int The width of string string + */ +int64_t f$mb_strwidth(const string &str, const Optional &encoding); + +/** + * Specifies a substitution character when input character encoding is invalid or character code does not exist in output character encoding. + * Invalid characters may be substituted "none" (no output), string or int value (Unicode character code value). + * This setting affects mb_convert_encoding(), mb_convert_variables(), mb_output_handler(), and mb_send_mail() + * @param string|int|null substitute_character (default = null) Specify the Unicode value as an int, or as one of the following strings: + * "none": no output + * "long": Output character code value (Example: U+3000, JIS+7E7E) + * "entity": Output character entity (Example: Ȁ) + * @return string|int|bool If substitute_character is set, it returns true for success, otherwise returns false. + * If substitute_character is not set, it returns the current setting + */ +mixed f$mb_substitute_character(const mixed &substitute_character); + +/** + * Counts the number of times the needle substring occurs in the haystack string + * @param string haystack The string being checked + * @param string needle The string being found + * @param ?string encoding (default = null) The encoding parameter is the character encoding. + * If it is omitted or null, the internal character encoding value will be used + * @return int The number of times the needle substring occurs in the haystack string + */ +int64_t f$mb_substr_count(const string &haystack, const string &needle, const Optional &encoding); + +/** + * Performs a multi-byte safe substr() operation based on number of characters. Position is counted from the beginning of string. + * First character's position is 0. Second character position is 1, and so on + * @param string str The string to extract the substring from + * @param int start If start is non-negative, the returned string will start at the start'th position in string, counting from zero. + * For instance, in the string 'abcdef', the character at position 0 is 'a', the character at position 2 is 'c', and so forth. + * If start is negative, the returned string will start at the start'th character from the end of string + * @param ?int length (default = null) Maximum number of characters to use from string. + * If omitted or NULL is passed, extract all characters to the end of the string + * @param ?string encoding (default = null) The encoding parameter is the character encoding. + * If it is omitted or null, the internal character encoding value will be used + * @return string mb_substr() returns the portion of string specified by the start and length parameters + */ +string f$mb_substr(const string &str, const int64_t start, const Optional &length, const Optional &encoding); + +#else + +#include + +#include "runtime/kphp_core.h" +#include "runtime/string_functions.h" + +bool f$mb_check_encoding(const string &str, const string &encoding = CP1251); + +int64_t f$mb_strlen(const string &str, const string &encoding = CP1251); + +string f$mb_strtolower(const string &str, const string &encoding = CP1251); + +string f$mb_strtoupper(const string &str, const string &encoding = CP1251); + +Optional f$mb_strpos(const string &haystack, const string &needle, int64_t offset = 0, const string &encoding = CP1251) noexcept; + +Optional f$mb_stripos(const string &haystack, const string &needle, int64_t offset = 0, const string &encoding = CP1251) noexcept; + +string f$mb_substr(const string &str, int64_t start, const mixed &length = std::numeric_limits::max(), const string &encoding = CP1251); + +void f$set_detect_incorrect_encoding_names_warning(bool show); + +void free_detect_incorrect_encoding_names(); + +#endif \ No newline at end of file diff --git a/runtime/regexp.h b/runtime/regexp.h index 8c20fe98ad..5eb579b447 100644 --- a/runtime/regexp.h +++ b/runtime/regexp.h @@ -9,7 +9,7 @@ #include "common/mixin/not_copyable.h" #include "runtime/kphp_core.h" -#include "runtime/mbstring.h" +#include "runtime/mbstring/mbstring.h" namespace re2 { class RE2; diff --git a/runtime/runtime.cmake b/runtime/runtime.cmake index 544e979941..3df11e3680 100644 --- a/runtime/runtime.cmake +++ b/runtime/runtime.cmake @@ -49,7 +49,11 @@ prepend(KPHP_RUNTIME_PDO_PGSQL_SOURCES pdo/pgsql/ pgsql_pdo_emulated_statement.cpp) endif() +prepend(KPHP_RUNTIME_MBSTRING_SOURCES mbstring/ + mbstring.cpp) + prepend(KPHP_RUNTIME_SOURCES ${BASE_DIR}/runtime/ + ${KPHP_RUNTIME_MBSTRING_SOURCES} ${KPHP_RUNTIME_DATETIME_SOURCES} ${KPHP_RUNTIME_MEMORY_RESOURCE_SOURCES} ${KPHP_RUNTIME_MSGPACK_SOURCES} @@ -82,7 +86,6 @@ prepend(KPHP_RUNTIME_SOURCES ${BASE_DIR}/runtime/ kphp-backtrace.cpp mail.cpp math_functions.cpp - mbstring.cpp memcache.cpp memory_usage.cpp migration_php8.cpp @@ -140,6 +143,10 @@ vk_add_library(kphp_runtime OBJECT ${KPHP_RUNTIME_ALL_SOURCES}) target_include_directories(kphp_runtime PUBLIC ${BASE_DIR} /opt/curl7600/include) add_dependencies(kphp_runtime kphp-timelib) +if (MBFL) + add_dependencies(kphp_runtime libmbfl) +endif() + prepare_cross_platform_libs(RUNTIME_LIBS yaml-cpp re2 zstd h3) # todo: linking between static libs is no-op, is this redundant? do we need to add mysqlclient here? set(RUNTIME_LIBS vk::kphp_runtime vk::kphp_server vk::popular_common vk::unicode vk::common_src vk::binlog_src vk::net_src ${RUNTIME_LIBS} OpenSSL::Crypto m z pthread) @@ -158,6 +165,10 @@ if (PDO_DRIVER_PGSQL) list(APPEND RUNTIME_LINK_TEST_LIBS PostgreSQL::PostgreSQL) endif() +if (MBFL) + list(APPEND RUNTIME_LINK_TEST_LIBS libmbfl) +endif() + file(GLOB_RECURSE KPHP_RUNTIME_ALL_HEADERS RELATIVE ${BASE_DIR} CONFIGURE_DEPENDS diff --git a/server/server-stats.cpp b/server/server-stats.cpp index 6406e09ed7..a45fcf1f4f 100644 --- a/server/server-stats.cpp +++ b/server/server-stats.cpp @@ -143,8 +143,6 @@ struct EnumTable : std::array(E::Key::types_count)> { template struct Percentiles { T p50{}; - T p75{}; - T p90{}; T p95{}; T p99{}; T max{}; @@ -154,8 +152,6 @@ struct Percentiles { void update_percentiles(I first, I last, const Mapper &mapper = {}) noexcept { const auto size = last - first; set_percentile<50>(p50, first, size, mapper); - set_percentile<75>(p75, first, size, mapper); - set_percentile<90>(p90, first, size, mapper); set_percentile<95>(p95, first, size, mapper); set_percentile<99>(p99, first, size, mapper); set_percentile<100>(max, first, size, mapper); @@ -702,8 +698,6 @@ template void write_to(stats_t *stats, const char *prefix, const char *suffix, const AggregatedSamples &samples, const Mapper &mapper = {}) { if (stats->need_aggregated_stats()) { stats->add_gauge_stat(mapper(samples.percentiles.p50), prefix, suffix, ".p50"); - stats->add_gauge_stat(mapper(samples.percentiles.p75), prefix, suffix, ".p75"); - stats->add_gauge_stat(mapper(samples.percentiles.p90), prefix, suffix, ".p90"); stats->add_gauge_stat(mapper(samples.percentiles.p95), prefix, suffix, ".p95"); stats->add_gauge_stat(mapper(samples.percentiles.p99), prefix, suffix, ".p99"); stats->add_gauge_stat(mapper(samples.percentiles.max), prefix, suffix, ".max"); @@ -714,8 +708,6 @@ template void write_to(stats_t *stats, const char *prefix, const char *suffix, const WorkerSamples &samples, const Mapper &mapper = {}) { if (stats->need_aggregated_stats()) { stats->add_gauge_stat(mapper(samples.percentiles.p50), prefix, suffix, ".p50"); - stats->add_gauge_stat(mapper(samples.percentiles.p75), prefix, suffix, ".p75"); - stats->add_gauge_stat(mapper(samples.percentiles.p90), prefix, suffix, ".p90"); stats->add_gauge_stat(mapper(samples.percentiles.p95), prefix, suffix, ".p95"); stats->add_gauge_stat(mapper(samples.percentiles.p99), prefix, suffix, ".p99"); stats->add_gauge_stat(mapper(samples.percentiles.max), prefix, suffix, ".max"); diff --git a/tests/cpp/runtime/mbstring-test.cpp b/tests/cpp/runtime/mbstring-test.cpp new file mode 100644 index 0000000000..2a0a484302 --- /dev/null +++ b/tests/cpp/runtime/mbstring-test.cpp @@ -0,0 +1,21 @@ +#include +#include "runtime/mbstring/mbstring.h" + +#ifdef MBFL +/* TODO: make fun strings for tests */ + +TEST(mbstring_test, test_mb_check_encoding) { + ASSERT_TRUE(f$mb_check_encoding(string("sdf"), string("Windows-1251"))); + ASSERT_TRUE(f$mb_check_encoding(string("ыва"), string("Windows-1251"))); + ASSERT_TRUE(f$mb_check_encoding(string("İnanç Esasları"), string("UTF-8"))); + ASSERT_TRUE(f$mb_check_encoding(string("İnanç Esasları"), string("Windows-1251"))); + ASSERT_FALSE(f$mb_check_encoding(string("İnanç Esasları"), string("ASCII"))); +} + +TEST(mbstring_test, test_mb_convert_encoding) { + ASSERT_STREQ(f$mb_convert_encoding(string("Hello"), string("UTF-8"), string("EUC-KR")).to_string().c_str(), "Hello"); + ASSERT_STREQ(f$mb_convert_encoding(string("ыавыа"), string("UTF-8"), string("Windows-1251")).to_string().c_str(), "ыавыа"); + ASSERT_STREQ(f$mb_convert_encoding(string("ыва"), string("UTF-8"), string("ASCII")).to_string().c_str(), "??????"); +} + +#endif \ No newline at end of file diff --git a/tests/cpp/runtime/runtime-tests.cmake b/tests/cpp/runtime/runtime-tests.cmake index 88d4255228..aea2f10c3d 100644 --- a/tests/cpp/runtime/runtime-tests.cmake +++ b/tests/cpp/runtime/runtime-tests.cmake @@ -1,26 +1,27 @@ prepend(RUNTIME_TESTS_SOURCES ${BASE_DIR}/tests/cpp/runtime/ - _runtime-tests-env.cpp - allocator-malloc-replacement-test.cpp - array-test.cpp - common-php-functions-test.cpp - confdata-functions-test.cpp - confdata-key-maker-test.cpp - confdata-predefined-wildcards-test.cpp - flex-test.cpp - inter-process-mutex-test.cpp - inter-process-resource-test.cpp - json-writer-test.cpp - number-string-comparison.cpp - kphp-type-traits-test.cpp - msgpack-test.cpp - memory_resource/details/memory_chunk_list-test.cpp - memory_resource/details/memory_chunk_tree-test.cpp - memory_resource/details/memory_ordered_chunk_list-test.cpp - memory_resource/extra-memory-pool-test.cpp - memory_resource/unsynchronized_pool_resource-test.cpp - string-list-test.cpp - string-test.cpp - zstd-test.cpp) + _runtime-tests-env.cpp + allocator-malloc-replacement-test.cpp + array-test.cpp + common-php-functions-test.cpp + confdata-functions-test.cpp + confdata-key-maker-test.cpp + confdata-predefined-wildcards-test.cpp + flex-test.cpp + inter-process-mutex-test.cpp + inter-process-resource-test.cpp + json-writer-test.cpp + number-string-comparison.cpp + kphp-type-traits-test.cpp + msgpack-test.cpp + memory_resource/details/memory_chunk_list-test.cpp + memory_resource/details/memory_chunk_tree-test.cpp + memory_resource/details/memory_ordered_chunk_list-test.cpp + memory_resource/extra-memory-pool-test.cpp + memory_resource/unsynchronized_pool_resource-test.cpp + string-list-test.cpp + string-test.cpp + zstd-test.cpp + mbstring-test.cpp) allow_deprecated_declarations_for_apple(${BASE_DIR}/tests/cpp/runtime/inter-process-mutex-test.cpp) vk_add_unittest(runtime "${RUNTIME_LIBS};${RUNTIME_LINK_TEST_LIBS}" ${RUNTIME_TESTS_SOURCES})