用于EagleEye3.0 规则集漏报和误报测试的示例项目,项目收集于github和gitee
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

2702 lines
109 KiB

/* Copyright (c) 2014, 2019, Oracle and/or its affiliates. All rights reserved.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License, version 2.0,
as published by the Free Software Foundation.
This program is also distributed with certain software (including
but not limited to OpenSSL) that is licensed under separate terms,
as designated in a particular file or component or in included license
documentation. The authors of MySQL hereby grant you an additional
permission to link the program and your derivative works with the
separately licensed software that they have included with MySQL.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License, version 2.0, for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
/*
Bug#16403708 SUBOPTIMAL CODE IN MY_STRNXFRM_SIMPLE()
Bug#68476 Suboptimal code in my_strnxfrm_simple()
Below we test some alternative implementations for my_strnxfrm_simple.
In order to do benchmarking, configure in optimized mode, and
generate a separate executable for this file:
cmake -DMERGE_UNITTESTS=0
You may want to tweak some constants below:
- experiment with num_iterations
run './strings_strnxfrm-t --disable-tap-output'
to see timing reports for your platform.
Benchmarking with gcc and clang indicates that:
There is insignificant difference between my_strnxfrm_simple and strnxfrm_new
when src != dst
my_strnxfrm_simple() is significantly faster than strnxfrm_new
when src == dst, especially for long strings.
Loop unrolling gives significant speedup for large strings.
*/
#include <gtest/gtest.h>
#include <inttypes.h>
#include <sys/types.h>
#include <algorithm>
#include <memory>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "my_inttypes.h"
#include "my_sys.h"
#include "template_utils.h"
#include "unittest/gunit/benchmark.h"
#include "unittest/gunit/strnxfrm.h"
using std::make_pair;
using std::max;
using std::pair;
using std::string;
using std::to_string;
using std::unordered_map;
namespace strnxfrm_unittest {
namespace {
// Simply print out an array.
void print_array(const uchar *arr, size_t len) {
for (size_t i = 0; i < len; ++i) {
fprintf(stderr, " %02x", arr[i]);
if ((i % 8) == 7 || i == len - 1) fprintf(stderr, "\n");
}
fprintf(stderr, "\n");
}
// A function to compare two arrays and print them out in its entirety
// (for easier context) if they are not equal.
void expect_arrays_equal(const uchar *expected, const uchar *got, size_t len) {
int num_err = 0;
for (size_t i = 0; i < len && num_err < 5; ++i) {
EXPECT_EQ(expected[i], got[i]);
if (expected[i] != got[i]) ++num_err;
}
if (num_err) {
fprintf(stderr, "Expected:\n");
for (size_t i = 0; i < len; ++i) {
fprintf(stderr, " %c%02x", expected[i] != got[i] ? '*' : ' ',
expected[i]);
if ((i % 8) == 7 || i == len - 1) fprintf(stderr, "\n");
}
fprintf(stderr, "\nGot:\n");
for (size_t i = 0; i < len; ++i) {
fprintf(stderr, " %c%02x", expected[i] != got[i] ? '*' : ' ', got[i]);
if ((i % 8) == 7 || i == len - 1) fprintf(stderr, "\n");
}
fprintf(stderr, "\n");
}
}
CHARSET_INFO *init_collation(const char *name) {
MY_CHARSET_LOADER loader;
my_charset_loader_init_mysys(&loader);
return my_collation_get_by_name(&loader, name, MYF(0));
}
int compare_through_strxfrm(CHARSET_INFO *cs, const char *a, const char *b) {
uchar abuf[256], bbuf[256];
int alen = my_strnxfrm(cs, abuf, sizeof(abuf), pointer_cast<const uchar *>(a),
strlen(a));
int blen = my_strnxfrm(cs, bbuf, sizeof(bbuf), pointer_cast<const uchar *>(b),
strlen(b));
if (false) // Enable this for debugging.
{
fprintf(stderr, "\n\nstrxfrm for '%s':\n", a);
print_array(abuf, alen);
fprintf(stderr, "strxfrm for '%s':\n", b);
print_array(bbuf, blen);
}
int cmp = memcmp(abuf, bbuf, std::min(alen, blen));
if (cmp != 0) return cmp;
if (alen == blen) {
return 0;
} else {
return (alen < blen) ? -1 : 1;
}
}
} // namespace
#if !defined(DBUG_OFF)
// There is no point in benchmarking anything in debug mode.
const size_t num_iterations = 1ULL;
#else
// Set this so that each test case takes a few seconds.
// And set it back to a small value before pushing!!
// const size_t num_iterations= 20000000ULL;
const size_t num_iterations = 2ULL;
#endif
class StrnxfrmTest : public ::testing::TestWithParam<size_t> {
protected:
virtual void SetUp() {
m_length = GetParam();
m_src.assign(m_length, 0x20);
m_dst.assign(m_length, 0x20);
}
std::vector<uchar> m_src;
std::vector<uchar> m_dst;
size_t m_length;
};
size_t test_values[] = {1, 10, 100, 1000};
INSTANTIATE_TEST_CASE_P(Strnxfrm, StrnxfrmTest,
::testing::ValuesIn(test_values));
TEST_P(StrnxfrmTest, OriginalSrcDst) {
CHARSET_INFO *cs = init_collation("latin1_swedish_ci");
for (size_t ix = 0; ix < num_iterations; ++ix)
strnxfrm_orig(cs, &m_dst[0], m_length, m_length, &m_src[0], m_length, 192);
}
TEST_P(StrnxfrmTest, OriginalUnrolledSrcDst) {
CHARSET_INFO *cs = init_collation("latin1_swedish_ci");
for (size_t ix = 0; ix < num_iterations; ++ix)
strnxfrm_orig_unrolled(cs, &m_dst[0], m_length, m_length, &m_src[0],
m_length, 192);
}
TEST_P(StrnxfrmTest, ModifiedSrcDst) {
CHARSET_INFO *cs = init_collation("latin1_swedish_ci");
for (size_t ix = 0; ix < num_iterations; ++ix)
strnxfrm_new(cs, &m_dst[0], m_length, m_length, &m_src[0], m_length, 192);
}
TEST_P(StrnxfrmTest, ModifiedUnrolledSrcDst) {
CHARSET_INFO *cs = init_collation("latin1_swedish_ci");
for (size_t ix = 0; ix < num_iterations; ++ix)
strnxfrm_new_unrolled(cs, &m_dst[0], m_length, m_length, &m_src[0],
m_length, 192);
}
TEST_P(StrnxfrmTest, OriginalSrcSrc) {
CHARSET_INFO *cs = init_collation("latin1_swedish_ci");
for (size_t ix = 0; ix < num_iterations; ++ix)
strnxfrm_orig(cs, &m_src[0], m_length, m_length, &m_src[0], m_length, 192);
}
TEST_P(StrnxfrmTest, OriginalUnrolledSrcSrc) {
CHARSET_INFO *cs = init_collation("latin1_swedish_ci");
for (size_t ix = 0; ix < num_iterations; ++ix)
strnxfrm_orig_unrolled(cs, &m_src[0], m_length, m_length, &m_src[0],
m_length, 192);
}
TEST_P(StrnxfrmTest, ModifiedSrcSrc) {
CHARSET_INFO *cs = init_collation("latin1_swedish_ci");
for (size_t ix = 0; ix < num_iterations; ++ix)
strnxfrm_new(cs, &m_src[0], m_length, m_length, &m_src[0], m_length, 192);
}
TEST_P(StrnxfrmTest, ModifiedUnrolledSrcSrc) {
CHARSET_INFO *cs = init_collation("latin1_swedish_ci");
for (size_t ix = 0; ix < num_iterations; ++ix)
strnxfrm_new_unrolled(cs, &m_src[0], m_length, m_length, &m_src[0],
m_length, 192);
}
TEST(StrXfrmTest, SimpleUTF8Correctness) {
CHARSET_INFO *cs = init_collation("utf8_bin");
const char *src = "abc æøå 日本語";
unsigned char buf[32];
static const unsigned char full_answer_with_pad[32] = {
0x00, 0x61, 0x00, 0x62, 0x00, 0x63, // abc
0x00, 0x20, // space
0x00, 0xe6, 0x00, 0xf8, 0x00, 0xe5, // æøå
0x00, 0x20, // space
0x65, 0xe5, 0x67, 0x2c, 0x8a, 0x9e, // 日本語
0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20 // space for padding
};
for (size_t maxlen = 0; maxlen < sizeof(buf); maxlen += 2) {
memset(buf, 0xff, sizeof(buf));
my_strnxfrm(cs, buf, maxlen, pointer_cast<const uchar *>(src), strlen(src));
expect_arrays_equal(full_answer_with_pad, buf, maxlen);
}
}
TEST(StrXfrmTest, SimpleUTF8MB4Correctness) {
CHARSET_INFO *cs = init_collation("utf8mb4_0900_ai_ci");
const char *src = "abc æøå 日本語";
unsigned char buf[30];
static const unsigned char full_answer_with_pad[30] = {
0x1c, 0x47, 0x1c, 0x60, 0x1c, 0x7a, // abc
0x02, 0x09, // space
0x1c, 0x47, 0x1c, 0xaa, 0x1d, 0xdd, 0x1c, 0x47, // æøå
0x02, 0x09, // space
0xfb, 0x40, 0xe5, 0xe5, 0xfb, 0x40, 0xe7, 0x2c,
0xfb, 0x41, 0x8a, 0x9e, // 日本語
};
for (size_t maxlen = 0; maxlen < sizeof(buf); maxlen += 2) {
memset(buf, 0xff, sizeof(buf));
my_strnxfrm(cs, buf, maxlen, pointer_cast<const uchar *>(src), strlen(src));
expect_arrays_equal(full_answer_with_pad, buf, maxlen);
}
}
TEST(StrXfrmTest, UTF8MB4Correctness_as_ci) {
CHARSET_INFO *cs = init_collation("utf8mb4_0900_as_ci");
const char *src = "abc æøå 日本語";
unsigned char buf[62];
static const unsigned char full_answer_with_pad[62] = {
0x1c, 0x47, 0x1c, 0x60, 0x1c, 0x7a, // abc
0x02, 0x09, // space
0x1c, 0x47, 0x1c, 0xaa, 0x1d, 0xdd, 0x1c, 0x47, // æøå
0x02, 0x09, // space
0xfb, 0x40, 0xe5, 0xe5, 0xfb, 0x40, 0xe7, 0x2c, // 日本語
0xfb, 0x41, 0x8a, 0x9e, 0x00, 0x00, // level separator
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, // abc
0x00, 0x20, // space
0x00, 0x20, 0x01, 0x10, 0x00, 0x20, 0x00, 0x20, // æøå
0x00, 0x2F, 0x00, 0x20, 0x00, 0x29, 0x00, 0x20, // space
0x00, 0x20, 0x00, 0x20, 0x00, 0x20 // 日本語
};
for (size_t maxlen = 0; maxlen < sizeof(buf); maxlen += 2) {
memset(buf, 0xff, sizeof(buf));
my_strnxfrm(cs, buf, maxlen, pointer_cast<const uchar *>(src), strlen(src));
expect_arrays_equal(full_answer_with_pad, buf, maxlen);
}
}
TEST(StrXfrmTest, UTF8MB4Correctness_as_ci_1) {
CHARSET_INFO *cs = init_collation("utf8mb4_0900_as_ci");
// case insensitive
EXPECT_EQ(compare_through_strxfrm(cs, "Abc", "aBC"), 0);
// accent sensitive
EXPECT_NE(compare_through_strxfrm(cs, "ǍḄÇ", "ÁḆĈ"), 0);
EXPECT_NE(compare_through_strxfrm(cs, u8"\uA73A", u8"\uA738"), 0);
// Hangul decomposition
EXPECT_EQ(compare_through_strxfrm(cs, u8"\uAC00", u8"\u326E"), 0);
}
TEST(StrXfrmTest, JapaneseUTF8MB4) {
CHARSET_INFO *cs = init_collation("utf8mb4_ja_0900_as_cs");
const char *src =
"\x61\x41\xCA\xAC\xCA\xAD" // latin 'aAʬʭ'
// Hiragana and Katakana 'ぁンはばぱ'
"\xE3\x81\x81\xE3\x83\xB3\xE3\x81\xAF\xE3\x81\xB0\xE3\x81\xB1"
// Japanese Han '亜熙憐'
"\xE4\xBA\x9C\xE7\x86\x99\xE6\x86\x90"
// Other Han '﨎㐀'
"\xEF\xA8\x8E\xE3\x90\x80"
// Greek, Coptic etc. 'αⲁаⳤퟻ'
"\xCE\xB1\xE2\xB2\x81\xD0\xB0\xE2\xB3\xA4\xED\x9F\xBB";
static const unsigned char full_answer_with_pad[156] = {
// Level 1
0x1C, 0x47, 0x1C, 0x47, 0x1F, 0xB1, 0x1F, 0xB5, // latin
0x1F, 0xB6, 0x1F, 0xE7, 0x1F, 0xD0, 0x1F, 0xD0, // Hiragana and Katakana
0x1F, 0xD0, 0x54, 0xA4, 0x6D, 0x76, 0x60, 0x00, // Japanese Han
0xFB, 0x41, 0xFA, 0x0E, 0xFB, 0x80, 0xB4, 0x00, // Other Han
0xFB, 0x86, 0x1F, 0xB9, 0xFB, 0x86, 0x1F, 0xE6, // Greek, Coptic etc.
0xFB, 0x86, 0x20, 0x22, 0xFB, 0x86, 0x1F, 0xF1, 0xFB, 0x86, 0x1F, 0xE6,
0xFB, 0x86, 0x1F, 0xF0, 0xFB, 0x86, 0x3D, 0x59, 0x00,
0x00, // Level separator
// Level 2
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, // latin
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, // Hiragana and Katakana
0x00, 0x37, 0x00, 0x20, 0x00, 0x38, 0x00, 0x20, 0x00, 0x20, 0x00,
0x20, // Japanese Han
0x00, 0x20, 0x00, 0x20, // Other Han
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, // Greek, Coptic etc.
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x00, // Level separator
// Level 3
0x00, 0x02, 0x00, 0x08, 0x00, 0x02, 0x00, 0x02, // latin
0x00, 0x0D, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, // Hiragana and Katakana
0x00, 0x02, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00,
0x02, // Japanese Han
0x00, 0x02, 0x00, 0x02, // Other Han
0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x04, // Greek, Coptic etc.
0x00, 0x04, 0x00, 0x04, 0x00, 0x02};
unsigned char buf[sizeof(full_answer_with_pad)];
for (size_t maxlen = 0; maxlen < sizeof(buf); maxlen += 2) {
memset(buf, 0xff, sizeof(buf));
my_strnxfrm(cs, buf, maxlen, pointer_cast<const uchar *>(src), strlen(src));
expect_arrays_equal(full_answer_with_pad, buf, maxlen);
}
}
TEST(StrXfrmTest, Japanese_ks_UTF8MB4) {
CHARSET_INFO *cs = init_collation("utf8mb4_ja_0900_as_cs_ks");
/*
Weights of Japanese Han, Other Han, Greek, Coptic are not changed comparing
to the test result of collation utf8mb4_ja_0900_as_cs (in test
JapaneseUtf8mb4 above). But additional quaternary weight is added for
Hiragana and Katakana characters.
*/
const char *src =
"\x61\x41\xCA\xAC\xCA\xAD" // latin 'aAʬʭ'
// Hiragana and Katakana 'ぁンはばぱ'
"\xE3\x81\x81\xE3\x83\xB3\xE3\x81\xAF\xE3\x81\xB0\xE3\x81\xB1"
// Japanese Han '亜熙憐'
"\xE4\xBA\x9C\xE7\x86\x99\xE6\x86\x90"
// Other Han '﨎㐀'
"\xEF\xA8\x8E\xE3\x90\x80"
// Greek, Coptic etc. 'αⲁаⳤퟻ'
"\xCE\xB1\xE2\xB2\x81\xD0\xB0\xE2\xB3\xA4\xED\x9F\xBB"
// Prefix context 'さー' and 'サー'
"\xE3\x81\x95\xE3\x83\xBC\xE3\x82\xB5\xE3\x83\xBC";
static const unsigned char full_answer_with_pad[] = {
// Level 1
0x1C, 0x47, 0x1C, 0x47, 0x1F, 0xB1, 0x1F, 0xB5, // latin
0x1F, 0xB6, 0x1F, 0xE7, 0x1F, 0xD0, 0x1F, 0xD0, // Hiragana and Katakana
0x1F, 0xD0, 0x54, 0xA4, 0x6D, 0x76, 0x60, 0x00, // Japanese Han
0xFB, 0x41, 0xFA, 0x0E, 0xFB, 0x80, 0xB4, 0x00, // Other Han
0xFB, 0x86, 0x1F, 0xB9, 0xFB, 0x86, 0x1F, 0xE6, // Greek, Coptic etc.
0xFB, 0x86, 0x20, 0x22, 0xFB, 0x86, 0x1F, 0xF1, 0xFB, 0x86, 0x1F, 0xE6,
0xFB, 0x86, 0x1F, 0xF0, 0xFB, 0x86, 0x3D, 0x59, 0x1F, 0xC1, 0x1F, 0xB6,
0x1F, 0xC1, 0x1F, 0xB6, // Prefix context
0x00, 0x00, // Level separator
// Level 2
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, // latin
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, // Hiragana and Katakana
0x00, 0x37, 0x00, 0x20, 0x00, 0x38, 0x00, 0x20, 0x00, 0x20, 0x00,
0x20, // Japanese Han
0x00, 0x20, 0x00, 0x20, // Other Han
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, // Greek, Coptic etc.
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, // Prefix context
0x00, 0x00, // Level separator
// Level 3
0x00, 0x02, 0x00, 0x08, 0x00, 0x02, 0x00, 0x02, // latin
0x00, 0x0D, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, // Hiragana and Katakana
0x00, 0x02, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00,
0x02, // Japanese Han
0x00, 0x02, 0x00, 0x02, // Other Han
0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x04, // Greek, Coptic etc.
0x00, 0x04, 0x00, 0x04, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x0C, 0x00, 0x21,
0x00, 0x0E, // Prefix context
0x00, 0x0C, 0x00, 0x21, 0x00, 0x00, // Level separator
// Level 4
0x00, 0x02, 0x00, 0x08, 0x00, 0x02, 0x00, 0x02, // Hiragana and Katakana
0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x08, 0x00,
0x08 // Prefix context
};
unsigned char buf[sizeof(full_answer_with_pad)];
for (size_t maxlen = 0; maxlen < sizeof(buf); maxlen += 2) {
memset(buf, 0xff, sizeof(buf));
my_strnxfrm(cs, buf, maxlen, pointer_cast<const uchar *>(src), strlen(src));
expect_arrays_equal(full_answer_with_pad, buf, maxlen);
}
CHARSET_INFO *as_cs = init_collation("utf8mb4_ja_0900_as_cs");
CHARSET_INFO *as_cs_ks = init_collation("utf8mb4_ja_0900_as_cs_ks");
// utf8 "にほんご"
const char *str1 = "\xE3\x81\xAB\xE3\x81\xBB\xE3\x82\x93\xE3\x81\x94";
// utf8 "ニホンゴ"
const char *str2 = "\xE3\x83\x8B\xE3\x83\x9B\xE3\x83\xB3\xE3\x82\xB4";
EXPECT_EQ(compare_through_strxfrm(as_cs, str1, str2), 0);
EXPECT_LT(compare_through_strxfrm(as_cs_ks, str1, str2), 0);
str1 = "\xE3\x81\xAF\xE3\x81\xAF"; // utf8 "はは"
str2 = "\xE3\x81\xAF\xE3\x83\x8F"; // utf8 "はハ"
const char *str3 = "\xE3\x83\x8F\xE3\x81\xAF"; // utf8 "ハは"
const char *str4 = "\xE3\x83\x8F\xE3\x83\x8F"; // utf8 "ハハ"
EXPECT_EQ(compare_through_strxfrm(as_cs, str1, str2), 0);
EXPECT_EQ(compare_through_strxfrm(as_cs, str2, str3), 0);
EXPECT_EQ(compare_through_strxfrm(as_cs, str3, str4), 0);
EXPECT_LT(compare_through_strxfrm(as_cs_ks, str1, str2), 0);
EXPECT_LT(compare_through_strxfrm(as_cs_ks, str2, str3), 0);
EXPECT_LT(compare_through_strxfrm(as_cs_ks, str3, str4), 0);
}
TEST(StrXfrmTest, JapaneseUTF8MB4_1) {
CHARSET_INFO *cs = init_collation("utf8mb4_ja_0900_as_cs");
// Japanese HE followed with Handakuten mark
const char *src1 = "\xE3\x81\xB8\xE3\x82\x99";
// Japanese HE followed with voiced length mark
const char *src2 = "\xE3\x81\xB8\xE3\x82\x9E";
/*
When the voiced length mark is after 'HE', it should sort before
'HE followed with Handakuten mark'on tertiary level.
*/
static const unsigned char answer1[] = {0x1F, 0xD3, 0x00, 0x00, 0x00,
0x20, 0x00, 0x37, 0x00, 0x00,
0x00, 0x0E, 0x00, 0x02};
static const unsigned char answer2[] = {
0x1F, 0xD3, 0x1F, 0xD3, 0x00, 0x00, 0x00, 0x20, 0x00, 0x20, 0x00,
0x37, 0x00, 0x00, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x01, 0x00, 0x21};
unsigned char buf[32];
size_t buf_len = sizeof(answer1);
memset(buf, 0xff, sizeof(buf));
my_strnxfrm(cs, buf, buf_len, pointer_cast<const uchar *>(src1),
strlen(src1));
expect_arrays_equal(answer1, buf, buf_len);
buf_len = sizeof(answer2);
memset(buf, 0xff, sizeof(buf));
my_strnxfrm(cs, buf, buf_len, pointer_cast<const uchar *>(src2),
strlen(src2));
expect_arrays_equal(answer2, buf, buf_len);
}
TEST(StrXfrmTest, UTF8MB4PadCorrectness) {
CHARSET_INFO *cs = init_collation("utf8mb4_0900_as_cs");
const char *src = "abc ";
unsigned char buf[46];
static const unsigned char full_answer[52] = {
0x1c, 0x47, 0x1c, 0x60, 0x1c, 0x7a, // abc
0x02, 0x09, 0x02, 0x09, 0x02, 0x09,
0x02, 0x09, // Four spaces.
0x00, 0x00, // Level separator.
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, // Accents for abc.
0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, // Accents for four spaces.
0x00, 0x00, // Level separator.
0x00, 0x02, 0x00, 0x02, 0x00, 0x02, // Case for abc.
0x00, 0x02, 0x00, 0x02, 0x00, 0x02,
0x00, 0x02, // Case for four spaces.
};
for (size_t maxlen = 0; maxlen < sizeof(buf); maxlen += 2) {
SCOPED_TRACE("maxlen=" + to_string(maxlen) + "/" + to_string(sizeof(buf)));
memset(buf, 0xff, sizeof(buf));
my_strnxfrm(cs, buf, maxlen, pointer_cast<const uchar *>(src), strlen(src));
expect_arrays_equal(full_answer, buf, maxlen);
}
}
TEST(StrXfrmTest, NullPointer) {
CHARSET_INFO *cs = init_collation("utf8mb4_0900_ai_ci");
unsigned char buf[256];
memset(buf, 0x33, sizeof(buf));
cs->coll->strnxfrm(cs, buf, sizeof(buf), sizeof(buf), nullptr, 0,
MY_STRXFRM_PAD_TO_MAXLEN);
for (size_t i = 0; i < sizeof(buf); ++i) {
EXPECT_EQ(0, buf[i]);
}
}
// Benchmark based on reduced test case in Bug #83247 / #24788778.
//
// Note: This benchmark does not exercise any real multibyte characters;
// it is mostly exercising padding. If we change the test string to contain
// e.g. Japanese characters, performance goes down by ~20%.
static void BM_SimpleUTF8(size_t num_iterations) {
StopBenchmarkTiming();
CHARSET_INFO *cs = init_collation("utf8_bin");
static constexpr int key_cols = 12;
static constexpr int set_key_cols = 6; // Only the first half is set.
static constexpr int key_col_chars = 80;
static constexpr int bytes_per_char = 3;
static constexpr int key_bytes = key_col_chars * bytes_per_char;
static constexpr int buffer_bytes = key_cols * key_bytes;
unsigned char source[buffer_bytes];
unsigned char dest[buffer_bytes];
const char *content = "PolyFilla27773";
const int len = strlen(content);
memset(source, 0, sizeof(source));
for (int k = 0, offset = 0; k < set_key_cols; ++k, offset += key_bytes) {
memcpy(source + offset, content, len);
}
StartBenchmarkTiming();
for (size_t i = 0; i < num_iterations; ++i) {
for (int k = 0, offset = 0; k < key_cols; ++k, offset += key_bytes) {
if (k < set_key_cols) {
my_strnxfrm(cs, dest + offset, key_bytes, source + offset, len);
} else {
my_strnxfrm(cs, dest + offset, key_bytes, source + offset, 0);
}
}
}
StopBenchmarkTiming();
}
BENCHMARK(BM_SimpleUTF8)
// Verifies using my_charpos to find the length of a string.
// hp_hash.c does this extensively. Not really a strnxfrm benchmark,
// but belongs to the same optimization effort.
static void BM_UTF8MB4StringLength(size_t num_iterations) {
StopBenchmarkTiming();
CHARSET_INFO *cs = init_collation("utf8mb4_0900_ai_ci");
// Some English text, then some Norwegian text, then some Japanese,
// and then a few emoji (the last with skin tone modifiers).
const char *content =
"Premature optimization is the root of all evil. "
"Våre norske tegn bør æres. 日本語が少しわかります。 "
"🐶👩🏽";
const int len = strlen(content);
int tot_len = 0;
StartBenchmarkTiming();
for (size_t i = 0; i < num_iterations; ++i) {
tot_len += my_charpos(cs, content, content + len, len / cs->mbmaxlen);
}
StopBenchmarkTiming();
EXPECT_NE(0, tot_len);
SetBytesProcessed(num_iterations * strlen(content));
}
BENCHMARK(BM_UTF8MB4StringLength)
// Benchmark testing the default recommended collation for 8.0, without
// stressing padding as much, but still testing only Latin letters.
static void BM_SimpleUTF8MB4(size_t num_iterations) {
StopBenchmarkTiming();
CHARSET_INFO *cs = init_collation("utf8mb4_0900_ai_ci");
const char *content =
"This is a rather long string that contains only "
"simple letters that are available in ASCII. This is a common special "
"case that warrants a benchmark on its own, even if the character set "
"and collation supports much more complicated scenarios.";
const int len = strlen(content);
// Just recorded from a trial run on the string above.
static constexpr uchar expected[] = {
0x1e, 0x95, 0x1d, 0x18, 0x1d, 0x32, 0x1e, 0x71, 0x02, 0x09, 0x1d, 0x32,
0x1e, 0x71, 0x02, 0x09, 0x1c, 0x47, 0x02, 0x09, 0x1e, 0x33, 0x1c, 0x47,
0x1e, 0x95, 0x1d, 0x18, 0x1c, 0xaa, 0x1e, 0x33, 0x02, 0x09, 0x1d, 0x77,
0x1d, 0xdd, 0x1d, 0xb9, 0x1c, 0xf4, 0x02, 0x09, 0x1e, 0x71, 0x1e, 0x95,
0x1e, 0x33, 0x1d, 0x32, 0x1d, 0xb9, 0x1c, 0xf4, 0x02, 0x09, 0x1e, 0x95,
0x1d, 0x18, 0x1c, 0x47, 0x1e, 0x95, 0x02, 0x09, 0x1c, 0x7a, 0x1d, 0xdd,
0x1d, 0xb9, 0x1e, 0x95, 0x1c, 0x47, 0x1d, 0x32, 0x1d, 0xb9, 0x1e, 0x71,
0x02, 0x09, 0x1d, 0xdd, 0x1d, 0xb9, 0x1d, 0x77, 0x1f, 0x0b, 0x02, 0x09,
0x1e, 0x71, 0x1d, 0x32, 0x1d, 0xaa, 0x1e, 0x0c, 0x1d, 0x77, 0x1c, 0xaa,
0x02, 0x09, 0x1d, 0x77, 0x1c, 0xaa, 0x1e, 0x95, 0x1e, 0x95, 0x1c, 0xaa,
0x1e, 0x33, 0x1e, 0x71, 0x02, 0x09, 0x1e, 0x95, 0x1d, 0x18, 0x1c, 0x47,
0x1e, 0x95, 0x02, 0x09, 0x1c, 0x47, 0x1e, 0x33, 0x1c, 0xaa, 0x02, 0x09,
0x1c, 0x47, 0x1e, 0xe3, 0x1c, 0x47, 0x1d, 0x32, 0x1d, 0x77, 0x1c, 0x47,
0x1c, 0x60, 0x1d, 0x77, 0x1c, 0xaa, 0x02, 0x09, 0x1d, 0x32, 0x1d, 0xb9,
0x02, 0x09, 0x1c, 0x47, 0x1e, 0x71, 0x1c, 0x7a, 0x1d, 0x32, 0x1d, 0x32,
0x02, 0x77, 0x02, 0x09, 0x1e, 0x95, 0x1d, 0x18, 0x1d, 0x32, 0x1e, 0x71,
0x02, 0x09, 0x1d, 0x32, 0x1e, 0x71, 0x02, 0x09, 0x1c, 0x47, 0x02, 0x09,
0x1c, 0x7a, 0x1d, 0xdd, 0x1d, 0xaa, 0x1d, 0xaa, 0x1d, 0xdd, 0x1d, 0xb9,
0x02, 0x09, 0x1e, 0x71, 0x1e, 0x0c, 0x1c, 0xaa, 0x1c, 0x7a, 0x1d, 0x32,
0x1c, 0x47, 0x1d, 0x77, 0x02, 0x09, 0x1c, 0x7a, 0x1c, 0x47, 0x1e, 0x71,
0x1c, 0xaa, 0x02, 0x09, 0x1e, 0x95, 0x1d, 0x18, 0x1c, 0x47, 0x1e, 0x95,
0x02, 0x09, 0x1e, 0xf5, 0x1c, 0x47, 0x1e, 0x33, 0x1e, 0x33, 0x1c, 0x47,
0x1d, 0xb9, 0x1e, 0x95, 0x1e, 0x71, 0x02, 0x09, 0x1c, 0x47, 0x02, 0x09,
0x1c, 0x60, 0x1c, 0xaa, 0x1d, 0xb9, 0x1c, 0x7a, 0x1d, 0x18, 0x1d, 0xaa,
0x1c, 0x47, 0x1e, 0x33, 0x1d, 0x65, 0x02, 0x09, 0x1d, 0xdd, 0x1d, 0xb9,
0x02, 0x09, 0x1d, 0x32, 0x1e, 0x95, 0x1e, 0x71, 0x02, 0x09, 0x1d, 0xdd,
0x1e, 0xf5, 0x1d, 0xb9, 0x02, 0x22, 0x02, 0x09, 0x1c, 0xaa, 0x1e, 0xe3,
0x1c, 0xaa, 0x1d, 0xb9, 0x02, 0x09, 0x1d, 0x32, 0x1c, 0xe5, 0x02, 0x09,
0x1e, 0x95, 0x1d, 0x18, 0x1c, 0xaa, 0x02, 0x09, 0x1c, 0x7a, 0x1d, 0x18,
0x1c, 0x47, 0x1e, 0x33, 0x1c, 0x47, 0x1c, 0x7a, 0x1e, 0x95, 0x1c, 0xaa,
0x1e, 0x33, 0x02, 0x09, 0x1e, 0x71, 0x1c, 0xaa, 0x1e, 0x95, 0x02, 0x09,
0x1c, 0x47, 0x1d, 0xb9, 0x1c, 0x8f, 0x02, 0x09, 0x1c, 0x7a, 0x1d, 0xdd,
0x1d, 0x77, 0x1d, 0x77, 0x1c, 0x47, 0x1e, 0x95, 0x1d, 0x32, 0x1d, 0xdd,
0x1d, 0xb9, 0x02, 0x09, 0x1e, 0x71, 0x1e, 0xb5, 0x1e, 0x0c, 0x1e, 0x0c,
0x1d, 0xdd, 0x1e, 0x33, 0x1e, 0x95, 0x1e, 0x71, 0x02, 0x09, 0x1d, 0xaa,
0x1e, 0xb5, 0x1c, 0x7a, 0x1d, 0x18, 0x02, 0x09, 0x1d, 0xaa, 0x1d, 0xdd,
0x1e, 0x33, 0x1c, 0xaa, 0x02, 0x09, 0x1c, 0x7a, 0x1d, 0xdd, 0x1d, 0xaa,
0x1e, 0x0c, 0x1d, 0x77, 0x1d, 0x32, 0x1c, 0x7a, 0x1c, 0x47, 0x1e, 0x95,
0x1c, 0xaa, 0x1c, 0x8f, 0x02, 0x09, 0x1e, 0x71, 0x1c, 0x7a, 0x1c, 0xaa,
0x1d, 0xb9, 0x1c, 0x47, 0x1e, 0x33, 0x1d, 0x32, 0x1d, 0xdd, 0x1e, 0x71,
0x02, 0x77};
uchar dest[sizeof(expected)];
StartBenchmarkTiming();
for (size_t i = 0; i < num_iterations; ++i) {
my_strnxfrm(cs, dest, sizeof(dest),
reinterpret_cast<const uchar *>(content), len);
}
StopBenchmarkTiming();
expect_arrays_equal(expected, dest, sizeof(dest));
SetBytesProcessed(num_iterations * strlen(content));
}
BENCHMARK(BM_SimpleUTF8MB4)
// Benchmark testing a wider variety of character sets on a more complicated
// collation (the recommended default collation for 8.0), without stressing
// padding as much.
static void BM_MixedUTF8MB4(size_t num_iterations) {
StopBenchmarkTiming();
CHARSET_INFO *cs = init_collation("utf8mb4_0900_ai_ci");
// Some English text, then some Norwegian text, then some Japanese,
// and then a few emoji (the last with skin tone modifiers).
const char *content =
"Premature optimization is the root of all evil. "
"Våre norske tegn bør æres. 日本語が少しわかります。 "
"🐶👩🏽";
const int len = strlen(content);
// Just recorded from a trial run on the string above.
static constexpr uchar expected[] = {
0x1e, 0x0c, 0x1e, 0x33, 0x1c, 0xaa, 0x1d, 0xaa, 0x1c, 0x47, 0x1e, 0x95,
0x1e, 0xb5, 0x1e, 0x33, 0x1c, 0xaa, 0x02, 0x09, 0x1d, 0xdd, 0x1e, 0x0c,
0x1e, 0x95, 0x1d, 0x32, 0x1d, 0xaa, 0x1d, 0x32, 0x1f, 0x21, 0x1c, 0x47,
0x1e, 0x95, 0x1d, 0x32, 0x1d, 0xdd, 0x1d, 0xb9, 0x02, 0x09, 0x1d, 0x32,
0x1e, 0x71, 0x02, 0x09, 0x1e, 0x95, 0x1d, 0x18, 0x1c, 0xaa, 0x02, 0x09,
0x1e, 0x33, 0x1d, 0xdd, 0x1d, 0xdd, 0x1e, 0x95, 0x02, 0x09, 0x1d, 0xdd,
0x1c, 0xe5, 0x02, 0x09, 0x1c, 0x47, 0x1d, 0x77, 0x1d, 0x77, 0x02, 0x09,
0x1c, 0xaa, 0x1e, 0xe3, 0x1d, 0x32, 0x1d, 0x77, 0x02, 0x77, 0x02, 0x09,
0x1e, 0xe3, 0x1c, 0x47, 0x1e, 0x33, 0x1c, 0xaa, 0x02, 0x09, 0x1d, 0xb9,
0x1d, 0xdd, 0x1e, 0x33, 0x1e, 0x71, 0x1d, 0x65, 0x1c, 0xaa, 0x02, 0x09,
0x1e, 0x95, 0x1c, 0xaa, 0x1c, 0xf4, 0x1d, 0xb9, 0x02, 0x09, 0x1c, 0x60,
0x1d, 0xdd, 0x1e, 0x33, 0x02, 0x09, 0x1c, 0x47, 0x1c, 0xaa, 0x1e, 0x33,
0x1c, 0xaa, 0x1e, 0x71, 0x02, 0x77, 0x02, 0x09, 0xfb, 0x40, 0xe5, 0xe5,
0xfb, 0x40, 0xe7, 0x2c, 0xfb, 0x41, 0x8a, 0x9e, 0x3d, 0x60, 0xfb, 0x40,
0xdc, 0x11, 0x3d, 0x66, 0x3d, 0x87, 0x3d, 0x60, 0x3d, 0x83, 0x3d, 0x79,
0x3d, 0x67, 0x02, 0x8a, 0x02, 0x09, 0x0a, 0x2d, 0x13, 0xdf, 0x14, 0x12,
0x13, 0xa6};
uchar dest[sizeof(expected)];
StartBenchmarkTiming();
for (size_t i = 0; i < num_iterations; ++i) {
my_strnxfrm(cs, dest, sizeof(dest),
reinterpret_cast<const uchar *>(content), len);
}
StopBenchmarkTiming();
expect_arrays_equal(expected, dest, sizeof(dest));
SetBytesProcessed(num_iterations * strlen(content));
}
BENCHMARK(BM_MixedUTF8MB4)
static void BM_MixedUTF8MB4_AS_CI(size_t num_iterations) {
StopBenchmarkTiming();
CHARSET_INFO *cs = init_collation("utf8mb4_0900_as_ci");
// Some English text, then some Norwegian text, then some Japanese,
// and then a few emoji (the last with skin tone modifiers).
const char *content =
"Premature optimization is the root of all evil. "
"Våre norske tegn bør æres. 日本語が少しわかります。 "
"🐶👩🏽";
const int len = strlen(content);
// Just recorded from a trial run on the string above.
static constexpr uchar expected[] = {
0x1e, 0x0c, 0x1e, 0x33, 0x1c, 0xaa, 0x1d, 0xaa, 0x1c, 0x47, 0x1e, 0x95,
0x1e, 0xb5, 0x1e, 0x33, 0x1c, 0xaa, 0x02, 0x09, 0x1d, 0xdd, 0x1e, 0x0c,
0x1e, 0x95, 0x1d, 0x32, 0x1d, 0xaa, 0x1d, 0x32, 0x1f, 0x21, 0x1c, 0x47,
0x1e, 0x95, 0x1d, 0x32, 0x1d, 0xdd, 0x1d, 0xb9, 0x02, 0x09, 0x1d, 0x32,
0x1e, 0x71, 0x02, 0x09, 0x1e, 0x95, 0x1d, 0x18, 0x1c, 0xaa, 0x02, 0x09,
0x1e, 0x33, 0x1d, 0xdd, 0x1d, 0xdd, 0x1e, 0x95, 0x02, 0x09, 0x1d, 0xdd,
0x1c, 0xe5, 0x02, 0x09, 0x1c, 0x47, 0x1d, 0x77, 0x1d, 0x77, 0x02, 0x09,
0x1c, 0xaa, 0x1e, 0xe3, 0x1d, 0x32, 0x1d, 0x77, 0x02, 0x77, 0x02, 0x09,
0x1e, 0xe3, 0x1c, 0x47, 0x1e, 0x33, 0x1c, 0xaa, 0x02, 0x09, 0x1d, 0xb9,
0x1d, 0xdd, 0x1e, 0x33, 0x1e, 0x71, 0x1d, 0x65, 0x1c, 0xaa, 0x02, 0x09,
0x1e, 0x95, 0x1c, 0xaa, 0x1c, 0xf4, 0x1d, 0xb9, 0x02, 0x09, 0x1c, 0x60,
0x1d, 0xdd, 0x1e, 0x33, 0x02, 0x09, 0x1c, 0x47, 0x1c, 0xaa, 0x1e, 0x33,
0x1c, 0xaa, 0x1e, 0x71, 0x02, 0x77, 0x02, 0x09, 0xfb, 0x40, 0xe5, 0xe5,
0xfb, 0x40, 0xe7, 0x2c, 0xfb, 0x41, 0x8a, 0x9e, 0x3d, 0x60, 0xfb, 0x40,
0xdc, 0x11, 0x3d, 0x66, 0x3d, 0x87, 0x3d, 0x60, 0x3d, 0x83, 0x3d, 0x79,
0x3d, 0x67, 0x02, 0x8a, 0x02, 0x09, 0x0a, 0x2d, 0x13, 0xdf, 0x14, 0x12,
0x13, 0xa6, 0x00, 0x00, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x29, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x2F, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x01, 0x10, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x37, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20};
uchar dest[sizeof(expected)];
StartBenchmarkTiming();
for (size_t i = 0; i < num_iterations; ++i) {
my_strnxfrm(cs, dest, sizeof(dest),
reinterpret_cast<const uchar *>(content), len);
}
StopBenchmarkTiming();
expect_arrays_equal(expected, dest, sizeof(dest));
SetBytesProcessed(num_iterations * strlen(content));
}
BENCHMARK(BM_MixedUTF8MB4_AS_CI)
// Case-sensitive, accent-sensitive benchmark, using the same string as
// BM_SimpleUTF8MB4. This will naturally be slower, since many more weights
// need to be generated.
static void BM_MixedUTF8MB4_AS_CS(size_t num_iterations) {
StopBenchmarkTiming();
CHARSET_INFO *cs = init_collation("utf8mb4_0900_as_cs");
// Some English text, then some Norwegian text, then some Japanese,
// and then a few emoji (the last with skin tone modifiers).
const char *content =
"Premature optimization is the root of all evil. "
"Våre norske tegn bør æres. 日本語が少しわかります。 "
"🐶👩🏽";
const int len = strlen(content);
// Just recorded from a trial run on the string above.
static constexpr uchar expected[] = {
// Primary weights.
0x1e,
0x0c,
0x1e,
0x33,
0x1c,
0xaa,
0x1d,
0xaa,
0x1c,
0x47,
0x1e,
0x95,
0x1e,
0xb5,
0x1e,
0x33,
0x1c,
0xaa,
0x02,
0x09,
0x1d,
0xdd,
0x1e,
0x0c,
0x1e,
0x95,
0x1d,
0x32,
0x1d,
0xaa,
0x1d,
0x32,
0x1f,
0x21,
0x1c,
0x47,
0x1e,
0x95,
0x1d,
0x32,
0x1d,
0xdd,
0x1d,
0xb9,
0x02,
0x09,
0x1d,
0x32,
0x1e,
0x71,
0x02,
0x09,
0x1e,
0x95,
0x1d,
0x18,
0x1c,
0xaa,
0x02,
0x09,
0x1e,
0x33,
0x1d,
0xdd,
0x1d,
0xdd,
0x1e,
0x95,
0x02,
0x09,
0x1d,
0xdd,
0x1c,
0xe5,
0x02,
0x09,
0x1c,
0x47,
0x1d,
0x77,
0x1d,
0x77,
0x02,
0x09,
0x1c,
0xaa,
0x1e,
0xe3,
0x1d,
0x32,
0x1d,
0x77,
0x02,
0x77,
0x02,
0x09,
0x1e,
0xe3,
0x1c,
0x47,
0x1e,
0x33,
0x1c,
0xaa,
0x02,
0x09,
0x1d,
0xb9,
0x1d,
0xdd,
0x1e,
0x33,
0x1e,
0x71,
0x1d,
0x65,
0x1c,
0xaa,
0x02,
0x09,
0x1e,
0x95,
0x1c,
0xaa,
0x1c,
0xf4,
0x1d,
0xb9,
0x02,
0x09,
0x1c,
0x60,
0x1d,
0xdd,
0x1e,
0x33,
0x02,
0x09,
0x1c,
0x47,
0x1c,
0xaa,
0x1e,
0x33,
0x1c,
0xaa,
0x1e,
0x71,
0x02,
0x77,
0x02,
0x09,
0xfb,
0x40,
0xe5,
0xe5,
0xfb,
0x40,
0xe7,
0x2c,
0xfb,
0x41,
0x8a,
0x9e,
0x3d,
0x60,
0xfb,
0x40,
0xdc,
0x11,
0x3d,
0x66,
0x3d,
0x87,
0x3d,
0x60,
0x3d,
0x83,
0x3d,
0x79,
0x3d,
0x67,
0x02,
0x8a,
0x02,
0x09,
0x0a,
0x2d,
0x13,
0xdf,
0x14,
0x12,
0x13,
0xa6,
// Level separator.
0x00,
0x00,
// Secondary weights.
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x29,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x2f,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x01,
0x10,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x37,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
0x00,
0x20,
// Level separator.
0x00,
0x00,
// Tertiary weights.
0x00,
0x08,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x08,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x04,
0x00,
0x04,
0x00,
0x04,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x0e,
0x00,
0x02,
0x00,
0x02,
0x00,
0x0e,
0x00,
0x0e,
0x00,
0x0e,
0x00,
0x0e,
0x00,
0x0e,
0x00,
0x0e,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
0x00,
0x02,
};
uchar dest[sizeof(expected)];
size_t ret = 0;
StartBenchmarkTiming();
for (size_t i = 0; i < num_iterations; ++i) {
ret = my_strnxfrm(cs, dest, sizeof(dest),
pointer_cast<const uchar *>(content), len);
}
StopBenchmarkTiming();
EXPECT_EQ(sizeof(expected), ret);
expect_arrays_equal(expected, dest, ret);
SetBytesProcessed(num_iterations * strlen(content));
}
BENCHMARK(BM_MixedUTF8MB4_AS_CS)
// Specifically benchmark Japanese text.
static void BM_JapaneseUTF8MB4(size_t num_iterations) {
StopBenchmarkTiming();
CHARSET_INFO *cs = init_collation("utf8mb4_0900_ai_ci");
const char *content =
"データの保存とアクセスを行うストレージエンジンがSQLパーサとは"
"分離独立しており、用途に応じたストレージエンジンを選択できる"
"「マルチストレージエンジン」方式を採用している。";
const int len = strlen(content);
// Just recorded from a trial run on the string above.
static constexpr uchar expected[] = {
0x3d, 0x6d, 0x1c, 0x0e, 0x3d, 0x6a, 0x3d, 0x73, 0xfb, 0x40, 0xcf, 0xdd,
0xfb, 0x40, 0xdb, 0x58, 0x3d, 0x6e, 0x3d, 0x5a, 0x3d, 0x62, 0x3d, 0x68,
0x3d, 0x67, 0x3d, 0x8a, 0xfb, 0x41, 0x88, 0x4c, 0x3d, 0x5c, 0x3d, 0x67,
0x3d, 0x6e, 0x3d, 0x85, 0x1c, 0x0e, 0x3d, 0x66, 0x3d, 0x5e, 0x3d, 0x8b,
0x3d, 0x66, 0x3d, 0x8b, 0x3d, 0x60, 0x1e, 0x71, 0x1e, 0x21, 0x1d, 0x77,
0x3d, 0x74, 0x1c, 0x0e, 0x3d, 0x65, 0x3d, 0x6e, 0x3d, 0x74, 0xfb, 0x40,
0xd2, 0x06, 0xfb, 0x41, 0x96, 0xe2, 0xfb, 0x40, 0xf2, 0xec, 0xfb, 0x40,
0xfa, 0xcb, 0x3d, 0x66, 0x3d, 0x6d, 0x3d, 0x5f, 0x3d, 0x83, 0x02, 0x31,
0xfb, 0x40, 0xf5, 0x28, 0xfb, 0x41, 0x90, 0x14, 0x3d, 0x70, 0xfb, 0x40,
0xdf, 0xdc, 0x3d, 0x66, 0x3d, 0x6a, 0x3d, 0x67, 0x3d, 0x6e, 0x3d, 0x85,
0x1c, 0x0e, 0x3d, 0x66, 0x3d, 0x5e, 0x3d, 0x8b, 0x3d, 0x66, 0x3d, 0x8b,
0x3d, 0x8a, 0xfb, 0x41, 0x90, 0x78, 0xfb, 0x40, 0xe2, 0x9e, 0x3d, 0x6d,
0x3d, 0x61, 0x3d, 0x84, 0x03, 0x73, 0x3d, 0x79, 0x3d, 0x84, 0x3d, 0x6b,
0x3d, 0x67, 0x3d, 0x6e, 0x3d, 0x85, 0x1c, 0x0e, 0x3d, 0x66, 0x3d, 0x5e,
0x3d, 0x8b, 0x3d, 0x66, 0x3d, 0x8b, 0x03, 0x74, 0xfb, 0x40, 0xe5, 0xb9,
0xfb, 0x40, 0xdf, 0x0f, 0x3d, 0x8a, 0xfb, 0x40, 0xe3, 0xa1, 0xfb, 0x40,
0xf5, 0x28, 0x3d, 0x66, 0x3d, 0x6d, 0x3d, 0x5b, 0x3d, 0x84, 0x02, 0x8a};
uchar dest[sizeof(expected)];
StartBenchmarkTiming();
for (size_t i = 0; i < num_iterations; ++i) {
my_strnxfrm(cs, dest, sizeof(dest),
reinterpret_cast<const uchar *>(content), len);
}
StopBenchmarkTiming();
expect_arrays_equal(expected, dest, sizeof(dest));
SetBytesProcessed(num_iterations * strlen(content));
}
BENCHMARK(BM_JapaneseUTF8MB4)
/*
A benchmark that illustrates the potential perils of not including the
range [0x00,0x20) in our fast path; newlines throw us off the fast path
and reduce speed.
The newlines are spaced a bit randomly in order not to create a perfectly
predictable pattern for the branch predictor (benchmark paranoia).
*/
static void BM_NewlineFilledUTF8MB4(size_t num_iterations) {
StopBenchmarkTiming();
CHARSET_INFO *cs = init_collation("utf8mb4_0900_ai_ci");
const char *content =
"This is a\n prett\ny unrealist\nic case; a\nn "
"Eng\nlish sente\nnce where\n we'\nve added a new\nline every te\nn "
"bytes or\n so.\n";
const int len = strlen(content);
// Just recorded from a trial run on the string above.
static constexpr uchar expected[] = {
0x1e, 0x95, 0x1d, 0x18, 0x1d, 0x32, 0x1e, 0x71, 0x02, 0x09, 0x1d, 0x32,
0x1e, 0x71, 0x02, 0x09, 0x1c, 0x47, 0x02, 0x02, 0x02, 0x09, 0x1e, 0x0c,
0x1e, 0x33, 0x1c, 0xaa, 0x1e, 0x95, 0x1e, 0x95, 0x02, 0x02, 0x1f, 0x0b,
0x02, 0x09, 0x1e, 0xb5, 0x1d, 0xb9, 0x1e, 0x33, 0x1c, 0xaa, 0x1c, 0x47,
0x1d, 0x77, 0x1d, 0x32, 0x1e, 0x71, 0x1e, 0x95, 0x02, 0x02, 0x1d, 0x32,
0x1c, 0x7a, 0x02, 0x09, 0x1c, 0x7a, 0x1c, 0x47, 0x1e, 0x71, 0x1c, 0xaa,
0x02, 0x34, 0x02, 0x09, 0x1c, 0x47, 0x02, 0x02, 0x1d, 0xb9, 0x02, 0x09,
0x1c, 0xaa, 0x1d, 0xb9, 0x1c, 0xf4, 0x02, 0x02, 0x1d, 0x77, 0x1d, 0x32,
0x1e, 0x71, 0x1d, 0x18, 0x02, 0x09, 0x1e, 0x71, 0x1c, 0xaa, 0x1d, 0xb9,
0x1e, 0x95, 0x1c, 0xaa, 0x02, 0x02, 0x1d, 0xb9, 0x1c, 0x7a, 0x1c, 0xaa,
0x02, 0x09, 0x1e, 0xf5, 0x1d, 0x18, 0x1c, 0xaa, 0x1e, 0x33, 0x1c, 0xaa,
0x02, 0x02, 0x02, 0x09, 0x1e, 0xf5, 0x1c, 0xaa, 0x03, 0x05, 0x02, 0x02,
0x1e, 0xe3, 0x1c, 0xaa, 0x02, 0x09, 0x1c, 0x47, 0x1c, 0x8f, 0x1c, 0x8f,
0x1c, 0xaa, 0x1c, 0x8f, 0x02, 0x09, 0x1c, 0x47, 0x02, 0x09, 0x1d, 0xb9,
0x1c, 0xaa, 0x1e, 0xf5, 0x02, 0x02, 0x1d, 0x77, 0x1d, 0x32, 0x1d, 0xb9,
0x1c, 0xaa, 0x02, 0x09, 0x1c, 0xaa, 0x1e, 0xe3, 0x1c, 0xaa, 0x1e, 0x33,
0x1f, 0x0b, 0x02, 0x09, 0x1e, 0x95, 0x1c, 0xaa, 0x02, 0x02, 0x1d, 0xb9,
0x02, 0x09, 0x1c, 0x60, 0x1f, 0x0b, 0x1e, 0x95, 0x1c, 0xaa, 0x1e, 0x71,
0x02, 0x09, 0x1d, 0xdd, 0x1e, 0x33, 0x02, 0x02, 0x02, 0x09, 0x1e, 0x71,
0x1d, 0xdd, 0x02, 0x77, 0x02, 0x02};
uchar dest[sizeof(expected)];
StartBenchmarkTiming();
for (size_t i = 0; i < num_iterations; ++i) {
my_strnxfrm(cs, dest, sizeof(dest),
reinterpret_cast<const uchar *>(content), len);
}
StopBenchmarkTiming();
expect_arrays_equal(expected, dest, sizeof(dest));
SetBytesProcessed(num_iterations * strlen(content));
}
BENCHMARK(BM_NewlineFilledUTF8MB4)
static void BM_HashSimpleUTF8MB4(size_t num_iterations) {
StopBenchmarkTiming();
CHARSET_INFO *cs = init_collation("utf8mb4_0900_ai_ci");
const char *content =
"This is a rather long string that contains only "
"simple letters that are available in ASCII. This is a common special "
"case that warrants a benchmark on its own, even if the character set "
"and collation supports much more complicated scenarios.";
const int len = strlen(content);
uint64 nr1 = 1, nr2 = 4;
StartBenchmarkTiming();
for (size_t i = 0; i < num_iterations; ++i) {
cs->coll->hash_sort(cs, reinterpret_cast<const uchar *>(content), len, &nr1,
&nr2);
}
StopBenchmarkTiming();
/*
Just to keep the compiler from optimizing away everything; this is highly
unlikely to ever happen given hash function that's not totally broken.
Don't test for an exact value; it will vary by platform and number
of iterations.
*/
EXPECT_FALSE(nr1 == 0 && nr2 == 0);
}
BENCHMARK(BM_HashSimpleUTF8MB4)
/*
Test a non-trivial collation with contractions, to highlight
the performance difference.
*/
static void BM_Hungarian_AS_CS(size_t num_iterations) {
StopBenchmarkTiming();
CHARSET_INFO *cs = init_collation("utf8mb4_hu_0900_as_cs");
// Text snippet from Wikipedia.
const char *content =
"A MySQL adatbázisok adminisztrációjára a mellékelt "
"parancssori eszközöket (mysql és mysqladmin) használhatjuk.";
const int len = strlen(content);
// Just recorded from a trial run on the string above.
static constexpr uchar expected[] = {
0x1c, 0x47, 0x02, 0x09, 0x1d, 0xaa, 0x1f, 0x0b, 0x1e, 0x71, 0x1e, 0x21,
0x1d, 0x77, 0x02, 0x09, 0x1c, 0x47, 0x1c, 0x8f, 0x1c, 0x47, 0x1e, 0x95,
0x1c, 0x60, 0x1c, 0x47, 0x1f, 0x21, 0x1d, 0x32, 0x1e, 0x71, 0x1d, 0xdd,
0x1d, 0x65, 0x02, 0x09, 0x1c, 0x47, 0x1c, 0x8f, 0x1d, 0xaa, 0x1d, 0x32,
0x1d, 0xb9, 0x1d, 0x32, 0x1e, 0x71, 0x54, 0xa5, 0x1e, 0x95, 0x1e, 0x33,
0x1c, 0x47, 0x1c, 0x7a, 0x1d, 0x32, 0x1d, 0xdd, 0x1d, 0x4c, 0x1c, 0x47,
0x1e, 0x33, 0x1c, 0x47, 0x02, 0x09, 0x1c, 0x47, 0x02, 0x09, 0x1d, 0xaa,
0x1c, 0xaa, 0x1d, 0x77, 0x1d, 0x77, 0x1c, 0xaa, 0x1d, 0x65, 0x1c, 0xaa,
0x1d, 0x77, 0x1e, 0x95, 0x02, 0x09, 0x1e, 0x0c, 0x1c, 0x47, 0x1e, 0x33,
0x1c, 0x47, 0x1d, 0xb9, 0x1c, 0x7a, 0x54, 0xa5, 0x1e, 0x71, 0x1d, 0xdd,
0x1e, 0x33, 0x1d, 0x32, 0x02, 0x09, 0x1c, 0xaa, 0x1e, 0x71, 0x54, 0xa5,
0x1d, 0x65, 0x1d, 0xdd, 0x54, 0xa5, 0x1f, 0x21, 0x1d, 0xdd, 0x54, 0xa5,
0x1d, 0x65, 0x1c, 0xaa, 0x1e, 0x95, 0x02, 0x09, 0x03, 0x17, 0x1d, 0xaa,
0x1f, 0x0b, 0x1e, 0x71, 0x1e, 0x21, 0x1d, 0x77, 0x02, 0x09, 0x1c, 0xaa,
0x1e, 0x71, 0x02, 0x09, 0x1d, 0xaa, 0x1f, 0x0b, 0x1e, 0x71, 0x1e, 0x21,
0x1d, 0x77, 0x1c, 0x47, 0x1c, 0x8f, 0x1d, 0xaa, 0x1d, 0x32, 0x1d, 0xb9,
0x03, 0x18, 0x02, 0x09, 0x1d, 0x18, 0x1c, 0x47, 0x1e, 0x71, 0x54, 0xa5,
0x1d, 0xb9, 0x1c, 0x47, 0x1d, 0x77, 0x1d, 0x18, 0x1c, 0x47, 0x1e, 0x95,
0x1d, 0x4c, 0x1e, 0xb5, 0x1d, 0x65, 0x02, 0x77, 0x00, 0x00, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x24, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x24, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x24, 0x00, 0x20,
0x00, 0x20, 0x00, 0x24, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x24, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x24,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x24, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x00, 0x00, 0x08,
0x00, 0x02, 0x00, 0x08, 0x00, 0x02, 0x00, 0x08, 0x00, 0x08, 0x00, 0x08,
0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02,
0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02,
0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02,
0x00, 0x02, 0x00, 0x02, 0x00, 0x08, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02,
0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02,
0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02,
0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02,
0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02,
0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x08,
0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02,
0x00, 0x08, 0x00, 0x02, 0x00, 0x08, 0x00, 0x02, 0x00, 0x08, 0x00, 0x02,
0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02,
0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02,
0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02,
0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02,
0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x08, 0x00, 0x02,
0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02,
0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02};
uchar dest[sizeof(expected)] = {0};
size_t ret = 0;
StartBenchmarkTiming();
for (size_t i = 0; i < num_iterations; ++i) {
ret = my_strnxfrm(cs, dest, sizeof(dest),
pointer_cast<const uchar *>(content), len);
}
StopBenchmarkTiming();
EXPECT_EQ(sizeof(expected), ret);
expect_arrays_equal(expected, dest, ret);
SetBytesProcessed(num_iterations * strlen(content));
}
BENCHMARK(BM_Hungarian_AS_CS)
static void BM_Japanese_AS_CS(size_t num_iterations) {
StopBenchmarkTiming();
CHARSET_INFO *cs = init_collation("utf8mb4_ja_0900_as_cs");
const char *content =
"サーバー SQL モードの設定方法。この設定は、たとえば"
"別のデータベースシステムからのコードとの互換性を保ったり、特定の状況に"
"ついてのエラー処理を制御したりするために、SQL の構文およびセマンティクス"
"の特定の側面を変更します。";
const int len = strlen(content);
// Just recorded from a trial run on the string above.
static constexpr uchar expected[] = {
0x1F, 0xC1, 0x1F, 0xB6, 0x1F, 0xD0, 0x1F, 0xB6, 0x02, 0x09, 0x1E, 0x71,
0x1E, 0x21, 0x1D, 0x77, 0x02, 0x09, 0x1F, 0xD9, 0x1F, 0xBB, 0x1F, 0xCA,
0x1F, 0xCF, 0x5A, 0xC2, 0x5C, 0x45, 0x5E, 0x8C, 0x5E, 0x8E, 0x02, 0x8A,
0x1F, 0xC0, 0x1F, 0xCF, 0x5A, 0xC2, 0x5C, 0x45, 0x1F, 0xD0, 0x02, 0x31,
0x1F, 0xC6, 0x1F, 0xCA, 0x1F, 0xBA, 0x1F, 0xD0, 0x5E, 0x5B, 0x1F, 0xCF,
0x1F, 0xC9, 0x1F, 0xBA, 0x1F, 0xC6, 0x1F, 0xD3, 0x1F, 0xBA, 0x1F, 0xC3,
0x1F, 0xC2, 0x1F, 0xC3, 0x1F, 0xC9, 0x1F, 0xD7, 0x1F, 0xBC, 0x1F, 0xDE,
0x1F, 0xCF, 0x1F, 0xC0, 0x1F, 0xBB, 0x1F, 0xCA, 0x1F, 0xCA, 0x1F, 0xCF,
0x57, 0xD2, 0x56, 0x34, 0x5A, 0x90, 0x1F, 0xE6, 0x5E, 0x6C, 0x1F, 0xC8,
0x1F, 0xC6, 0x1F, 0xDF, 0x02, 0x31, 0x5C, 0xDA, 0x5C, 0x45, 0x1F, 0xCF,
0x5A, 0x1C, 0x56, 0xEE, 0x1F, 0xCC, 0x1F, 0xC8, 0x1F, 0xB7, 0x1F, 0xC9,
0x1F, 0xCF, 0x1F, 0xBA, 0x1F, 0xDE, 0x1F, 0xB6, 0x59, 0xB1, 0x5F, 0xA6,
0x1F, 0xE6, 0x5A, 0x8C, 0x57, 0xD9, 0x1F, 0xC2, 0x1F, 0xC6, 0x1F, 0xDF,
0x1F, 0xC3, 0x1F, 0xE0, 0x1F, 0xC6, 0x1F, 0xD8, 0x1F, 0xCC, 0x02, 0x31,
0x1E, 0x71, 0x1E, 0x21, 0x1D, 0x77, 0x02, 0x09, 0x1F, 0xCF, 0x58, 0x0E,
0x5E, 0x47, 0x1F, 0xBB, 0x1F, 0xDD, 0x1F, 0xD1, 0x1F, 0xC4, 0x1F, 0xD5,
0x1F, 0xE7, 0x1F, 0xC9, 0x1F, 0xB7, 0x1F, 0xBE, 0x1F, 0xC3, 0x1F, 0xCF,
0x5C, 0xDA, 0x5C, 0x45, 0x1F, 0xCF, 0x5B, 0x45, 0x5F, 0x17, 0x1F, 0xE6,
0x5E, 0x60, 0x58, 0x0A, 0x1F, 0xC2, 0x1F, 0xD5, 0x1F, 0xC3, 0x02, 0x8A,
0x00, 0x00, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x37, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x37, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x37, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x37,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x37, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x37, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x37,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x00, 0x00, 0x0E, 0x00, 0x0C, 0x00, 0x21,
0x00, 0x0E, 0x00, 0x02, 0x00, 0x0C, 0x00, 0x21, 0x00, 0x02, 0x00, 0x08,
0x00, 0x08, 0x00, 0x08, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x0C, 0x00, 0x21,
0x00, 0x0E, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02,
0x00, 0x02, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x02,
0x00, 0x0E, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E,
0x00, 0x02, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x0C,
0x00, 0x21, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x0C, 0x00, 0x21,
0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E,
0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0C, 0x00, 0x21, 0x00, 0x0E,
0x00, 0x02, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02,
0x00, 0x0E, 0x00, 0x02, 0x00, 0x0D, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x02,
0x00, 0x02, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x02, 0x00, 0x0E,
0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E,
0x00, 0x0C, 0x00, 0x21, 0x00, 0x02, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x02,
0x00, 0x02, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E,
0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x08, 0x00, 0x08,
0x00, 0x08, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x02, 0x00, 0x0E,
0x00, 0x0E, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E,
0x00, 0x0E, 0x00, 0x0D, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x02,
0x00, 0x02, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x02,
0x00, 0x02, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x02};
uchar dest[sizeof(expected)];
StartBenchmarkTiming();
for (size_t i = 0; i < num_iterations; ++i) {
my_strnxfrm(cs, dest, sizeof(dest),
reinterpret_cast<const uchar *>(content), len);
}
StopBenchmarkTiming();
expect_arrays_equal(expected, dest, sizeof(dest));
SetBytesProcessed(num_iterations * strlen(content));
}
BENCHMARK(BM_Japanese_AS_CS)
static void BM_Japanese_AS_CS_KS(size_t num_iterations) {
StopBenchmarkTiming();
CHARSET_INFO *cs = init_collation("utf8mb4_ja_0900_as_cs_ks");
const char *content =
"サーバー SQL モードの設定方法。この設定は、たとえば"
"別のデータベースシステムからのコードとの互換性を保ったり、特定の状況に"
"ついてのエラー処理を制御したりするために、SQL の構文およびセマンティクス"
"の特定の側面を変更します。";
const int len = strlen(content);
// Just recorded from a trial run on the string above.
static constexpr uchar expected[] = {
0x1F, 0xC1, 0x1F, 0xB6, 0x1F, 0xD0, 0x1F, 0xB6, 0x02, 0x09, 0x1E, 0x71,
0x1E, 0x21, 0x1D, 0x77, 0x02, 0x09, 0x1F, 0xD9, 0x1F, 0xBB, 0x1F, 0xCA,
0x1F, 0xCF, 0x5A, 0xC2, 0x5C, 0x45, 0x5E, 0x8C, 0x5E, 0x8E, 0x02, 0x8A,
0x1F, 0xC0, 0x1F, 0xCF, 0x5A, 0xC2, 0x5C, 0x45, 0x1F, 0xD0, 0x02, 0x31,
0x1F, 0xC6, 0x1F, 0xCA, 0x1F, 0xBA, 0x1F, 0xD0, 0x5E, 0x5B, 0x1F, 0xCF,
0x1F, 0xC9, 0x1F, 0xBA, 0x1F, 0xC6, 0x1F, 0xD3, 0x1F, 0xBA, 0x1F, 0xC3,
0x1F, 0xC2, 0x1F, 0xC3, 0x1F, 0xC9, 0x1F, 0xD7, 0x1F, 0xBC, 0x1F, 0xDE,
0x1F, 0xCF, 0x1F, 0xC0, 0x1F, 0xBB, 0x1F, 0xCA, 0x1F, 0xCA, 0x1F, 0xCF,
0x57, 0xD2, 0x56, 0x34, 0x5A, 0x90, 0x1F, 0xE6, 0x5E, 0x6C, 0x1F, 0xC8,
0x1F, 0xC6, 0x1F, 0xDF, 0x02, 0x31, 0x5C, 0xDA, 0x5C, 0x45, 0x1F, 0xCF,
0x5A, 0x1C, 0x56, 0xEE, 0x1F, 0xCC, 0x1F, 0xC8, 0x1F, 0xB7, 0x1F, 0xC9,
0x1F, 0xCF, 0x1F, 0xBA, 0x1F, 0xDE, 0x1F, 0xB6, 0x59, 0xB1, 0x5F, 0xA6,
0x1F, 0xE6, 0x5A, 0x8C, 0x57, 0xD9, 0x1F, 0xC2, 0x1F, 0xC6, 0x1F, 0xDF,
0x1F, 0xC3, 0x1F, 0xE0, 0x1F, 0xC6, 0x1F, 0xD8, 0x1F, 0xCC, 0x02, 0x31,
0x1E, 0x71, 0x1E, 0x21, 0x1D, 0x77, 0x02, 0x09, 0x1F, 0xCF, 0x58, 0x0E,
0x5E, 0x47, 0x1F, 0xBB, 0x1F, 0xDD, 0x1F, 0xD1, 0x1F, 0xC4, 0x1F, 0xD5,
0x1F, 0xE7, 0x1F, 0xC9, 0x1F, 0xB7, 0x1F, 0xBE, 0x1F, 0xC3, 0x1F, 0xCF,
0x5C, 0xDA, 0x5C, 0x45, 0x1F, 0xCF, 0x5B, 0x45, 0x5F, 0x17, 0x1F, 0xE6,
0x5E, 0x60, 0x58, 0x0A, 0x1F, 0xC2, 0x1F, 0xD5, 0x1F, 0xC3, 0x02, 0x8A,
0x00, 0x00, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x37, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x37, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x37, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x37,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x37, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x37, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x37,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x00, 0x00, 0x0E, 0x00, 0x0C, 0x00, 0x21,
0x00, 0x0E, 0x00, 0x02, 0x00, 0x0C, 0x00, 0x21, 0x00, 0x02, 0x00, 0x08,
0x00, 0x08, 0x00, 0x08, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x0C, 0x00, 0x21,
0x00, 0x0E, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02,
0x00, 0x02, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x02,
0x00, 0x0E, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E,
0x00, 0x02, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x0C,
0x00, 0x21, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x0C, 0x00, 0x21,
0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E,
0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0C, 0x00, 0x21, 0x00, 0x0E,
0x00, 0x02, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02,
0x00, 0x0E, 0x00, 0x02, 0x00, 0x0D, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x02,
0x00, 0x02, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x02, 0x00, 0x0E,
0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E,
0x00, 0x0C, 0x00, 0x21, 0x00, 0x02, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x02,
0x00, 0x02, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E,
0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x08, 0x00, 0x08,
0x00, 0x08, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x02, 0x00, 0x0E,
0x00, 0x0E, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E,
0x00, 0x0E, 0x00, 0x0D, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x02,
0x00, 0x02, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x02,
0x00, 0x02, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x00,
0x00, 0x08, 0x00, 0x08, 0x00, 0x08, 0x00, 0x08, 0x00, 0x08, 0x00, 0x08,
0x00, 0x08, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02,
0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x08, 0x00, 0x08,
0x00, 0x08, 0x00, 0x08, 0x00, 0x08, 0x00, 0x08, 0x00, 0x08, 0x00, 0x08,
0x00, 0x08, 0x00, 0x08, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x08,
0x00, 0x08, 0x00, 0x08, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02,
0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02,
0x00, 0x02, 0x00, 0x02, 0x00, 0x08, 0x00, 0x08, 0x00, 0x08, 0x00, 0x02,
0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02,
0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02,
0x00, 0x08, 0x00, 0x08, 0x00, 0x08, 0x00, 0x08, 0x00, 0x08, 0x00, 0x08,
0x00, 0x08, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02,
0x00, 0x02};
uchar dest[sizeof(expected)];
StartBenchmarkTiming();
for (size_t i = 0; i < num_iterations; ++i) {
my_strnxfrm(cs, dest, sizeof(dest),
reinterpret_cast<const uchar *>(content), len);
}
StopBenchmarkTiming();
expect_arrays_equal(expected, dest, sizeof(dest));
SetBytesProcessed(num_iterations * strlen(content));
}
BENCHMARK(BM_Japanese_AS_CS_KS)
TEST(StrXfrmTest, ChineseUTF8MB4) {
CHARSET_INFO *cs = init_collation("utf8mb4_zh_0900_as_cs");
const char *src =
"\xE9\x98\xBF\xE5\x92\x97" // The first and last Han character in zh.xml
"\xF0\xAC\xBA\xA1" // The last Han character
"\xC4\x81\x61\x62\xC5\xAB\x75\x55\xC7\x96\x5A" // Some latin characters
// are used as Bopomofo.
"\xF0\x94\x99\x86" // The last character that has explicit weight
// in the DUCET.
/* Non-Han characters that have implicit weight. */
"\xF0\x97\x86\xA0\xF0\xAC\xBA\xA2\xF0\xAE\xAF\xA0\xF0\xB3\x8C\xB3";
static const unsigned char full_answer_with_pad[116] = {
// level 1
0x1C, 0x47, 0xBD, 0xBE, // The first and last Han character in zh.xml
0xBD, 0xC3, 0xCE, 0xA1, // The last Han character
/* Latin characters. Some are used as Bopomofo. */
0xBD, 0xC4, 0xBD, 0xC4, 0xBD, 0xDD, 0xC0, 0x32, 0xC0, 0x32, 0xC0, 0x32,
0xC0, 0x32, 0xC0, 0x9E,
0xF6, 0x20, // The last character that has explicit weight in the DUCET.
/* Non-Han characters that have implicit weight. */
0xF6, 0x21, 0x81, 0xA0, 0xF6, 0x27, 0xCE, 0xA2, 0xF6, 0x27, 0xEB, 0xE0,
0xF6, 0x28, 0xB3, 0x33,
// level separator.
0x00, 0x00,
// level 2
0x00, 0x20, 0x00, 0x20, // The first and last Han character in zh.xml
0x00, 0x20, // The last Han character
/* Latin characters. Some are used as Bopomofo. */
0x00, 0x1F, 0x01, 0x16, 0x00, 0x20, 0x00, 0x20, 0x00, 0x1F, 0x01, 0x16,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x01, 0x16, 0x00, 0x20,
0x00, 0x20, // The last character that has explicit weight in the DUCET.
/* Non-Han characters that have implicit weight. */
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
// level separator.
0x00, 0x00,
// level 3
0x00, 0x02, 0x00, 0x02, // The first and last Han character in zh.xml
0x00, 0x02, // The last Han character
/* Latin characters. Some are used as Bopomofo. */
0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x08,
0x00, 0x08, 0x00, 0x08,
0x00, 0x02, // The last character that has explicit weight in the DUCET.
/* Non-Han characters that have implicit weight. */
0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02};
unsigned char buf[sizeof(full_answer_with_pad)];
for (size_t maxlen = 0; maxlen < sizeof(buf); maxlen += 2) {
memset(buf, 0xff, sizeof(buf));
my_strnxfrm(cs, buf, maxlen, pointer_cast<const uchar *>(src), strlen(src));
expect_arrays_equal(full_answer_with_pad, buf, maxlen);
}
}
static void BM_Chinese_AS_CS(size_t num_iterations) {
StopBenchmarkTiming();
CHARSET_INFO *cs = init_collation("utf8mb4_zh_0900_as_cs");
const char *content =
"春江潮水连海平,海上明月共潮生。"
"滟滟随波千万里,何处春江无月明!"
"江流宛转绕芳甸,月照花林皆似霰;"
"空里流霜不觉飞,汀上白沙看不见。"
"江天一色无纤尘,皎皎空中孤月轮。"
"江畔何人初见月?江月何年初照人?"
"人生代代无穷已,江月年年只相似。"
"不知江月待何人,但见长江送流水。"
"白云一片去悠悠,青枫浦上不胜愁。"
"谁家今夜扁舟子?何处相思明月楼?";
const int len = strlen(content);
// Just recorded from a trial run on the string above.
static constexpr uchar expected[] = {
0x2C, 0xD0, 0x4F, 0xF1, 0x28, 0x08, 0x87, 0xE8, 0x60, 0x4C, 0x42, 0xEF,
0x75, 0x93, 0x02, 0x22, 0x42, 0xEF, 0x83, 0x8A, 0x6C, 0x4F, 0xAF, 0x96,
0x3F, 0x58, 0x28, 0x08, 0x84, 0xCF, 0x02, 0x8A, 0xA3, 0xA4, 0xA3, 0xA4,
0x8A, 0x5F, 0x23, 0x71, 0x78, 0xA8, 0x93, 0x1A, 0x5E, 0xD9, 0x02, 0x22,
0x44, 0xAC, 0x2B, 0xD5, 0x2C, 0xD0, 0x4F, 0xF1, 0x96, 0x31, 0xAF, 0x96,
0x6C, 0x4F, 0x02, 0x60, 0x4F, 0xF1, 0x63, 0x7B, 0x92, 0xDD, 0xBA, 0x2E,
0x7F, 0x07, 0x39, 0x15, 0x32, 0xB2, 0x02, 0x22, 0xAF, 0x96, 0xB4, 0x41,
0x47, 0xD7, 0x62, 0x27, 0x51, 0x4C, 0x85, 0xE9, 0x81, 0x86, 0x02, 0x34,
0x59, 0x09, 0x5E, 0xD9, 0x63, 0x7B, 0x87, 0xBA, 0x24, 0x78, 0x56, 0x5A,
0x39, 0x48, 0x02, 0x22, 0x8F, 0x74, 0x83, 0x8A, 0x1E, 0x4D, 0x82, 0x46,
0x57, 0xD9, 0x24, 0x78, 0x4F, 0x79, 0x02, 0x8A, 0x4F, 0xF1, 0x8E, 0x8A,
0xA6, 0x3E, 0x81, 0xEE, 0x96, 0x31, 0x99, 0x9E, 0x28, 0x97, 0x02, 0x22,
0x50, 0xC2, 0x50, 0xC2, 0x59, 0x09, 0xB8, 0x20, 0x3F, 0xCC, 0xAF, 0x96,
0x66, 0xC9, 0x02, 0x8A, 0x4F, 0xF1, 0x72, 0xB6, 0x44, 0xAC, 0x7F, 0x11,
0x2B, 0x7B, 0x4F, 0x79, 0xAF, 0x96, 0x02, 0x66, 0x4F, 0xF1, 0xAF, 0x96,
0x44, 0xAC, 0x6F, 0xD5, 0x2B, 0x7B, 0xB4, 0x41, 0x7F, 0x11, 0x02, 0x66,
0x7F, 0x11, 0x84, 0xCF, 0x2F, 0xE2, 0x2F, 0xE2, 0x96, 0x31, 0x7B, 0xE1,
0xA7, 0x41, 0x02, 0x22, 0x4F, 0xF1, 0xAF, 0x96, 0x6F, 0xD5, 0x6F, 0xD5,
0xB6, 0xC3, 0x9B, 0x15, 0x85, 0xE9, 0x02, 0x8A, 0x24, 0x78, 0xB6, 0x2E,
0x4F, 0xF1, 0xAF, 0x96, 0x2F, 0xF4, 0x44, 0xAC, 0x7F, 0x11, 0x02, 0x22,
0x30, 0x86, 0x4F, 0x79, 0xB3, 0xDD, 0x4F, 0xF1, 0x89, 0x2A, 0x63, 0x7B,
0x87, 0xE8, 0x02, 0x8A, 0x1E, 0x4D, 0xB0, 0x1B, 0xA6, 0x3E, 0x75, 0x00,
0x7D, 0x93, 0xAB, 0xAF, 0xAB, 0xAF, 0x02, 0x22, 0x7B, 0x7D, 0x3A, 0x63,
0x76, 0xA2, 0x83, 0x8A, 0x24, 0x78, 0x85, 0x16, 0x2B, 0x2D, 0x02, 0x8A,
0x84, 0x30, 0x4D, 0xF3, 0x52, 0x63, 0xA5, 0xC7, 0x21, 0xE0, 0xB8, 0x87,
0xBC, 0x16, 0x02, 0x66, 0x44, 0xAC, 0x2B, 0xD5, 0x9B, 0x15, 0x88, 0x52,
0x6C, 0x4F, 0xAF, 0x96, 0x64, 0xA1, 0x02, 0x66, 0x00, 0x00, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20,
0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x00, 0x00, 0x02, 0x00, 0x02,
0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x03,
0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02,
0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02,
0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x03, 0x00, 0x02, 0x00, 0x02,
0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x03,
0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02,
0x00, 0x02, 0x00, 0x03, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02,
0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x03, 0x00, 0x02, 0x00, 0x02,
0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x03,
0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02,
0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02,
0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x03, 0x00, 0x02, 0x00, 0x02,
0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02,
0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02,
0x00, 0x02, 0x00, 0x03, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02,
0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x03, 0x00, 0x02, 0x00, 0x02,
0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x03,
0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02,
0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02,
0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x03, 0x00, 0x02, 0x00, 0x02,
0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02,
0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02,
0x00, 0x02, 0x00, 0x03, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02,
0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02,
0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x03,
0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02,
0x00, 0x02, 0x00, 0x03};
uchar dest[sizeof(expected)];
StartBenchmarkTiming();
for (size_t i = 0; i < num_iterations; ++i) {
my_strnxfrm(cs, dest, sizeof(dest),
reinterpret_cast<const uchar *>(content), len);
}
StopBenchmarkTiming();
expect_arrays_equal(expected, dest, sizeof(dest));
SetBytesProcessed(num_iterations * strlen(content));
}
BENCHMARK(BM_Chinese_AS_CS)
static void BM_UTF8MB4_bin(size_t num_iterations) {
StopBenchmarkTiming();
CHARSET_INFO *cs = init_collation("utf8mb4_bin");
const char *content =
"Premature optimization is the root of all evil. "
"Våre norske tegn bør æres. 日本語が少しわかります。 "
"🐶👩🏽";
const int len = strlen(content);
// Just recorded from a trial run on the string above.
static constexpr uchar expected[] = {
0x00, 0x00, 0x50, 0x00, 0x00, 0x72, 0x00, 0x00, 0x65, 0x00, 0x00, 0x6D,
0x00, 0x00, 0x61, 0x00, 0x00, 0x74, 0x00, 0x00, 0x75, 0x00, 0x00, 0x72,
0x00, 0x00, 0x65, 0x00, 0x00, 0x20, 0x00, 0x00, 0x6F, 0x00, 0x00, 0x70,
0x00, 0x00, 0x74, 0x00, 0x00, 0x69, 0x00, 0x00, 0x6D, 0x00, 0x00, 0x69,
0x00, 0x00, 0x7A, 0x00, 0x00, 0x61, 0x00, 0x00, 0x74, 0x00, 0x00, 0x69,
0x00, 0x00, 0x6F, 0x00, 0x00, 0x6E, 0x00, 0x00, 0x20, 0x00, 0x00, 0x69,
0x00, 0x00, 0x73, 0x00, 0x00, 0x20, 0x00, 0x00, 0x74, 0x00, 0x00, 0x68,
0x00, 0x00, 0x65, 0x00, 0x00, 0x20, 0x00, 0x00, 0x72, 0x00, 0x00, 0x6F,
0x00, 0x00, 0x6F, 0x00, 0x00, 0x74, 0x00, 0x00, 0x20, 0x00, 0x00, 0x6F,
0x00, 0x00, 0x66, 0x00, 0x00, 0x20, 0x00, 0x00, 0x61, 0x00, 0x00, 0x6C,
0x00, 0x00, 0x6C, 0x00, 0x00, 0x20, 0x00, 0x00, 0x65, 0x00, 0x00, 0x76,
0x00, 0x00, 0x69, 0x00, 0x00, 0x6C, 0x00, 0x00, 0x2E, 0x00, 0x00, 0x20,
0x00, 0x00, 0x56, 0x00, 0x00, 0xE5, 0x00, 0x00, 0x72, 0x00, 0x00, 0x65,
0x00, 0x00, 0x20, 0x00, 0x00, 0x6E, 0x00, 0x00, 0x6F, 0x00, 0x00, 0x72,
0x00, 0x00, 0x73, 0x00, 0x00, 0x6B, 0x00, 0x00, 0x65, 0x00, 0x00, 0x20,
0x00, 0x00, 0x74, 0x00, 0x00, 0x65, 0x00, 0x00, 0x67, 0x00, 0x00, 0x6E,
0x00, 0x00, 0x20, 0x00, 0x00, 0x62, 0x00, 0x00, 0xF8, 0x00, 0x00, 0x72,
0x00, 0x00, 0x20, 0x00, 0x00, 0xE6, 0x00, 0x00, 0x72, 0x00, 0x00, 0x65,
0x00, 0x00, 0x73, 0x00, 0x00, 0x2E, 0x00, 0x00, 0x20, 0x00, 0x65, 0xE5,
0x00, 0x67, 0x2C, 0x00, 0x8A, 0x9E, 0x00, 0x30, 0x4C, 0x00, 0x5C, 0x11,
0x00, 0x30, 0x57, 0x00, 0x30, 0x8F, 0x00, 0x30, 0x4B, 0x00, 0x30, 0x8A,
0x00, 0x30, 0x7E, 0x00, 0x30, 0x59, 0x00, 0x30, 0x02, 0x00, 0x00, 0x20,
0x00, 0x27, 0x0C, 0x00, 0xFE, 0x0F, 0x01, 0xF4, 0x36, 0x01, 0xF4, 0x69,
0x01, 0xF3, 0xFD};
uchar dest[sizeof(expected)];
StartBenchmarkTiming();
for (size_t i = 0; i < num_iterations; ++i) {
my_strnxfrm(cs, dest, sizeof(dest),
reinterpret_cast<const uchar *>(content), len);
}
StopBenchmarkTiming();
expect_arrays_equal(expected, dest, sizeof(dest));
SetBytesProcessed(num_iterations * strlen(content));
}
BENCHMARK(BM_UTF8MB4_bin)
static void BM_UTF8MB4_0900_bin(size_t num_iterations) {
StopBenchmarkTiming();
CHARSET_INFO *cs = init_collation("utf8mb4_0900_bin");
const char *content =
"Premature optimization is the root of all evil. "
"Våre norske tegn bør æres. 日本語が少しわかります。 "
"🐶👩🏽";
const int len = strlen(content);
uchar *dest = new uchar[len];
StartBenchmarkTiming();
for (size_t i = 0; i < num_iterations; ++i) {
my_strnxfrm(cs, dest, len, reinterpret_cast<const uchar *>(content), len);
}
StopBenchmarkTiming();
/*
utf8mb4_0900_bin_nopad gives the weight that has same bytes and length as
source string.
*/
expect_arrays_equal((const uchar *)content, dest, len);
delete[] dest;
SetBytesProcessed(num_iterations * len);
}
BENCHMARK(BM_UTF8MB4_0900_bin)
// The classic MySQL latin1 collation, for reference.
static void BM_Latin1_CI(size_t num_iterations) {
StopBenchmarkTiming();
CHARSET_INFO *cs = init_collation("latin1_swedish_ci");
const char *content =
"Alla människor är födda fria och lika i värde "
"och rättigheter. De är utrustade med förnuft och samvete och bör "
"handla gentemot varandra i en anda av broderskap.";
const int len = strlen(content);
/*
Just recorded from a trial run on the string above.
The entire last row is padding.
*/
static constexpr uchar expected[] = {
0x41, 0x4c, 0x4c, 0x41, 0x20, 0x4d, 0x41, 0xa4, 0x4e, 0x4e, 0x49, 0x53,
0x4b, 0x4f, 0x52, 0x20, 0x41, 0xa4, 0x52, 0x20, 0x46, 0x41, 0xb6, 0x44,
0x44, 0x41, 0x20, 0x46, 0x52, 0x49, 0x41, 0x20, 0x4f, 0x43, 0x48, 0x20,
0x4c, 0x49, 0x4b, 0x41, 0x20, 0x49, 0x20, 0x56, 0x41, 0xa4, 0x52, 0x44,
0x45, 0x20, 0x4f, 0x43, 0x48, 0x20, 0x52, 0x41, 0xa4, 0x54, 0x54, 0x49,
0x47, 0x48, 0x45, 0x54, 0x45, 0x52, 0x2e, 0x20, 0x44, 0x45, 0x20, 0x41,
0xa4, 0x52, 0x20, 0x55, 0x54, 0x52, 0x55, 0x53, 0x54, 0x41, 0x44, 0x45,
0x20, 0x4d, 0x45, 0x44, 0x20, 0x46, 0x41, 0xb6, 0x52, 0x4e, 0x55, 0x46,
0x54, 0x20, 0x4f, 0x43, 0x48, 0x20, 0x53, 0x41, 0x4d, 0x56, 0x45, 0x54,
0x45, 0x20, 0x4f, 0x43, 0x48, 0x20, 0x42, 0x41, 0xb6, 0x52, 0x20, 0x48,
0x41, 0x4e, 0x44, 0x4c, 0x41, 0x20, 0x47, 0x45, 0x4e, 0x54, 0x45, 0x4d,
0x4f, 0x54, 0x20, 0x56, 0x41, 0x52, 0x41, 0x4e, 0x44, 0x52, 0x41, 0x20,
0x49, 0x20, 0x45, 0x4e, 0x20, 0x41, 0x4e, 0x44, 0x41, 0x20, 0x41, 0x56,
0x20, 0x42, 0x52, 0x4f, 0x44, 0x45, 0x52, 0x53, 0x4b, 0x41, 0x50, 0x2e,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
};
uchar dest[sizeof(expected)];
size_t ret = 0;
StartBenchmarkTiming();
for (size_t i = 0; i < num_iterations; ++i) {
ret = cs->coll->strnxfrm(cs, dest, sizeof(dest), sizeof(dest),
pointer_cast<const uchar *>(content), len,
MY_STRXFRM_PAD_TO_MAXLEN);
}
StopBenchmarkTiming();
EXPECT_EQ(sizeof(expected), ret);
expect_arrays_equal(expected, dest, ret);
SetBytesProcessed(num_iterations * strlen(content));
}
BENCHMARK(BM_Latin1_CI)
// Since the UCA collations are NO PAD, strnncollsp should heed spaces.
TEST(PadCollationTest, BasicTest) {
constexpr char foo[] = "foo";
constexpr char foosp[] = "foo ";
constexpr char bar[] = "bar";
constexpr char foobar[] = "foobar";
CHARSET_INFO *cs = init_collation("utf8mb4_0900_ai_ci");
auto my_strnncollsp = cs->coll->strnncollsp;
// "foo" == "foo"
EXPECT_EQ(my_strnncollsp(cs, pointer_cast<const uchar *>(foo), strlen(foo),
pointer_cast<const uchar *>(foo), strlen(foo)),
0);
// "foo" < "foo "
EXPECT_LT(my_strnncollsp(cs, pointer_cast<const uchar *>(foo), strlen(foo),
pointer_cast<const uchar *>(foosp), strlen(foosp)),
0);
// "foo" > "bar"
EXPECT_GT(my_strnncollsp(cs, pointer_cast<const uchar *>(foo), strlen(foo),
pointer_cast<const uchar *>(bar), strlen(bar)),
0);
// "foo" < "foobar".
EXPECT_LT(my_strnncollsp(cs, pointer_cast<const uchar *>(foo), strlen(foo),
pointer_cast<const uchar *>(foobar), strlen(foobar)),
0);
// Exactly the same tests in reverse.
// "foo " > "foo"
EXPECT_GT(
my_strnncollsp(cs, pointer_cast<const uchar *>(foosp), strlen(foosp),
pointer_cast<const uchar *>(foo), strlen(foo)),
0);
// "bar" < "foo"
EXPECT_LT(my_strnncollsp(cs, pointer_cast<const uchar *>(bar), strlen(bar),
pointer_cast<const uchar *>(foo), strlen(foo)),
0);
// "foobar" > "foo".
EXPECT_GT(
my_strnncollsp(cs, pointer_cast<const uchar *>(foobar), strlen(foobar),
pointer_cast<const uchar *>(foo), strlen(foo)),
0);
}
TEST(StrxfrmTest, NoPadCollation) {
CHARSET_INFO *ai_ci = init_collation("utf8mb4_0900_ai_ci");
CHARSET_INFO *as_cs = init_collation("utf8mb4_0900_as_cs");
CHARSET_INFO *as_ci = init_collation("utf8mb4_0900_as_ci");
// Basic sanity checks.
EXPECT_EQ(compare_through_strxfrm(ai_ci, "abc", "abc"), 0);
EXPECT_NE(compare_through_strxfrm(as_ci, "abc", "Ǎḅç"), 0);
EXPECT_NE(compare_through_strxfrm(ai_ci, "abc", "def"), 0);
EXPECT_NE(compare_through_strxfrm(as_ci, "abc", "def"), 0);
// Spaces from the end should matter, no matter the collation.
EXPECT_LT(compare_through_strxfrm(ai_ci, "abc", "abc "), 0);
EXPECT_LT(compare_through_strxfrm(as_ci, "abc", "Ǎḅç "), 0);
EXPECT_LT(compare_through_strxfrm(as_cs, "abc", "abc "), 0);
EXPECT_LT(compare_through_strxfrm(as_cs, "abc", "Abc "), 0);
// Same with other types of spaces.
EXPECT_LT(compare_through_strxfrm(ai_ci, "abc", u8"abc \u00a0"), 0);
// Non-breaking space should compare _equal_ to space in ai_ci and as_ci,
// but _after_ in as_cs.
EXPECT_EQ(compare_through_strxfrm(ai_ci, "abc ", u8"abc\u00a0"), 0);
EXPECT_EQ(compare_through_strxfrm(as_ci, "abc ", u8"abc\u00a0"), 0);
EXPECT_LT(compare_through_strxfrm(as_cs, "abc ", u8"abc\u00a0"), 0);
// Also in the middle of the string.
EXPECT_EQ(compare_through_strxfrm(ai_ci, "a c", u8"a\u00a0c"), 0);
EXPECT_EQ(compare_through_strxfrm(as_ci, "a c", u8"a\u00a0c"), 0);
EXPECT_LT(compare_through_strxfrm(as_cs, "a c", u8"a\u00a0c"), 0);
// Verify that space in the middle of the string isn't stripped.
EXPECT_LT(compare_through_strxfrm(ai_ci, "ab c", "abc"), 0);
EXPECT_LT(compare_through_strxfrm(as_ci, "ab c", "abc"), 0);
EXPECT_LT(compare_through_strxfrm(as_cs, "ab c", "abc"), 0);
// Whitespace ordering as specified by DUCET.
EXPECT_GT(compare_through_strxfrm(as_ci, " ", "\t"), 0);
EXPECT_GT(compare_through_strxfrm(as_cs, " ", "\t"), 0);
EXPECT_LT(compare_through_strxfrm(as_cs, "", "\t"), 0);
}
TEST(StrxfrmTest, Contractions) {
CHARSET_INFO *hu_ai_ci = init_collation("utf8mb4_hu_0900_ai_ci");
// Basic sanity checks.
EXPECT_EQ(compare_through_strxfrm(hu_ai_ci, "abc", "abc"), 0);
EXPECT_NE(compare_through_strxfrm(hu_ai_ci, "abc", "def"), 0);
EXPECT_EQ(compare_through_strxfrm(hu_ai_ci, "abc", "Abc"), 0);
// "cs" counts as a separate letter, where c < cs < d, so:
EXPECT_LT(compare_through_strxfrm(hu_ai_ci, "c", "cs"), 0);
EXPECT_LT(compare_through_strxfrm(hu_ai_ci, "cs", "d"), 0);
EXPECT_LT(compare_through_strxfrm(hu_ai_ci, "ct", "cst"), 0);
EXPECT_LT(compare_through_strxfrm(hu_ai_ci, "cst", "dt"), 0);
// Wikipedia gives this as an example.
EXPECT_LT(compare_through_strxfrm(hu_ai_ci, "cukor", "csak"), 0);
}
/*
This test is disabled by default since it needs ~10 seconds to run,
even in optimized mode.
*/
TEST(BitfiddlingTest, DISABLED_FastOutOfRange) {
unsigned char bytes[4];
for (int a = 0; a < 256; ++a) {
bytes[0] = a;
for (int b = 0; b < 256; ++b) {
bytes[1] = b;
for (int c = 0; c < 256; ++c) {
bytes[2] = c;
for (int d = 0; d < 256; ++d) {
bytes[3] = d;
bool any_out_of_range_slow =
(a < 0x20 || a > 0x7e) || (b < 0x20 || b > 0x7e) ||
(c < 0x20 || c > 0x7e) || (d < 0x20 || d > 0x7e);
uint32 four_bytes;
memcpy(&four_bytes, bytes, sizeof(four_bytes));
bool any_out_of_range_fast =
(((four_bytes + 0x01010101u) & 0x80808080) ||
((four_bytes - 0x20202020u) & 0x80808080));
EXPECT_EQ(any_out_of_range_slow, any_out_of_range_fast);
}
}
}
}
}
/*
A version of FastOutOfRange that tests the analogous trick for 16-bit
integers instead (much, much faster).
*/
TEST(BitfiddlingTest, FastOutOfRange16) {
unsigned char bytes[2];
for (int a = 0; a < 256; ++a) {
bytes[0] = a;
for (int b = 0; b < 256; ++b) {
bytes[1] = b;
bool any_out_of_range_slow =
(a < 0x20 || a > 0x7e) || (b < 0x20 || b > 0x7e);
uint16 two_bytes;
memcpy(&two_bytes, bytes, sizeof(two_bytes));
bool any_out_of_range_fast =
(((two_bytes + uint16{0x0101}) & uint16{0x8080}) ||
((two_bytes - uint16{0x2020}) & uint16{0x8080}));
EXPECT_EQ(any_out_of_range_slow, any_out_of_range_fast);
}
}
}
uint64 hash(CHARSET_INFO *cs, const char *str) {
uint64 nr1 = 1, nr2 = 4;
cs->coll->hash_sort(cs, pointer_cast<const uchar *>(str), strlen(str), &nr1,
&nr2);
return nr1;
}
/*
NOTE: In this entire test, there's an infinitesimal chance
that something that we expect doesn't match, still matches
by pure accident.
*/
TEST(PadCollationTest, HashSort) {
CHARSET_INFO *ai_ci = init_collation("utf8mb4_0900_ai_ci");
CHARSET_INFO *as_cs = init_collation("utf8mb4_0900_as_cs");
// Basic sanity checks.
EXPECT_EQ(hash(ai_ci, "abc"), hash(ai_ci, "abc"));
EXPECT_NE(hash(ai_ci, "abc"), hash(ai_ci, "def"));
// Spaces from the end should matter, no matter the collation.
EXPECT_NE(hash(ai_ci, "abc"), hash(ai_ci, "abc "));
EXPECT_NE(hash(as_cs, "abc"), hash(as_cs, "abc "));
EXPECT_NE(hash(as_cs, "abc"), hash(as_cs, "Abc "));
// Same with other types of spaces.
EXPECT_NE(hash(ai_ci, "abc"), hash(ai_ci, u8"abc \u00a0"));
// Non-breaking space should compare _equal_ to space in ai_ci,
// but _inequal_ in as_cs.
EXPECT_EQ(hash(ai_ci, "abc "), hash(ai_ci, u8"abc\u00a0"));
EXPECT_NE(hash(as_cs, "abc "), hash(as_cs, u8"abc\u00a0"));
EXPECT_NE(hash(as_cs, "abc"), hash(as_cs, u8"abc\u00a0"));
// Also in the middle of the string.
EXPECT_EQ(hash(ai_ci, "a c"), hash(ai_ci, u8"a\u00a0c"));
EXPECT_NE(hash(as_cs, "a c"), hash(as_cs, u8"a\u00a0c"));
// Verify that space in the middle of the string isn't stripped.
EXPECT_NE(hash(ai_ci, "ab c"), hash(ai_ci, "abc"));
EXPECT_NE(hash(as_cs, "ab c"), hash(as_cs, "abc"));
}
TEST(HashTest, NullPointer) {
CHARSET_INFO *cs = init_collation("utf8mb4_0900_ai_ci");
uint64 nr1 = 1, nr2 = 4;
/*
We should get the same hash from the empty string no matter what
the pointer is.
*/
cs->coll->hash_sort(cs, nullptr, 0, &nr1, &nr2);
EXPECT_EQ(nr1, hash(cs, ""));
cs->coll->hash_sort(cs, pointer_cast<const uchar *>(" "), 8, &nr1,
&nr2);
// Don't care what the values are, just that we don't crash.
}
namespace {
// Test that strnxfrmlen() holds for all single characters.
void test_strnxfrmlen(CHARSET_INFO *cs) {
pair<size_t, my_wc_t> longest{0, 0};
uchar inbuf[16], outbuf[256]; // Ought to be enough for anyone.
const size_t max_len = cs->coll->strnxfrmlen(cs, cs->mbmaxlen);
for (my_wc_t ch = 0; ch <= 0x10ffff; ++ch) {
size_t in_len = cs->cset->wc_mb(cs, ch, inbuf, inbuf + sizeof(inbuf));
if (in_len <= 0) {
continue; // Not representable in this character set.
}
size_t out_len =
cs->coll->strnxfrm(cs, outbuf, sizeof(outbuf), 1, inbuf, in_len, 0);
EXPECT_LE(out_len, max_len);
if (out_len > max_len) {
fprintf(stderr, "U+%04lX needed more room than strnxfrmlen() claimed\n",
ch);
fprintf(stderr, "Weight string:");
for (size_t i = 0; i < out_len; ++i) {
fprintf(stderr, " %02x", outbuf[i]);
}
fprintf(stderr, "\n\n");
}
longest = max(longest, make_pair(out_len, ch));
}
fprintf(stderr,
"Longest character in '%s': U+%04lX, %d bytes (strnxfrm_len=%d)\n",
cs->name, longest.second, static_cast<int>(longest.first),
static_cast<int>(max_len));
}
} // namespace
TEST(StrxfrmLenTest, StrnxfrmLenIsLongEnoughForAllCharacters) {
// Load one collation to get everything going.
init_collation("utf8mb4_0900_ai_ci");
for (CHARSET_INFO *cs : all_charsets) {
if (cs && (cs->state & MY_CS_AVAILABLE)) {
SCOPED_TRACE(cs->name);
test_strnxfrmlen(init_collation(cs->name));
}
}
}
// Golden hashes for a test string. These may be stored on disk, so we need to
// make sure that they never change.
struct GoldenHashResult {
pair<uint64, uint64> hash_value;
};
TEST(StrmxfrmHashTest, HashStability) {
// Load one collation to get everything going.
init_collation("utf8mb4_0900_ai_ci");
// Reference values. Please keep this list sorted.
unordered_map<string, GoldenHashResult> expected = {
{"armscii8_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}},
{"armscii8_general_ci", {{0xdae43ea5cabac97cLL, 0x000002b0LL}}},
{"ascii_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}},
{"ascii_general_ci", {{0xdae43ea5cabac97cLL, 0x000002b0LL}}},
{"big5_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}},
{"big5_chinese_ci", {{0xdae43ea5cabac97cLL, 0x000002b0LL}}},
{"binary", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}},
{"cp1250_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}},
{"cp1250_croatian_ci", {{0xe25aa32298f78f4aLL, 0x000002b0LL}}},
{"cp1250_czech_cs", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}},
{"cp1250_general_ci", {{0x81c46f6c6b06f8fcLL, 0x000002b0LL}}},
{"cp1250_polish_ci", {{0xe25aa32298f78f4aLL, 0x000002b0LL}}},
{"cp1251_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}},
{"cp1251_bulgarian_ci", {{0xdae43ea5cabac97cLL, 0x000002b0LL}}},
{"cp1251_general_ci", {{0xce71da5364c300a4LL, 0x000002b0LL}}},
{"cp1251_general_cs", {{0xff44ce45c6d3d142LL, 0x000002b0LL}}},
{"cp1251_ukrainian_ci", {{0xdae43ea5cabac97cLL, 0x000002b0LL}}},
{"cp1256_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}},
{"cp1256_general_ci", {{0x44ed84e7ad4a6c1cLL, 0x000002b0LL}}},
{"cp1257_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}},
{"cp1257_general_ci", {{0x15219f243a38ad58LL, 0x000002b0LL}}},
{"cp1257_lithuanian_ci", {{0xaa3ef638e5e056e8LL, 0x000002b0LL}}},
{"cp850_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}},
{"cp850_general_ci", {{0xf32b1cf4087a0b08LL, 0x000002b0LL}}},
{"cp852_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}},
{"cp852_general_ci", {{0x60dce9bffdeccd52LL, 0x000002b0LL}}},
{"cp866_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}},
{"cp866_general_ci", {{0xce71da5364c300a4LL, 0x000002b0LL}}},
{"cp932_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}},
{"cp932_japanese_ci", {{0xdae43ea5cabac97cLL, 0x000002b0LL}}},
{"dec8_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}},
{"dec8_swedish_ci", {{0xdae43ea5cabac97cLL, 0x000002b0LL}}},
{"eucjpms_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}},
{"eucjpms_japanese_ci", {{0xdae43ea5cabac97cLL, 0x000002b0LL}}},
{"euckr_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}},
{"euckr_korean_ci", {{0xdae43ea5cabac97cLL, 0x000002b0LL}}},
{"gb18030_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}},
{"gb18030_chinese_ci", {{0xb7b6676124243e73LL, 0x00000abdLL}}},
{"gb18030_unicode_520_ci", {{0x5c1f019a21e3d464LL, 0x0000055fLL}}},
{"gb2312_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}},
{"gb2312_chinese_ci", {{0xdae43ea5cabac97cLL, 0x000002b0LL}}},
{"gbk_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}},
{"gbk_chinese_ci", {{0xdae43ea5cabac97cLL, 0x000002b0LL}}},
{"geostd8_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}},
{"geostd8_general_ci", {{0xdae43ea5cabac97cLL, 0x000002b0LL}}},
{"greek_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}},
{"greek_general_ci", {{0xdae43ea5cabac97cLL, 0x000002b0LL}}},
{"hebrew_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}},
{"hebrew_general_ci", {{0xdae43ea5cabac97cLL, 0x000002b0LL}}},
{"hp8_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}},
{"hp8_english_ci", {{0xdae43ea5cabac97cLL, 0x000002b0LL}}},
{"keybcs2_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}},
{"keybcs2_general_ci", {{0xd2d54c0201229650LL, 0x000002b0LL}}},
{"koi8r_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}},
{"koi8r_general_ci", {{0xdae43ea5cabac97cLL, 0x000002b0LL}}},
{"koi8u_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}},
{"koi8u_general_ci", {{0xdae43ea5cabac97cLL, 0x000002b0LL}}},
{"latin1_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}},
{"latin1_danish_ci", {{0xdae43ea5cabac97cLL, 0x000002b0LL}}},
{"latin1_general_ci", {{0xd7d424d55cb8f402LL, 0x000002b0LL}}},
{"latin1_general_cs", {{0x96b2a3f94ffe41f9LL, 0x000002b0LL}}},
{"latin1_german1_ci", {{0xdae43ea5cabac97cLL, 0x000002b0LL}}},
{"latin1_german2_ci", {{0xdae43ea5cabac97cLL, 0x000002b0LL}}},
{"latin1_spanish_ci", {{0xd7d424d55cb8f402LL, 0x000002b0LL}}},
{"latin1_swedish_ci", {{0xdae43ea5cabac97cLL, 0x000002b0LL}}},
{"latin2_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}},
{"latin2_croatian_ci", {{0xe25aa32298f78f4aLL, 0x000002b0LL}}},
{"latin2_czech_cs", {{0xba89a4855c3a88b6LL, 0x000002b0LL}}},
{"latin2_general_ci", {{0xd9179195a5ddebf8LL, 0x000002b0LL}}},
{"latin2_hungarian_ci", {{0xba89a4855c3a88b6LL, 0x000002b0LL}}},
{"latin5_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}},
{"latin5_turkish_ci", {{0x68989a162aab9f1cLL, 0x000002b0LL}}},
{"latin7_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}},
{"latin7_estonian_cs", {{0xa281f3df87b89fe1LL, 0x000002b0LL}}},
{"latin7_general_ci", {{0xc6808727382ffb41LL, 0x000002b0LL}}},
{"latin7_general_cs", {{0xf70d2b9f0d640804LL, 0x000002b0LL}}},
{"macce_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}},
{"macce_general_ci", {{0xb27ca521eb9b7492LL, 0x000002b0LL}}},
{"macroman_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}},
{"macroman_general_ci", {{0x3254bac0fa3625efLL, 0x000002b0LL}}},
{"sjis_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}},
{"sjis_japanese_ci", {{0xdae43ea5cabac97cLL, 0x000002b0LL}}},
{"swe7_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}},
{"swe7_swedish_ci", {{0xdae43ea5cabac97cLL, 0x000002b0LL}}},
{"tis620_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}},
{"tis620_thai_ci", {{0xdae43ea5cabac97cLL, 0x000002b0LL}}},
{"ucs2_bin", {{0x1877f0a25b18b4c6LL, 0x0000055fLL}}},
{"ucs2_croatian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"ucs2_czech_ci", {{0x1dc65c2738ed47c0LL, 0x00000553LL}}},
{"ucs2_danish_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"ucs2_esperanto_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"ucs2_estonian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"ucs2_general_ci", {{0xfb66c3f2301bd579LL, 0x0000055fLL}}},
{"ucs2_general_mysql500_ci", {{0xfb66c3f2301bd579LL, 0x0000055fLL}}},
{"ucs2_german2_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"ucs2_hungarian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"ucs2_icelandic_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"ucs2_latvian_ci", {{0x6473871765c3455cLL, 0x0000055fLL}}},
{"ucs2_lithuanian_ci", {{0xccb8395ef1969f40LL, 0x00000553LL}}},
{"ucs2_persian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"ucs2_polish_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"ucs2_roman_ci", {{0xf40d4b3c957fccdcLL, 0x0000055fLL}}},
{"ucs2_romanian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"ucs2_sinhala_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"ucs2_slovak_ci", {{0x1dc65c2738ed47c0LL, 0x00000553LL}}},
{"ucs2_slovenian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"ucs2_spanish2_ci", {{0x3e79d9277da1beb4LL, 0x00000547LL}}},
{"ucs2_spanish_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"ucs2_swedish_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"ucs2_turkish_ci", {{0x3fb28acb6e515c9cLL, 0x0000055fLL}}},
{"ucs2_unicode_520_ci", {{0x5c1f019a21e3d464LL, 0x0000055fLL}}},
{"ucs2_unicode_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"ucs2_vietnamese_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"ujis_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}},
{"ujis_japanese_ci", {{0xdae43ea5cabac97cLL, 0x000002b0LL}}},
{"utf16_bin", {{0x1877f0a25b18b4c6LL, 0x0000055fLL}}},
{"utf16_croatian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf16_czech_ci", {{0x1dc65c2738ed47c0LL, 0x00000553LL}}},
{"utf16_danish_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf16_esperanto_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf16_estonian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf16_general_ci", {{0xfb66c3f2301bd579LL, 0x0000055fLL}}},
{"utf16_german2_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf16_hungarian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf16_icelandic_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf16_latvian_ci", {{0x6473871765c3455cLL, 0x0000055fLL}}},
{"utf16_lithuanian_ci", {{0xccb8395ef1969f40LL, 0x00000553LL}}},
{"utf16_persian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf16_polish_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf16_roman_ci", {{0xf40d4b3c957fccdcLL, 0x0000055fLL}}},
{"utf16_romanian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf16_sinhala_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf16_slovak_ci", {{0x1dc65c2738ed47c0LL, 0x00000553LL}}},
{"utf16_slovenian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf16_spanish2_ci", {{0x3e79d9277da1beb4LL, 0x00000547LL}}},
{"utf16_spanish_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf16_swedish_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf16_turkish_ci", {{0x3fb28acb6e515c9cLL, 0x0000055fLL}}},
{"utf16_unicode_520_ci", {{0x5c1f019a21e3d464LL, 0x0000055fLL}}},
{"utf16_unicode_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf16_vietnamese_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf16le_bin", {{0x3da26ce08ecbfaf9LL, 0x0000055fLL}}},
{"utf16le_general_ci", {{0xfb66c3f2301bd579LL, 0x0000055fLL}}},
{"utf32_bin", {{0x353330032692faLL, 0x00000abdLL}}},
{"utf32_croatian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf32_czech_ci", {{0x1dc65c2738ed47c0LL, 0x00000553LL}}},
{"utf32_danish_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf32_esperanto_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf32_estonian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf32_general_ci", {{0x353330032692faLL, 0x00000abdLL}}},
{"utf32_german2_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf32_hungarian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf32_icelandic_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf32_latvian_ci", {{0x6473871765c3455cLL, 0x0000055fLL}}},
{"utf32_lithuanian_ci", {{0xccb8395ef1969f40LL, 0x00000553LL}}},
{"utf32_persian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf32_polish_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf32_roman_ci", {{0xf40d4b3c957fccdcLL, 0x0000055fLL}}},
{"utf32_romanian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf32_sinhala_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf32_slovak_ci", {{0x1dc65c2738ed47c0LL, 0x00000553LL}}},
{"utf32_slovenian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf32_spanish2_ci", {{0x3e79d9277da1beb4LL, 0x00000547LL}}},
{"utf32_spanish_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf32_swedish_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf32_turkish_ci", {{0x3fb28acb6e515c9cLL, 0x0000055fLL}}},
{"utf32_unicode_520_ci", {{0x5c1f019a21e3d464LL, 0x0000055fLL}}},
{"utf32_unicode_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf32_vietnamese_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf8_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}},
{"utf8_croatian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf8_czech_ci", {{0x1dc65c2738ed47c0LL, 0x00000553LL}}},
{"utf8_danish_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf8_esperanto_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf8_estonian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf8_general_ci", {{0xfb66c3f2301bd579LL, 0x0000055fLL}}},
{"utf8_general_mysql500_ci", {{0xfb66c3f2301bd579LL, 0x0000055fLL}}},
{"utf8_german2_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf8_hungarian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf8_icelandic_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf8_latvian_ci", {{0x6473871765c3455cLL, 0x0000055fLL}}},
{"utf8_lithuanian_ci", {{0xccb8395ef1969f40LL, 0x00000553LL}}},
{"utf8_persian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf8_polish_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf8_roman_ci", {{0xf40d4b3c957fccdcLL, 0x0000055fLL}}},
{"utf8_romanian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf8_sinhala_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf8_slovak_ci", {{0x1dc65c2738ed47c0LL, 0x00000553LL}}},
{"utf8_slovenian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf8_spanish2_ci", {{0x3e79d9277da1beb4LL, 0x00000547LL}}},
{"utf8_spanish_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf8_swedish_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf8_tolower_ci", {{0x8eab9a2c403c8eb9LL, 0x0000055fLL}}},
{"utf8_turkish_ci", {{0x3fb28acb6e515c9cLL, 0x0000055fLL}}},
{"utf8_unicode_520_ci", {{0x5c1f019a21e3d464LL, 0x0000055fLL}}},
{"utf8_unicode_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf8_vietnamese_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf8mb4_0900_ai_ci", {{0x3329a425d0f7f8d3LL, 0x00000001LL}}},
{"utf8mb4_0900_as_ci", {{0xfc978781d49d0d9bLL, 0x00000001LL}}},
{"utf8mb4_0900_as_cs", {{0xcfb3e3073c9f5a19LL, 0x00000001LL}}},
{"utf8mb4_0900_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}},
{"utf8mb4_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}},
{"utf8mb4_croatian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf8mb4_cs_0900_ai_ci", {{0x36582be4fafa0bbbLL, 0x00000001LL}}},
{"utf8mb4_cs_0900_as_cs", {{0xac403419684d8c71LL, 0x00000001LL}}},
{"utf8mb4_czech_ci", {{0x1dc65c2738ed47c0LL, 0x00000553LL}}},
{"utf8mb4_da_0900_ai_ci", {{0x3329a425d0f7f8d3LL, 0x00000001LL}}},
{"utf8mb4_da_0900_as_cs", {{0xbd24fdcb7b0cf519LL, 0x00000001LL}}},
{"utf8mb4_danish_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf8mb4_de_pb_0900_ai_ci", {{0x3329a425d0f7f8d3LL, 0x00000001LL}}},
{"utf8mb4_de_pb_0900_as_cs", {{0xcfb3e3073c9f5a19LL, 0x00000001LL}}},
{"utf8mb4_eo_0900_ai_ci", {{0x3329a425d0f7f8d3LL, 0x00000001LL}}},
{"utf8mb4_eo_0900_as_cs", {{0xcfb3e3073c9f5a19LL, 0x00000001LL}}},
{"utf8mb4_es_0900_ai_ci", {{0x3329a425d0f7f8d3LL, 0x00000001LL}}},
{"utf8mb4_es_0900_as_cs", {{0xcfb3e3073c9f5a19LL, 0x00000001LL}}},
{"utf8mb4_es_trad_0900_ai_ci", {{0x555a77b8a263f17fLL, 0x00000001LL}}},
{"utf8mb4_es_trad_0900_as_cs", {{0xae993a138c5c030dLL, 0x00000001LL}}},
{"utf8mb4_esperanto_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf8mb4_estonian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf8mb4_et_0900_ai_ci", {{0x3329a425d0f7f8d3LL, 0x00000001LL}}},
{"utf8mb4_et_0900_as_cs", {{0xcfb3e3073c9f5a19LL, 0x00000001LL}}},
{"utf8mb4_general_ci", {{0xfb66c3f2301bd579LL, 0x0000055fLL}}},
{"utf8mb4_german2_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf8mb4_hr_0900_ai_ci", {{0x3329a425d0f7f8d3LL, 0x00000001LL}}},
{"utf8mb4_hr_0900_as_cs", {{0xcfb3e3073c9f5a19LL, 0x00000001LL}}},
{"utf8mb4_hu_0900_ai_ci", {{0x3162e9e9cebb9148LL, 0x00000001LL}}},
{"utf8mb4_hu_0900_as_cs", {{0x88842661c548eec1LL, 0x00000001LL}}},
{"utf8mb4_hungarian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf8mb4_icelandic_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf8mb4_is_0900_ai_ci", {{0x3329a425d0f7f8d3LL, 0x00000001LL}}},
{"utf8mb4_is_0900_as_cs", {{0xcfb3e3073c9f5a19LL, 0x00000001LL}}},
{"utf8mb4_ja_0900_as_cs", {{0xcfb3e3073c9f5a19LL, 0x00000001LL}}},
{"utf8mb4_ja_0900_as_cs_ks", {{0xcfb3e3073c9f5a19LL, 0x00000001LL}}},
{"utf8mb4_la_0900_ai_ci", {{0x2928cd07bca9a85dLL, 0x00000001LL}}},
{"utf8mb4_la_0900_as_cs", {{0x29a7f3eb43a9819LL, 0x00000001LL}}},
{"utf8mb4_latvian_ci", {{0x6473871765c3455cLL, 0x0000055fLL}}},
{"utf8mb4_lithuanian_ci", {{0xccb8395ef1969f40LL, 0x00000553LL}}},
{"utf8mb4_lt_0900_ai_ci", {{0xcd5ce469f67f6792LL, 0x00000001LL}}},
{"utf8mb4_lt_0900_as_cs", {{0xe2e6dc41a4d6b3c1LL, 0x00000001LL}}},
{"utf8mb4_lv_0900_ai_ci", {{0xcd5ce469f67f6792LL, 0x00000001LL}}},
{"utf8mb4_lv_0900_as_cs", {{0xfe377cec9551f0f4LL, 0x00000001LL}}},
{"utf8mb4_persian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf8mb4_pl_0900_ai_ci", {{0x3329a425d0f7f8d3LL, 0x00000001LL}}},
{"utf8mb4_pl_0900_as_cs", {{0xcfb3e3073c9f5a19LL, 0x00000001LL}}},
{"utf8mb4_polish_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf8mb4_ro_0900_ai_ci", {{0x3329a425d0f7f8d3LL, 0x00000001LL}}},
{"utf8mb4_ro_0900_as_cs", {{0xcfb3e3073c9f5a19LL, 0x00000001LL}}},
{"utf8mb4_roman_ci", {{0xf40d4b3c957fccdcLL, 0x0000055fLL}}},
{"utf8mb4_romanian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf8mb4_ru_0900_ai_ci", {{0xb55bc2bf5ab2bf53LL, 0x00000001LL}}},
{"utf8mb4_ru_0900_as_cs", {{0x36f5a31292841899LL, 0x00000001LL}}},
{"utf8mb4_sinhala_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf8mb4_sk_0900_ai_ci", {{0x36582be4fafa0bbbLL, 0x00000001LL}}},
{"utf8mb4_sk_0900_as_cs", {{0xac403419684d8c71LL, 0x00000001LL}}},
{"utf8mb4_sl_0900_ai_ci", {{0x3329a425d0f7f8d3LL, 0x00000001LL}}},
{"utf8mb4_sl_0900_as_cs", {{0xcfb3e3073c9f5a19LL, 0x00000001LL}}},
{"utf8mb4_slovak_ci", {{0x1dc65c2738ed47c0LL, 0x00000553LL}}},
{"utf8mb4_slovenian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf8mb4_spanish2_ci", {{0x3e79d9277da1beb4LL, 0x00000547LL}}},
{"utf8mb4_spanish_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf8mb4_sv_0900_ai_ci", {{0x3329a425d0f7f8d3LL, 0x00000001LL}}},
{"utf8mb4_sv_0900_as_cs", {{0xcfb3e3073c9f5a19LL, 0x00000001LL}}},
{"utf8mb4_swedish_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf8mb4_tr_0900_ai_ci", {{0x7ea67be76364740fLL, 0x00000001LL}}},
{"utf8mb4_tr_0900_as_cs", {{0xfa4556e24336675eLL, 0x00000001LL}}},
{"utf8mb4_turkish_ci", {{0x3fb28acb6e515c9cLL, 0x0000055fLL}}},
{"utf8mb4_unicode_520_ci", {{0x5c1f019a21e3d464LL, 0x0000055fLL}}},
{"utf8mb4_unicode_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf8mb4_vi_0900_ai_ci", {{0x3329a425d0f7f8d3LL, 0x00000001LL}}},
{"utf8mb4_vi_0900_as_cs", {{0xcfb3e3073c9f5a19LL, 0x00000001LL}}},
{"utf8mb4_vietnamese_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}},
{"utf8mb4_zh_0900_as_cs", {{0x23c370d9ac589d1fLL, 0x00000001LL}}},
};
string test_str =
"This is a fairly long string. It does not contain any special "
"characters since they are probably not universally supported across all "
"character sets, but should at least be enough to make the nr1 value go "
"up past the 32-bit mark.";
for (CHARSET_INFO *cs : all_charsets) {
if (cs && (cs->state & MY_CS_AVAILABLE)) {
init_collation(cs->name);
char buf[4096];
uint errors;
size_t len =
my_convert(buf, sizeof(buf), cs, test_str.data(), test_str.size(),
&my_charset_utf8mb4_0900_ai_ci, &errors);
ASSERT_EQ(0, errors);
uint64 nr1 = 4, nr2 = 1;
cs->coll->hash_sort(cs, pointer_cast<const uchar *>(buf), len, &nr1,
&nr2);
// Change this from false to true to output source code you can paste
// into “expected” above.
if (false) {
printf(" {\"%s\", {{0x%016" PRIx64 "LL, 0x%" PRIx64 "LL}}},\n",
cs->name, nr1, nr2);
continue;
}
ASSERT_EQ(1, expected.count(cs->name))
<< "Character set " << cs->name << " is missing in the database";
SCOPED_TRACE(cs->name);
EXPECT_EQ(expected[cs->name].hash_value.first, nr1);
EXPECT_EQ(expected[cs->name].hash_value.second, nr2);
}
}
}
} // namespace strnxfrm_unittest