41 {
"a\x80""bcd",
"a\xc2\x80""bcd" },
42 {
"a\xa0",
"a\xc2\xa0" },
43 {
"a\xa0z",
"a\xc2\xa0z" },
44 {
"x\xc1yz",
"x\xc3\x81yz" },
45 {
"\xc2z",
"\xc3\x82z" },
46 {
"\xc2",
"\xc3\x82" },
47 {
"xy\xc3z",
"xy\xc3\x83z" },
48 {
"xy\xc3\xc3z",
"xy\xc3\x83\xc3\x83z" },
49 {
"xy\xc3\xc3",
"xy\xc3\x83\xc3\x83" },
50 {
"\xe0",
"\xc3\xa0" },
51 {
"\xe0\x80",
"\xc3\xa0\xc2\x80" },
52 {
"\xe0\xc0",
"\xc3\xa0\xc3\x80" },
53 {
"\xe0\xc0z",
"\xc3\xa0\xc3\x80z" },
54 {
"\xe0\xc0zz",
"\xc3\xa0\xc3\x80zz" },
55 {
"\xe0\xc0\x81",
"\xc3\xa0\xc3\x80\xc2\x81" },
56 {
"\xe0\x82\xc1",
"\xc3\xa0\xc2\x82\xc3\x81" },
57 {
"\xe0\xc5\xc7",
"\xc3\xa0\xc3\x85\xc3\x87" },
58 {
"\xf0",
"\xc3\xb0" },
59 {
"\xf0\x80",
"\xc3\xb0\xc2\x80" },
60 {
"\xf0\xc0",
"\xc3\xb0\xc3\x80" },
61 {
"\xf0\xc0z",
"\xc3\xb0\xc3\x80z" },
62 {
"\xf0\xc0zz",
"\xc3\xb0\xc3\x80zz" },
63 {
"\xf0\xc0\x81",
"\xc3\xb0\xc3\x80\xc2\x81" },
64 {
"\xf0\x82\xc1",
"\xc3\xb0\xc2\x82\xc3\x81" },
65 {
"\xf0\xc5\xc7",
"\xc3\xb0\xc3\x85\xc3\x87" },
66 {
"\xf0\xc0\x81\xc9",
"\xc3\xb0\xc3\x80\xc2\x81\xc3\x89" },
67 {
"\xf0\x82\xc1\xc8",
"\xc3\xb0\xc2\x82\xc3\x81\xc3\x88" },
68 {
"\xf0\xc5\xc7\xc6",
"\xc3\xb0\xc3\x85\xc3\x87\xc3\x86" },
69 {
"\xf0\xc0\x81\x89",
"\xc3\xb0\xc3\x80\xc2\x81\xc2\x89" },
70 {
"\xf0\x82\xc1\x88",
"\xc3\xb0\xc2\x82\xc3\x81\xc2\x88" },
71 {
"\xf0\xc5\xc7\xc6",
"\xc3\xb0\xc3\x85\xc3\x87\xc3\x86" },
72 {
"\xf4P\x80\x80",
"\xc3\xb4P\xc2\x80\xc2\x80" },
73 {
"\xf4\x80P\x80",
"\xc3\xb4\xc2\x80P\xc2\x80" },
74 {
"\xf4\x80\x80P",
"\xc3\xb4\xc2\x80\xc2\x80P" },
75 {
"\xfe\xffxyzzy",
"\xc3\xbe\xc3\xbfxyzzy" },
77 {
"\xc0\x80",
"\xc3\x80\xc2\x80" },
78 {
"\xc0\xbf",
"\xc3\x80\xc2\xbf" },
79 {
"\xc1\x80",
"\xc3\x81\xc2\x80" },
80 {
"\xc1\xbf",
"\xc3\x81\xc2\xbf" },
81 {
"\xe0\x80\x80",
"\xc3\xa0\xc2\x80\xc2\x80" },
82 {
"\xe0\x9f\xbf",
"\xc3\xa0\xc2\x9f\xc2\xbf" },
83 {
"\xf0\x80\x80\x80",
"\xc3\xb0\xc2\x80\xc2\x80\xc2\x80" },
84 {
"\xf0\x8f\xbf\xbf",
"\xc3\xb0\xc2\x8f\xc2\xbf\xc2\xbf" },
86 {
"\xf4\x90\x80\x80",
"\xc3\xb4\xc2\x90\xc2\x80\xc2\x80" },
93 for (p = testcases; p->
a; ++p) {
95 tout <<
'"' << p->
a <<
"\" and \"" << p->
b <<
"\"\n";
96 size_t a_len = strlen(p->
a);
99 size_t b_len = strlen(p->
b);
125 {
"\xe0\xa0\x80", 0x0800 },
126 {
"\xe1\x80\x80", 0x1000 },
127 {
"\xf0\xa8\xa8\x8f", 166415 },
128 {
"\xf3\x80\x80\x80", 0x0c0000 },
129 {
"\xf4\x80\x80\x80", 0x100000 },
136 for (p = testcases2; p->
a; ++p) {
285 for (
unsigned ch = 0; ch < 128; ++ch) {
366 for (u = 0x514; u < 0x524; u += 2) {
416 static const unsigned wordchars[] = {
420 'a',
'z', 0x250, 0x251, 0x271, 0x3d7,
423 0x371, 0x373, 0x377, 0x514, 0x516, 0x518, 0x51a, 0x51c, 0x51e,
428 0x370, 0x372, 0x376, 0x3cf, 0x515, 0x517, 0x519, 0x51b, 0x51d, 0x51f,
429 0x521, 0x523, 0x2c6d, 0x2c6e, 0x2c6f,
442 static const unsigned currency[] = {
451 static const unsigned whitespace[] = {
453 '\t',
'\n',
'\f',
'\r',
458 static const unsigned other[] = {
468 0xffff, 0x10ffff, 0x110000, 0xFFFFFFFF,
474 for (
const unsigned* p = wordchars; *p; ++p) {
480 for (
const unsigned* p = currency; *p; ++p) {
486 for (
const unsigned* p = whitespace; *p; ++p) {
492 for (
const unsigned* p = other; *p; ++p) {
The Xapian namespace contains public interfaces for the Xapian library.
void append_utf8(std::string &s, unsigned ch)
Append the UTF-8 representation of a single Unicode character to a std::string.
#define TEST(a)
Test a condition, without an additional explanation for failure.
unsigned tolower(unsigned ch)
Convert a Unicode character to lowercase.
Mark, spacing combining (Mc)
bool is_currency(unsigned ch)
Test if a given Unicode character is a currency symbol.
Number, decimal digit (Nd)
DEFINE_TESTCASE(utf8iterator1, !backend)
test functionality of the Xapian API
static const testcase2 testcases2[]
std::ostringstream tout
The debug printing stream.
Public interfaces for the Xapian library.
unsigned toupper(unsigned ch)
Convert a Unicode character to uppercase.
An iterator which returns Unicode character values from a UTF-8 encoded string.
bool is_wordchar(unsigned ch)
Test if a given Unicode character is "word character".
Xapian-specific test helper functions and macros.
bool is_whitespace(unsigned ch)
Test if a given Unicode character is a whitespace character.
#define TEST_STRINGS_EQUAL(a, b)
Test for equality of two strings.
category get_category(int info)
#define TEST_EQUAL(a, b)
Test for equality of two things.
static const testcase testcases[]