41 {
"a\x80""bcd",
"a\xc2\x80""bcd" },
42 {
"a\xa0",
"a\xc2\xa0" },
43 {
"a\xa0z",
"a\xc2\xa0z" },
44 {
"x\xc1yz",
"x\xc3\x81yz" },
45 {
"\xc2z",
"\xc3\x82z" },
46 {
"\xc2",
"\xc3\x82" },
47 {
"xy\xc3z",
"xy\xc3\x83z" },
48 {
"xy\xc3\xc3z",
"xy\xc3\x83\xc3\x83z" },
49 {
"xy\xc3\xc3",
"xy\xc3\x83\xc3\x83" },
50 {
"\xe0",
"\xc3\xa0" },
51 {
"\xe0\x80",
"\xc3\xa0\xc2\x80" },
52 {
"\xe0\xc0",
"\xc3\xa0\xc3\x80" },
53 {
"\xe0\xc0z",
"\xc3\xa0\xc3\x80z" },
54 {
"\xe0\xc0zz",
"\xc3\xa0\xc3\x80zz" },
55 {
"\xe0\xc0\x81",
"\xc3\xa0\xc3\x80\xc2\x81" },
56 {
"\xe0\x82\xc1",
"\xc3\xa0\xc2\x82\xc3\x81" },
57 {
"\xe0\xc5\xc7",
"\xc3\xa0\xc3\x85\xc3\x87" },
58 {
"\xf0",
"\xc3\xb0" },
59 {
"\xf0\x80",
"\xc3\xb0\xc2\x80" },
60 {
"\xf0\xc0",
"\xc3\xb0\xc3\x80" },
61 {
"\xf0\xc0z",
"\xc3\xb0\xc3\x80z" },
62 {
"\xf0\xc0zz",
"\xc3\xb0\xc3\x80zz" },
63 {
"\xf0\xc0\x81",
"\xc3\xb0\xc3\x80\xc2\x81" },
64 {
"\xf0\x82\xc1",
"\xc3\xb0\xc2\x82\xc3\x81" },
65 {
"\xf0\xc5\xc7",
"\xc3\xb0\xc3\x85\xc3\x87" },
66 {
"\xf0\xc0\x81\xc9",
"\xc3\xb0\xc3\x80\xc2\x81\xc3\x89" },
67 {
"\xf0\x82\xc1\xc8",
"\xc3\xb0\xc2\x82\xc3\x81\xc3\x88" },
68 {
"\xf0\xc5\xc7\xc6",
"\xc3\xb0\xc3\x85\xc3\x87\xc3\x86" },
69 {
"\xf0\xc0\x81\x89",
"\xc3\xb0\xc3\x80\xc2\x81\xc2\x89" },
70 {
"\xf0\x82\xc1\x88",
"\xc3\xb0\xc2\x82\xc3\x81\xc2\x88" },
71 {
"\xf0\xc5\xc7\xc6",
"\xc3\xb0\xc3\x85\xc3\x87\xc3\x86" },
72 {
"\xf4P\x80\x80",
"\xc3\xb4P\xc2\x80\xc2\x80" },
73 {
"\xf4\x80P\x80",
"\xc3\xb4\xc2\x80P\xc2\x80" },
74 {
"\xf4\x80\x80P",
"\xc3\xb4\xc2\x80\xc2\x80P" },
75 {
"\xfe\xffxyzzy",
"\xc3\xbe\xc3\xbfxyzzy" },
77 {
"\xc0\x80",
"\xc3\x80\xc2\x80" },
78 {
"\xc0\xbf",
"\xc3\x80\xc2\xbf" },
79 {
"\xc1\x80",
"\xc3\x81\xc2\x80" },
80 {
"\xc1\xbf",
"\xc3\x81\xc2\xbf" },
81 {
"\xe0\x80\x80",
"\xc3\xa0\xc2\x80\xc2\x80" },
82 {
"\xe0\x9f\xbf",
"\xc3\xa0\xc2\x9f\xc2\xbf" },
83 {
"\xf0\x80\x80\x80",
"\xc3\xb0\xc2\x80\xc2\x80\xc2\x80" },
84 {
"\xf0\x8f\xbf\xbf",
"\xc3\xb0\xc2\x8f\xc2\xbf\xc2\xbf" },
86 {
"\xf4\x90\x80\x80",
"\xc3\xb4\xc2\x90\xc2\x80\xc2\x80" },
95 tout <<
'"' << p->
a <<
"\" and \"" << p->
b <<
"\"\n";
96 size_t a_len = strlen(p->
a);
99 size_t b_len = strlen(p->
b);
125 {
"\xe0\xa0\x80", 0x0800 },
126 {
"\xe1\x80\x80", 0x1000 },
127 {
"\xf0\xa8\xa8\x8f", 166415 },
128 {
"\xf3\x80\x80\x80", 0x0c0000 },
129 {
"\xf4\x80\x80\x80", 0x100000 },
285 for (
unsigned ch = 0; ch < 128; ++ch) {
366 for (u = 0x514; u < 0x524; u += 2) {
416 static const unsigned wordchars[] = {
420 'a',
'z', 0x250, 0x251, 0x271, 0x3d7,
423 0x371, 0x373, 0x377, 0x514, 0x516, 0x518, 0x51a, 0x51c, 0x51e,
428 0x370, 0x372, 0x376, 0x3cf, 0x515, 0x517, 0x519, 0x51b, 0x51d, 0x51f,
429 0x521, 0x523, 0x2c6d, 0x2c6e, 0x2c6f,
442 static const unsigned currency[] = {
451 static const unsigned whitespace[] = {
453 '\t',
'\n',
'\f',
'\r',
458 static const unsigned other[] = {
468 0xffff, 0x10ffff, 0x110000, 0xFFFFFFFF,
474 for (
const unsigned* p = wordchars; *p; ++p) {
480 for (
const unsigned* p = currency; *p; ++p) {
486 for (
const unsigned* p = whitespace; *p; ++p) {
492 for (
const unsigned* p = other; *p; ++p) {
static const testcase2 testcases2[]
DEFINE_TESTCASE(utf8iterator1, !backend)
static const testcase testcases[]
test functionality of the Xapian API
An iterator which returns Unicode character values from a UTF-8 encoded string.
category get_category(int info)
void append_utf8(std::string &s, unsigned ch)
Append the UTF-8 representation of a single Unicode character to a std::string.
unsigned tolower(unsigned ch)
Convert a Unicode character to lowercase.
@ MATH_SYMBOL
Symbol, math (Sm)
@ FORMAT
Other, format (Cf)
@ PRIVATE_USE
Other, private use (Co)
@ LOWERCASE_LETTER
Letter, lowercase (Ll)
@ MODIFIER_LETTER
Letter, modifier (Lm)
@ OTHER_SYMBOL
Symbol, other (So)
@ CURRENCY_SYMBOL
Symbol, currency (Sc)
@ UNASSIGNED
Other, not assigned (Cn)
@ OTHER_LETTER
Letter, other (Lo)
@ DECIMAL_DIGIT_NUMBER
Number, decimal digit (Nd)
@ COMBINING_SPACING_MARK
Mark, spacing combining (Mc)
@ NON_SPACING_MARK
Mark, nonspacing (Mn)
@ DASH_PUNCTUATION
Punctuation, dash (Pd)
@ OTHER_PUNCTUATION
Punctuation, other (Po)
@ OTHER_NUMBER
Number, other (No)
@ UPPERCASE_LETTER
Letter, uppercase (Lu)
bool is_wordchar(unsigned ch)
Test if a given Unicode character is "word character".
bool is_currency(unsigned ch)
Test if a given Unicode character is a currency symbol.
unsigned toupper(unsigned ch)
Convert a Unicode character to uppercase.
bool is_whitespace(unsigned ch)
Test if a given Unicode character is a whitespace character.
The Xapian namespace contains public interfaces for the Xapian library.
std::ostringstream tout
The debug printing stream.
#define TEST_EQUAL(a, b)
Test for equality of two things.
#define TEST_STRINGS_EQUAL(a, b)
Test for equality of two strings.
#define TEST(a)
Test a condition, without an additional explanation for failure.
Xapian-specific test helper functions and macros.
Public interfaces for the Xapian library.