00001
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #include <config.h>
00022
00023 #include "api_unicode.h"
00024
00025 #include <xapian.h>
00026
00027 #include "apitest.h"
00028 #include "testutils.h"
00029
00030 #include <cctype>
00031
00032 using namespace std;
00033
00034 struct testcase {
00035 const char * a, * b;
00036 };
00037
00038 static const testcase testcases[] = {
00039 { "abcd", "abcd" },
00040 { "a\x80""bcd", "a\xc2\x80""bcd" },
00041 { "a\xa0", "a\xc2\xa0" },
00042 { "a\xa0z", "a\xc2\xa0z" },
00043 { "x\xc1yz", "x\xc3\x81yz" },
00044 { "\xc2z", "\xc3\x82z" },
00045 { "\xc2", "\xc3\x82" },
00046 { "xy\xc3z", "xy\xc3\x83z" },
00047 { "xy\xc3\xc3z", "xy\xc3\x83\xc3\x83z" },
00048 { "xy\xc3\xc3", "xy\xc3\x83\xc3\x83" },
00049 { "\xe0", "\xc3\xa0" },
00050 { "\xe0\x80", "\xc3\xa0\xc2\x80" },
00051 { "\xe0\xc0", "\xc3\xa0\xc3\x80" },
00052 { "\xe0\xc0z", "\xc3\xa0\xc3\x80z" },
00053 { "\xe0\xc0zz", "\xc3\xa0\xc3\x80zz" },
00054 { "\xe0\xc0\x81", "\xc3\xa0\xc3\x80\xc2\x81" },
00055 { "\xe0\x82\xc1", "\xc3\xa0\xc2\x82\xc3\x81" },
00056 { "\xe0\xc5\xc7", "\xc3\xa0\xc3\x85\xc3\x87" },
00057 { "\xf0", "\xc3\xb0" },
00058 { "\xf0\x80", "\xc3\xb0\xc2\x80" },
00059 { "\xf0\xc0", "\xc3\xb0\xc3\x80" },
00060 { "\xf0\xc0z", "\xc3\xb0\xc3\x80z" },
00061 { "\xf0\xc0zz", "\xc3\xb0\xc3\x80zz" },
00062 { "\xf0\xc0\x81", "\xc3\xb0\xc3\x80\xc2\x81" },
00063 { "\xf0\x82\xc1", "\xc3\xb0\xc2\x82\xc3\x81" },
00064 { "\xf0\xc5\xc7", "\xc3\xb0\xc3\x85\xc3\x87" },
00065 { "\xf0\xc0\x81\xc9", "\xc3\xb0\xc3\x80\xc2\x81\xc3\x89" },
00066 { "\xf0\x82\xc1\xc8", "\xc3\xb0\xc2\x82\xc3\x81\xc3\x88" },
00067 { "\xf0\xc5\xc7\xc6", "\xc3\xb0\xc3\x85\xc3\x87\xc3\x86" },
00068 { "\xf0\xc0\x81\x89", "\xc3\xb0\xc3\x80\xc2\x81\xc2\x89" },
00069 { "\xf0\x82\xc1\x88", "\xc3\xb0\xc2\x82\xc3\x81\xc2\x88" },
00070 { "\xf0\xc5\xc7\xc6", "\xc3\xb0\xc3\x85\xc3\x87\xc3\x86" },
00071 { "\xf4P\x80\x80", "\xc3\xb4P\xc2\x80\xc2\x80" },
00072 { "\xf4\x80P\x80", "\xc3\xb4\xc2\x80P\xc2\x80" },
00073 { "\xf4\x80\x80P", "\xc3\xb4\xc2\x80\xc2\x80P" },
00074 { "\xfe\xffxyzzy", "\xc3\xbe\xc3\xbfxyzzy" },
00075
00076 { "\xc0\x80", "\xc3\x80\xc2\x80" },
00077 { "\xc0\xbf", "\xc3\x80\xc2\xbf" },
00078 { "\xc1\x80", "\xc3\x81\xc2\x80" },
00079 { "\xc1\xbf", "\xc3\x81\xc2\xbf" },
00080 { "\xe0\x80\x80", "\xc3\xa0\xc2\x80\xc2\x80" },
00081 { "\xe0\x9f\xbf", "\xc3\xa0\xc2\x9f\xc2\xbf" },
00082 { "\xf0\x80\x80\x80", "\xc3\xb0\xc2\x80\xc2\x80\xc2\x80" },
00083 { "\xf0\x8f\xbf\xbf", "\xc3\xb0\xc2\x8f\xc2\xbf\xc2\xbf" },
00084
00085 { "\xf4\x90\x80\x80", "\xc3\xb4\xc2\x90\xc2\x80\xc2\x80" },
00086 { 0, 0 }
00087 };
00088
00089
00090 DEFINE_TESTCASE(utf8iterator1,!backend) {
00091 const testcase * p;
00092 for (p = testcases; p->a; ++p) {
00093 tout.str(string());
00094 tout << '"' << p->a << "\" and \"" << p->b << '"' << endl;
00095 size_t a_len = strlen(p->a);
00096 Xapian::Utf8Iterator a(p->a, a_len);
00097
00098 size_t b_len = strlen(p->b);
00099 Xapian::Utf8Iterator b(p->b, b_len);
00100
00101 while (a != Xapian::Utf8Iterator() && b != Xapian::Utf8Iterator()) {
00102 TEST_EQUAL(*a, *b);
00103 ++a;
00104 ++b;
00105 }
00106
00107
00108 TEST(a == Xapian::Utf8Iterator());
00109 TEST(b == Xapian::Utf8Iterator());
00110 }
00111 return true;
00112 }
00113
00114 struct testcase2 {
00115 const char * a;
00116 unsigned long n;
00117 };
00118
00119 static const testcase2 testcases2[] = {
00120 { "a", 97 },
00121 { "\x80", 128 },
00122 { "\xa0", 160 },
00123 { "\xc2\x80", 128 },
00124 { "\xc2\xa0", 160 },
00125 { "\xe0\xa0\x80", 0x0800 },
00126 { "\xe1\x80\x80", 0x1000 },
00127 { "\xf0\xa8\xa8\x8f", 166415 },
00128 { "\xf3\x80\x80\x80", 0x0c0000 },
00129 { "\xf4\x80\x80\x80", 0x100000 },
00130 { 0, 0 }
00131 };
00132
00133
00134 DEFINE_TESTCASE(utf8iterator2,!backend) {
00135 const testcase2 * p;
00136 for (p = testcases2; p->a; ++p) {
00137 Xapian::Utf8Iterator a(p->a);
00138
00139 TEST(a != Xapian::Utf8Iterator());
00140 TEST_EQUAL(*a, p->n);
00141 TEST(++a == Xapian::Utf8Iterator());
00142 }
00143 return true;
00144 }
00145
00146
00147 DEFINE_TESTCASE(unicode1,!backend) {
00148 using namespace Xapian;
00149 TEST_EQUAL(Unicode::get_category('a'), Unicode::LOWERCASE_LETTER);
00150 TEST_EQUAL(Unicode::get_category('0'), Unicode::DECIMAL_DIGIT_NUMBER);
00151 TEST_EQUAL(Unicode::get_category('$'), Unicode::CURRENCY_SYMBOL);
00152 TEST_EQUAL(Unicode::get_category(0xa3), Unicode::CURRENCY_SYMBOL);
00153
00154 TEST_EQUAL(Unicode::get_category(0x242), Unicode::LOWERCASE_LETTER);
00155
00156 TEST_EQUAL(Unicode::get_category(0x11A7), Unicode::OTHER_LETTER);
00157
00158 TEST_EQUAL(Unicode::get_category(0x9FCB), Unicode::OTHER_LETTER);
00159
00160 TEST_EQUAL(Unicode::get_category(0xFA6C), Unicode::OTHER_LETTER);
00161 TEST_EQUAL(Unicode::get_category(0xFFFF), Unicode::UNASSIGNED);
00162
00163 TEST_EQUAL(Unicode::get_category(0x10345), Unicode::OTHER_LETTER);
00164 TEST_EQUAL(Unicode::get_category(0x10FFFD), Unicode::PRIVATE_USE);
00165 TEST_EQUAL(Unicode::get_category(0x10FFFF), Unicode::UNASSIGNED);
00166
00167 TEST_EQUAL(Unicode::get_category(0x1109a), Unicode::OTHER_LETTER);
00168
00169 TEST_EQUAL(Unicode::get_category(0x110000), Unicode::UNASSIGNED);
00170 TEST_EQUAL(Unicode::get_category(0xFFFFFFFF), Unicode::UNASSIGNED);
00171 return true;
00172 }
00173
00174 DEFINE_TESTCASE(caseconvert1,!backend) {
00175 using namespace Xapian;
00176 for (unsigned ch = 0; ch < 128; ++ch) {
00177 if (isupper((char)ch)) {
00178 TEST_EQUAL(Unicode::tolower(ch), unsigned(tolower((char)ch)));
00179 } else {
00180 TEST_EQUAL(Unicode::tolower(ch), ch);
00181 }
00182 if (islower((char)ch)) {
00183 TEST_EQUAL(Unicode::toupper(ch), unsigned(toupper((char)ch)));
00184 } else {
00185 TEST_EQUAL(Unicode::toupper(ch), ch);
00186 }
00187 }
00188
00189
00190 TEST_EQUAL(Unicode::tolower(0x242), 0x242);
00191 TEST_EQUAL(Unicode::toupper(0x242), 0x241);
00192 TEST_EQUAL(Unicode::toupper(0x241), 0x241);
00193 TEST_EQUAL(Unicode::tolower(0x241), 0x242);
00194
00195
00196 TEST_EQUAL(Unicode::tolower(0xa3), 0xa3);
00197 TEST_EQUAL(Unicode::toupper(0xa3), 0xa3);
00198
00199 TEST_EQUAL(Unicode::tolower(0xFFFF), 0xFFFF);
00200 TEST_EQUAL(Unicode::toupper(0xFFFF), 0xFFFF);
00201
00202 TEST_EQUAL(Unicode::tolower(0x10345), 0x10345);
00203 TEST_EQUAL(Unicode::toupper(0x10345), 0x10345);
00204 TEST_EQUAL(Unicode::tolower(0x10FFFD), 0x10FFFD);
00205 TEST_EQUAL(Unicode::toupper(0x10FFFD), 0x10FFFD);
00206 TEST_EQUAL(Unicode::tolower(0x10FFFF), 0x10FFFF);
00207 TEST_EQUAL(Unicode::toupper(0x10FFFF), 0x10FFFF);
00208
00209 TEST_EQUAL(Unicode::tolower(0x110000), 0x110000);
00210 TEST_EQUAL(Unicode::toupper(0x110000), 0x110000);
00211 TEST_EQUAL(Unicode::tolower(0xFFFFFFFF), 0xFFFFFFFF);
00212 TEST_EQUAL(Unicode::toupper(0xFFFFFFFF), 0xFFFFFFFF);
00213
00214 return true;
00215 }
00216
00218 DEFINE_TESTCASE(caseconvert2,!backend) {
00219 using namespace Xapian;
00220
00221 TEST_EQUAL(Unicode::toupper(0x250), 0x2c6f);
00222 TEST_EQUAL(Unicode::toupper(0x251), 0x2c6d);
00223 TEST_EQUAL(Unicode::toupper(0x271), 0x2c6e);
00224
00225 TEST_EQUAL(Unicode::get_category(0x2ec), Unicode::MODIFIER_LETTER);
00226 TEST_EQUAL(Unicode::get_category(0x374), Unicode::MODIFIER_LETTER);
00227 TEST_EQUAL(Unicode::get_category(0x487), Unicode::NON_SPACING_MARK);
00228 TEST_EQUAL(Unicode::get_category(0x5be), Unicode::DASH_PUNCTUATION);
00229 TEST_EQUAL(Unicode::get_category(0x1f093), Unicode::OTHER_SYMBOL);
00230
00231 TEST_EQUAL(Unicode::tolower(0x370), 0x371);
00232 TEST_EQUAL(Unicode::toupper(0x371), 0x370);
00233 TEST_EQUAL(Unicode::tolower(0x372), 0x373);
00234 TEST_EQUAL(Unicode::toupper(0x373), 0x372);
00235 TEST_EQUAL(Unicode::tolower(0x376), 0x377);
00236 TEST_EQUAL(Unicode::toupper(0x377), 0x376);
00237 TEST_EQUAL(Unicode::tolower(0x3cf), 0x3d7);
00238 TEST_EQUAL(Unicode::toupper(0x3d7), 0x3cf);
00239
00240 unsigned u;
00241 for (u = 0x514; u < 0x524; u += 2) {
00242 TEST_EQUAL(Unicode::get_category(u), Unicode::UPPERCASE_LETTER);
00243 TEST_EQUAL(Unicode::get_category(u + 1), Unicode::LOWERCASE_LETTER);
00244 TEST_EQUAL(Unicode::tolower(u), u + 1);
00245 TEST_EQUAL(Unicode::toupper(u + 1), u);
00246 }
00247
00248 return true;
00249 }
00250
00251 DEFINE_TESTCASE(utf8convert1,!backend) {
00252 string s;
00253 Xapian::Unicode::append_utf8(s, 'a');
00254 Xapian::Unicode::append_utf8(s, 128);
00255 Xapian::Unicode::append_utf8(s, 160);
00256 Xapian::Unicode::append_utf8(s, 0xFFFF);
00257 Xapian::Unicode::append_utf8(s, 166415);
00258 Xapian::Unicode::append_utf8(s, 0x10345);
00259 Xapian::Unicode::append_utf8(s, 0x10FFFD);
00260 Xapian::Unicode::append_utf8(s, 0xFFFFFFFF);
00261 Xapian::Unicode::append_utf8(s, 'z');
00262 TEST_STRINGS_EQUAL(s, "a"
00263 "\xc2\x80"
00264 "\xc2\xa0"
00265 "\xef\xbf\xbf"
00266 "\xf0\xa8\xa8\x8f"
00267 "\xf0\x90\x8d\x85"
00268 "\xf4\x8f\xbf\xbd"
00269 ""
00270 "z"
00271 );
00272
00273 return true;
00274 }
00275
00276 DEFINE_TESTCASE(unicodepredicates1,!backend) {
00277 const unsigned wordchars[] = {
00278
00279 '0', '7', '9',
00280
00281 'a', 'z', 0x250, 0x251, 0x271, 0x3d7,
00282 0x242,
00283
00284 0x371, 0x373, 0x377, 0x514, 0x516, 0x518, 0x51a, 0x51c, 0x51e,
00285 0x520, 0x522,
00286
00287 'A', 'Z', 0x241,
00288
00289 0x370, 0x372, 0x376, 0x3cf, 0x515, 0x517, 0x519, 0x51b, 0x51d, 0x51f,
00290 0x521, 0x523, 0x2c6d, 0x2c6e, 0x2c6f,
00291
00292 0x10345,
00293
00294 0x2ec, 0x374,
00295
00296 0x651,
00297 0x487,
00298 0
00299 };
00300 const unsigned currency[] = {
00301
00302 '$', 0xa3,
00303 0
00304 };
00305 const unsigned whitespace[] = {
00306
00307 '\t', '\n', '\f', '\r',
00308
00309 ' ',
00310 0
00311 };
00312 const unsigned other[] = {
00313
00314 0x5be,
00315
00316 0x1f093,
00317
00318 0xffff, 0x10ffff, 0x110000, 0xFFFFFFFF,
00319
00320 0x10fffd,
00321 0
00322 };
00323
00324 for (const unsigned * p = wordchars; *p; ++p) {
00325 TEST(Xapian::Unicode::is_wordchar(*p));
00326 TEST(!Xapian::Unicode::is_currency(*p));
00327 TEST(!Xapian::Unicode::is_whitespace(*p));
00328 }
00329
00330 for (const unsigned * p = currency; *p; ++p) {
00331 TEST(!Xapian::Unicode::is_wordchar(*p));
00332 TEST(Xapian::Unicode::is_currency(*p));
00333 TEST(!Xapian::Unicode::is_whitespace(*p));
00334 }
00335
00336 for (const unsigned * p = whitespace; *p; ++p) {
00337 TEST(!Xapian::Unicode::is_wordchar(*p));
00338 TEST(!Xapian::Unicode::is_currency(*p));
00339 TEST(Xapian::Unicode::is_whitespace(*p));
00340 }
00341
00342 for (const unsigned * p = other; *p; ++p) {
00343 TEST(!Xapian::Unicode::is_wordchar(*p));
00344 TEST(!Xapian::Unicode::is_currency(*p));
00345 TEST(!Xapian::Unicode::is_whitespace(*p));
00346 }
00347
00348 return true;
00349 }