xapian-core  1.4.19
api_unicode.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include <config.h>
22 
23 #include "api_unicode.h"
24 
25 #include <xapian.h>
26 
27 #include "apitest.h"
28 #include "testutils.h"
29 
30 #include <cctype>
31 
32 using namespace std;
33 
34 struct testcase {
35  const char* a;
36  const char* b;
37 };
38 
39 static const testcase testcases[] = {
40  { "abcd", "abcd" }, // Sanity check!
41  { "a\x80""bcd", "a\xc2\x80""bcd" },
42  { "a\xa0", "a\xc2\xa0" },
43  { "a\xa0z", "a\xc2\xa0z" },
44  { "x\xc1yz", "x\xc3\x81yz" },
45  { "\xc2z", "\xc3\x82z" },
46  { "\xc2", "\xc3\x82" },
47  { "xy\xc3z", "xy\xc3\x83z" },
48  { "xy\xc3\xc3z", "xy\xc3\x83\xc3\x83z" },
49  { "xy\xc3\xc3", "xy\xc3\x83\xc3\x83" },
50  { "\xe0", "\xc3\xa0" },
51  { "\xe0\x80", "\xc3\xa0\xc2\x80" },
52  { "\xe0\xc0", "\xc3\xa0\xc3\x80" },
53  { "\xe0\xc0z", "\xc3\xa0\xc3\x80z" },
54  { "\xe0\xc0zz", "\xc3\xa0\xc3\x80zz" },
55  { "\xe0\xc0\x81", "\xc3\xa0\xc3\x80\xc2\x81" },
56  { "\xe0\x82\xc1", "\xc3\xa0\xc2\x82\xc3\x81" },
57  { "\xe0\xc5\xc7", "\xc3\xa0\xc3\x85\xc3\x87" },
58  { "\xf0", "\xc3\xb0" },
59  { "\xf0\x80", "\xc3\xb0\xc2\x80" },
60  { "\xf0\xc0", "\xc3\xb0\xc3\x80" },
61  { "\xf0\xc0z", "\xc3\xb0\xc3\x80z" },
62  { "\xf0\xc0zz", "\xc3\xb0\xc3\x80zz" },
63  { "\xf0\xc0\x81", "\xc3\xb0\xc3\x80\xc2\x81" },
64  { "\xf0\x82\xc1", "\xc3\xb0\xc2\x82\xc3\x81" },
65  { "\xf0\xc5\xc7", "\xc3\xb0\xc3\x85\xc3\x87" },
66  { "\xf0\xc0\x81\xc9", "\xc3\xb0\xc3\x80\xc2\x81\xc3\x89" },
67  { "\xf0\x82\xc1\xc8", "\xc3\xb0\xc2\x82\xc3\x81\xc3\x88" },
68  { "\xf0\xc5\xc7\xc6", "\xc3\xb0\xc3\x85\xc3\x87\xc3\x86" },
69  { "\xf0\xc0\x81\x89", "\xc3\xb0\xc3\x80\xc2\x81\xc2\x89" },
70  { "\xf0\x82\xc1\x88", "\xc3\xb0\xc2\x82\xc3\x81\xc2\x88" },
71  { "\xf0\xc5\xc7\xc6", "\xc3\xb0\xc3\x85\xc3\x87\xc3\x86" },
72  { "\xf4P\x80\x80", "\xc3\xb4P\xc2\x80\xc2\x80" },
73  { "\xf4\x80P\x80", "\xc3\xb4\xc2\x80P\xc2\x80" },
74  { "\xf4\x80\x80P", "\xc3\xb4\xc2\x80\xc2\x80P" },
75  { "\xfe\xffxyzzy", "\xc3\xbe\xc3\xbfxyzzy" },
76  // Overlong encodings:
77  { "\xc0\x80", "\xc3\x80\xc2\x80" },
78  { "\xc0\xbf", "\xc3\x80\xc2\xbf" },
79  { "\xc1\x80", "\xc3\x81\xc2\x80" },
80  { "\xc1\xbf", "\xc3\x81\xc2\xbf" },
81  { "\xe0\x80\x80", "\xc3\xa0\xc2\x80\xc2\x80" },
82  { "\xe0\x9f\xbf", "\xc3\xa0\xc2\x9f\xc2\xbf" },
83  { "\xf0\x80\x80\x80", "\xc3\xb0\xc2\x80\xc2\x80\xc2\x80" },
84  { "\xf0\x8f\xbf\xbf", "\xc3\xb0\xc2\x8f\xc2\xbf\xc2\xbf" },
85  // Above Unicode:
86  { "\xf4\x90\x80\x80", "\xc3\xb4\xc2\x90\xc2\x80\xc2\x80" },
87  { 0, 0 }
88 };
89 
90 // Test handling of invalid UTF-8 is as desired.
91 DEFINE_TESTCASE(utf8iterator1, !backend) {
92  const testcase* p;
93  for (p = testcases; p->a; ++p) {
94  tout.str(string());
95  tout << '"' << p->a << "\" and \"" << p->b << '"' << endl;
96  size_t a_len = strlen(p->a);
97  Xapian::Utf8Iterator a(p->a, a_len);
98 
99  size_t b_len = strlen(p->b);
100  Xapian::Utf8Iterator b(p->b, b_len);
101 
102  while (a != Xapian::Utf8Iterator() && b != Xapian::Utf8Iterator()) {
103  TEST_EQUAL(*a, *b);
104  ++a;
105  ++b;
106  }
107 
108  // Test that we don't reach the end of one before the other.
109  TEST(a == Xapian::Utf8Iterator());
110  TEST(b == Xapian::Utf8Iterator());
111  }
112 }
113 
114 struct testcase2 {
115  const char* a;
116  unsigned long n;
117 };
118 
119 static const testcase2 testcases2[] = {
120  { "a", 97 },
121  { "\x80", 128 },
122  { "\xa0", 160 },
123  { "\xc2\x80", 128 },
124  { "\xc2\xa0", 160 },
125  { "\xe0\xa0\x80", 0x0800 },
126  { "\xe1\x80\x80", 0x1000 },
127  { "\xf0\xa8\xa8\x8f", 166415 },
128  { "\xf3\x80\x80\x80", 0x0c0000 },
129  { "\xf4\x80\x80\x80", 0x100000 },
130  { 0, 0 }
131 };
132 
133 // Test decoding of UTF-8.
134 DEFINE_TESTCASE(utf8iterator2, !backend) {
135  const testcase2* p;
136  for (p = testcases2; p->a; ++p) {
137  Xapian::Utf8Iterator a(p->a);
138 
139  TEST(a != Xapian::Utf8Iterator());
140  TEST_EQUAL(*a, p->n);
141  TEST(++a == Xapian::Utf8Iterator());
142  }
143 }
144 
145 // Test Unicode categorisation.
146 DEFINE_TESTCASE(unicode1, !backend) {
147  using namespace Xapian;
152  // U+0242 was added in Unicode 5.0.0.
154  // U+0526 was added in Unicode 6.0.0.
156  // U+0527 was added in Unicode 6.0.0.
158  // U+0620 was added in Unicode 6.0.0.
160  // U+065F was added in Unicode 6.0.0.
162  // U+06DE changed category in Unicode 6.0.0.
164  // U+0840 was added in Unicode 6.0.0.
166  // U+093A was added in Unicode 6.0.0.
168  // U+093B was added in Unicode 6.0.0.
170  // U+0CF1 changed category in Unicode 6.0.0.
172  // U+0CF2 changed category in Unicode 6.0.0.
174  // U+11A7 was added in Unicode 5.2.0.
176  // U+9FCB was added in Unicode 5.2.0.
178  // U+FA6C was added in Unicode 5.2.0.
181  // Test characters outside BMP.
185  // U+1109A was added in Unicode 5.2.0.
187  // U+1F773 was added in Unicode 6.0.0.
189  // U+2B740 was added in Unicode 6.0.0.
191  // U+2B81D was added in Unicode 6.0.0.
193  // U+00A7 changed category in Unicode 6.1.0 (was OTHER_SYMBOL).
195  // U+00AA changed category in Unicode 6.1.0 (was LOWERCASE_LETTER).
197  // U+00B6 changed category in Unicode 6.1.0 (was OTHER_SYMBOL).
199  // U+00BA changed category in Unicode 6.1.0 (was LOWERCASE_LETTER).
201  // U+058F was added in Unicode 6.1.0.
203  // U+0604 was added in Unicode 6.1.0.
205  // U+08A0 was added in Unicode 6.1.0.
207  // U+08E4 was added in Unicode 6.1.0.
209  // U+0AF0 was added in Unicode 6.1.0.
211  // U+9FCC was added in Unicode 6.1.0.
213  // U+A7F9 was added in Unicode 6.1.0.
215  // U+110F0 was added in Unicode 6.1.0.
217  // U+11100 was added in Unicode 6.1.0.
219  // U+1EEF0 was added in Unicode 6.1.0.
221  // U+1F634 was added in Unicode 6.1.0.
223  // U+20BA was added in Unicode 6.2.0.
225  // U+061C was added in Unicode 6.3.0.
227  // U+037F "GREEK CAPITAL LETTER YOT" was added in Unicode 7.0.0.
229 
230  // Added or changed in Unicode 8.0.0:
231  // U+08B3 "ARABIC LETTER AIN WITH THREE DOTS BELOW".
233  // U+0AF9 "GUJARATI LETTER ZHA".
235  // U+0C5A "TELUGU LETTER RRRA".
237  // U+0D5F "MALAYALAM LETTER ARCHAIC II".
239  // U+13F5 "CHEROKEE LETTER MV".
241  // U+13F8 "CHEROKEE SMALL LETTER YE".
243  // U+19B7 "NEW TAI LUE VOWEL SIGN O" changed to be OTHER_LETTER in 8.0.0.
245  // U+20BE "LARI SIGN".
247  // U+218A "TURNED DIGIT TWO".
249  // U+10C9C "OLD HUNGARIAN CAPITAL LETTER OO".
251  // U+12399 "CUNEIFORM SIGN U U".
253  // U+1D800 "SIGNWRITING HAND-FIST INDEX".
255 
256  // Added or changed in Unicode 9.0.0:
257  // U+08B6 "ARABIC LETTER BEH WITH SMALL MEEM ABOVE"
259  // U+08E2 "ARABIC DISPUTED END OF AYAH"
261  // U+0C80 "KANNADA SIGN SPACING CANDRABINDU"
263  // U+0D56 "MALAYALAM LETTER CHILLU LLL"
265  // U+0D58 "MALAYALAM FRACTION ONE ONE-HUNDRED-AND-SIXTIETH"
267  // U+1885 "MONGOLIAN LETTER ALI GALI BALUDA"
269  // U+1886 "MONGOLIAN LETTER ALI GALI THREE BALUDA"
271  // U+104FB "OSAGE SMALL LETTER ZHA"
273  // U+1141F "NEWA LETTER TA"
275  // U+1F989 "OWL"
277 
278  // Test some invalid Unicode values.
281 }
282 
283 DEFINE_TESTCASE(caseconvert1, !backend) {
284  using namespace Xapian;
285  for (unsigned ch = 0; ch < 128; ++ch) {
286  TEST_EQUAL(Unicode::tolower(ch), unsigned(tolower(ch)));
287  TEST_EQUAL(Unicode::toupper(ch), unsigned(toupper(ch)));
288  }
289 
290  // U+0242 was added in Unicode 5.0.0 as a lowercase form of U+0241.
291  TEST_EQUAL(Unicode::tolower(0x242), 0x242);
292  TEST_EQUAL(Unicode::toupper(0x242), 0x241);
293  TEST_EQUAL(Unicode::toupper(0x241), 0x241);
294  TEST_EQUAL(Unicode::tolower(0x241), 0x242);
295 
296  // Regression test for bug fixed in 1.2.17.
297  TEST_EQUAL(Unicode::tolower(0x1c5), 0x1c6);
298  TEST_EQUAL(Unicode::tolower(0x1c8), 0x1c9);
299  TEST_EQUAL(Unicode::tolower(0x1cb), 0x1cc);
300  TEST_EQUAL(Unicode::tolower(0x1f2), 0x1f3);
301 
302  // Pound currency symbol:
303  TEST_EQUAL(Unicode::tolower(0xa3), 0xa3);
304  TEST_EQUAL(Unicode::toupper(0xa3), 0xa3);
305  // Unassigned:
306  TEST_EQUAL(Unicode::tolower(0xFFFF), 0xFFFF);
307  TEST_EQUAL(Unicode::toupper(0xFFFF), 0xFFFF);
308  // Test characters outside BMP.
309  TEST_EQUAL(Unicode::tolower(0x10345), 0x10345);
310  TEST_EQUAL(Unicode::toupper(0x10345), 0x10345);
311  TEST_EQUAL(Unicode::tolower(0x10FFFD), 0x10FFFD);
312  TEST_EQUAL(Unicode::toupper(0x10FFFD), 0x10FFFD);
313  TEST_EQUAL(Unicode::tolower(0x10FFFF), 0x10FFFF);
314  TEST_EQUAL(Unicode::toupper(0x10FFFF), 0x10FFFF);
315  // Test some invalid Unicode values.
316  TEST_EQUAL(Unicode::tolower(0x110000), 0x110000);
317  TEST_EQUAL(Unicode::toupper(0x110000), 0x110000);
318  TEST_EQUAL(Unicode::tolower(0xFFFFFFFF), 0xFFFFFFFF);
319  TEST_EQUAL(Unicode::toupper(0xFFFFFFFF), 0xFFFFFFFF);
320 }
321 
323 DEFINE_TESTCASE(caseconvert2, !backend) {
324  using namespace Xapian;
325 
326  TEST_EQUAL(Unicode::toupper(0x250), 0x2c6f);
327  TEST_EQUAL(Unicode::toupper(0x251), 0x2c6d);
328  TEST_EQUAL(Unicode::toupper(0x271), 0x2c6e);
329 
335 
336  // U+0526, U+0527 and U+A78D were added in Unicode 6.0.0:
337  TEST_EQUAL(Unicode::toupper(0x265), 0xa78d);
338  TEST_EQUAL(Unicode::tolower(0xa78d), 0x265);
339  TEST_EQUAL(Unicode::tolower(0x526), 0x527);
340  TEST_EQUAL(Unicode::toupper(0x527), 0x526);
341 
342  // U+A7AA was added in Unicode 6.1.0:
343  TEST_EQUAL(Unicode::toupper(0x266), 0xa7aa);
344  TEST_EQUAL(Unicode::tolower(0xa7aa), 0x266);
345  TEST_EQUAL(Unicode::tolower(0x526), 0x527);
346  TEST_EQUAL(Unicode::toupper(0x527), 0x526);
347 
348  TEST_EQUAL(Unicode::tolower(0x370), 0x371);
349  TEST_EQUAL(Unicode::toupper(0x371), 0x370);
350  TEST_EQUAL(Unicode::tolower(0x372), 0x373);
351  TEST_EQUAL(Unicode::toupper(0x373), 0x372);
352  TEST_EQUAL(Unicode::tolower(0x376), 0x377);
353  TEST_EQUAL(Unicode::toupper(0x377), 0x376);
354  TEST_EQUAL(Unicode::tolower(0x3cf), 0x3d7);
355  TEST_EQUAL(Unicode::toupper(0x3d7), 0x3cf);
356 
357  // U+20BA was added in Unicode 6.2.0:
358  TEST_EQUAL(Unicode::toupper(0x20ba), 0x20ba);
359  TEST_EQUAL(Unicode::tolower(0x20ba), 0x20ba);
360 
361  // U+061C was added in Unicode 6.3.0:
362  TEST_EQUAL(Unicode::toupper(0x61c), 0x61c);
363  TEST_EQUAL(Unicode::tolower(0x61c), 0x61c);
364 
365  unsigned u;
366  for (u = 0x514; u < 0x524; u += 2) {
369  TEST_EQUAL(Unicode::tolower(u), u + 1);
370  TEST_EQUAL(Unicode::toupper(u + 1), u);
371  }
372 
373  // U+A7B1 was added in Unicode 8.0.0 as an uppercase form of U+0287.
374  TEST_EQUAL(Unicode::tolower(0xA7B1), 0x0287);
375  TEST_EQUAL(Unicode::toupper(0xA7B1), 0xA7B1);
376  TEST_EQUAL(Unicode::tolower(0x0287), 0x0287);
377  TEST_EQUAL(Unicode::toupper(0x0287), 0xA7B1);
378 
379  // U+A7B4 (capital) and U+A7B5 (small) added in Unicode 8.0.0
380  TEST_EQUAL(Unicode::tolower(0xA7B4), 0xA7B5);
381  TEST_EQUAL(Unicode::toupper(0xA7B4), 0xA7B4);
382  TEST_EQUAL(Unicode::tolower(0xA7B5), 0xA7B5);
383  TEST_EQUAL(Unicode::toupper(0xA7B5), 0xA7B4);
384 
385  // U+A7AE was added in Unicode 9.0.0 as an uppercase form of U+026A.
386  TEST_EQUAL(Unicode::tolower(0xA7AE), 0x026A);
387  TEST_EQUAL(Unicode::toupper(0xA7AE), 0xA7AE);
388  TEST_EQUAL(Unicode::tolower(0x026A), 0x026A);
389  TEST_EQUAL(Unicode::toupper(0x026A), 0xA7AE);
390 }
391 
392 DEFINE_TESTCASE(utf8convert1, !backend) {
393  string s;
397  Xapian::Unicode::append_utf8(s, 0xFFFF);
398  Xapian::Unicode::append_utf8(s, 166415);
399  Xapian::Unicode::append_utf8(s, 0x10345);
400  Xapian::Unicode::append_utf8(s, 0x10FFFD);
401  Xapian::Unicode::append_utf8(s, 0xFFFFFFFF);
403  TEST_STRINGS_EQUAL(s, "a"
404  "\xc2\x80"
405  "\xc2\xa0"
406  "\xef\xbf\xbf"
407  "\xf0\xa8\xa8\x8f"
408  "\xf0\x90\x8d\x85"
409  "\xf4\x8f\xbf\xbd"
410  ""
411  "z"
412  );
413 }
414 
415 DEFINE_TESTCASE(unicodepredicates1, !backend) {
416  static const unsigned wordchars[] = {
417  // DECIMAL_DIGIT_NUMBER
418  '0', '7', '9',
419  // LOWERCASE_LETTER
420  'a', 'z', 0x250, 0x251, 0x271, 0x3d7,
421  0x242, // (added in Unicode 5.0.0)
422  // LOWERCASE_LETTER (added in Unicode 5.1.0)
423  0x371, 0x373, 0x377, 0x514, 0x516, 0x518, 0x51a, 0x51c, 0x51e,
424  0x520, 0x522,
425  // UPPERCASE_LETTER
426  'A', 'Z', 0x241,
427  // UPPERCASE_LETTER (added in Unicode 5.1.0)
428  0x370, 0x372, 0x376, 0x3cf, 0x515, 0x517, 0x519, 0x51b, 0x51d, 0x51f,
429  0x521, 0x523, 0x2c6d, 0x2c6e, 0x2c6f,
430  // OTHER_LETTER
431  0x8bb, // Added in Unicode 9.0.0
432  0xc80, // Added in Unicode 9.0.0
433  0x10345,
434  // MODIFIER_LETTER (added in Unicode 5.1.0)
435  0x2ec, 0x374,
436  // NON_SPACING_MARK (added to is_wordchar() in 1.1.0)
437  0x651,
438  0x487, // Added in Unicode 5.1.0
439  0x8db, // Added in Unicode 9.0.0
440  0
441  };
442  static const unsigned currency[] = {
443  // CURRENCY_SYMBOL
444  '$', 0xa3,
445  // CURRENCY_SYMBOL (added in Unicode 6.2.0)
446  0x20ba,
447  // CURRENCY_SYMBOL (added in Unicode 8.0.0)
448  0x20be,
449  0
450  };
451  static const unsigned whitespace[] = {
452  // CONTROL
453  '\t', '\n', '\f', '\r',
454  // SPACE_SEPARATOR
455  ' ',
456  0
457  };
458  static const unsigned other[] = {
459  // DASH_PUNCTUATION (added in Unicode 5.1.0)
460  0x5be,
461  // OTHER_SYMBOL
462  0xd4f, // Added in Unicode 9.0.0
463  0x1f093, // Added in Unicode 5.1.0
464  // FORMAT
465  0x61c, // Added in Unicode 6.3.0
466  0x8e2, // Added in Unicode 9.0.0
467  // UNASSIGNED
468  0xffff, 0x10ffff, 0x110000, 0xFFFFFFFF,
469  // PRIVATE_USE
470  0x10fffd,
471  0
472  };
473 
474  for (const unsigned* p = wordchars; *p; ++p) {
478  }
479 
480  for (const unsigned* p = currency; *p; ++p) {
484  }
485 
486  for (const unsigned* p = whitespace; *p; ++p) {
490  }
491 
492  for (const unsigned* p = other; *p; ++p) {
496  }
497 }
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:80
void append_utf8(std::string &s, unsigned ch)
Append the UTF-8 representation of a single Unicode character to a std::string.
Definition: unicode.h:332
Letter, modifier (Lm)
Definition: unicode.h:225
#define TEST(a)
Test a condition, without an additional explanation for failure.
Definition: testsuite.h:275
unsigned tolower(unsigned ch)
Convert a Unicode character to lowercase.
Definition: unicode.h:376
Other, not assigned (Cn)
Definition: unicode.h:221
Mark, spacing combining (Mc)
Definition: unicode.h:229
Letter, other (Lo)
Definition: unicode.h:226
bool is_currency(unsigned ch)
Test if a given Unicode character is a currency symbol.
Definition: unicode.h:371
Number, decimal digit (Nd)
Definition: unicode.h:230
Symbol, currency (Sc)
Definition: unicode.h:248
const char * b
Definition: api_unicode.cc:36
unsigned long n
Definition: api_unicode.cc:116
STL namespace.
Other, format (Cf)
Definition: unicode.h:237
DEFINE_TESTCASE(utf8iterator1, !backend)
Definition: api_unicode.cc:91
test functionality of the Xapian API
Letter, lowercase (Ll)
Definition: unicode.h:223
const char * a
Definition: api_unicode.cc:35
Mark, nonspacing (Mn)
Definition: unicode.h:227
static const testcase2 testcases2[]
Definition: api_unicode.cc:119
std::ostringstream tout
The debug printing stream.
Definition: testsuite.cc:103
Punctuation, dash (Pd)
Definition: unicode.h:241
Public interfaces for the Xapian library.
Letter, uppercase (Lu)
Definition: unicode.h:222
unsigned toupper(unsigned ch)
Convert a Unicode character to uppercase.
Definition: unicode.h:384
Number, other (No)
Definition: unicode.h:232
An iterator which returns Unicode character values from a UTF-8 encoded string.
Definition: unicode.h:38
Symbol, math (Sm)
Definition: unicode.h:247
bool is_wordchar(unsigned ch)
Test if a given Unicode character is "word character".
Definition: unicode.h:343
Xapian-specific test helper functions and macros.
Punctuation, other (Po)
Definition: unicode.h:246
bool is_whitespace(unsigned ch)
Test if a given Unicode character is a whitespace character.
Definition: unicode.h:361
#define TEST_STRINGS_EQUAL(a, b)
Test for equality of two strings.
Definition: testsuite.h:287
const char * a
Definition: api_unicode.cc:115
Other, private use (Co)
Definition: unicode.h:238
category get_category(int info)
Definition: unicode.h:271
#define TEST_EQUAL(a, b)
Test for equality of two things.
Definition: testsuite.h:278
Symbol, other (So)
Definition: unicode.h:250
static const testcase testcases[]
Definition: api_unicode.cc:39