xapian-core  2.0.0
api_unicode.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2006-2025 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, see
18  * <https://www.gnu.org/licenses/>.
19  */
20 
21 #include <config.h>
22 
23 #include "api_unicode.h"
24 
25 #include <xapian.h>
26 
27 #include "apitest.h"
28 #include "testutils.h"
29 
30 #include <cctype>
31 #include <string_view>
32 
33 using namespace std;
34 
35 struct testcase {
36  const char* a;
37  const char* b;
38 };
39 
40 static const testcase testcases[] = {
41  { "abcd", "abcd" }, // Sanity check!
42  { "a\x80""bcd", "a\xc2\x80""bcd" },
43  { "a\xa0", "a\xc2\xa0" },
44  { "a\xa0z", "a\xc2\xa0z" },
45  { "x\xc1yz", "x\xc3\x81yz" },
46  { "\xc2z", "\xc3\x82z" },
47  { "\xc2", "\xc3\x82" },
48  { "xy\xc3z", "xy\xc3\x83z" },
49  { "xy\xc3\xc3z", "xy\xc3\x83\xc3\x83z" },
50  { "xy\xc3\xc3", "xy\xc3\x83\xc3\x83" },
51  { "\xe0", "\xc3\xa0" },
52  { "\xe0\x80", "\xc3\xa0\xc2\x80" },
53  { "\xe0\xc0", "\xc3\xa0\xc3\x80" },
54  { "\xe0\xc0z", "\xc3\xa0\xc3\x80z" },
55  { "\xe0\xc0zz", "\xc3\xa0\xc3\x80zz" },
56  { "\xe0\xc0\x81", "\xc3\xa0\xc3\x80\xc2\x81" },
57  { "\xe0\x82\xc1", "\xc3\xa0\xc2\x82\xc3\x81" },
58  { "\xe0\xc5\xc7", "\xc3\xa0\xc3\x85\xc3\x87" },
59  { "\xf0", "\xc3\xb0" },
60  { "\xf0\x80", "\xc3\xb0\xc2\x80" },
61  { "\xf0\xc0", "\xc3\xb0\xc3\x80" },
62  { "\xf0\xc0z", "\xc3\xb0\xc3\x80z" },
63  { "\xf0\xc0zz", "\xc3\xb0\xc3\x80zz" },
64  { "\xf0\xc0\x81", "\xc3\xb0\xc3\x80\xc2\x81" },
65  { "\xf0\x82\xc1", "\xc3\xb0\xc2\x82\xc3\x81" },
66  { "\xf0\xc5\xc7", "\xc3\xb0\xc3\x85\xc3\x87" },
67  { "\xf0\xc0\x81\xc9", "\xc3\xb0\xc3\x80\xc2\x81\xc3\x89" },
68  { "\xf0\x82\xc1\xc8", "\xc3\xb0\xc2\x82\xc3\x81\xc3\x88" },
69  { "\xf0\xc5\xc7\xc6", "\xc3\xb0\xc3\x85\xc3\x87\xc3\x86" },
70  { "\xf0\xc0\x81\x89", "\xc3\xb0\xc3\x80\xc2\x81\xc2\x89" },
71  { "\xf0\x82\xc1\x88", "\xc3\xb0\xc2\x82\xc3\x81\xc2\x88" },
72  { "\xf0\xc5\xc7\xc6", "\xc3\xb0\xc3\x85\xc3\x87\xc3\x86" },
73  { "\xf4P\x80\x80", "\xc3\xb4P\xc2\x80\xc2\x80" },
74  { "\xf4\x80P\x80", "\xc3\xb4\xc2\x80P\xc2\x80" },
75  { "\xf4\x80\x80P", "\xc3\xb4\xc2\x80\xc2\x80P" },
76  { "\xfe\xffxyzzy", "\xc3\xbe\xc3\xbfxyzzy" },
77  // Overlong encodings:
78  { "\xc0\x80", "\xc3\x80\xc2\x80" },
79  { "\xc0\xbf", "\xc3\x80\xc2\xbf" },
80  { "\xc1\x80", "\xc3\x81\xc2\x80" },
81  { "\xc1\xbf", "\xc3\x81\xc2\xbf" },
82  { "\xe0\x80\x80", "\xc3\xa0\xc2\x80\xc2\x80" },
83  { "\xe0\x9f\xbf", "\xc3\xa0\xc2\x9f\xc2\xbf" },
84  { "\xf0\x80\x80\x80", "\xc3\xb0\xc2\x80\xc2\x80\xc2\x80" },
85  { "\xf0\x8f\xbf\xbf", "\xc3\xb0\xc2\x8f\xc2\xbf\xc2\xbf" },
86  // Above Unicode:
87  { "\xf4\x90\x80\x80", "\xc3\xb4\xc2\x90\xc2\x80\xc2\x80" },
88  // Surrogate pair cases:
89  { "\xed\xa0\x80", "\xc3\xad\xc2\xa0\xc2\x80" },
90  { "\xed\xbf\xbf", "\xc3\xad\xc2\xbf\xc2\xbf" },
91  { "\xed\xa0\x80" "\xed\xbf\xbf",
92  "\xc3\xad\xc2\xa0\xc2\x80" "\xc3\xad\xc2\xbf\xc2\xbf" },
93  { 0, 0 }
94 };
95 
96 // Test handling of invalid UTF-8 is as desired.
97 DEFINE_TESTCASE(utf8iterator1, !backend) {
98  const testcase* p;
99  for (p = testcases; p->a; ++p) {
100  tout.str(string());
101  tout << '"' << p->a << "\" and \"" << p->b << "\"\n";
102  // Exercise construction from pointer and length.
103  Xapian::Utf8Iterator a(p->a, strlen(p->a));
104  // Exercise construction from std::string_view.
105  Xapian::Utf8Iterator b(string_view(p->b));
106 
107  while (a != Xapian::Utf8Iterator() && b != Xapian::Utf8Iterator()) {
108  TEST_EQUAL(*a, *b);
109  ++a;
110  ++b;
111  }
112 
113  // Test that we don't reach the end of one before the other.
114  TEST(a == Xapian::Utf8Iterator());
115  TEST(b == Xapian::Utf8Iterator());
116  }
117 }
118 
119 struct testcase2 {
120  const char* a;
121  unsigned long n;
122 };
123 
124 static const testcase2 testcases2[] = {
125  { "a", 97 },
126  { "\x80", 128 },
127  { "\xa0", 160 },
128  { "\xc2\x80", 128 },
129  { "\xc2\xa0", 160 },
130  { "\xe0\xa0\x80", 0x0800 },
131  { "\xe1\x80\x80", 0x1000 },
132  { "\xf0\xa8\xa8\x8f", 166415 },
133  { "\xf3\x80\x80\x80", 0x0c0000 },
134  { "\xf4\x80\x80\x80", 0x100000 },
135  { 0, 0 }
136 };
137 
138 // Test decoding of UTF-8.
139 DEFINE_TESTCASE(utf8iterator2, !backend) {
140  const testcase2* p;
141  for (p = testcases2; p->a; ++p) {
142  Xapian::Utf8Iterator a(p->a);
143 
144  TEST(a != Xapian::Utf8Iterator());
145  TEST_EQUAL(*a, p->n);
146  TEST(++a == Xapian::Utf8Iterator());
147  }
148 }
149 
150 // Test Unicode categorisation.
151 DEFINE_TESTCASE(unicode1, !backend) {
152  using namespace Xapian;
157  // U+0242 was added in Unicode 5.0.0.
159  // U+0526 was added in Unicode 6.0.0.
161  // U+0527 was added in Unicode 6.0.0.
163  // U+0620 was added in Unicode 6.0.0.
165  // U+065F was added in Unicode 6.0.0.
167  // U+06DE changed category in Unicode 6.0.0.
169  // U+0840 was added in Unicode 6.0.0.
171  // U+08BE was added in Unicode 13.0.0.
173  // U+093A was added in Unicode 6.0.0.
175  // U+093B was added in Unicode 6.0.0.
177  // U+20C0 was added in Unicode 14.0.0.
179  // U+2FFE was added in Unicode 15.1.0.
181  // Added in Unicode 16.0.0.
183  // U+0242 was added in Unicode 5.0.0.
185  // U+0CF1 changed category in Unicode 6.0.0.
187  // U+0CF2 changed category in Unicode 6.0.0.
189  // U+0CF3 was added in Unicode 15.0.0.
191  // U+0ECE was added in Unicode 15.0.0.
193  // U+11A7 was added in Unicode 5.2.0.
195  // U+2C2F was added in Unicode 14.0.0.
197  // U+2C5F was added in Unicode 14.0.0.
199  // U+2B97 was added in Unicode 13.0.0.
201  // U+31EF was added in Unicode 15.1.0.
203  // U+9FCB was added in Unicode 5.2.0.
205  // U+9FFC was added in Unicode 13.0.0.
207  // U+FA6C was added in Unicode 5.2.0.
210  // Test characters outside BMP.
214  // U+1109A was added in Unicode 5.2.0.
216  // U+1F773 was added in Unicode 6.0.0.
218  // U+2B740 was added in Unicode 6.0.0.
220  // U+2B81D was added in Unicode 6.0.0.
222  // U+00A7 changed category in Unicode 6.1.0 (was OTHER_SYMBOL).
224  // U+00AA changed category in Unicode 6.1.0 (was LOWERCASE_LETTER).
226  // U+00B6 changed category in Unicode 6.1.0 (was OTHER_SYMBOL).
228  // U+00BA changed category in Unicode 6.1.0 (was LOWERCASE_LETTER).
230  // U+058F was added in Unicode 6.1.0.
232  // U+0604 was added in Unicode 6.1.0.
234  // U+08A0 was added in Unicode 6.1.0.
236  // U+08E4 was added in Unicode 6.1.0.
238  // U+0AF0 was added in Unicode 6.1.0.
240  // U+9FCC was added in Unicode 6.1.0.
242  // U+A7F9 was added in Unicode 6.1.0.
244  // U+110F0 was added in Unicode 6.1.0.
246  // U+11100 was added in Unicode 6.1.0.
248  // U+1EEF0 was added in Unicode 6.1.0.
250  // U+1F634 was added in Unicode 6.1.0.
252  // U+20BA was added in Unicode 6.2.0.
254  // U+061C was added in Unicode 6.3.0.
256  // U+037F "GREEK CAPITAL LETTER YOT" was added in Unicode 7.0.0.
258 
259  // Added or changed in Unicode 8.0.0:
260  // U+08B3 "ARABIC LETTER AIN WITH THREE DOTS BELOW".
262  // U+0AF9 "GUJARATI LETTER ZHA".
264  // U+0C5A "TELUGU LETTER RRRA".
266  // U+0D5F "MALAYALAM LETTER ARCHAIC II".
268  // U+13F5 "CHEROKEE LETTER MV".
270  // U+13F8 "CHEROKEE SMALL LETTER YE".
272  // U+19B7 "NEW TAI LUE VOWEL SIGN O" changed to be OTHER_LETTER in 8.0.0.
274  // U+20BE "LARI SIGN".
276  // U+218A "TURNED DIGIT TWO".
278  // U+10C9C "OLD HUNGARIAN CAPITAL LETTER OO".
280  // U+12399 "CUNEIFORM SIGN U U".
282  // U+1D800 "SIGNWRITING HAND-FIST INDEX".
284 
285  // Added or changed in Unicode 9.0.0:
286  // U+08B6 "ARABIC LETTER BEH WITH SMALL MEEM ABOVE"
288  // U+08E2 "ARABIC DISPUTED END OF AYAH"
290  // U+0C80 "KANNADA SIGN SPACING CANDRABINDU"
292  // U+0D56 "MALAYALAM LETTER CHILLU LLL"
294  // U+0D58 "MALAYALAM FRACTION ONE ONE-HUNDRED-AND-SIXTIETH"
296  // U+1885 "MONGOLIAN LETTER ALI GALI BALUDA"
298  // U+1886 "MONGOLIAN LETTER ALI GALI THREE BALUDA"
300  // U+104FB "OSAGE SMALL LETTER ZHA"
302  // U+1141F "NEWA LETTER TA"
304  // U+1F989 "OWL"
306 
307  // Added in Unicode 10.0.0:
308  // U+20BF "BITCOIN SIGN"
310  // U+23FF "OBSERVER EYE SYMBOL"
312  // U+1032D "OLD ITALIC LETTER YE"
314  // U+11A34 "ZANABAZAR SQUARE SIGN VIRAMA"
316  // U+1F6F8 "FLYING SAUCER"
318  // U+1F9E6 "SOCKS"
320 
321  // Added in Unicode 11.0.0:
322  // U+0560 "ARMENIAN SMALL LETTER TURNED AYB"
324  // U+05EF "HEBREW YOD TRIANGLE"
326  // U+07FF "NKO TAMAN SIGN"
328  // U+08D3 "ARABIC SMALL LOW WAW"
330  // U+1878 "MONGOLIAN LETTER CHA WITH TWO DOTS"
332  // U+1F12F "COPYLEFT SYMBOL"
334 
335  // Changed category in Unicode 11.0.0:
336  // U+10D0 "GEORGIAN LETTER AN"
338 
339  // Added in Unicode 12.0.0:
340  // U+0C77 "TELUGU SIGN SIDDHAM"
342  // U+2BC9 "NEPTUNE FORM TWO"
344  // U+A7C5 "LATIN CAPITAL LETTER S WITH HOOK"
346  // U+1FA90 "RINGED PLANET"
348 
349  // Added in Unicode 12.1.0:
351 
352  // Added in Unicode 13.0.0:
358 
359  // Added in Unicode 15.1.0.
362 
363  // Added in Unicode 16.0.0.
365 
366  // Added or changed category in Unicode 17.0.0:
387 
388  // Test some invalid Unicode values.
391 }
392 
393 DEFINE_TESTCASE(caseconvert1, !backend) {
394  using namespace Xapian;
395  for (unsigned ch = 0; ch < 128; ++ch) {
396  TEST_EQUAL(Unicode::tolower(ch), unsigned(tolower(ch)));
397  TEST_EQUAL(Unicode::toupper(ch), unsigned(toupper(ch)));
398  }
399 
400  // U+0242 was added in Unicode 5.0.0 as a lowercase form of U+0241.
401  TEST_EQUAL(Unicode::tolower(0x242), 0x242);
402  TEST_EQUAL(Unicode::toupper(0x242), 0x241);
403  TEST_EQUAL(Unicode::toupper(0x241), 0x241);
404  TEST_EQUAL(Unicode::tolower(0x241), 0x242);
405 
406  // Regression test for bug fixed in 1.2.17.
407  TEST_EQUAL(Unicode::tolower(0x1c5), 0x1c6);
408  TEST_EQUAL(Unicode::tolower(0x1c8), 0x1c9);
409  TEST_EQUAL(Unicode::tolower(0x1cb), 0x1cc);
410  TEST_EQUAL(Unicode::tolower(0x1f2), 0x1f3);
411 
412  // Pound currency symbol:
413  TEST_EQUAL(Unicode::tolower(0xa3), 0xa3);
414  TEST_EQUAL(Unicode::toupper(0xa3), 0xa3);
415  // Unassigned:
416  TEST_EQUAL(Unicode::tolower(0xFFFF), 0xFFFF);
417  TEST_EQUAL(Unicode::toupper(0xFFFF), 0xFFFF);
418  // Test characters outside BMP.
419  TEST_EQUAL(Unicode::tolower(0x10345), 0x10345);
420  TEST_EQUAL(Unicode::toupper(0x10345), 0x10345);
421  TEST_EQUAL(Unicode::tolower(0x10FFFD), 0x10FFFD);
422  TEST_EQUAL(Unicode::toupper(0x10FFFD), 0x10FFFD);
423  TEST_EQUAL(Unicode::tolower(0x10FFFF), 0x10FFFF);
424  TEST_EQUAL(Unicode::toupper(0x10FFFF), 0x10FFFF);
425  // Test some invalid Unicode values.
426  TEST_EQUAL(Unicode::tolower(0x110000), 0x110000);
427  TEST_EQUAL(Unicode::toupper(0x110000), 0x110000);
428  TEST_EQUAL(Unicode::tolower(0xFFFFFFFF), 0xFFFFFFFF);
429  TEST_EQUAL(Unicode::toupper(0xFFFFFFFF), 0xFFFFFFFF);
430 }
431 
433 DEFINE_TESTCASE(caseconvert2, !backend) {
434  using namespace Xapian;
435 
436  TEST_EQUAL(Unicode::toupper(0x250), 0x2c6f);
437  TEST_EQUAL(Unicode::toupper(0x251), 0x2c6d);
438  TEST_EQUAL(Unicode::toupper(0x271), 0x2c6e);
439 
445 
446  // U+0526, U+0527 and U+A78D were added in Unicode 6.0.0:
447  TEST_EQUAL(Unicode::toupper(0x265), 0xa78d);
448  TEST_EQUAL(Unicode::tolower(0xa78d), 0x265);
449  TEST_EQUAL(Unicode::tolower(0x526), 0x527);
450  TEST_EQUAL(Unicode::toupper(0x527), 0x526);
451 
452  // U+A7AA was added in Unicode 6.1.0:
453  TEST_EQUAL(Unicode::toupper(0x266), 0xa7aa);
454  TEST_EQUAL(Unicode::tolower(0xa7aa), 0x266);
455  TEST_EQUAL(Unicode::tolower(0x526), 0x527);
456  TEST_EQUAL(Unicode::toupper(0x527), 0x526);
457 
458  TEST_EQUAL(Unicode::tolower(0x370), 0x371);
459  TEST_EQUAL(Unicode::toupper(0x371), 0x370);
460  TEST_EQUAL(Unicode::tolower(0x372), 0x373);
461  TEST_EQUAL(Unicode::toupper(0x373), 0x372);
462  TEST_EQUAL(Unicode::tolower(0x376), 0x377);
463  TEST_EQUAL(Unicode::toupper(0x377), 0x376);
464  TEST_EQUAL(Unicode::tolower(0x3cf), 0x3d7);
465  TEST_EQUAL(Unicode::toupper(0x3d7), 0x3cf);
466 
467  // U+20BA was added in Unicode 6.2.0:
468  TEST_EQUAL(Unicode::toupper(0x20ba), 0x20ba);
469  TEST_EQUAL(Unicode::tolower(0x20ba), 0x20ba);
470 
471  // U+061C was added in Unicode 6.3.0:
472  TEST_EQUAL(Unicode::toupper(0x61c), 0x61c);
473  TEST_EQUAL(Unicode::tolower(0x61c), 0x61c);
474 
475  unsigned u;
476  for (u = 0x514; u < 0x524; u += 2) {
479  TEST_EQUAL(Unicode::tolower(u), u + 1);
480  TEST_EQUAL(Unicode::toupper(u + 1), u);
481  }
482 
483  // U+A7B1 was added in Unicode 8.0.0 as an uppercase form of U+0287.
484  TEST_EQUAL(Unicode::tolower(0xA7B1), 0x0287);
485  TEST_EQUAL(Unicode::toupper(0xA7B1), 0xA7B1);
486  TEST_EQUAL(Unicode::tolower(0x0287), 0x0287);
487  TEST_EQUAL(Unicode::toupper(0x0287), 0xA7B1);
488 
489  // U+A7B4 (capital) and U+A7B5 (small) added in Unicode 8.0.0
490  TEST_EQUAL(Unicode::tolower(0xA7B4), 0xA7B5);
491  TEST_EQUAL(Unicode::toupper(0xA7B4), 0xA7B4);
492  TEST_EQUAL(Unicode::tolower(0xA7B5), 0xA7B5);
493  TEST_EQUAL(Unicode::toupper(0xA7B5), 0xA7B4);
494 
495  // U+A7AE was added in Unicode 9.0.0 as an uppercase form of U+026A.
496  TEST_EQUAL(Unicode::tolower(0xA7AE), 0x026A);
497  TEST_EQUAL(Unicode::toupper(0xA7AE), 0xA7AE);
498  TEST_EQUAL(Unicode::tolower(0x026A), 0x026A);
499  TEST_EQUAL(Unicode::toupper(0x026A), 0xA7AE);
500 
501  // U+A7AE was added in Unicode 9.0.0 as an uppercase form of U+026A.
502  TEST_EQUAL(Unicode::tolower(0xA7AE), 0x026A);
503  TEST_EQUAL(Unicode::toupper(0xA7AE), 0xA7AE);
504  TEST_EQUAL(Unicode::tolower(0x026A), 0x026A);
505  TEST_EQUAL(Unicode::toupper(0x026A), 0xA7AE);
506 
507  // U+0560 was added in Unicode 11.0.0 (lowercase, no other forms).
508  TEST_EQUAL(Unicode::tolower(0x0560), 0x0560);
509  TEST_EQUAL(Unicode::toupper(0x0560), 0x0560);
510 
511  // U+10D0 changed to be lowercase in Unicode 11.0.0 and U+1C90 was added.
512  TEST_EQUAL(Unicode::tolower(0x10D0), 0x10D0);
513  TEST_EQUAL(Unicode::toupper(0x10D0), 0x1C90);
514  TEST_EQUAL(Unicode::tolower(0x1C90), 0x10D0);
515  TEST_EQUAL(Unicode::toupper(0x1C90), 0x1C90);
516 
517  // U+A7C5 was added in Unicode 12.0.0 as an uppercase form of U+0282.
518  TEST_EQUAL(Unicode::tolower(0xA7C5), 0x0282);
519  TEST_EQUAL(Unicode::toupper(0xA7C5), 0xA7C5);
520  TEST_EQUAL(Unicode::tolower(0x0282), 0x0282);
521  TEST_EQUAL(Unicode::toupper(0x0282), 0xA7C5);
522 
523  // Added in Unicode 13.0.0.
524  TEST_EQUAL(Unicode::tolower(0xA7C8), 0xA7C8);
525  TEST_EQUAL(Unicode::toupper(0xA7C8), 0xA7C7);
526  TEST_EQUAL(Unicode::toupper(0xA7C7), 0xA7C7);
527  TEST_EQUAL(Unicode::tolower(0xA7C7), 0xA7C8);
528 
529  // Added in Unicode 14.0.0.
530  TEST_EQUAL(Unicode::tolower(0x2C5F), 0x2C5F);
531  TEST_EQUAL(Unicode::toupper(0x2C5F), 0x2C2F);
532  TEST_EQUAL(Unicode::toupper(0x2C2F), 0x2C2F);
533  TEST_EQUAL(Unicode::tolower(0x2C2F), 0x2C5F);
534 
535  // Uppercase versions and mappings added in Unicode 17.0.0:
536  TEST_EQUAL(Unicode::tolower(0xA7D2), 0xA7D3);
537  TEST_EQUAL(Unicode::toupper(0xA7D3), 0xA7D2);
538  TEST_EQUAL(Unicode::tolower(0xA7D4), 0xA7D5);
539  TEST_EQUAL(Unicode::toupper(0xA7D5), 0xA7D4);
540 }
541 
542 DEFINE_TESTCASE(utf8convert1, !backend) {
543  string s;
547  Xapian::Unicode::append_utf8(s, 0xFFFF);
548  Xapian::Unicode::append_utf8(s, 166415);
549  Xapian::Unicode::append_utf8(s, 0x10345);
550  Xapian::Unicode::append_utf8(s, 0x10FFFD);
551  Xapian::Unicode::append_utf8(s, 0xFFFFFFFF);
553  TEST_STRINGS_EQUAL(s, "a"
554  "\xc2\x80"
555  "\xc2\xa0"
556  "\xef\xbf\xbf"
557  "\xf0\xa8\xa8\x8f"
558  "\xf0\x90\x8d\x85"
559  "\xf4\x8f\xbf\xbd"
560  ""
561  "z"
562  );
563 }
564 
565 DEFINE_TESTCASE(unicodepredicates1, !backend) {
566  static const unsigned wordchars[] = {
567  // DECIMAL_DIGIT_NUMBER
568  '0', '7', '9',
569  0x10D30, // (added in Unicode 11.0.0)
570  0x11D51, // (added in Unicode 10.0.0)
571  0x11DA9, // (added in Unicode 11.0.0)
572  0x11F50, // (added in Unicode 15.0.0)
573  0x16AC9, // (added in Unicode 14.0.0)
574  // OTHER_NUMBER
575  0x1ECB3, // (added in Unicode 11.0.0)
576  0x1D2D3, // (added in Unicode 15.0.0)
577  // LOWERCASE_LETTER
578  'a', 'z', 0x250, 0x251, 0x271, 0x3d7,
579  0x242, // (added in Unicode 5.0.0)
580  // LOWERCASE_LETTER (added in Unicode 5.1.0)
581  0x371, 0x373, 0x377, 0x514, 0x516, 0x518, 0x51a, 0x51c, 0x51e,
582  0x520, 0x522,
583  0x1C8A, // (added in Unicode 16.0.0)
584  0xA7C1, // (added in Unicode 14.0.0)
585  0x16E78, // (added in Unicode 11.0.0)
586  0x1DF2A, // (added in Unicode 15.0.0)
587  // UPPERCASE_LETTER
588  'A', 'Z', 0x241,
589  // UPPERCASE_LETTER (added in Unicode 5.1.0)
590  0x370, 0x372, 0x376, 0x3cf, 0x515, 0x517, 0x519, 0x51b, 0x51d, 0x51f,
591  0x521, 0x523, 0x2c6d, 0x2c6e, 0x2c6f,
592  0xA7C0, // (added in Unicode 14.0.0)
593  0xA7CB, // (added in Unicode 16.0.0)
594  0x16E45, // (added in Unicode 11.0.0)
595  // OTHER_LETTER
596  0x870, // (added in Unicode 14.0.0)
597  0x8bb, // Added in Unicode 9.0.0
598  0x8c7, // Added in Unicode 13.0.0
599  0xc80, // Added in Unicode 9.0.0
600  0xe86, // Added in Unicode 12.0.0
601  0x312e, // Added in Unicode 10.0.0
602  0x10345,
603  0x18CFF, // Added in Unicode 16.0.0
604  0x1e4d0, // Added in Unicode 15.0.0
605  0x2ee2e, // Added in Unicode 15.1.0
606  // MODIFIER_LETTER
607  0x2ec, // Added in Unicode 5.1.0
608  0x374, // Added in Unicode 5.1.0
609  0x8c9, // Added in Unicode 14.0.0
610  0x10D6F, // Added in Unicode 16.0.0
611  0x16fe1, // Added in Unicode 10.0.0
612  0x16fe3, // Added in Unicode 12.0.0
613  0x1e4eb, // Added in Unicode 15.0.0
614  // NON_SPACING_MARK (added to is_wordchar() in 1.1.0)
615  0x651,
616  0x487, // Added in Unicode 5.1.0
617  0x897, // Added in Unicode 16.0.0
618  0x899, // Added in Unicode 14.0.0
619  0x8d3, // Added in Unicode 11.0.0
620  0x8db, // Added in Unicode 9.0.0
621  0xeba, // Added in Unicode 12.0.0
622  0x11d47, // Added in Unicode 10.0.0
623  0x16fe4, // Added in Unicode 13.0.0
624  0x1e4ee, // Added in Unicode 15.0.0
625  0
626  };
627  static const unsigned currency[] = {
628  // CURRENCY_SYMBOL
629  '$', 0xa3,
630  // CURRENCY_SYMBOL (added in Unicode 6.2.0)
631  0x20ba,
632  // CURRENCY_SYMBOL (added in Unicode 8.0.0)
633  0x20be,
634  // CURRENCY_SYMBOL (added in Unicode 10.0.0)
635  0x20bf,
636  // CURRENCY_SYMBOL (added in Unicode 11.0.0)
637  0x7fe,
638  // CURRENCY_SYMBOL (added in Unicode 12.0.0)
639  0x1e2ff,
640  // CURRENCY_SYMBOL (added in Unicode 14.0.0)
641  0x20c0,
642  // CURRENCY_SYMBOL (added in Unicode 17.0.0)
643  0x20c1,
644  0
645  };
646  static const unsigned whitespace[] = {
647  // CONTROL
648  '\t', '\n', '\f', '\r',
649  // SPACE_SEPARATOR
650  ' ',
651  0
652  };
653  static const unsigned other[] = {
654  // DASH_PUNCTUATION
655  0x5be, // Added in Unicode 5.1.0
656  0x2e5d, // Added in Unicode 14.0.0
657  0x10D6E, // Added in Unicode 16.0.0
658  // OTHER_SYMBOL
659  0xd4f, // Added in Unicode 9.0.0
660  0x2b97, // Added in Unicode 13.0.0
661  0x2ffc, // Added in Unicode 15.1.0
662  0x31ef, // Added in Unicode 15.1.0
663  0x32ff, // Added in Unicode 12.1.0; UNASSIGNED before
664  0xfdcF, // Added in Unicode 14.0.0
665  0x1f093, // Added in Unicode 5.1.0
666  0x1f263, // Added in Unicode 10.0.0
667  0x1fa62, // Added in Unicode 11.0.0
668  0x1f6dc, // Added in Unicode 15.0.0
669  0x1FADC, // Added in Unicode 16.0.0
670  // FORMAT
671  0x61c, // Added in Unicode 6.3.0
672  0x891, // Added in Unicode 14.0.0
673  0x8e2, // Added in Unicode 9.0.0
674  0x1343e, // Added in Unicode 15.0.0
675  // UNASSIGNED
676  0xffff, 0x10ffff, 0x110000, 0xFFFFFFFF,
677  // PRIVATE_USE
678  0x10fffd,
679  0
680  };
681 
682  for (const unsigned* p = wordchars; *p; ++p) {
686  }
687 
688  for (const unsigned* p = currency; *p; ++p) {
692  }
693 
694  for (const unsigned* p = whitespace; *p; ++p) {
698  }
699 
700  for (const unsigned* p = other; *p; ++p) {
704  }
705 }
static const testcase2 testcases2[]
Definition: api_unicode.cc:124
DEFINE_TESTCASE(utf8iterator1, !backend)
Definition: api_unicode.cc:97
static const testcase testcases[]
Definition: api_unicode.cc:40
test functionality of the Xapian API
An iterator which returns Unicode character values from a UTF-8 encoded string.
Definition: unicode.h:39
PositionList * p
category get_category(int info)
Definition: unicode.h:283
void append_utf8(std::string &s, unsigned ch)
Append the UTF-8 representation of a single Unicode character to a std::string.
Definition: unicode.h:344
unsigned tolower(unsigned ch)
Convert a Unicode character to lowercase.
Definition: unicode.h:388
@ MATH_SYMBOL
Symbol, math (Sm)
Definition: unicode.h:255
@ FORMAT
Other, format (Cf)
Definition: unicode.h:245
@ PRIVATE_USE
Other, private use (Co)
Definition: unicode.h:246
@ LOWERCASE_LETTER
Letter, lowercase (Ll)
Definition: unicode.h:231
@ MODIFIER_LETTER
Letter, modifier (Lm)
Definition: unicode.h:233
@ OTHER_SYMBOL
Symbol, other (So)
Definition: unicode.h:258
@ CURRENCY_SYMBOL
Symbol, currency (Sc)
Definition: unicode.h:256
@ UNASSIGNED
Other, not assigned (Cn)
Definition: unicode.h:229
@ OTHER_LETTER
Letter, other (Lo)
Definition: unicode.h:234
@ DECIMAL_DIGIT_NUMBER
Number, decimal digit (Nd)
Definition: unicode.h:238
@ COMBINING_SPACING_MARK
Mark, spacing combining (Mc)
Definition: unicode.h:237
@ NON_SPACING_MARK
Mark, nonspacing (Mn)
Definition: unicode.h:235
@ DASH_PUNCTUATION
Punctuation, dash (Pd)
Definition: unicode.h:249
@ OTHER_PUNCTUATION
Punctuation, other (Po)
Definition: unicode.h:254
@ OTHER_NUMBER
Number, other (No)
Definition: unicode.h:240
@ UPPERCASE_LETTER
Letter, uppercase (Lu)
Definition: unicode.h:230
bool is_wordchar(unsigned ch)
Test if a given Unicode character is "word character".
Definition: unicode.h:355
bool is_currency(unsigned ch)
Test if a given Unicode character is a currency symbol.
Definition: unicode.h:383
unsigned toupper(unsigned ch)
Convert a Unicode character to uppercase.
Definition: unicode.h:396
bool is_whitespace(unsigned ch)
Test if a given Unicode character is a whitespace character.
Definition: unicode.h:373
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:82
unsigned long n
Definition: api_unicode.cc:121
const char * a
Definition: api_unicode.cc:120
const char * a
Definition: api_unicode.cc:36
const char * b
Definition: api_unicode.cc:37
std::ostringstream tout
The debug printing stream.
Definition: testsuite.cc:104
#define TEST_EQUAL(a, b)
Test for equality of two things.
Definition: testsuite.h:276
#define TEST_STRINGS_EQUAL(a, b)
Test for equality of two strings.
Definition: testsuite.h:285
#define TEST(a)
Test a condition, without an additional explanation for failure.
Definition: testsuite.h:273
Xapian-specific test helper functions and macros.
Public interfaces for the Xapian library.