xapian-core  2.0.0
utf8itor.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2006,2007,2010,2013,2015,2019,2024 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, see
18  * <https://www.gnu.org/licenses/>.
19  */
20 
21 #include <config.h>
22 
23 #include <xapian/unicode.h>
24 
25 using namespace std;
26 
27 static inline bool bad_cont(unsigned char ch) {
28  return static_cast<signed char>(ch) >= static_cast<signed char>(0xc0);
29 }
30 
31 namespace Xapian {
32 
33 namespace Unicode {
34 
35 // buf should be at least 4 bytes.
36 unsigned
37 nonascii_to_utf8(unsigned ch, char* buf)
38 {
39  if (ch < 0x800) {
40  buf[0] = char(0xc0 | (ch >> 6));
41  buf[1] = char(0x80 | (ch & 0x3f));
42  return 2;
43  }
44  if (ch < 0x10000) {
45  buf[0] = char(0xe0 | (ch >> 12));
46  buf[1] = char(0x80 | ((ch >> 6) & 0x3f));
47  buf[2] = char(0x80 | (ch & 0x3f));
48  return 3;
49  }
50  if (ch < 0x200000) {
51  buf[0] = char(0xf0 | (ch >> 18));
52  buf[1] = char(0x80 | ((ch >> 12) & 0x3f));
53  buf[2] = char(0x80 | ((ch >> 6) & 0x3f));
54  buf[3] = char(0x80 | (ch & 0x3f));
55  return 4;
56  }
57  // Unicode doesn't specify any characters above 0x10ffff.
58  // Should we be presented with such a numeric character
59  // entity or similar, we just replace it with nothing.
60  return 0;
61 }
62 
63 }
64 
65 bool
66 Utf8Iterator::calculate_sequence_length() const noexcept
67 {
68  // Handle invalid UTF-8, overlong sequences, surrogate pair halves, and
69  // truncated sequences as if the text was actually in ISO-8859-1 since we
70  // need to do something with it, and this seems the most likely reason why
71  // we'd have invalid UTF-8.
72 
73  unsigned char ch = *p;
74 
75  seqlen = 1;
76  // Single byte encoding (0x00-0x7f) or invalid (0x80-0xbf) or overlong
77  // sequence (0xc0-0xc1).
78  //
79  // (0xc0 and 0xc1 would start 2 byte sequences for characters which are
80  // representable in a single byte, and we should not decode these.)
81  if (ch < 0xc2) return (ch < 0x80);
82 
83  if (ch < 0xe0) {
84  if (p + 1 == end || // Not enough bytes
85  bad_cont(p[1])) // Invalid
86  return false;
87  seqlen = 2;
88  return true;
89  }
90  if (ch < 0xf0) {
91  if (end - p < 3 || // Not enough bytes
92  bad_cont(p[1]) || bad_cont(p[2]) || // Invalid
93  (p[0] == 0xe0 && p[1] < 0xa0) || // Overlong encoding
94  (p[0] == 0xed && p[1] >= 0xa0)) // Surrogate pair half
95  return false;
96  seqlen = 3;
97  return true;
98  }
99  if (ch >= 0xf5 || // Code value above Unicode
100  end - p < 4 || // Not enough bytes
101  bad_cont(p[1]) || bad_cont(p[2]) || bad_cont(p[3]) || // Invalid
102  (p[0] == 0xf0 && p[1] < 0x90) || // Overlong encoding
103  (p[0] == 0xf4 && p[1] >= 0x90)) // Code value above Unicode
104  return false;
105  seqlen = 4;
106  return true;
107 }
108 
109 unsigned Utf8Iterator::operator*() const noexcept {
110  if (p == NULL) return unsigned(-1);
111  if (seqlen == 0) calculate_sequence_length();
112  unsigned char ch = *p;
113  if (seqlen == 1) return ch;
114  if (seqlen == 2) return ((ch & 0x1f) << 6) | (p[1] & 0x3f);
115  if (seqlen == 3)
116  return ((ch & 0x0f) << 12) | ((p[1] & 0x3f) << 6) | (p[2] & 0x3f);
117  return ((ch & 0x07) << 18) | ((p[1] & 0x3f) << 12) |
118  ((p[2] & 0x3f) << 6) | (p[3] & 0x3f);
119 }
120 
121 unsigned
122 Utf8Iterator::strict_deref() const noexcept
123 {
124  if (p == NULL) return unsigned(-1);
125  if (seqlen == 0) {
126  if (!calculate_sequence_length())
127  return unsigned(*p) | 0x80000000;
128  }
129  unsigned char ch = *p;
130  if (seqlen == 1) return ch;
131  if (seqlen == 2) return ((ch & 0x1f) << 6) | (p[1] & 0x3f);
132  if (seqlen == 3)
133  return ((ch & 0x0f) << 12) | ((p[1] & 0x3f) << 6) | (p[2] & 0x3f);
134  return ((ch & 0x07) << 18) | ((p[1] & 0x3f) << 12) |
135  ((p[2] & 0x3f) << 6) | (p[3] & 0x3f);
136 }
137 
138 }
PositionList * p
unsigned nonascii_to_utf8(unsigned ch, char *buf)
Convert a single non-ASCII Unicode character to UTF-8.
Definition: utf8itor.cc:37
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:82
const Query operator*(double factor, const Query &q)
Scale a Xapian::Query object using OP_SCALE_WEIGHT.
Definition: query.h:827
Unicode and UTF-8 related classes and functions.
static bool bad_cont(unsigned char ch)
Definition: utf8itor.cc:27