xapian-core  1.4.25
utf8itor.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2006,2007,2010,2013,2015,2019 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include <config.h>
22 
23 #include <xapian/unicode.h>
24 
25 #include <cstring>
26 
27 using namespace std;
28 
29 static inline bool bad_cont(unsigned char ch) {
30  return static_cast<signed char>(ch) >= static_cast<signed char>(0xc0);
31 }
32 
33 namespace Xapian {
34 
35 namespace Unicode {
36 
37 // buf should be at least 4 bytes.
38 unsigned
39 nonascii_to_utf8(unsigned ch, char* buf)
40 {
41  if (ch < 0x800) {
42  buf[0] = char(0xc0 | (ch >> 6));
43  buf[1] = char(0x80 | (ch & 0x3f));
44  return 2;
45  }
46  if (ch < 0x10000) {
47  buf[0] = char(0xe0 | (ch >> 12));
48  buf[1] = char(0x80 | ((ch >> 6) & 0x3f));
49  buf[2] = char(0x80 | (ch & 0x3f));
50  return 3;
51  }
52  if (ch < 0x200000) {
53  buf[0] = char(0xf0 | (ch >> 18));
54  buf[1] = char(0x80 | ((ch >> 12) & 0x3f));
55  buf[2] = char(0x80 | ((ch >> 6) & 0x3f));
56  buf[3] = char(0x80 | (ch & 0x3f));
57  return 4;
58  }
59  // Unicode doesn't specify any characters above 0x10ffff.
60  // Should we be presented with such a numeric character
61  // entity or similar, we just replace it with nothing.
62  return 0;
63 }
64 
65 }
66 
67 Utf8Iterator::Utf8Iterator(const char* p_)
68 {
69  assign(p_, strlen(p_));
70 }
71 
72 bool
73 Utf8Iterator::calculate_sequence_length() const XAPIAN_NOEXCEPT
74 {
75  // Handle invalid UTF-8, overlong sequences, and truncated sequences as
76  // if the text was actually in ISO-8859-1 since we need to do something
77  // with it, and this seems the most likely reason why we'd have invalid
78  // UTF-8.
79 
80  unsigned char ch = *p;
81 
82  seqlen = 1;
83  // Single byte encoding (0x00-0x7f) or invalid (0x80-0xbf) or overlong
84  // sequence (0xc0-0xc1).
85  //
86  // (0xc0 and 0xc1 would start 2 byte sequences for characters which are
87  // representable in a single byte, and we should not decode these.)
88  if (ch < 0xc2) return (ch < 0x80);
89 
90  if (ch < 0xe0) {
91  if (p + 1 == end || // Not enough bytes
92  bad_cont(p[1])) // Invalid
93  return false;
94  seqlen = 2;
95  return true;
96  }
97  if (ch < 0xf0) {
98  if (end - p < 3 || // Not enough bytes
99  bad_cont(p[1]) || bad_cont(p[2]) || // Invalid
100  (p[0] == 0xe0 && p[1] < 0xa0)) // Overlong encoding
101  return false;
102  seqlen = 3;
103  return true;
104  }
105  if (ch >= 0xf5 || // Code value above Unicode
106  end - p < 4 || // Not enough bytes
107  bad_cont(p[1]) || bad_cont(p[2]) || bad_cont(p[3]) || // Invalid
108  (p[0] == 0xf0 && p[1] < 0x90) || // Overlong encoding
109  (p[0] == 0xf4 && p[1] >= 0x90)) // Code value above Unicode
110  return false;
111  seqlen = 4;
112  return true;
113 }
114 
116  if (p == NULL) return unsigned(-1);
117  if (seqlen == 0) calculate_sequence_length();
118  unsigned char ch = *p;
119  if (seqlen == 1) return ch;
120  if (seqlen == 2) return ((ch & 0x1f) << 6) | (p[1] & 0x3f);
121  if (seqlen == 3)
122  return ((ch & 0x0f) << 12) | ((p[1] & 0x3f) << 6) | (p[2] & 0x3f);
123  return ((ch & 0x07) << 18) | ((p[1] & 0x3f) << 12) |
124  ((p[2] & 0x3f) << 6) | (p[3] & 0x3f);
125 }
126 
127 unsigned
128 Utf8Iterator::strict_deref() const XAPIAN_NOEXCEPT
129 {
130  if (p == NULL) return unsigned(-1);
131  if (seqlen == 0) {
132  if (!calculate_sequence_length())
133  return unsigned(*p) | 0x80000000;
134  }
135  unsigned char ch = *p;
136  if (seqlen == 1) return ch;
137  if (seqlen == 2) return ((ch & 0x1f) << 6) | (p[1] & 0x3f);
138  if (seqlen == 3)
139  return ((ch & 0x0f) << 12) | ((p[1] & 0x3f) << 6) | (p[2] & 0x3f);
140  return ((ch & 0x07) << 18) | ((p[1] & 0x3f) << 12) |
141  ((p[2] & 0x3f) << 6) | (p[3] & 0x3f);
142 }
143 
144 }
Unicode and UTF-8 related classes and functions.
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:80
STL namespace.
unsigned nonascii_to_utf8(unsigned ch, char *buf)
Convert a single non-ASCII Unicode character to UTF-8.
Definition: utf8itor.cc:39
const Query operator*(double factor, const Query &q)
Scale a Xapian::Query object using OP_SCALE_WEIGHT.
Definition: query.h:670
static bool bad_cont(unsigned char ch)
Definition: utf8itor.cc:29
#define XAPIAN_NOEXCEPT
Definition: attributes.h:39