xapian-core  1.4.27
unicode.h
Go to the documentation of this file.
1 
4 /* Copyright (C) 2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2019 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #ifndef XAPIAN_INCLUDED_UNICODE_H
22 #define XAPIAN_INCLUDED_UNICODE_H
23 
24 #if !defined XAPIAN_IN_XAPIAN_H && !defined XAPIAN_LIB_BUILD
25 # error Never use <xapian/unicode.h> directly; include <xapian.h> instead.
26 #endif
27 
28 #include <xapian/attributes.h>
29 #include <xapian/visibility.h>
30 
31 #include <string>
32 
33 namespace Xapian {
34 
39  const unsigned char* p;
40  const unsigned char* end;
41  mutable unsigned seqlen;
42 
43  bool XAPIAN_NOTHROW(calculate_sequence_length() const);
44 
45  unsigned get_char() const;
46 
47  Utf8Iterator(const unsigned char* p_,
48  const unsigned char* end_,
49  unsigned seqlen_)
50  : p(p_), end(end_), seqlen(seqlen_) { }
51 
52  public:
54  const char* raw() const {
55  return reinterpret_cast<const char*>(p ? p : end);
56  }
57 
59  size_t left() const { return p ? end - p : 0; }
60 
72  void assign(const char* p_, size_t len) {
73  if (len) {
74  p = reinterpret_cast<const unsigned char*>(p_);
75  end = p + len;
76  seqlen = 0;
77  } else {
78  p = NULL;
79  }
80  }
81 
92  void assign(const std::string& s) { assign(s.data(), s.size()); }
93 
102  explicit Utf8Iterator(const char* p_);
103 
114  Utf8Iterator(const char* p_, size_t len) { assign(p_, len); }
115 
125  Utf8Iterator(const std::string& s) { assign(s.data(), s.size()); }
126 
132  XAPIAN_NOTHROW(Utf8Iterator())
133  : p(NULL), end(0), seqlen(0) { }
134 
143  unsigned XAPIAN_NOTHROW(operator*() const) XAPIAN_PURE_FUNCTION;
144 
155  unsigned XAPIAN_NOTHROW(strict_deref() const) XAPIAN_PURE_FUNCTION;
156 
161  Utf8Iterator operator++(int) {
162  // If we've not calculated seqlen yet, do so.
163  if (seqlen == 0) calculate_sequence_length();
164  const unsigned char* old_p = p;
165  unsigned old_seqlen = seqlen;
166  p += seqlen;
167  if (p == end) p = NULL;
168  seqlen = 0;
169  return Utf8Iterator(old_p, end, old_seqlen);
170  }
171 
176  Utf8Iterator& operator++() {
177  if (seqlen == 0) calculate_sequence_length();
178  p += seqlen;
179  if (p == end) p = NULL;
180  seqlen = 0;
181  return *this;
182  }
183 
189  bool XAPIAN_NOTHROW(operator==(const Utf8Iterator& other) const) {
190  return p == other.p;
191  }
192 
198  bool XAPIAN_NOTHROW(operator!=(const Utf8Iterator& other) const) {
199  return p != other.p;
200  }
201 
203 
204  typedef std::input_iterator_tag iterator_category;
205  typedef unsigned value_type;
206  typedef size_t difference_type;
207  typedef const unsigned* pointer;
208  typedef const unsigned& reference;
210 };
211 
213 namespace Unicode {
214 
220 typedef enum {
251 } category;
252 
253 namespace Internal {
261  int XAPIAN_NOTHROW(get_character_info(unsigned ch)) XAPIAN_CONST_FUNCTION;
262 
266  inline int get_case_type(int info) { return ((info & 0xe0) >> 5); }
267 
271  inline category get_category(int info) {
272  return static_cast<category>(info & 0x1f);
273  }
274 
278  inline int get_delta(int info) {
279  /* It's implementation defined if sign extension happens when right
280  * shifting a signed int, although in practice sign extension is what
281  * most compilers implement.
282  *
283  * Some compilers are smart enough to spot common idioms for sign
284  * extension, but not all (e.g. GCC < 7 doesn't spot the one used
285  * below), so check what the implementation-defined behaviour is with
286  * a constant conditional which should get optimised away.
287  *
288  * We use the ternary operator here to avoid various compiler
289  * warnings which writing this as an `if` results in.
290  */
291  return ((-1 >> 1) == -1 ?
292  // Right shift sign-extends.
293  info >> 8 :
294  // Right shift shifts in zeros so bitwise-not before and after
295  // the shift for negative values.
296  (info >= 0) ? (info >> 8) : (~(~info >> 8)));
297  }
298 }
299 
311 unsigned nonascii_to_utf8(unsigned ch, char* buf);
312 
321 inline unsigned to_utf8(unsigned ch, char* buf) {
322  if (ch < 128) {
323  *buf = static_cast<unsigned char>(ch);
324  return 1;
325  }
326  return Xapian::Unicode::nonascii_to_utf8(ch, buf);
327 }
328 
332 inline void append_utf8(std::string& s, unsigned ch) {
333  char buf[4];
334  s.append(buf, to_utf8(ch, buf));
335 }
336 
338 inline category get_category(unsigned ch) {
340 }
341 
343 inline bool is_wordchar(unsigned ch) {
344  const unsigned int WORDCHAR_MASK =
357  return ((WORDCHAR_MASK >> get_category(ch)) & 1);
358 }
359 
361 inline bool is_whitespace(unsigned ch) {
362  const unsigned int WHITESPACE_MASK =
363  (1 << Xapian::Unicode::CONTROL) | // For TAB, CR, LF, FF.
367  return ((WHITESPACE_MASK >> get_category(ch)) & 1);
368 }
369 
371 inline bool is_currency(unsigned ch) {
373 }
374 
376 inline unsigned tolower(unsigned ch) {
378  if (!(Internal::get_case_type(info) & 2))
379  return ch;
380  return ch + Internal::get_delta(info);
381 }
382 
384 inline unsigned toupper(unsigned ch) {
386  if (!(Internal::get_case_type(info) & 4))
387  return ch;
388  return ch - Internal::get_delta(info);
389 }
390 
392 inline std::string
393 tolower(const std::string& term)
394 {
395  std::string result;
396  result.reserve(term.size());
397  for (Utf8Iterator i(term); i != Utf8Iterator(); ++i) {
398  append_utf8(result, tolower(*i));
399  }
400  return result;
401 }
402 
404 inline std::string
405 toupper(const std::string& term)
406 {
407  std::string result;
408  result.reserve(term.size());
409  for (Utf8Iterator i(term); i != Utf8Iterator(); ++i) {
410  append_utf8(result, toupper(*i));
411  }
412  return result;
413 }
414 
415 }
416 
417 }
418 
419 #endif // XAPIAN_INCLUDED_UNICODE_H
Mark, enclosing (Me)
Definition: unicode.h:228
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:80
void append_utf8(std::string &s, unsigned ch)
Append the UTF-8 representation of a single Unicode character to a std::string.
Definition: unicode.h:332
Other, control (Cc)
Definition: unicode.h:236
Letter, modifier (Lm)
Definition: unicode.h:225
Number, letter (Nl)
Definition: unicode.h:231
#define XAPIAN_PURE_FUNCTION
Like XAPIAN_CONST_FUNCTION, but such a function can also examine global memory, perhaps via pointer o...
Definition: attributes.h:67
const char * raw() const
Return the raw const char* pointer for the current position.
Definition: unicode.h:54
Utf8Iterator & operator++()
Move forward to the next Unicode character.
Definition: unicode.h:176
Other, not assigned (Cn)
Definition: unicode.h:221
Mark, spacing combining (Mc)
Definition: unicode.h:229
Letter, other (Lo)
Definition: unicode.h:226
size_t left() const
Return the number of bytes left in the iterator&#39;s buffer.
Definition: unicode.h:59
Utf8Iterator(const unsigned char *p_, const unsigned char *end_, unsigned seqlen_)
Definition: unicode.h:47
category
Each Unicode character is in exactly one of these categories.
Definition: unicode.h:220
bool is_currency(unsigned ch)
Test if a given Unicode character is a currency symbol.
Definition: unicode.h:371
Number, decimal digit (Nd)
Definition: unicode.h:230
Compiler attribute macros.
Symbol, currency (Sc)
Definition: unicode.h:248
const unsigned & reference
We implement the semantics of an STL input_iterator.
Definition: unicode.h:208
Other, format (Cf)
Definition: unicode.h:237
#define XAPIAN_VISIBILITY_DEFAULT
Definition: visibility.h:28
Separator, space (Zs)
Definition: unicode.h:233
unsigned to_utf8(unsigned ch, char *buf)
Convert a single Unicode character to UTF-8.
Definition: unicode.h:321
Letter, lowercase (Ll)
Definition: unicode.h:223
void assign(const std::string &s)
Assign a new string to the iterator.
Definition: unicode.h:92
Mark, nonspacing (Mn)
Definition: unicode.h:227
unsigned seqlen
Definition: unicode.h:41
Other, surrogate (Cs)
Definition: unicode.h:239
Letter, titlecase (Lt)
Definition: unicode.h:224
#define XAPIAN_CONST_FUNCTION
A function which does not examine any values except its arguments and has no effects except its retur...
Definition: attributes.h:62
int get_character_info(unsigned ch)
Extract information about a Unicode character.
Define XAPIAN_VISIBILITY_* macros.
Punctuation, dash (Pd)
Definition: unicode.h:241
const unsigned * pointer
We implement the semantics of an STL input_iterator.
Definition: unicode.h:207
unsigned nonascii_to_utf8(unsigned ch, char *buf)
Convert a single non-ASCII Unicode character to UTF-8.
Definition: utf8itor.cc:39
Letter, uppercase (Lu)
Definition: unicode.h:222
Punctuation, close (Pe)
Definition: unicode.h:243
Punctuation, open (Ps)
Definition: unicode.h:242
Utf8Iterator()
Create an iterator which is at the end of its iteration.
Definition: unicode.h:132
void assign(const char *p_, size_t len)
Assign a new string to the iterator.
Definition: unicode.h:72
Punctuation, connector (Pc)
Definition: unicode.h:240
std::input_iterator_tag iterator_category
We implement the semantics of an STL input_iterator.
Definition: unicode.h:204
Number, other (No)
Definition: unicode.h:232
std::string toupper(const std::string &term)
Convert a UTF-8 std::string to uppercase.
Definition: unicode.h:405
Utf8Iterator operator++(int)
Move forward to the next Unicode character.
Definition: unicode.h:161
int get_case_type(int info)
Definition: unicode.h:266
unsigned value_type
We implement the semantics of an STL input_iterator.
Definition: unicode.h:205
An iterator which returns Unicode character values from a UTF-8 encoded string.
Definition: unicode.h:38
Symbol, math (Sm)
Definition: unicode.h:247
Separator, line (Zl)
Definition: unicode.h:234
std::string tolower(const std::string &term)
Convert a UTF-8 std::string to lowercase.
Definition: unicode.h:393
int get_delta(int info)
Definition: unicode.h:278
bool is_wordchar(unsigned ch)
Test if a given Unicode character is "word character".
Definition: unicode.h:343
const unsigned char * end
Definition: unicode.h:40
Punctuation, other (Po)
Definition: unicode.h:246
bool is_whitespace(unsigned ch)
Test if a given Unicode character is a whitespace character.
Definition: unicode.h:361
Punctuation, initial quote (Pi)
Definition: unicode.h:244
Symbol, modified (Sk)
Definition: unicode.h:249
Other, private use (Co)
Definition: unicode.h:238
Utf8Iterator(const char *p_, size_t len)
Create an iterator given a pointer and a length.
Definition: unicode.h:114
Punctuation, final quote (Pf)
Definition: unicode.h:245
Utf8Iterator(const std::string &s)
Create an iterator given a string.
Definition: unicode.h:125
size_t difference_type
We implement the semantics of an STL input_iterator.
Definition: unicode.h:206
category get_category(unsigned ch)
Return the category which a given Unicode character falls into.
Definition: unicode.h:338
Separator, paragraph (Zp)
Definition: unicode.h:235
const unsigned char * p
Definition: unicode.h:39
Symbol, other (So)
Definition: unicode.h:250