xapian-core  2.0.0
unicode.h
Go to the documentation of this file.
1 
4 /* Copyright (C) 2006-2024 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, see
18  * <https://www.gnu.org/licenses/>.
19  */
20 
21 #ifndef XAPIAN_INCLUDED_UNICODE_H
22 #define XAPIAN_INCLUDED_UNICODE_H
23 
24 #if !defined XAPIAN_IN_XAPIAN_H && !defined XAPIAN_LIB_BUILD
25 # error Never use <xapian/unicode.h> directly; include <xapian.h> instead.
26 #endif
27 
28 #include <xapian/attributes.h>
29 #include <xapian/visibility.h>
30 
31 #include <string>
32 #include <string_view>
33 
34 namespace Xapian {
35 
40  const unsigned char* p;
41  const unsigned char* end;
42  mutable unsigned seqlen;
43 
44  bool calculate_sequence_length() const noexcept;
45 
46  unsigned get_char() const;
47 
48  Utf8Iterator(const unsigned char* p_,
49  const unsigned char* end_,
50  unsigned seqlen_)
51  : p(p_), end(end_), seqlen(seqlen_) { }
52 
53  public:
55  const char* raw() const {
56  return reinterpret_cast<const char*>(p ? p : end);
57  }
58 
60  size_t left() const { return p ? end - p : 0; }
61 
73  void assign(const char* p_, size_t len) {
74  if (len) {
75  p = reinterpret_cast<const unsigned char*>(p_);
76  end = p + len;
77  seqlen = 0;
78  } else {
79  p = NULL;
80  }
81  }
82 
93  void assign(std::string_view s) { assign(s.data(), s.size()); }
94 
105  Utf8Iterator(const char* p_, size_t len) { assign(p_, len); }
106 
121  explicit
122  Utf8Iterator(std::string_view s) { assign(s.data(), s.size()); }
123 
129  Utf8Iterator() noexcept
130  : p(NULL), end(0), seqlen(0) { }
131 
147  unsigned operator*() const noexcept XAPIAN_PURE_FUNCTION;
148 
163  unsigned strict_deref() const noexcept XAPIAN_PURE_FUNCTION;
164 
169  Utf8Iterator operator++(int) {
170  // If we've not calculated seqlen yet, do so.
171  if (seqlen == 0) calculate_sequence_length();
172  const unsigned char* old_p = p;
173  unsigned old_seqlen = seqlen;
174  p += seqlen;
175  if (p == end) p = NULL;
176  seqlen = 0;
177  return Utf8Iterator(old_p, end, old_seqlen);
178  }
179 
185  if (seqlen == 0) calculate_sequence_length();
186  p += seqlen;
187  if (p == end) p = NULL;
188  seqlen = 0;
189  return *this;
190  }
191 
197  bool operator==(const Utf8Iterator& other) const noexcept {
198  return p == other.p;
199  }
200 
206  bool operator!=(const Utf8Iterator& other) const noexcept {
207  return p != other.p;
208  }
209 
211 
212  typedef std::input_iterator_tag iterator_category;
213  typedef unsigned value_type;
214  typedef size_t difference_type;
215  typedef value_type* pointer;
218 };
219 
221 namespace Unicode {
222 
228 typedef enum {
258  OTHER_SYMBOL
260 
261 namespace Internal {
269  int get_character_info(unsigned ch) noexcept XAPIAN_CONST_FUNCTION;
270 
278  enum { INFO_TOLOWER_MASK = 0x40, INFO_TOUPPER_MASK = 0x80 };
279 
283  inline category get_category(int info) {
284  return static_cast<category>(info & 0x1f);
285  }
286 
290  inline int get_delta(int info) {
291  /* It's implementation defined if sign extension happens when right
292  * shifting a signed int, although in practice sign extension is what
293  * most compilers implement.
294  *
295  * Some compilers are smart enough to spot common idioms for sign
296  * extension, but not all (e.g. GCC < 7 doesn't spot the one used
297  * below), so check what the implementation-defined behaviour is with
298  * a constant conditional which should get optimised away.
299  *
300  * We use the ternary operator here to avoid various compiler
301  * warnings which writing this as an `if` results in.
302  */
303  return ((-1 >> 1) == -1 ?
304  // Right shift sign-extends.
305  info >> 8 :
306  // Right shift shifts in zeros so bitwise-not before and after
307  // the shift for negative values.
308  (info >= 0) ? (info >> 8) : (~(~info >> 8)));
309  }
310 }
311 
323 unsigned nonascii_to_utf8(unsigned ch, char* buf);
324 
333 inline unsigned to_utf8(unsigned ch, char* buf) {
334  if (ch < 128) {
335  *buf = static_cast<unsigned char>(ch);
336  return 1;
337  }
338  return Xapian::Unicode::nonascii_to_utf8(ch, buf);
339 }
340 
344 inline void append_utf8(std::string& s, unsigned ch) {
345  char buf[4];
346  s.append(buf, to_utf8(ch, buf));
347 }
348 
350 inline category get_category(unsigned ch) {
352 }
353 
355 inline bool is_wordchar(unsigned ch) {
356  const unsigned int WORDCHAR_MASK =
369  return ((WORDCHAR_MASK >> get_category(ch)) & 1);
370 }
371 
373 inline bool is_whitespace(unsigned ch) {
374  const unsigned int WHITESPACE_MASK =
375  (1 << Xapian::Unicode::CONTROL) | // For TAB, CR, LF, FF.
379  return ((WHITESPACE_MASK >> get_category(ch)) & 1);
380 }
381 
383 inline bool is_currency(unsigned ch) {
385 }
386 
388 inline unsigned tolower(unsigned ch) {
390  if (!(info & Internal::INFO_TOLOWER_MASK))
391  return ch;
392  return unsigned(int(ch) + Internal::get_delta(info));
393 }
394 
396 inline unsigned toupper(unsigned ch) {
398  if (!(info & Internal::INFO_TOUPPER_MASK))
399  return ch;
400  return unsigned(int(ch) - Internal::get_delta(info));
401 }
402 
404 inline std::string
405 tolower(std::string_view term)
406 {
407  std::string result;
408  result.reserve(term.size());
409  for (Utf8Iterator i(term); i != Utf8Iterator(); ++i) {
410  append_utf8(result, tolower(*i));
411  }
412  return result;
413 }
414 
416 inline std::string
417 toupper(std::string_view term)
418 {
419  std::string result;
420  result.reserve(term.size());
421  for (Utf8Iterator i(term); i != Utf8Iterator(); ++i) {
422  append_utf8(result, toupper(*i));
423  }
424  return result;
425 }
426 
427 }
428 
429 }
430 
431 #endif // XAPIAN_INCLUDED_UNICODE_H
Compiler attribute macros.
#define XAPIAN_PURE_FUNCTION
Like XAPIAN_CONST_FUNCTION, but such a function can also examine global memory, perhaps via pointer o...
Definition: attributes.h:59
#define XAPIAN_CONST_FUNCTION
A function which does not examine any values except its arguments and has no effects except its retur...
Definition: attributes.h:54
An iterator which returns Unicode character values from a UTF-8 encoded string.
Definition: unicode.h:39
const char * raw() const
Return the raw const char* pointer for the current position.
Definition: unicode.h:55
void assign(std::string_view s)
Assign a new string to the iterator.
Definition: unicode.h:93
Utf8Iterator() noexcept
Create an iterator which is at the end of its iteration.
Definition: unicode.h:129
const unsigned char * end
Definition: unicode.h:41
Utf8Iterator & operator++()
Move forward to the next Unicode character.
Definition: unicode.h:184
unsigned seqlen
Definition: unicode.h:42
std::input_iterator_tag iterator_category
We implement the semantics of an STL input_iterator.
Definition: unicode.h:212
size_t left() const
Return the number of bytes left in the iterator's buffer.
Definition: unicode.h:60
void assign(const char *p_, size_t len)
Assign a new string to the iterator.
Definition: unicode.h:73
Utf8Iterator(const char *p_, size_t len)
Create an iterator given a pointer and a length.
Definition: unicode.h:105
bool operator!=(const Utf8Iterator &other) const noexcept
Test two Utf8Iterators for inequality.
Definition: unicode.h:206
value_type reference
Definition: unicode.h:216
value_type * pointer
Definition: unicode.h:215
const unsigned char * p
Definition: unicode.h:40
size_t difference_type
Definition: unicode.h:214
unsigned value_type
Definition: unicode.h:213
bool operator==(const Utf8Iterator &other) const noexcept
Test two Utf8Iterators for equality.
Definition: unicode.h:197
Utf8Iterator(std::string_view s)
Create an iterator given a string.
Definition: unicode.h:122
string term
PositionList * p
int get_character_info(unsigned ch) noexcept
Extract information about a Unicode character.
category get_category(int info)
Definition: unicode.h:283
int get_delta(int info)
Definition: unicode.h:290
void append_utf8(std::string &s, unsigned ch)
Append the UTF-8 representation of a single Unicode character to a std::string.
Definition: unicode.h:344
unsigned to_utf8(unsigned ch, char *buf)
Convert a single Unicode character to UTF-8.
Definition: unicode.h:333
unsigned tolower(unsigned ch)
Convert a Unicode character to lowercase.
Definition: unicode.h:388
category
Each Unicode character is in exactly one of these categories.
Definition: unicode.h:228
@ MATH_SYMBOL
Symbol, math (Sm)
Definition: unicode.h:255
@ FORMAT
Other, format (Cf)
Definition: unicode.h:245
@ MODIFIER_SYMBOL
Symbol, modified (Sk)
Definition: unicode.h:257
@ FINAL_QUOTE_PUNCTUATION
Punctuation, final quote (Pf)
Definition: unicode.h:253
@ PRIVATE_USE
Other, private use (Co)
Definition: unicode.h:246
@ INITIAL_QUOTE_PUNCTUATION
Punctuation, initial quote (Pi)
Definition: unicode.h:252
@ CONNECTOR_PUNCTUATION
Punctuation, connector (Pc)
Definition: unicode.h:248
@ LOWERCASE_LETTER
Letter, lowercase (Ll)
Definition: unicode.h:231
@ MODIFIER_LETTER
Letter, modifier (Lm)
Definition: unicode.h:233
@ OTHER_SYMBOL
Symbol, other (So)
Definition: unicode.h:258
@ CURRENCY_SYMBOL
Symbol, currency (Sc)
Definition: unicode.h:256
@ UNASSIGNED
Other, not assigned (Cn)
Definition: unicode.h:229
@ ENCLOSING_MARK
Mark, enclosing (Me)
Definition: unicode.h:236
@ OTHER_LETTER
Letter, other (Lo)
Definition: unicode.h:234
@ DECIMAL_DIGIT_NUMBER
Number, decimal digit (Nd)
Definition: unicode.h:238
@ CONTROL
Other, control (Cc)
Definition: unicode.h:244
@ LINE_SEPARATOR
Separator, line (Zl)
Definition: unicode.h:242
@ CLOSE_PUNCTUATION
Punctuation, close (Pe)
Definition: unicode.h:251
@ SURROGATE
Other, surrogate (Cs)
Definition: unicode.h:247
@ PARAGRAPH_SEPARATOR
Separator, paragraph (Zp)
Definition: unicode.h:243
@ SPACE_SEPARATOR
Separator, space (Zs)
Definition: unicode.h:241
@ COMBINING_SPACING_MARK
Mark, spacing combining (Mc)
Definition: unicode.h:237
@ OPEN_PUNCTUATION
Punctuation, open (Ps)
Definition: unicode.h:250
@ NON_SPACING_MARK
Mark, nonspacing (Mn)
Definition: unicode.h:235
@ DASH_PUNCTUATION
Punctuation, dash (Pd)
Definition: unicode.h:249
@ OTHER_PUNCTUATION
Punctuation, other (Po)
Definition: unicode.h:254
@ OTHER_NUMBER
Number, other (No)
Definition: unicode.h:240
@ TITLECASE_LETTER
Letter, titlecase (Lt)
Definition: unicode.h:232
@ LETTER_NUMBER
Number, letter (Nl)
Definition: unicode.h:239
@ UPPERCASE_LETTER
Letter, uppercase (Lu)
Definition: unicode.h:230
bool is_wordchar(unsigned ch)
Test if a given Unicode character is "word character".
Definition: unicode.h:355
bool is_currency(unsigned ch)
Test if a given Unicode character is a currency symbol.
Definition: unicode.h:383
unsigned toupper(unsigned ch)
Convert a Unicode character to uppercase.
Definition: unicode.h:396
category get_category(unsigned ch)
Return the category which a given Unicode character falls into.
Definition: unicode.h:350
bool is_whitespace(unsigned ch)
Test if a given Unicode character is a whitespace character.
Definition: unicode.h:373
unsigned nonascii_to_utf8(unsigned ch, char *buf)
Convert a single non-ASCII Unicode character to UTF-8.
Definition: utf8itor.cc:37
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:82
const Query operator*(double factor, const Query &q)
Scale a Xapian::Query object using OP_SCALE_WEIGHT.
Definition: query.h:827
Define XAPIAN_VISIBILITY_* macros.
#define XAPIAN_VISIBILITY_DEFAULT
Definition: visibility.h:28