21 #ifndef XAPIAN_INCLUDED_UNICODE_H
22 #define XAPIAN_INCLUDED_UNICODE_H
24 #if !defined XAPIAN_IN_XAPIAN_H && !defined XAPIAN_LIB_BUILD
25 # error Never use <xapian/unicode.h> directly; include <xapian.h> instead.
32 #include <string_view>
40 const unsigned char*
p;
41 const unsigned char*
end;
44 bool calculate_sequence_length() const noexcept;
46 unsigned get_char() const;
49 const
unsigned char* end_,
51 :
p(p_), end(end_), seqlen(seqlen_) { }
55 const char*
raw()
const {
56 return reinterpret_cast<const char*
>(
p ?
p : end);
60 size_t left()
const {
return p ? end -
p : 0; }
73 void assign(
const char* p_,
size_t len) {
75 p =
reinterpret_cast<const unsigned char*
>(p_);
130 :
p(NULL), end(0), seqlen(0) { }
171 if (seqlen == 0) calculate_sequence_length();
172 const unsigned char* old_p =
p;
173 unsigned old_seqlen = seqlen;
175 if (
p == end)
p = NULL;
185 if (seqlen == 0) calculate_sequence_length();
187 if (
p == end)
p = NULL;
284 return static_cast<category>(info & 0x1f);
303 return ((-1 >> 1) == -1 ?
308 (info >= 0) ? (info >> 8) : (~(~info >> 8)));
333 inline unsigned to_utf8(
unsigned ch,
char* buf) {
335 *buf =
static_cast<unsigned char>(ch);
346 s.append(buf,
to_utf8(ch, buf));
356 const unsigned int WORDCHAR_MASK =
374 const unsigned int WHITESPACE_MASK =
408 result.reserve(
term.size());
420 result.reserve(
term.size());
Compiler attribute macros.
#define XAPIAN_PURE_FUNCTION
Like XAPIAN_CONST_FUNCTION, but such a function can also examine global memory, perhaps via pointer o...
#define XAPIAN_CONST_FUNCTION
A function which does not examine any values except its arguments and has no effects except its retur...
An iterator which returns Unicode character values from a UTF-8 encoded string.
const char * raw() const
Return the raw const char* pointer for the current position.
void assign(std::string_view s)
Assign a new string to the iterator.
Utf8Iterator() noexcept
Create an iterator which is at the end of its iteration.
const unsigned char * end
Utf8Iterator & operator++()
Move forward to the next Unicode character.
std::input_iterator_tag iterator_category
We implement the semantics of an STL input_iterator.
size_t left() const
Return the number of bytes left in the iterator's buffer.
void assign(const char *p_, size_t len)
Assign a new string to the iterator.
Utf8Iterator(const char *p_, size_t len)
Create an iterator given a pointer and a length.
bool operator!=(const Utf8Iterator &other) const noexcept
Test two Utf8Iterators for inequality.
bool operator==(const Utf8Iterator &other) const noexcept
Test two Utf8Iterators for equality.
Utf8Iterator(std::string_view s)
Create an iterator given a string.
int get_character_info(unsigned ch) noexcept
Extract information about a Unicode character.
category get_category(int info)
void append_utf8(std::string &s, unsigned ch)
Append the UTF-8 representation of a single Unicode character to a std::string.
unsigned to_utf8(unsigned ch, char *buf)
Convert a single Unicode character to UTF-8.
unsigned tolower(unsigned ch)
Convert a Unicode character to lowercase.
category
Each Unicode character is in exactly one of these categories.
@ MATH_SYMBOL
Symbol, math (Sm)
@ FORMAT
Other, format (Cf)
@ MODIFIER_SYMBOL
Symbol, modified (Sk)
@ FINAL_QUOTE_PUNCTUATION
Punctuation, final quote (Pf)
@ PRIVATE_USE
Other, private use (Co)
@ INITIAL_QUOTE_PUNCTUATION
Punctuation, initial quote (Pi)
@ CONNECTOR_PUNCTUATION
Punctuation, connector (Pc)
@ LOWERCASE_LETTER
Letter, lowercase (Ll)
@ MODIFIER_LETTER
Letter, modifier (Lm)
@ OTHER_SYMBOL
Symbol, other (So)
@ CURRENCY_SYMBOL
Symbol, currency (Sc)
@ UNASSIGNED
Other, not assigned (Cn)
@ ENCLOSING_MARK
Mark, enclosing (Me)
@ OTHER_LETTER
Letter, other (Lo)
@ DECIMAL_DIGIT_NUMBER
Number, decimal digit (Nd)
@ CONTROL
Other, control (Cc)
@ LINE_SEPARATOR
Separator, line (Zl)
@ CLOSE_PUNCTUATION
Punctuation, close (Pe)
@ SURROGATE
Other, surrogate (Cs)
@ PARAGRAPH_SEPARATOR
Separator, paragraph (Zp)
@ SPACE_SEPARATOR
Separator, space (Zs)
@ COMBINING_SPACING_MARK
Mark, spacing combining (Mc)
@ OPEN_PUNCTUATION
Punctuation, open (Ps)
@ NON_SPACING_MARK
Mark, nonspacing (Mn)
@ DASH_PUNCTUATION
Punctuation, dash (Pd)
@ OTHER_PUNCTUATION
Punctuation, other (Po)
@ OTHER_NUMBER
Number, other (No)
@ TITLECASE_LETTER
Letter, titlecase (Lt)
@ LETTER_NUMBER
Number, letter (Nl)
@ UPPERCASE_LETTER
Letter, uppercase (Lu)
bool is_wordchar(unsigned ch)
Test if a given Unicode character is "word character".
bool is_currency(unsigned ch)
Test if a given Unicode character is a currency symbol.
unsigned toupper(unsigned ch)
Convert a Unicode character to uppercase.
category get_category(unsigned ch)
Return the category which a given Unicode character falls into.
bool is_whitespace(unsigned ch)
Test if a given Unicode character is a whitespace character.
unsigned nonascii_to_utf8(unsigned ch, char *buf)
Convert a single non-ASCII Unicode character to UTF-8.
The Xapian namespace contains public interfaces for the Xapian library.
const Query operator*(double factor, const Query &q)
Scale a Xapian::Query object using OP_SCALE_WEIGHT.
Define XAPIAN_VISIBILITY_* macros.
#define XAPIAN_VISIBILITY_DEFAULT