21 #ifndef XAPIAN_INCLUDED_UNICODE_H
22 #define XAPIAN_INCLUDED_UNICODE_H
24 #if !defined XAPIAN_IN_XAPIAN_H && !defined XAPIAN_LIB_BUILD
25 # error Never use <xapian/unicode.h> directly; include <xapian.h> instead.
39 const unsigned char*
p;
40 const unsigned char*
end;
43 bool XAPIAN_NOTHROW(calculate_sequence_length()
const);
48 const unsigned char* end_,
50 : p(p_), end(end_), seqlen(seqlen_) { }
54 const char*
raw()
const {
55 return reinterpret_cast<const char*
>(p ? p : end);
59 size_t left()
const {
return p ? end - p : 0; }
72 void assign(
const char* p_,
size_t len) {
74 p =
reinterpret_cast<const unsigned char*
>(p_);
133 : p(NULL), end(0), seqlen(0) { }
163 if (seqlen == 0) calculate_sequence_length();
164 const unsigned char* old_p = p;
165 unsigned old_seqlen = seqlen;
167 if (p == end) p = NULL;
177 if (seqlen == 0) calculate_sequence_length();
179 if (p == end) p = NULL;
272 return static_cast<category>(info & 0x1f);
291 return ((-1 >> 1) == -1 ?
296 (info >= 0) ? (info >> 8) : (~(~info >> 8)));
321 inline unsigned to_utf8(
unsigned ch,
char* buf) {
323 *buf =
static_cast<unsigned char>(ch);
334 s.append(buf,
to_utf8(ch, buf));
344 const unsigned int WORDCHAR_MASK =
362 const unsigned int WHITESPACE_MASK =
396 result.reserve(term.size());
408 result.reserve(term.size());
Compiler attribute macros.
#define XAPIAN_PURE_FUNCTION
Like XAPIAN_CONST_FUNCTION, but such a function can also examine global memory, perhaps via pointer o...
#define XAPIAN_CONST_FUNCTION
A function which does not examine any values except its arguments and has no effects except its retur...
An iterator which returns Unicode character values from a UTF-8 encoded string.
const char * raw() const
Return the raw const char* pointer for the current position.
const unsigned char * end
Utf8Iterator & operator++()
Move forward to the next Unicode character.
Utf8Iterator(const std::string &s)
Create an iterator given a string.
std::input_iterator_tag iterator_category
We implement the semantics of an STL input_iterator.
size_t left() const
Return the number of bytes left in the iterator's buffer.
void assign(const char *p_, size_t len)
Assign a new string to the iterator.
unsigned get_char() const
void assign(const std::string &s)
Assign a new string to the iterator.
Utf8Iterator(const char *p_, size_t len)
Create an iterator given a pointer and a length.
const unsigned & reference
Utf8Iterator()
Create an iterator which is at the end of its iteration.
Utf8Iterator(const unsigned char *p_, const unsigned char *end_, unsigned seqlen_)
category get_category(int info)
int get_character_info(unsigned ch)
Extract information about a Unicode character.
int get_case_type(int info)
void append_utf8(std::string &s, unsigned ch)
Append the UTF-8 representation of a single Unicode character to a std::string.
unsigned to_utf8(unsigned ch, char *buf)
Convert a single Unicode character to UTF-8.
unsigned tolower(unsigned ch)
Convert a Unicode character to lowercase.
category
Each Unicode character is in exactly one of these categories.
@ MATH_SYMBOL
Symbol, math (Sm)
@ FORMAT
Other, format (Cf)
@ MODIFIER_SYMBOL
Symbol, modified (Sk)
@ FINAL_QUOTE_PUNCTUATION
Punctuation, final quote (Pf)
@ PRIVATE_USE
Other, private use (Co)
@ INITIAL_QUOTE_PUNCTUATION
Punctuation, initial quote (Pi)
@ CONNECTOR_PUNCTUATION
Punctuation, connector (Pc)
@ LOWERCASE_LETTER
Letter, lowercase (Ll)
@ MODIFIER_LETTER
Letter, modifier (Lm)
@ OTHER_SYMBOL
Symbol, other (So)
@ CURRENCY_SYMBOL
Symbol, currency (Sc)
@ UNASSIGNED
Other, not assigned (Cn)
@ ENCLOSING_MARK
Mark, enclosing (Me)
@ OTHER_LETTER
Letter, other (Lo)
@ DECIMAL_DIGIT_NUMBER
Number, decimal digit (Nd)
@ CONTROL
Other, control (Cc)
@ LINE_SEPARATOR
Separator, line (Zl)
@ CLOSE_PUNCTUATION
Punctuation, close (Pe)
@ SURROGATE
Other, surrogate (Cs)
@ PARAGRAPH_SEPARATOR
Separator, paragraph (Zp)
@ SPACE_SEPARATOR
Separator, space (Zs)
@ COMBINING_SPACING_MARK
Mark, spacing combining (Mc)
@ OPEN_PUNCTUATION
Punctuation, open (Ps)
@ NON_SPACING_MARK
Mark, nonspacing (Mn)
@ DASH_PUNCTUATION
Punctuation, dash (Pd)
@ OTHER_PUNCTUATION
Punctuation, other (Po)
@ OTHER_NUMBER
Number, other (No)
@ TITLECASE_LETTER
Letter, titlecase (Lt)
@ LETTER_NUMBER
Number, letter (Nl)
@ UPPERCASE_LETTER
Letter, uppercase (Lu)
bool is_wordchar(unsigned ch)
Test if a given Unicode character is "word character".
bool is_currency(unsigned ch)
Test if a given Unicode character is a currency symbol.
unsigned toupper(unsigned ch)
Convert a Unicode character to uppercase.
category get_category(unsigned ch)
Return the category which a given Unicode character falls into.
bool is_whitespace(unsigned ch)
Test if a given Unicode character is a whitespace character.
unsigned nonascii_to_utf8(unsigned ch, char *buf)
Convert a single non-ASCII Unicode character to UTF-8.
The Xapian namespace contains public interfaces for the Xapian library.
Define XAPIAN_VISIBILITY_* macros.
#define XAPIAN_VISIBILITY_DEFAULT