21 #ifndef XAPIAN_INCLUDED_UNICODE_H 22 #define XAPIAN_INCLUDED_UNICODE_H 24 #if !defined XAPIAN_IN_XAPIAN_H && !defined XAPIAN_LIB_BUILD 25 # error Never use <xapian/unicode.h> directly; include <xapian.h> instead. 39 const unsigned char*
p;
40 const unsigned char*
end;
43 bool XAPIAN_NOTHROW(calculate_sequence_length()
const);
45 unsigned get_char()
const;
48 const unsigned char* end_,
50 : p(p_), end(end_), seqlen(seqlen_) { }
54 const char*
raw()
const {
55 return reinterpret_cast<const char*
>(p ? p : end);
59 size_t left()
const {
return p ? end - p : 0; }
72 void assign(
const char* p_,
size_t len) {
74 p =
reinterpret_cast<const unsigned char*
>(p_);
133 : p(NULL), end(0), seqlen(0) { }
163 if (seqlen == 0) calculate_sequence_length();
164 const unsigned char* old_p = p;
165 unsigned old_seqlen = seqlen;
167 if (p == end) p = NULL;
169 return Utf8Iterator(old_p, end, old_seqlen);
177 if (seqlen == 0) calculate_sequence_length();
179 if (p == end) p = NULL;
189 bool XAPIAN_NOTHROW(
operator==(
const Utf8Iterator& other)
const) {
198 bool XAPIAN_NOTHROW(
operator!=(
const Utf8Iterator& other)
const) {
272 return static_cast<category
>(info & 0x1f);
291 return ((-1 >> 1) == -1 ?
296 (info >= 0) ? (info >> 8) : (~(~info >> 8)));
321 inline unsigned to_utf8(
unsigned ch,
char* buf) {
323 *buf =
static_cast<unsigned char>(ch);
334 s.append(buf,
to_utf8(ch, buf));
344 const unsigned int WORDCHAR_MASK =
362 const unsigned int WHITESPACE_MASK =
396 result.reserve(term.size());
397 for (Utf8Iterator i(term); i != Utf8Iterator(); ++i) {
408 result.reserve(term.size());
409 for (Utf8Iterator i(term); i != Utf8Iterator(); ++i) {
419 #endif // XAPIAN_INCLUDED_UNICODE_H
The Xapian namespace contains public interfaces for the Xapian library.
void append_utf8(std::string &s, unsigned ch)
Append the UTF-8 representation of a single Unicode character to a std::string.
#define XAPIAN_PURE_FUNCTION
Like XAPIAN_CONST_FUNCTION, but such a function can also examine global memory, perhaps via pointer o...
const char * raw() const
Return the raw const char* pointer for the current position.
Utf8Iterator & operator++()
Move forward to the next Unicode character.
Mark, spacing combining (Mc)
size_t left() const
Return the number of bytes left in the iterator's buffer.
Utf8Iterator(const unsigned char *p_, const unsigned char *end_, unsigned seqlen_)
category
Each Unicode character is in exactly one of these categories.
bool is_currency(unsigned ch)
Test if a given Unicode character is a currency symbol.
Number, decimal digit (Nd)
Compiler attribute macros.
const unsigned & reference
We implement the semantics of an STL input_iterator.
#define XAPIAN_VISIBILITY_DEFAULT
unsigned to_utf8(unsigned ch, char *buf)
Convert a single Unicode character to UTF-8.
void assign(const std::string &s)
Assign a new string to the iterator.
#define XAPIAN_CONST_FUNCTION
A function which does not examine any values except its arguments and has no effects except its retur...
int get_character_info(unsigned ch)
Extract information about a Unicode character.
Define XAPIAN_VISIBILITY_* macros.
const unsigned * pointer
We implement the semantics of an STL input_iterator.
unsigned nonascii_to_utf8(unsigned ch, char *buf)
Convert a single non-ASCII Unicode character to UTF-8.
Utf8Iterator()
Create an iterator which is at the end of its iteration.
void assign(const char *p_, size_t len)
Assign a new string to the iterator.
Punctuation, connector (Pc)
std::input_iterator_tag iterator_category
We implement the semantics of an STL input_iterator.
std::string toupper(const std::string &term)
Convert a UTF-8 std::string to uppercase.
Utf8Iterator operator++(int)
Move forward to the next Unicode character.
int get_case_type(int info)
unsigned value_type
We implement the semantics of an STL input_iterator.
An iterator which returns Unicode character values from a UTF-8 encoded string.
std::string tolower(const std::string &term)
Convert a UTF-8 std::string to lowercase.
bool is_wordchar(unsigned ch)
Test if a given Unicode character is "word character".
const unsigned char * end
bool is_whitespace(unsigned ch)
Test if a given Unicode character is a whitespace character.
Punctuation, initial quote (Pi)
Utf8Iterator(const char *p_, size_t len)
Create an iterator given a pointer and a length.
Punctuation, final quote (Pf)
Utf8Iterator(const std::string &s)
Create an iterator given a string.
size_t difference_type
We implement the semantics of an STL input_iterator.
category get_category(unsigned ch)
Return the category which a given Unicode character falls into.
Separator, paragraph (Zp)