44 static bool result = ((p = getenv(
"XAPIAN_CJK_NGRAM")) != NULL && *p);
73 if (p < 0x2E80)
return false;
74 return ((p >= 0x2E80 && p <= 0x2EFF) ||
75 (p >= 0x3000 && p <= 0x9FFF) ||
76 (p >= 0xA700 && p <= 0xA71F) ||
77 (p >= 0xAC00 && p <= 0xD7AF) ||
78 (p >= 0xF900 && p <= 0xFAFF) ||
79 (p >= 0xFE30 && p <= 0xFE4F) ||
80 (p >= 0xFF00 && p <= 0xFFEF) ||
81 (p >= 0x20000 && p <= 0x2A6DF) ||
82 (p >= 0x2F800 && p <= 0x2FA1F));
103 current_token.resize(0);
115 offset = current_token.size();
119 current_token.resize(0);
122 current_token.resize(0);
125 current_token.erase(0, offset);
Unicode and UTF-8 related classes and functions.
void append_utf8(std::string &s, unsigned ch)
Append the UTF-8 representation of a single Unicode character to a std::string.
void init()
Call to set current_token at the start.
bool is_cjk_enabled()
Should we use the CJK n-gram code?
bool codepoint_is_cjk(unsigned codepoint)
Tokenise CJK text as n-grams.
Iterator returning unigrams and bigrams.
An iterator which returns Unicode character values from a UTF-8 encoded string.
CJKTokenIterator & operator++()
bool is_wordchar(unsigned ch)
Test if a given Unicode character is "word character".
void get_cjk(Xapian::Utf8Iterator &it)
Various assertion macros.