46 static bool result = ((
p = getenv(
"XAPIAN_CJK_NGRAM")) != NULL && *
p);
60 static const unsigned splits[] = {
111 0x1AFF0 - 1, 0x1B16F,
113 0x1F200 - 1, 0x1F2FF,
115 0x20000 - 1, 0x2A6DF,
121 0x2A700 - 1, 0x2EE5F,
123 0x2F800 - 1, 0x2FA1F,
131 auto it = lower_bound(begin(splits), end(splits),
p);
132 return ((it - splits) & 1);
144 size_t char_count = 0;
161 current_token.resize(0);
173 offset = current_token.size();
177 current_token.resize(0);
180 current_token.resize(0);
183 current_token.erase(0, offset);
190 WordIterator::WordIterator(
const char* ptr,
size_t len)
192 UErrorCode err = U_ZERO_ERROR;
193 UText utext = UTEXT_INITIALIZER;
194 brk = icu::BreakIterator::createWordInstance(0, err);
195 if (
usual(U_SUCCESS(err))) {
196 utext_openUTF8(&utext, ptr, len, &err);
197 if (
usual(U_SUCCESS(err)))
198 brk->setText(&utext, err);
201 if (
rare(U_FAILURE(err)))
203 int32_t first = brk->first();
206 current_token.assign(utf8_ptr + first,
p - first);
210 WordIterator::operator++()
215 current_token.assign(utf8_ptr + first,
p - first);
Iterator returning unigrams and bigrams.
NgramIterator & operator++()
void init()
Call to set current_token at the start.
InternalError indicates a runtime problem of some sort.
virtual bool next()=0
Advance to the next entry in the positionlist.
An iterator which returns Unicode character values from a UTF-8 encoded string.
Hierarchy of classes which Xapian can throw as exceptions.
void append_utf8(std::string &s, unsigned ch)
Append the UTF-8 representation of a single Unicode character to a std::string.
bool is_wordchar(unsigned ch)
Test if a given Unicode character is "word character".
Various assertion macros.
Unicode and UTF-8 related classes and functions.
bool is_unbroken_script(unsigned p)
bool is_unbroken_wordchar(unsigned p)
size_t get_unbroken(Xapian::Utf8Iterator &it)
bool is_ngram_enabled()
Should we use the n-gram code?
Handle text without explicit word breaks.