28 #ifndef XAPIAN_INCLUDED_WORD_BREAKER_H
29 #define XAPIAN_INCLUDED_WORD_BREAKER_H
32 # error config.h must be included first in each C++ source file
42 # pragma GCC diagnostic push
43 # pragma GCC diagnostic ignored "-Wold-style-cast"
44 # pragma GCC diagnostic ignored "-Wundef"
47 # include <unicode/brkiter.h>
48 # include <unicode/unistr.h>
52 # pragma GCC diagnostic pop
114 return !(*
this == other);
120 std::string current_token;
124 const char* utf8_ptr;
127 #pragma GCC diagnostic push
128 #pragma GCC diagnostic ignored "-Wold-style-cast"
129 static const int32_t done = UBRK_DONE;
130 #pragma GCC diagnostic pop
132 icu::BreakIterator *brk;
135 WordIterator(
const char* ptr,
size_t len);
137 explicit WordIterator(
const std::string& s)
138 : WordIterator(s.data(), s.size()) { }
141 :
p(done), brk(NULL) { }
143 ~WordIterator() {
delete brk; }
146 return current_token;
149 WordIterator& operator++();
151 bool operator==(
const WordIterator& other)
const {
155 bool operator!=(
const WordIterator& other)
const {
156 return !(*
this == other);
Iterator returning unigrams and bigrams.
NgramIterator & operator++()
bool operator!=(const NgramIterator &other) const
NgramIterator(const std::string &s)
const std::string & operator*() const
bool unigram() const
Is this a unigram?
const Xapian::Utf8Iterator & get_utf8iterator() const
unsigned offset
Offset to penultimate Unicode character in current_token.
std::string current_token
bool operator==(const NgramIterator &other) const
void init()
Call to set current_token at the start.
NgramIterator(const Xapian::Utf8Iterator &it_)
An iterator which returns Unicode character values from a UTF-8 encoded string.
bool operator==(const ESetIterator &a, const ESetIterator &b) noexcept
Equality test for ESetIterator objects.
const Query operator*(double factor, const Query &q)
Scale a Xapian::Query object using OP_SCALE_WEIGHT.
bool operator!=(const Xapian::MSet &first, const Xapian::MSet &second)
Unicode and UTF-8 related classes and functions.
bool is_unbroken_wordchar(unsigned codepoint)
bool is_unbroken_script(unsigned codepoint)
size_t get_unbroken(Xapian::Utf8Iterator &it)
bool is_ngram_enabled()
Should we use the n-gram code?