sourcedoc/html/word-breaker_8h_source.html

 /* Copyright (c) 2007, 2008 Yung-chung Lin (henearkrxern@gmail.com)
  * Copyright (c) 2011 Richard Boulton (richard@tartarus.org)
  * Copyright (c) 2011 Brandon Schaefer (brandontschaefer@gmail.com)
  * Copyright (c) 2011,2019,2023 Olly Betts
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
  * deal in the Software without restriction, including without limitation the
  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice shall be included in
  * all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  * IN THE SOFTWARE.
  */

 #ifndef XAPIAN_INCLUDED_WORD_BREAKER_H
 #define XAPIAN_INCLUDED_WORD_BREAKER_H

 #ifndef PACKAGE
 # error config.h must be included first in each C++ source file
 #endif

 #include "xapian/unicode.h"

 #include <string>

 bool is_ngram_enabled();

 bool is_unbroken_script(unsigned codepoint);

 void get_unbroken(Xapian::Utf8Iterator& it);

 class NgramIterator {
     Xapian::Utf8Iterator it;

     unsigned offset = 0;

     std::string current_token;

     void init();

   public:
     explicit NgramIterator(const std::string& s) : it(s) {
         init();
     }

     explicit NgramIterator(const Xapian::Utf8Iterator& it_) : it(it_) {
         init();
     }

     NgramIterator() { }

     const std::string& operator*() const {
         return current_token;
     }

     NgramIterator& operator++();

     bool unigram() const { return offset == 0; }

     const Xapian::Utf8Iterator& get_utf8iterator() const { return it; }

     bool operator==(const NgramIterator& other) const {
         // We only really care about comparisons where one or other is an end
         // iterator.
         return current_token.empty() && other.current_token.empty();
     }

     bool operator!=(const NgramIterator& other) const {
         return !(*this == other);
     }
 };

 #endif // XAPIAN_INCLUDED_WORD_BREAKER_H
unicode.h
Unicode and UTF-8 related classes and functions.

NgramIterator::get_utf8iterator
const Xapian::Utf8Iterator & get_utf8iterator() const
Definition: word-breaker.h:86

NgramIterator::NgramIterator
NgramIterator(const std::string &s)
Definition: word-breaker.h:67

NgramIterator
Iterator returning unigrams and bigrams.
Definition: word-breaker.h:52

NgramIterator::operator++
NgramIterator & operator++()
Definition: word-breaker.cc:110

get_unbroken
void get_unbroken(Xapian::Utf8Iterator &it)
Definition: word-breaker.cc:86

NgramIterator::current_token
std::string current_token
Definition: word-breaker.h:61

NgramIterator::operator!=
bool operator!=(const NgramIterator &other) const
Definition: word-breaker.h:94

NgramIterator::operator*
const std::string & operator*() const
Definition: word-breaker.h:77

Xapian::Utf8Iterator
An iterator which returns Unicode character values from a UTF-8 encoded string.
Definition: unicode.h:38

NgramIterator::NgramIterator
NgramIterator()
Definition: word-breaker.h:75

NgramIterator::init
void init()
Call to set current_token at the start.
Definition: word-breaker.cc:96

is_unbroken_script
bool is_unbroken_script(unsigned codepoint)
Definition: word-breaker.cc:71

NgramIterator::it
Xapian::Utf8Iterator it
Definition: word-breaker.h:53

NgramIterator::offset
unsigned offset
Offset to penultimate Unicode character in current_token.
Definition: word-breaker.h:59

NgramIterator::unigram
bool unigram() const
Is this a unigram?
Definition: word-breaker.h:84

NgramIterator::operator==
bool operator==(const NgramIterator &other) const
Definition: word-breaker.h:88

is_ngram_enabled
bool is_ngram_enabled()
Should we use the n-gram code?
Definition: word-breaker.cc:41

NgramIterator::NgramIterator
NgramIterator(const Xapian::Utf8Iterator &it_)
Definition: word-breaker.h:71