xapian-core  1.4.27
word-breaker.h
Go to the documentation of this file.
1 
4 /* Copyright (c) 2007, 2008 Yung-chung Lin (henearkrxern@gmail.com)
5  * Copyright (c) 2011 Richard Boulton (richard@tartarus.org)
6  * Copyright (c) 2011 Brandon Schaefer (brandontschaefer@gmail.com)
7  * Copyright (c) 2011,2019,2023 Olly Betts
8  *
9  * Permission is hereby granted, free of charge, to any person obtaining a copy
10  * of this software and associated documentation files (the "Software"), to deal
11  * deal in the Software without restriction, including without limitation the
12  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
13  * sell copies of the Software, and to permit persons to whom the Software is
14  * furnished to do so, subject to the following conditions:
15  *
16  * The above copyright notice and this permission notice shall be included in
17  * all copies or substantial portions of the Software.
18  *
19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25  * IN THE SOFTWARE.
26  */
27 
28 #ifndef XAPIAN_INCLUDED_WORD_BREAKER_H
29 #define XAPIAN_INCLUDED_WORD_BREAKER_H
30 
31 #ifndef PACKAGE
32 # error config.h must be included first in each C++ source file
33 #endif
34 
35 #include "xapian/unicode.h"
36 
37 #include <string>
38 
45 bool is_ngram_enabled();
46 
47 bool is_unbroken_script(unsigned codepoint);
48 
50 
54 
59  unsigned offset = 0;
60 
61  std::string current_token;
62 
64  void init();
65 
66  public:
67  explicit NgramIterator(const std::string& s) : it(s) {
68  init();
69  }
70 
71  explicit NgramIterator(const Xapian::Utf8Iterator& it_) : it(it_) {
72  init();
73  }
74 
76 
77  const std::string& operator*() const {
78  return current_token;
79  }
80 
82 
84  bool unigram() const { return offset == 0; }
85 
86  const Xapian::Utf8Iterator& get_utf8iterator() const { return it; }
87 
88  bool operator==(const NgramIterator& other) const {
89  // We only really care about comparisons where one or other is an end
90  // iterator.
91  return current_token.empty() && other.current_token.empty();
92  }
93 
94  bool operator!=(const NgramIterator& other) const {
95  return !(*this == other);
96  }
97 };
98 
99 #endif // XAPIAN_INCLUDED_WORD_BREAKER_H
Unicode and UTF-8 related classes and functions.
const Xapian::Utf8Iterator & get_utf8iterator() const
Definition: word-breaker.h:86
NgramIterator(const std::string &s)
Definition: word-breaker.h:67
Iterator returning unigrams and bigrams.
Definition: word-breaker.h:52
NgramIterator & operator++()
void get_unbroken(Xapian::Utf8Iterator &it)
Definition: word-breaker.cc:86
std::string current_token
Definition: word-breaker.h:61
bool operator!=(const NgramIterator &other) const
Definition: word-breaker.h:94
const std::string & operator*() const
Definition: word-breaker.h:77
An iterator which returns Unicode character values from a UTF-8 encoded string.
Definition: unicode.h:38
void init()
Call to set current_token at the start.
Definition: word-breaker.cc:96
bool is_unbroken_script(unsigned codepoint)
Definition: word-breaker.cc:71
Xapian::Utf8Iterator it
Definition: word-breaker.h:53
unsigned offset
Offset to penultimate Unicode character in current_token.
Definition: word-breaker.h:59
bool unigram() const
Is this a unigram?
Definition: word-breaker.h:84
bool operator==(const NgramIterator &other) const
Definition: word-breaker.h:88
bool is_ngram_enabled()
Should we use the n-gram code?
Definition: word-breaker.cc:41
NgramIterator(const Xapian::Utf8Iterator &it_)
Definition: word-breaker.h:71