xapian-core  2.0.0
word-breaker.h
Go to the documentation of this file.
1 
4 /* Copyright (c) 2007, 2008 Yung-chung Lin (henearkrxern@gmail.com)
5  * Copyright (c) 2011 Richard Boulton (richard@tartarus.org)
6  * Copyright (c) 2011 Brandon Schaefer (brandontschaefer@gmail.com)
7  * Copyright (c) 2011,2018,2019,2023 Olly Betts
8  *
9  * Permission is hereby granted, free of charge, to any person obtaining a copy
10  * of this software and associated documentation files (the "Software"), to deal
11  * deal in the Software without restriction, including without limitation the
12  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
13  * sell copies of the Software, and to permit persons to whom the Software is
14  * furnished to do so, subject to the following conditions:
15  *
16  * The above copyright notice and this permission notice shall be included in
17  * all copies or substantial portions of the Software.
18  *
19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25  * IN THE SOFTWARE.
26  */
27 
28 #ifndef XAPIAN_INCLUDED_WORD_BREAKER_H
29 #define XAPIAN_INCLUDED_WORD_BREAKER_H
30 
31 #ifndef PACKAGE
32 # error config.h must be included first in each C++ source file
33 #endif
34 
35 #include "xapian/unicode.h"
36 
37 #include <string>
38 
39 #ifdef USE_ICU
40 # ifdef __GNUC__
41 // Turn off some warnings for libicu headers.
42 # pragma GCC diagnostic push
43 # pragma GCC diagnostic ignored "-Wold-style-cast"
44 # pragma GCC diagnostic ignored "-Wundef"
45 # endif
46 
47 # include <unicode/brkiter.h>
48 # include <unicode/unistr.h>
49 
50 # ifdef __GNUC__
51 // Restore the original warning state.
52 # pragma GCC diagnostic pop
53 # endif
54 #endif
55 
62 bool is_ngram_enabled();
63 
64 bool is_unbroken_script(unsigned codepoint);
65 
66 bool is_unbroken_wordchar(unsigned codepoint);
67 
69 
73 
78  unsigned offset = 0;
79 
80  std::string current_token;
81 
83  void init();
84 
85  public:
86  explicit NgramIterator(const std::string& s) : it(s) {
87  init();
88  }
89 
90  explicit NgramIterator(const Xapian::Utf8Iterator& it_) : it(it_) {
91  init();
92  }
93 
95 
96  const std::string& operator*() const {
97  return current_token;
98  }
99 
101 
103  bool unigram() const { return offset == 0; }
104 
105  const Xapian::Utf8Iterator& get_utf8iterator() const { return it; }
106 
107  bool operator==(const NgramIterator& other) const {
108  // We only really care about comparisons where one or other is an end
109  // iterator.
110  return current_token.empty() && other.current_token.empty();
111  }
112 
113  bool operator!=(const NgramIterator& other) const {
114  return !(*this == other);
115  }
116 };
117 
118 #ifdef USE_ICU
119 class WordIterator {
120  std::string current_token;
121 
122  int32_t p;
123 
124  const char* utf8_ptr;
125 
126  // copy UBRK_DONE to avoid GCC old-style cast error
127 #pragma GCC diagnostic push
128 #pragma GCC diagnostic ignored "-Wold-style-cast"
129  static const int32_t done = UBRK_DONE;
130 #pragma GCC diagnostic pop
131 
132  icu::BreakIterator *brk;
133 
134  public:
135  WordIterator(const char* ptr, size_t len);
136 
137  explicit WordIterator(const std::string& s)
138  : WordIterator(s.data(), s.size()) { }
139 
140  WordIterator()
141  : p(done), brk(NULL) { }
142 
143  ~WordIterator() { delete brk; }
144 
145  const std::string& operator*() const {
146  return current_token;
147  }
148 
149  WordIterator& operator++();
150 
151  bool operator==(const WordIterator& other) const {
152  return p == other.p;
153  }
154 
155  bool operator!=(const WordIterator& other) const {
156  return !(*this == other);
157  }
158 };
159 #endif
160 
161 #endif // XAPIAN_INCLUDED_WORD_BREAKER_H
Iterator returning unigrams and bigrams.
Definition: word-breaker.h:71
NgramIterator & operator++()
bool operator!=(const NgramIterator &other) const
Definition: word-breaker.h:113
NgramIterator(const std::string &s)
Definition: word-breaker.h:86
const std::string & operator*() const
Definition: word-breaker.h:96
bool unigram() const
Is this a unigram?
Definition: word-breaker.h:103
const Xapian::Utf8Iterator & get_utf8iterator() const
Definition: word-breaker.h:105
unsigned offset
Offset to penultimate Unicode character in current_token.
Definition: word-breaker.h:78
std::string current_token
Definition: word-breaker.h:80
bool operator==(const NgramIterator &other) const
Definition: word-breaker.h:107
void init()
Call to set current_token at the start.
Xapian::Utf8Iterator it
Definition: word-breaker.h:72
NgramIterator(const Xapian::Utf8Iterator &it_)
Definition: word-breaker.h:90
An iterator which returns Unicode character values from a UTF-8 encoded string.
Definition: unicode.h:39
PositionList * p
bool operator==(const ESetIterator &a, const ESetIterator &b) noexcept
Equality test for ESetIterator objects.
Definition: eset.h:271
const Query operator*(double factor, const Query &q)
Scale a Xapian::Query object using OP_SCALE_WEIGHT.
Definition: query.h:827
bool operator!=(const Xapian::MSet &first, const Xapian::MSet &second)
Definition: testutils.h:57
Unicode and UTF-8 related classes and functions.
bool is_unbroken_wordchar(unsigned codepoint)
bool is_unbroken_script(unsigned codepoint)
Definition: word-breaker.cc:51
size_t get_unbroken(Xapian::Utf8Iterator &it)
bool is_ngram_enabled()
Should we use the n-gram code?
Definition: word-breaker.cc:43