xapian-core  2.0.0
word-breaker.cc
Go to the documentation of this file.
1 
4 /* Copyright (c) 2007, 2008 Yung-chung Lin (henearkrxern@gmail.com)
5  * Copyright (c) 2011 Richard Boulton (richard@tartarus.org)
6  * Copyright (c) 2011 Brandon Schaefer (brandontschaefer@gmail.com)
7  * Copyright (c) 2011,2018,2019,2023 Olly Betts
8  *
9  * Permission is hereby granted, free of charge, to any person obtaining a copy
10  * of this software and associated documentation files (the "Software"), to deal
11  * deal in the Software without restriction, including without limitation the
12  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
13  * sell copies of the Software, and to permit persons to whom the Software is
14  * furnished to do so, subject to the following conditions:
15  *
16  * The above copyright notice and this permission notice shall be included in
17  * all copies or substantial portions of the Software.
18  *
19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25  * IN THE SOFTWARE.
26  */
27 
28 #include <config.h>
29 
30 #include "word-breaker.h"
31 
32 #include "omassert.h"
33 #include "xapian/unicode.h"
34 #include "xapian/error.h"
35 
36 #include <algorithm>
37 #include <cstdlib>
38 #include <string>
39 
40 using namespace std;
41 
42 bool
44 {
45  const char * p;
46  static bool result = ((p = getenv("XAPIAN_CJK_NGRAM")) != NULL && *p);
47  return result;
48 }
49 
50 bool
52 {
53  // Array containing the last value in each range of codepoints which
54  // are either all in scripts which are written without explicit word
55  // breaks, or all not in such scripts.
56  //
57  // We only include scripts here which ICU has dictionaries for. The
58  // same list is currently also used to decide which languages to do
59  // ngrams for, though perhaps that should use a separate list.
60  static const unsigned splits[] = {
61  // 0E00..0E7F; Thai, Lanna Tai, Pali
62  // 0E80..0EFF; Lao
63  0x0E00 - 1, 0x0EFF,
64  // 1000..109F; Myanmar (Burmese)
65  0x1000 - 1, 0x109F,
66  // 1100..11FF; Hangul Jamo
67  0x1100 - 1, 0x11FF,
68  // 1780..17FF; Khmer
69  0x1780 - 1, 0x17FF,
70  // 19E0..19FF; Khmer Symbols
71  0x19E0 - 1, 0x19FF,
72  // 2E80..2EFF; CJK Radicals Supplement
73  // 2F00..2FDF; Kangxi Radicals
74  // 2FE0..2FFF; Ideographic Description Characters
75  // 3000..303F; CJK Symbols and Punctuation
76  // 3040..309F; Hiragana
77  // 30A0..30FF; Katakana
78  // 3100..312F; Bopomofo
79  // 3130..318F; Hangul Compatibility Jamo
80  // 3190..319F; Kanbun
81  // 31A0..31BF; Bopomofo Extended
82  // 31C0..31EF; CJK Strokes
83  // 31F0..31FF; Katakana Phonetic Extensions
84  // 3200..32FF; Enclosed CJK Letters and Months
85  // 3300..33FF; CJK Compatibility
86  // 3400..4DBF; CJK Unified Ideographs Extension A
87  // 4DC0..4DFF; Yijing Hexagram Symbols
88  // 4E00..9FFF; CJK Unified Ideographs
89  0x2E80 - 1, 0x9FFF,
90  // A700..A71F; Modifier Tone Letters
91  0xA700 - 1, 0xA71F,
92  // A960..A97F; Hangul Jamo Extended-A
93  0xA960 - 1, 0xA97F,
94  // A9E0..A9FF; Myanmar Extended-B (Burmese)
95  0xA9E0 - 1, 0xA9FF,
96  // AA60..AA7F; Myanmar Extended-A (Burmese)
97  0xAA60 - 1, 0xAA7F,
98  // AC00..D7AF; Hangul Syllables
99  // D7B0..D7FF; Hangul Jamo Extended-B
100  0xAC00 - 1, 0xD7FF,
101  // F900..FAFF; CJK Compatibility Ideographs
102  0xF900 - 1, 0xFAFF,
103  // FE30..FE4F; CJK Compatibility Forms
104  0xFE30 - 1, 0xFE4F,
105  // FF65..FFDC; Halfwidth Katakana and Hangul
106  0xFF65 - 1, 0xFFDC,
107  // 1AFF0..1AFFF; Kana Extended-B
108  // 1B000..1B0FF; Kana Supplement
109  // 1B100..1B12F; Kana Extended-A
110  // 1B130..1B16F; Small Kana Extension
111  0x1AFF0 - 1, 0x1B16F,
112  // 1F200..1F2FF; Enclosed Ideographic Supplement
113  0x1F200 - 1, 0x1F2FF,
114  // 20000..2A6DF; CJK Unified Ideographs Extension B
115  0x20000 - 1, 0x2A6DF,
116  // 2A700..2B73F; CJK Unified Ideographs Extension C
117  // 2B740..2B81F; CJK Unified Ideographs Extension D
118  // 2B820..2CEAF; CJK Unified Ideographs Extension E
119  // 2CEB0..2EBEF; CJK Unified Ideographs Extension F
120  // 2EBF0..2EE5F; CJK Unified Ideographs Extension I
121  0x2A700 - 1, 0x2EE5F,
122  // 2F800..2FA1F; CJK Compatibility Ideographs Supplement
123  0x2F800 - 1, 0x2FA1F,
124  // 30000..3134F; CJK Unified Ideographs Extension G
125  // 31350..323AF; CJK Unified Ideographs Extension H
126  0x30000 - 1, 0x323AF
127  };
128  // Binary chop to find the first entry which is >= p. If it's an odd
129  // offset then the codepoint is in a script which needs splitting; if it's
130  // an even offset then it's not.
131  auto it = lower_bound(begin(splits), end(splits), p);
132  return ((it - splits) & 1);
133 }
134 
135 bool
137 {
139 }
140 
141 size_t
143 {
144  size_t char_count = 0;
145  while (it != Xapian::Utf8Iterator() && is_unbroken_wordchar(*it)) {
146  ++char_count;
147  ++it;
148  }
149  return char_count;
150 }
151 
152 void
154 {
155  if (it != Xapian::Utf8Iterator()) {
156  unsigned ch = *it;
157  if (is_unbroken_wordchar(ch)) {
158  Xapian::Unicode::append_utf8(current_token, ch);
159  ++it;
160  } else {
161  current_token.resize(0);
162  }
163  }
164 }
165 
168 {
169  if (offset == 0) {
170  if (it != Xapian::Utf8Iterator()) {
171  unsigned ch = *it;
172  if (is_unbroken_wordchar(ch)) {
173  offset = current_token.size();
174  Xapian::Unicode::append_utf8(current_token, ch);
175  ++it;
176  } else {
177  current_token.resize(0);
178  }
179  } else {
180  current_token.resize(0);
181  }
182  } else {
183  current_token.erase(0, offset);
184  offset = 0;
185  }
186  return *this;
187 }
188 
189 #ifdef USE_ICU
190 WordIterator::WordIterator(const char* ptr, size_t len)
191 {
192  UErrorCode err = U_ZERO_ERROR;
193  UText utext = UTEXT_INITIALIZER;
194  brk = icu::BreakIterator::createWordInstance(0/*unknown locale*/, err);
195  if (usual(U_SUCCESS(err))) {
196  utext_openUTF8(&utext, ptr, len, &err);
197  if (usual(U_SUCCESS(err)))
198  brk->setText(&utext, err);
199  utext_close(&utext);
200  }
201  if (rare(U_FAILURE(err)))
202  throw Xapian::InternalError(string("ICU error: ") + u_errorName(err));
203  int32_t first = brk->first();
204  p = brk->next();
205  utf8_ptr = ptr;
206  current_token.assign(utf8_ptr + first, p - first);
207 }
208 
209 WordIterator&
210 WordIterator::operator++()
211 {
212  int32_t first = p;
213  p = brk->next();
214  if (usual(p != done)) {
215  current_token.assign(utf8_ptr + first, p - first);
216  }
217  return *this;
218 }
219 #endif
Iterator returning unigrams and bigrams.
Definition: word-breaker.h:71
NgramIterator & operator++()
void init()
Call to set current_token at the start.
InternalError indicates a runtime problem of some sort.
Definition: error.h:749
virtual bool next()=0
Advance to the next entry in the positionlist.
An iterator which returns Unicode character values from a UTF-8 encoded string.
Definition: unicode.h:39
#define usual(COND)
Definition: config.h:608
#define rare(COND)
Definition: config.h:607
PositionList * p
Hierarchy of classes which Xapian can throw as exceptions.
void append_utf8(std::string &s, unsigned ch)
Append the UTF-8 representation of a single Unicode character to a std::string.
Definition: unicode.h:344
bool is_wordchar(unsigned ch)
Test if a given Unicode character is "word character".
Definition: unicode.h:355
Various assertion macros.
Unicode and UTF-8 related classes and functions.
bool is_unbroken_script(unsigned p)
Definition: word-breaker.cc:51
bool is_unbroken_wordchar(unsigned p)
size_t get_unbroken(Xapian::Utf8Iterator &it)
bool is_ngram_enabled()
Should we use the n-gram code?
Definition: word-breaker.cc:43
Handle text without explicit word breaks.