xapian-core  1.4.26
word-breaker.cc
Go to the documentation of this file.
1 
4 /* Copyright (c) 2007, 2008 Yung-chung Lin (henearkrxern@gmail.com)
5  * Copyright (c) 2011 Richard Boulton (richard@tartarus.org)
6  * Copyright (c) 2011 Brandon Schaefer (brandontschaefer@gmail.com)
7  * Copyright (c) 2011,2019,2023 Olly Betts
8  *
9  * Permission is hereby granted, free of charge, to any person obtaining a copy
10  * of this software and associated documentation files (the "Software"), to deal
11  * deal in the Software without restriction, including without limitation the
12  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
13  * sell copies of the Software, and to permit persons to whom the Software is
14  * furnished to do so, subject to the following conditions:
15  *
16  * The above copyright notice and this permission notice shall be included in
17  * all copies or substantial portions of the Software.
18  *
19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25  * IN THE SOFTWARE.
26  */
27 
28 #include <config.h>
29 
30 #include "word-breaker.h"
31 
32 #include "omassert.h"
33 #include "xapian/unicode.h"
34 
35 #include <cstdlib>
36 #include <string>
37 
38 using namespace std;
39 
40 bool
42 {
43  const char * p;
44  static bool result = ((p = getenv("XAPIAN_CJK_NGRAM")) != NULL && *p);
45  return result;
46 }
47 
48 // 2E80..2EFF; CJK Radicals Supplement
49 // 3000..303F; CJK Symbols and Punctuation
50 // 3040..309F; Hiragana
51 // 30A0..30FF; Katakana
52 // 3100..312F; Bopomofo
53 // 3130..318F; Hangul Compatibility Jamo
54 // 3190..319F; Kanbun
55 // 31A0..31BF; Bopomofo Extended
56 // 31C0..31EF; CJK Strokes
57 // 31F0..31FF; Katakana Phonetic Extensions
58 // 3200..32FF; Enclosed CJK Letters and Months
59 // 3300..33FF; CJK Compatibility
60 // 3400..4DBF; CJK Unified Ideographs Extension A
61 // 4DC0..4DFF; Yijing Hexagram Symbols
62 // 4E00..9FFF; CJK Unified Ideographs
63 // A700..A71F; Modifier Tone Letters
64 // AC00..D7AF; Hangul Syllables
65 // F900..FAFF; CJK Compatibility Ideographs
66 // FE30..FE4F; CJK Compatibility Forms
67 // FF00..FFEF; Halfwidth and Fullwidth Forms
68 // 20000..2A6DF; CJK Unified Ideographs Extension B
69 // 2F800..2FA1F; CJK Compatibility Ideographs Supplement
70 bool
71 is_unbroken_script(unsigned p)
72 {
73  if (p < 0x2E80) return false;
74  return ((p >= 0x2E80 && p <= 0x2EFF) ||
75  (p >= 0x3000 && p <= 0x9FFF) ||
76  (p >= 0xA700 && p <= 0xA71F) ||
77  (p >= 0xAC00 && p <= 0xD7AF) ||
78  (p >= 0xF900 && p <= 0xFAFF) ||
79  (p >= 0xFE30 && p <= 0xFE4F) ||
80  (p >= 0xFF00 && p <= 0xFFEF) ||
81  (p >= 0x20000 && p <= 0x2A6DF) ||
82  (p >= 0x2F800 && p <= 0x2FA1F));
83 }
84 
85 void
87 {
88  while (it != Xapian::Utf8Iterator() &&
89  is_unbroken_script(*it) &&
91  ++it;
92  }
93 }
94 
95 void
97 {
98  if (it != Xapian::Utf8Iterator()) {
99  unsigned ch = *it;
101  Xapian::Unicode::append_utf8(current_token, ch);
102  ++it;
103  } else {
104  current_token.resize(0);
105  }
106  }
107 }
108 
111 {
112  if (offset == 0) {
113  if (it != Xapian::Utf8Iterator()) {
114  unsigned ch = *it;
116  offset = current_token.size();
117  Xapian::Unicode::append_utf8(current_token, ch);
118  ++it;
119  } else {
120  current_token.resize(0);
121  }
122  } else {
123  current_token.resize(0);
124  }
125  } else {
126  current_token.erase(0, offset);
127  offset = 0;
128  }
129  return *this;
130 }
Unicode and UTF-8 related classes and functions.
void append_utf8(std::string &s, unsigned ch)
Append the UTF-8 representation of a single Unicode character to a std::string.
Definition: unicode.h:332
bool is_unbroken_script(unsigned p)
Definition: word-breaker.cc:71
STL namespace.
Iterator returning unigrams and bigrams.
Definition: word-breaker.h:52
NgramIterator & operator++()
Handle text without explicit word breaks.
An iterator which returns Unicode character values from a UTF-8 encoded string.
Definition: unicode.h:38
bool is_wordchar(unsigned ch)
Test if a given Unicode character is "word character".
Definition: unicode.h:343
void get_unbroken(Xapian::Utf8Iterator &it)
Definition: word-breaker.cc:86
void init()
Call to set current_token at the start.
Definition: word-breaker.cc:96
Various assertion macros.
bool is_ngram_enabled()
Should we use the n-gram code?
Definition: word-breaker.cc:41