xapian-core  1.4.22
cjk-tokenizer.cc
Go to the documentation of this file.
1 
4 /* Copyright (c) 2007, 2008 Yung-chung Lin (henearkrxern@gmail.com)
5  * Copyright (c) 2011 Richard Boulton (richard@tartarus.org)
6  * Copyright (c) 2011 Brandon Schaefer (brandontschaefer@gmail.com)
7  * Copyright (c) 2011,2019 Olly Betts
8  *
9  * Permission is hereby granted, free of charge, to any person obtaining a copy
10  * of this software and associated documentation files (the "Software"), to deal
11  * deal in the Software without restriction, including without limitation the
12  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
13  * sell copies of the Software, and to permit persons to whom the Software is
14  * furnished to do so, subject to the following conditions:
15  *
16  * The above copyright notice and this permission notice shall be included in
17  * all copies or substantial portions of the Software.
18  *
19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25  * IN THE SOFTWARE.
26  */
27 
28 #include <config.h>
29 
30 #include "cjk-tokenizer.h"
31 
32 #include "omassert.h"
33 #include "xapian/unicode.h"
34 
35 #include <cstdlib>
36 #include <string>
37 
38 using namespace std;
39 
40 bool
42 {
43  const char * p;
44  static bool result = ((p = getenv("XAPIAN_CJK_NGRAM")) != NULL && *p);
45  return result;
46 }
47 
48 // 2E80..2EFF; CJK Radicals Supplement
49 // 3000..303F; CJK Symbols and Punctuation
50 // 3040..309F; Hiragana
51 // 30A0..30FF; Katakana
52 // 3100..312F; Bopomofo
53 // 3130..318F; Hangul Compatibility Jamo
54 // 3190..319F; Kanbun
55 // 31A0..31BF; Bopomofo Extended
56 // 31C0..31EF; CJK Strokes
57 // 31F0..31FF; Katakana Phonetic Extensions
58 // 3200..32FF; Enclosed CJK Letters and Months
59 // 3300..33FF; CJK Compatibility
60 // 3400..4DBF; CJK Unified Ideographs Extension A
61 // 4DC0..4DFF; Yijing Hexagram Symbols
62 // 4E00..9FFF; CJK Unified Ideographs
63 // A700..A71F; Modifier Tone Letters
64 // AC00..D7AF; Hangul Syllables
65 // F900..FAFF; CJK Compatibility Ideographs
66 // FE30..FE4F; CJK Compatibility Forms
67 // FF00..FFEF; Halfwidth and Fullwidth Forms
68 // 20000..2A6DF; CJK Unified Ideographs Extension B
69 // 2F800..2FA1F; CJK Compatibility Ideographs Supplement
70 bool
72 {
73  if (p < 0x2E80) return false;
74  return ((p >= 0x2E80 && p <= 0x2EFF) ||
75  (p >= 0x3000 && p <= 0x9FFF) ||
76  (p >= 0xA700 && p <= 0xA71F) ||
77  (p >= 0xAC00 && p <= 0xD7AF) ||
78  (p >= 0xF900 && p <= 0xFAFF) ||
79  (p >= 0xFE30 && p <= 0xFE4F) ||
80  (p >= 0xFF00 && p <= 0xFFEF) ||
81  (p >= 0x20000 && p <= 0x2A6DF) ||
82  (p >= 0x2F800 && p <= 0x2FA1F));
83 }
84 
85 void
87 {
88  while (it != Xapian::Utf8Iterator() &&
89  codepoint_is_cjk(*it) &&
91  ++it;
92  }
93 }
94 
95 void
97  if (it != Xapian::Utf8Iterator()) {
98  unsigned ch = *it;
100  Xapian::Unicode::append_utf8(current_token, ch);
101  ++it;
102  } else {
103  current_token.resize(0);
104  }
105  }
106 }
107 
110 {
111  if (offset == 0) {
112  if (it != Xapian::Utf8Iterator()) {
113  unsigned ch = *it;
115  offset = current_token.size();
116  Xapian::Unicode::append_utf8(current_token, ch);
117  ++it;
118  } else {
119  current_token.resize(0);
120  }
121  } else {
122  current_token.resize(0);
123  }
124  } else {
125  current_token.erase(0, offset);
126  offset = 0;
127  }
128  return *this;
129 }
Unicode and UTF-8 related classes and functions.
void append_utf8(std::string &s, unsigned ch)
Append the UTF-8 representation of a single Unicode character to a std::string.
Definition: unicode.h:332
void init()
Call to set current_token at the start.
bool is_cjk_enabled()
Should we use the CJK n-gram code?
STL namespace.
bool codepoint_is_cjk(unsigned codepoint)
Tokenise CJK text as n-grams.
Iterator returning unigrams and bigrams.
Definition: cjk-tokenizer.h:56
An iterator which returns Unicode character values from a UTF-8 encoded string.
Definition: unicode.h:38
CJKTokenIterator & operator++()
bool is_wordchar(unsigned ch)
Test if a given Unicode character is "word character".
Definition: unicode.h:343
void get_cjk(Xapian::Utf8Iterator &it)
Various assertion macros.