00001 00004 /* Copyright (c) 2007, 2008 Yung-chung Lin (henearkrxern@gmail.com) 00005 * Copyright (c) 2011 Richard Boulton (richard@tartarus.org) 00006 * Copyright (c) 2011 Brandon Schaefer (brandontschaefer@gmail.com) 00007 * Copyright (c) 2011 Olly Betts 00008 * 00009 * Permission is hereby granted, free of charge, to any person obtaining a copy 00010 * of this software and associated documentation files (the "Software"), to deal 00011 * deal in the Software without restriction, including without limitation the 00012 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 00013 * sell copies of the Software, and to permit persons to whom the Software is 00014 * furnished to do so, subject to the following conditions: 00015 * 00016 * The above copyright notice and this permission notice shall be included in 00017 * all copies or substantial portions of the Software. 00018 * 00019 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 00020 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 00021 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 00022 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 00023 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 00024 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 00025 * IN THE SOFTWARE. 00026 */ 00027 00028 #include <config.h> 00029 00030 #include "cjk-tokenizer.h" 00031 00032 #include "omassert.h" 00033 #include "xapian/unicode.h" 00034 00035 #include <cstdlib> 00036 #include <string> 00037 00038 using namespace std; 00039 00040 static unsigned NGRAM_SIZE = 2; 00041 00042 bool 00043 CJK::is_cjk_enabled() 00044 { 00045 const char * p; 00046 static bool result = ((p = getenv("XAPIAN_CJK_NGRAM")) != NULL && *p); 00047 return result; 00048 } 00049 00050 // 2E80..2EFF; CJK Radicals Supplement 00051 // 3000..303F; CJK Symbols and Punctuation 00052 // 3040..309F; Hiragana 00053 // 30A0..30FF; Katakana 00054 // 3100..312F; Bopomofo 00055 // 3130..318F; Hangul Compatibility Jamo 00056 // 3190..319F; Kanbun 00057 // 31A0..31BF; Bopomofo Extended 00058 // 31C0..31EF; CJK Strokes 00059 // 31F0..31FF; Katakana Phonetic Extensions 00060 // 3200..32FF; Enclosed CJK Letters and Months 00061 // 3300..33FF; CJK Compatibility 00062 // 3400..4DBF; CJK Unified Ideographs Extension A 00063 // 4DC0..4DFF; Yijing Hexagram Symbols 00064 // 4E00..9FFF; CJK Unified Ideographs 00065 // A700..A71F; Modifier Tone Letters 00066 // AC00..D7AF; Hangul Syllables 00067 // F900..FAFF; CJK Compatibility Ideographs 00068 // FE30..FE4F; CJK Compatibility Forms 00069 // FF00..FFEF; Halfwidth and Fullwidth Forms 00070 // 20000..2A6DF; CJK Unified Ideographs Extension B 00071 // 2F800..2FA1F; CJK Compatibility Ideographs Supplement 00072 bool 00073 CJK::codepoint_is_cjk(unsigned p) 00074 { 00075 if (p < 0x2E80) return false; 00076 return ((p >= 0x2E80 && p <= 0x2EFF) || 00077 (p >= 0x3000 && p <= 0x9FFF) || 00078 (p >= 0xA700 && p <= 0xA71F) || 00079 (p >= 0xAC00 && p <= 0xD7AF) || 00080 (p >= 0xF900 && p <= 0xFAFF) || 00081 (p >= 0xFE30 && p <= 0xFE4F) || 00082 (p >= 0xFF00 && p <= 0xFFEF) || 00083 (p >= 0x20000 && p <= 0x2A6DF) || 00084 (p >= 0x2F800 && p <= 0x2FA1F)); 00085 } 00086 00087 string 00088 CJK::get_cjk(Xapian::Utf8Iterator &it) 00089 { 00090 string str; 00091 while (it != Xapian::Utf8Iterator() && codepoint_is_cjk(*it)) { 00092 Xapian::Unicode::append_utf8(str, *it); 00093 ++it; 00094 } 00095 return str; 00096 } 00097 00098 const string & 00099 CJKTokenIterator::operator*() const 00100 { 00101 if (current_token.empty()) { 00102 Assert(it != Xapian::Utf8Iterator()); 00103 p = it; 00104 Xapian::Unicode::append_utf8(current_token, *p); 00105 ++p; 00106 len = 1; 00107 } 00108 return current_token; 00109 } 00110 00111 CJKTokenIterator & 00112 CJKTokenIterator::operator++() 00113 { 00114 if (len < NGRAM_SIZE && p != Xapian::Utf8Iterator()) { 00115 Xapian::Unicode::append_utf8(current_token, *p); 00116 ++p; 00117 ++len; 00118 } else { 00119 Assert(it != Xapian::Utf8Iterator()); 00120 ++it; 00121 current_token.resize(0); 00122 } 00123 return *this; 00124 }