00001 00004 /* Copyright (c) 2007, 2008 Yung-chung Lin (henearkrxern@gmail.com) 00005 * Copyright (c) 2011 Richard Boulton (richard@tartarus.org) 00006 * Copyright (c) 2011 Brandon Schaefer (brandontschaefer@gmail.com) 00007 * Copyright (c) 2011 Olly Betts 00008 * 00009 * Permission is hereby granted, free of charge, to any person obtaining a copy 00010 * of this software and associated documentation files (the "Software"), to deal 00011 * deal in the Software without restriction, including without limitation the 00012 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 00013 * sell copies of the Software, and to permit persons to whom the Software is 00014 * furnished to do so, subject to the following conditions: 00015 * 00016 * The above copyright notice and this permission notice shall be included in 00017 * all copies or substantial portions of the Software. 00018 * 00019 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 00020 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 00021 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 00022 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 00023 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 00024 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 00025 * IN THE SOFTWARE. 00026 */ 00027 00028 #ifndef XAPIAN_INCLUDED_CJK_TOKENIZER_H 00029 #define XAPIAN_INCLUDED_CJK_TOKENIZER_H 00030 00031 #include "xapian/unicode.h" 00032 00033 #include <string> 00034 00035 namespace CJK { 00036 00043 bool is_cjk_enabled(); 00044 00045 bool codepoint_is_cjk(unsigned codepoint); 00046 00047 std::string get_cjk(Xapian::Utf8Iterator &it); 00048 00049 } 00050 00051 class CJKTokenIterator { 00052 Xapian::Utf8Iterator it; 00053 00054 mutable Xapian::Utf8Iterator p; 00055 00056 mutable unsigned len; 00057 00058 mutable std::string current_token; 00059 00060 public: 00061 CJKTokenIterator(const std::string & s) 00062 : it(s) { } 00063 00064 CJKTokenIterator(const Xapian::Utf8Iterator & it_) 00065 : it(it_) { } 00066 00067 CJKTokenIterator() 00068 : it() { } 00069 00070 const std::string & operator*() const; 00071 00072 CJKTokenIterator & operator++(); 00073 00075 unsigned get_length() const { return len; } 00076 00077 friend bool operator==(const CJKTokenIterator &, const CJKTokenIterator &); 00078 }; 00079 00080 inline bool 00081 operator==(const CJKTokenIterator & a, const CJKTokenIterator & b) 00082 { 00083 // We only really care about comparisons where one or other is an end 00084 // iterator. 00085 return a.it == b.it; 00086 } 00087 00088 inline bool 00089 operator!=(const CJKTokenIterator & a, const CJKTokenIterator & b) 00090 { 00091 return !(a == b); 00092 } 00093 00094 #endif // XAPIAN_INCLUDED_CJK_TOKENIZER_H