00001
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #include <config.h>
00022
00023 #include "termgenerator_internal.h"
00024
00025 #include <xapian/document.h>
00026 #include <xapian/queryparser.h>
00027 #include <xapian/unicode.h>
00028
00029 #include "stringutils.h"
00030
00031 #include <limits>
00032 #include <string>
00033
00034 #include "cjk-tokenizer.h"
00035
00036 using namespace std;
00037
00038 namespace Xapian {
00039
00040
00041
00042 static const unsigned int MAX_PROB_TERM_LENGTH = 64;
00043
00044
00045
00046
00047
00048 inline bool
00049 U_isupper(unsigned ch) {
00050 return (ch < 128 && C_isupper((unsigned char)ch));
00051 }
00052
00053 inline unsigned check_wordchar(unsigned ch) {
00054 if (Unicode::is_wordchar(ch)) return Unicode::tolower(ch);
00055 return 0;
00056 }
00057
00058 inline bool
00059 should_stem(const std::string & term)
00060 {
00061 const unsigned int SHOULD_STEM_MASK =
00062 (1 << Unicode::LOWERCASE_LETTER) |
00063 (1 << Unicode::TITLECASE_LETTER) |
00064 (1 << Unicode::MODIFIER_LETTER) |
00065 (1 << Unicode::OTHER_LETTER);
00066 Utf8Iterator u(term);
00067 return ((SHOULD_STEM_MASK >> Unicode::get_category(*u)) & 1);
00068 }
00069
00073 const unsigned UNICODE_IGNORE = numeric_limits<unsigned>::max();
00074
00075 inline unsigned check_infix(unsigned ch) {
00076 if (ch == '\'' || ch == '&' || ch == 0xb7 || ch == 0x5f4 || ch == 0x2027) {
00077
00078
00079
00080
00081 return ch;
00082 }
00083
00084
00085 if (ch == 0x2019 || ch == 0x201b) return '\'';
00086 if (ch >= 0x200b && (ch <= 0x200d || ch == 0x2060 || ch == 0xfeff))
00087 return UNICODE_IGNORE;
00088 return 0;
00089 }
00090
00091 inline unsigned check_infix_digit(unsigned ch) {
00092
00093 switch (ch) {
00094 case ',':
00095 case '.':
00096 case ';':
00097 case 0x037e:
00098 case 0x0589:
00099 case 0x060D:
00100 case 0x07F8:
00101 case 0x2044:
00102 case 0xFE10:
00103 case 0xFE13:
00104 case 0xFE14:
00105 return ch;
00106 }
00107 if (ch >= 0x200b && (ch <= 0x200d || ch == 0x2060 || ch == 0xfeff))
00108 return UNICODE_IGNORE;
00109 return 0;
00110 }
00111
00112 inline bool
00113 is_digit(unsigned ch) {
00114 return (Unicode::get_category(ch) == Unicode::DECIMAL_DIGIT_NUMBER);
00115 }
00116
00117 inline unsigned check_suffix(unsigned ch) {
00118 if (ch == '+' || ch == '#') return ch;
00119
00120 return 0;
00121 }
00122
00123
00124 #define STOPWORDS_NONE 0
00125 #define STOPWORDS_IGNORE 1
00126 #define STOPWORDS_INDEX_UNSTEMMED_ONLY 2
00127
00128 void
00129 TermGenerator::Internal::index_text(Utf8Iterator itor, termcount wdf_inc,
00130 const string & prefix, bool with_positions)
00131 {
00132 bool cjk_ngram = CJK::is_cjk_enabled();
00133
00134 int stop_mode = STOPWORDS_INDEX_UNSTEMMED_ONLY;
00135
00136 if (!stopper) stop_mode = STOPWORDS_NONE;
00137
00138 while (true) {
00139
00140 unsigned ch;
00141 while (true) {
00142 if (itor == Utf8Iterator()) return;
00143 ch = check_wordchar(*itor);
00144 if (ch) break;
00145 ++itor;
00146 }
00147
00148 string term;
00149
00150
00151 if (U_isupper(*itor)) {
00152 const Utf8Iterator end;
00153 Utf8Iterator p = itor;
00154 do {
00155 Unicode::append_utf8(term, Unicode::tolower(*p++));
00156 } while (p != end && *p == '.' && ++p != end && U_isupper(*p));
00157
00158
00159 if (term.size() > 1) {
00160
00161
00162 if (p == end || !Unicode::is_wordchar(*p)) {
00163 itor = p;
00164 goto endofterm;
00165 }
00166 }
00167 term.resize(0);
00168 }
00169
00170 while (true) {
00171 if (cjk_ngram && CJK::codepoint_is_cjk(*itor)) {
00172 const string & cjk = CJK::get_cjk(itor);
00173 for (CJKTokenIterator tk(cjk); tk != CJKTokenIterator(); ++tk) {
00174 const string & cjk_token = *tk;
00175 if (cjk_token.size() > MAX_PROB_TERM_LENGTH) continue;
00176
00177 if (stop_mode == STOPWORDS_IGNORE && (*stopper)(cjk_token))
00178 continue;
00179
00180 if (with_positions && tk.get_length() == 1) {
00181 doc.add_posting(prefix + cjk_token, ++termpos, wdf_inc);
00182 } else {
00183 doc.add_term(prefix + cjk_token, wdf_inc);
00184 }
00185 if ((flags & FLAG_SPELLING) && prefix.empty())
00186 db.add_spelling(cjk_token);
00187
00188 if (!stemmer.internal.get()) continue;
00189
00190 if (stop_mode == STOPWORDS_INDEX_UNSTEMMED_ONLY &&
00191 (*stopper)(cjk_token))
00192 continue;
00193
00194
00195
00196 if (!should_stem(cjk_token)) continue;
00197
00198
00199 string stem("Z");
00200 stem += prefix;
00201 stem += stemmer(cjk_token);
00202 doc.add_term(stem, wdf_inc);
00203 }
00204 while (true) {
00205 if (itor == Utf8Iterator()) return;
00206 ch = check_wordchar(*itor);
00207 if (ch) break;
00208 ++itor;
00209 }
00210 }
00211 unsigned prevch;
00212 do {
00213 Unicode::append_utf8(term, ch);
00214 prevch = ch;
00215 if (++itor == Utf8Iterator() ||
00216 (cjk_ngram && CJK::codepoint_is_cjk(*itor)))
00217 goto endofterm;
00218 ch = check_wordchar(*itor);
00219 } while (ch);
00220
00221 Utf8Iterator next(itor);
00222 ++next;
00223 if (next == Utf8Iterator()) break;
00224 unsigned nextch = check_wordchar(*next);
00225 if (!nextch) break;
00226 unsigned infix_ch = *itor;
00227 if (is_digit(prevch) && is_digit(*next)) {
00228 infix_ch = check_infix_digit(infix_ch);
00229 } else {
00230
00231 infix_ch = check_infix(infix_ch);
00232 }
00233 if (!infix_ch) break;
00234 if (infix_ch != UNICODE_IGNORE)
00235 Unicode::append_utf8(term, infix_ch);
00236 ch = nextch;
00237 itor = next;
00238 }
00239
00240 {
00241 size_t len = term.size();
00242 unsigned count = 0;
00243 while ((ch = check_suffix(*itor))) {
00244 if (++count > 3) {
00245 term.resize(len);
00246 break;
00247 }
00248 Unicode::append_utf8(term, ch);
00249 if (++itor == Utf8Iterator()) goto endofterm;
00250 }
00251
00252 if (Unicode::is_wordchar(*itor))
00253 term.resize(len);
00254 }
00255
00256 endofterm:
00257 if (term.size() > MAX_PROB_TERM_LENGTH) continue;
00258
00259 if (stop_mode == STOPWORDS_IGNORE && (*stopper)(term)) continue;
00260
00261 if (with_positions) {
00262 doc.add_posting(prefix + term, ++termpos, wdf_inc);
00263 } else {
00264 doc.add_term(prefix + term, wdf_inc);
00265 }
00266 if ((flags & FLAG_SPELLING) && prefix.empty()) db.add_spelling(term);
00267
00268 if (!stemmer.internal.get()) continue;
00269
00270 if (stop_mode == STOPWORDS_INDEX_UNSTEMMED_ONLY && (*stopper)(term))
00271 continue;
00272
00273
00274
00275 if (!should_stem(term)) continue;
00276
00277
00278 string stem("Z");
00279 stem += prefix;
00280 stem += stemmer(term);
00281 doc.add_term(stem, wdf_inc);
00282 }
00283 }
00284
00285 }