00001
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #include <config.h>
00022
00023 #include "termgenerator_internal.h"
00024
00025 #include <xapian/document.h>
00026 #include <xapian/queryparser.h>
00027 #include <xapian/unicode.h>
00028
00029 #include "stringutils.h"
00030
00031 #include <string>
00032
00033 using namespace std;
00034
00035 namespace Xapian {
00036
00037
00038
00039 static const unsigned int MAX_PROB_TERM_LENGTH = 64;
00040
00041
00042
00043
00044
00045 inline bool
00046 U_isupper(unsigned ch) {
00047 return (ch < 128 && C_isupper((unsigned char)ch));
00048 }
00049
00050 inline unsigned check_wordchar(unsigned ch) {
00051 if (Unicode::is_wordchar(ch)) return Unicode::tolower(ch);
00052 return 0;
00053 }
00054
00055 inline bool
00056 should_stem(const std::string & term)
00057 {
00058 const unsigned int SHOULD_STEM_MASK =
00059 (1 << Unicode::LOWERCASE_LETTER) |
00060 (1 << Unicode::TITLECASE_LETTER) |
00061 (1 << Unicode::MODIFIER_LETTER) |
00062 (1 << Unicode::OTHER_LETTER);
00063 Utf8Iterator u(term);
00064 return ((SHOULD_STEM_MASK >> Unicode::get_category(*u)) & 1);
00065 }
00066
00067 inline unsigned check_infix(unsigned ch) {
00068 if (ch == '\'' || ch == '&' || ch == 0xb7 || ch == 0x5f4 || ch == 0x2027) {
00069
00070
00071
00072
00073 return ch;
00074 }
00075
00076
00077 if (ch == 0x2019 || ch == 0x201b) return '\'';
00078 return 0;
00079 }
00080
00081 inline unsigned check_infix_digit(unsigned ch) {
00082
00083 switch (ch) {
00084 case ',':
00085 case '.':
00086 case ';':
00087 case 0x037e:
00088 case 0x0589:
00089 case 0x060D:
00090 case 0x07F8:
00091 case 0x2044:
00092 case 0xFE10:
00093 case 0xFE13:
00094 case 0xFE14:
00095 return ch;
00096 }
00097 return 0;
00098 }
00099
00100 inline bool
00101 is_digit(unsigned ch) {
00102 return (Unicode::get_category(ch) == Unicode::DECIMAL_DIGIT_NUMBER);
00103 }
00104
00105 inline unsigned check_suffix(unsigned ch) {
00106 if (ch == '+' || ch == '#') return ch;
00107
00108 return 0;
00109 }
00110
00111
00112 #define STOPWORDS_NONE 0
00113 #define STOPWORDS_IGNORE 1
00114 #define STOPWORDS_INDEX_UNSTEMMED_ONLY 2
00115
00116 void
00117 TermGenerator::Internal::index_text(Utf8Iterator itor, termcount weight,
00118 const string & prefix, bool with_positions)
00119 {
00120 int stop_mode = STOPWORDS_INDEX_UNSTEMMED_ONLY;
00121
00122 if (!stopper) stop_mode = STOPWORDS_NONE;
00123
00124 while (true) {
00125
00126 unsigned ch;
00127 while (true) {
00128 if (itor == Utf8Iterator()) return;
00129 ch = check_wordchar(*itor);
00130 if (ch) break;
00131 ++itor;
00132 }
00133
00134 string term;
00135
00136
00137 if (U_isupper(*itor)) {
00138 const Utf8Iterator end;
00139 Utf8Iterator p = itor;
00140 do {
00141 Unicode::append_utf8(term, Unicode::tolower(*p++));
00142 } while (p != end && *p == '.' && ++p != end && U_isupper(*p));
00143
00144
00145 if (term.size() > 1) {
00146
00147
00148 if (p == end || !Unicode::is_wordchar(*p)) {
00149 itor = p;
00150 goto endofterm;
00151 }
00152 }
00153 term.resize(0);
00154 }
00155
00156 while (true) {
00157 unsigned prevch;
00158 do {
00159 Unicode::append_utf8(term, ch);
00160 prevch = ch;
00161 if (++itor == Utf8Iterator()) goto endofterm;
00162 ch = check_wordchar(*itor);
00163 } while (ch);
00164
00165 Utf8Iterator next(itor);
00166 ++next;
00167 if (next == Utf8Iterator()) break;
00168 unsigned nextch = check_wordchar(*next);
00169 if (!nextch) break;
00170 unsigned infix_ch = *itor;
00171 if (is_digit(prevch) && is_digit(*next)) {
00172 infix_ch = check_infix_digit(infix_ch);
00173 } else {
00174
00175 infix_ch = check_infix(infix_ch);
00176 }
00177 if (!infix_ch) break;
00178 Unicode::append_utf8(term, infix_ch);
00179 ch = nextch;
00180 itor = next;
00181 }
00182
00183 {
00184 size_t len = term.size();
00185 unsigned count = 0;
00186 while ((ch = check_suffix(*itor))) {
00187 if (++count > 3) {
00188 term.resize(len);
00189 break;
00190 }
00191 Unicode::append_utf8(term, ch);
00192 if (++itor == Utf8Iterator()) goto endofterm;
00193 }
00194
00195 if (Unicode::is_wordchar(*itor))
00196 term.resize(len);
00197 }
00198
00199 endofterm:
00200 if (term.size() > MAX_PROB_TERM_LENGTH) continue;
00201
00202 if (stop_mode == STOPWORDS_IGNORE && (*stopper)(term)) continue;
00203
00204 if (with_positions) {
00205 doc.add_posting(prefix + term, ++termpos, weight);
00206 } else {
00207 doc.add_term(prefix + term, weight);
00208 }
00209 if ((flags & FLAG_SPELLING) && prefix.empty()) db.add_spelling(term);
00210
00211 if (!stemmer.internal.get()) continue;
00212
00213 if (stop_mode == STOPWORDS_INDEX_UNSTEMMED_ONLY && (*stopper)(term))
00214 continue;
00215
00216
00217
00218 if (!should_stem(term)) continue;
00219
00220
00221 string stem("Z");
00222 stem += prefix;
00223 stem += stemmer(term);
00224 doc.add_term(stem, weight);
00225 }
00226 }
00227
00228 }