00001
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055 #include <config.h>
00056
00057 #include "steminternal.h"
00058
00059 #include <xapian/error.h>
00060
00061 #include "omassert.h"
00062
00063 #include <cstdlib>
00064 #include <cstring>
00065
00066 #include <string>
00067
00068 using namespace std;
00069
00070 #define CREATE_SIZE 16
00071
00072 extern symbol * create_s() {
00073 void * mem = malloc(HEAD + (CREATE_SIZE + 1) * sizeof(symbol));
00074 if (mem == NULL) throw std::bad_alloc();
00075 symbol * p = reinterpret_cast<symbol*>(HEAD + static_cast<char *>(mem));
00076 SET_CAPACITY(p, CREATE_SIZE);
00077 SET_SIZE(p, CREATE_SIZE);
00078 return p;
00079 }
00080
00081
00082
00083
00084
00085
00086
00087
00088
00089 extern int skip_utf8(const symbol * p, int c, int lb, int l, int n) {
00090 if (n >= 0) {
00091 for (; n > 0; n--) {
00092 if (c >= l) return -1;
00093 if (p[c++] >= 0xC0) {
00094 while (c < l) {
00095
00096 if (p[c] >> 6 != 2) break;
00097 c++;
00098 }
00099 }
00100 }
00101 } else {
00102 for (; n < 0; n++) {
00103 if (c <= lb) return -1;
00104 if (p[--c] >= 0x80) {
00105 while (c > lb) {
00106 if (p[c] >= 0xC0) break;
00107 c--;
00108 }
00109 }
00110 }
00111 }
00112 return c;
00113 }
00114
00115
00116
00117
00118
00119 static symbol * increase_size(symbol * p, int n) {
00120 int new_size = n + 20;
00121 void * mem = realloc(reinterpret_cast<char *>(p) - HEAD,
00122 HEAD + (new_size + 1) * sizeof(symbol));
00123 if (mem == NULL) {
00124 throw std::bad_alloc();
00125 }
00126 symbol * q = reinterpret_cast<symbol*>(HEAD + static_cast<char *>(mem));
00127 SET_CAPACITY(q, new_size);
00128 return q;
00129 }
00130
00131 namespace Xapian {
00132
00133 StemImplementation::~StemImplementation() { }
00134
00135 SnowballStemImplementation::~SnowballStemImplementation()
00136 {
00137 lose_s(p);
00138 }
00139
00140 string
00141 SnowballStemImplementation::operator()(const string & word)
00142 {
00143 const symbol * s = reinterpret_cast<const symbol *>(word.data());
00144 replace_s(0, l, word.size(), s);
00145 c = 0;
00146 if (stem() < 0) {
00147
00148 throw Xapian::InternalError("stemming exception!");
00149 }
00150 return string(reinterpret_cast<const char *>(p), l);
00151 }
00152
00153
00154
00155 int SnowballStemImplementation::get_utf8(int * slot) {
00156 int b0, b1;
00157 int tmp = c;
00158 if (tmp >= l) return 0;
00159 b0 = p[tmp++];
00160 if (b0 < 0xC0 || tmp == l) {
00161 * slot = b0; return 1;
00162 }
00163 b1 = p[tmp++];
00164 if (b0 < 0xE0 || tmp == l) {
00165 * slot = (b0 & 0x1F) << 6 | (b1 & 0x3F); return 2;
00166 }
00167 * slot = (b0 & 0xF) << 12 | (b1 & 0x3F) << 6 | (p[tmp] & 0x3F); return 3;
00168 }
00169
00170 int SnowballStemImplementation::get_b_utf8(int * slot) {
00171 int b0, b1;
00172 int tmp = c;
00173 if (tmp <= lb) return 0;
00174 b0 = p[--tmp];
00175 if (b0 < 0x80 || tmp == lb) {
00176 * slot = b0; return 1;
00177 }
00178 b1 = p[--tmp];
00179 if (b1 >= 0xC0 || tmp == lb) {
00180 * slot = (b1 & 0x1F) << 6 | (b0 & 0x3F); return 2;
00181 }
00182 * slot = (p[tmp] & 0xF) << 12 | (b1 & 0x3F) << 6 | (b0 & 0x3F); return 3;
00183 }
00184
00185 int
00186 SnowballStemImplementation::in_grouping_U(const unsigned char * s, int min,
00187 int max, int repeat)
00188 {
00189 do {
00190 int ch;
00191 int w = get_utf8(&ch);
00192 if (!w) return -1;
00193 if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
00194 return w;
00195 c += w;
00196 } while (repeat);
00197 return 0;
00198 }
00199
00200 int
00201 SnowballStemImplementation::in_grouping_b_U(const unsigned char * s, int min,
00202 int max, int repeat)
00203 {
00204 do {
00205 int ch;
00206 int w = get_b_utf8(&ch);
00207 if (!w) return -1;
00208 if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
00209 return w;
00210 c -= w;
00211 } while (repeat);
00212 return 0;
00213 }
00214
00215 int
00216 SnowballStemImplementation::out_grouping_U(const unsigned char * s, int min,
00217 int max, int repeat)
00218 {
00219 do {
00220 int ch;
00221 int w = get_utf8(&ch);
00222 if (!w) return -1;
00223 if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0))
00224 return w;
00225 c += w;
00226 } while (repeat);
00227 return 0;
00228 }
00229
00230 int
00231 SnowballStemImplementation::out_grouping_b_U(const unsigned char * s, int min,
00232 int max, int repeat)
00233 {
00234 do {
00235 int ch;
00236 int w = get_b_utf8(&ch);
00237 if (!w) return -1;
00238 if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0))
00239 return w;
00240 c -= w;
00241 } while (repeat);
00242 return 0;
00243 }
00244
00245 int SnowballStemImplementation::eq_s(int s_size, const symbol * s) {
00246 if (l - c < s_size || memcmp(p + c, s, s_size * sizeof(symbol)) != 0)
00247 return 0;
00248 c += s_size;
00249 return 1;
00250 }
00251
00252 int SnowballStemImplementation::eq_s_b(int s_size, const symbol * s) {
00253 if (c - lb < s_size || memcmp(p + c - s_size, s, s_size * sizeof(symbol)) != 0)
00254 return 0;
00255 c -= s_size;
00256 return 1;
00257 }
00258
00259 int
00260 SnowballStemImplementation::find_among(const symbol * pool,
00261 const struct among * v, int v_size,
00262 const unsigned char * fnum,
00263 const among_function * f)
00264 {
00265 int i = 0;
00266 int j = v_size;
00267
00268 const symbol * q = p + c;
00269 int c_orig = c;
00270
00271 int common_i = 0;
00272 int common_j = 0;
00273
00274 int first_key_inspected = 0;
00275
00276 while (1) {
00277 int k = i + ((j - i) >> 1);
00278 int diff = 0;
00279 int common = common_i < common_j ? common_i : common_j;
00280 const struct among * w = v + k;
00281 for (int x = common; x < w->s_size; x++) {
00282 if (c_orig + common == l) { diff = -1; break; }
00283 diff = q[common] - (pool + w->s)[x];
00284 if (diff != 0) break;
00285 common++;
00286 }
00287 if (diff < 0) { j = k; common_j = common; }
00288 else { i = k; common_i = common; }
00289 if (j - i <= 1) {
00290 if (i > 0) break;
00291 if (j == i) break;
00292
00293
00294
00295
00296
00297 if (first_key_inspected) break;
00298 first_key_inspected = 1;
00299 }
00300 }
00301 while (1) {
00302 const struct among * w = v + i;
00303 if (common_i >= w->s_size) {
00304 c = c_orig + w->s_size;
00305 if (!fnum || !fnum[i]) return w->result;
00306 {
00307 int res = f[fnum[i] - 1](this);
00308 c = c_orig + w->s_size;
00309 if (res) return w->result;
00310 }
00311 }
00312 i = w->substring_i;
00313 if (i < 0) return 0;
00314 }
00315 }
00316
00317
00318 int
00319 SnowballStemImplementation::find_among_b(const symbol * pool,
00320 const struct among * v, int v_size,
00321 const unsigned char * fnum,
00322 const among_function * f)
00323 {
00324 int i = 0;
00325 int j = v_size;
00326
00327 const symbol * q = p + c - 1;
00328 int c_orig = c;
00329
00330 int common_i = 0;
00331 int common_j = 0;
00332
00333 int first_key_inspected = 0;
00334
00335 while (1) {
00336 int k = i + ((j - i) >> 1);
00337 int diff = 0;
00338 int common = common_i < common_j ? common_i : common_j;
00339 const struct among * w = v + k;
00340 for (int x = w->s_size - 1 - common; x >= 0; x--) {
00341 if (c_orig - common == lb) { diff = -1; break; }
00342 diff = q[- common] - (pool + w->s)[x];
00343 if (diff != 0) break;
00344 common++;
00345 }
00346 if (diff < 0) { j = k; common_j = common; }
00347 else { i = k; common_i = common; }
00348 if (j - i <= 1) {
00349 if (i > 0) break;
00350 if (j == i) break;
00351 if (first_key_inspected) break;
00352 first_key_inspected = 1;
00353 }
00354 }
00355 while (1) {
00356 const struct among * w = v + i;
00357 if (common_i >= w->s_size) {
00358 c = c_orig - w->s_size;
00359 if (!fnum || !fnum[i]) return w->result;
00360 {
00361 int res = f[fnum[i] - 1](this);
00362 c = c_orig - w->s_size;
00363 if (res) return w->result;
00364 }
00365 }
00366 i = w->substring_i;
00367 if (i < 0) return 0;
00368 }
00369 }
00370
00371 int
00372 SnowballStemImplementation::replace_s(int c_bra, int c_ket, int s_size,
00373 const symbol * s)
00374 {
00375 int adjustment;
00376 int len;
00377 Assert(p);
00378 adjustment = s_size - (c_ket - c_bra);
00379 len = SIZE(p);
00380 if (adjustment != 0) {
00381 if (adjustment + len > CAPACITY(p)) {
00382 p = increase_size(p, adjustment + len);
00383 }
00384 memmove(p + c_ket + adjustment,
00385 p + c_ket,
00386 (len - c_ket) * sizeof(symbol));
00387 SET_SIZE(p, adjustment + len);
00388 l += adjustment;
00389 if (c >= c_ket)
00390 c += adjustment;
00391 else
00392 if (c > c_bra)
00393 c = c_bra;
00394 }
00395 if (s_size != 0) memmove(p + c_bra, s, s_size * sizeof(symbol));
00396 return adjustment;
00397 }
00398
00399 int SnowballStemImplementation::slice_check() {
00400 Assert(p);
00401 if (bra < 0 || bra > ket || ket > l) {
00402 #if 0
00403 fprintf(stderr, "faulty slice operation:\n");
00404 debug(z, -1, 0);
00405 #endif
00406 return -1;
00407 }
00408 return 0;
00409 }
00410
00411 int SnowballStemImplementation::slice_from_s(int s_size, const symbol * s) {
00412 if (slice_check()) return -1;
00413 replace_s(bra, ket, s_size, s);
00414 return 0;
00415 }
00416
00417 void
00418 SnowballStemImplementation::insert_s(int c_bra, int c_ket, int s_size,
00419 const symbol * s)
00420 {
00421 int adjustment = replace_s(c_bra, c_ket, s_size, s);
00422 if (c_bra <= bra) bra += adjustment;
00423 if (c_bra <= ket) ket += adjustment;
00424 }
00425
00426 symbol * SnowballStemImplementation::slice_to(symbol * v) {
00427 if (slice_check()) return NULL;
00428 {
00429 int len = ket - bra;
00430 if (CAPACITY(v) < len) {
00431 v = increase_size(v, len);
00432 }
00433 memmove(v, p + bra, len * sizeof(symbol));
00434 SET_SIZE(v, len);
00435 }
00436 return v;
00437 }
00438
00439 symbol * SnowballStemImplementation::assign_to(symbol * v) {
00440 int len = l;
00441 if (CAPACITY(v) < len) {
00442 v = increase_size(v, len);
00443 }
00444 memmove(v, p, len * sizeof(symbol));
00445 SET_SIZE(v, len);
00446 return v;
00447 }
00448
00449 #if 0
00450 void SnowballStemImplementation::debug(int number, int line_count) {
00451 int i;
00452 int limit = SIZE(p);
00453
00454 if (number >= 0) printf("%3d (line %4d): [%d]'", number, line_count,limit);
00455 for (i = 0; i <= limit; i++) {
00456 if (lb == i) printf("{");
00457 if (bra == i) printf("[");
00458 if (c == i) printf("|");
00459 if (ket == i) printf("]");
00460 if (l == i) printf("}");
00461 if (i < limit)
00462 { int ch = p[i];
00463 if (ch == 0) ch = '#';
00464 printf("%c", ch);
00465 }
00466 }
00467 printf("'\n");
00468 }
00469 #endif
00470 }