00001
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054 #include <config.h>
00055
00056 #include <xapian/error.h>
00057
00058 #include "omassert.h"
00059 #include "steminternal.h"
00060
00061 #if 0
00062 #include <stdio.h>
00063 #endif
00064 #include <stdlib.h>
00065 #include <string.h>
00066
00067 #include <string>
00068
00069 using namespace std;
00070
00071 #define CREATE_SIZE 16
00072
00073 extern symbol * create_s() {
00074 void * mem = malloc(HEAD + (CREATE_SIZE + 1) * sizeof(symbol));
00075 if (mem == NULL) throw std::bad_alloc();
00076 symbol * p = reinterpret_cast<symbol*>(HEAD + static_cast<char *>(mem));
00077 SET_CAPACITY(p, CREATE_SIZE);
00078 SET_SIZE(p, CREATE_SIZE);
00079 return p;
00080 }
00081
00082
00083
00084
00085
00086
00087
00088
00089
00090 extern int skip_utf8(const symbol * p, int c, int lb, int l, int n) {
00091 if (n >= 0) {
00092 for (; n > 0; n--) {
00093 if (c >= l) return -1;
00094 if (p[c++] >= 0xC0) {
00095 while (c < l) {
00096
00097 if (p[c] >> 6 != 2) break;
00098 c++;
00099 }
00100 }
00101 }
00102 } else {
00103 for (; n < 0; n++) {
00104 if (c <= lb) return -1;
00105 if (p[--c] >= 0x80) {
00106 while (c > lb) {
00107 if (p[c] >= 0xC0) break;
00108 c--;
00109 }
00110 }
00111 }
00112 }
00113 return c;
00114 }
00115
00116
00117
00118
00119
00120 static symbol * increase_size(symbol * p, int n) {
00121 int new_size = n + 20;
00122 void * mem = realloc(reinterpret_cast<char *>(p) - HEAD,
00123 HEAD + (new_size + 1) * sizeof(symbol));
00124 if (mem == NULL) {
00125 throw std::bad_alloc();
00126 }
00127 symbol * q = reinterpret_cast<symbol*>(HEAD + static_cast<char *>(mem));
00128 SET_CAPACITY(q, new_size);
00129 return q;
00130 }
00131
00132 namespace Xapian {
00133
00134 Stem::Internal::~Internal()
00135 {
00136 lose_s(p);
00137 }
00138
00139 string
00140 Stem::Internal::operator()(const string & word)
00141 {
00142 const symbol * s = reinterpret_cast<const symbol *>(word.data());
00143 replace_s(0, l, word.size(), s);
00144 c = 0;
00145 if (stem() < 0) {
00146
00147 throw Xapian::InternalError("stemming exception!");
00148 }
00149 return string(reinterpret_cast<const char *>(p), l);
00150 }
00151
00152
00153
00154 int Stem::Internal::get_utf8(int * slot) {
00155 int b0, b1;
00156 int tmp = c;
00157 if (tmp >= l) return 0;
00158 b0 = p[tmp++];
00159 if (b0 < 0xC0 || tmp == l) {
00160 * slot = b0; return 1;
00161 }
00162 b1 = p[tmp++];
00163 if (b0 < 0xE0 || tmp == l) {
00164 * slot = (b0 & 0x1F) << 6 | (b1 & 0x3F); return 2;
00165 }
00166 * slot = (b0 & 0xF) << 12 | (b1 & 0x3F) << 6 | (p[tmp] & 0x3F); return 3;
00167 }
00168
00169 int Stem::Internal::get_b_utf8(int * slot) {
00170 int b0, b1;
00171 int tmp = c;
00172 if (tmp <= lb) return 0;
00173 b0 = p[--tmp];
00174 if (b0 < 0x80 || tmp == lb) {
00175 * slot = b0; return 1;
00176 }
00177 b1 = p[--tmp];
00178 if (b1 >= 0xC0 || tmp == lb) {
00179 * slot = (b1 & 0x1F) << 6 | (b0 & 0x3F); return 2;
00180 }
00181 * slot = (p[tmp] & 0xF) << 12 | (b1 & 0x3F) << 6 | (b0 & 0x3F); return 3;
00182 }
00183
00184 int Stem::Internal::in_grouping_U(const unsigned char * s, int min, int max, int repeat) {
00185 do {
00186 int ch;
00187 int w = get_utf8(&ch);
00188 if (!w) return -1;
00189 if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
00190 return w;
00191 c += w;
00192 } while (repeat);
00193 return 0;
00194 }
00195
00196 int Stem::Internal::in_grouping_b_U(const unsigned char * s, int min, int max, int repeat) {
00197 do {
00198 int ch;
00199 int w = get_b_utf8(&ch);
00200 if (!w) return -1;
00201 if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
00202 return w;
00203 c -= w;
00204 } while (repeat);
00205 return 0;
00206 }
00207
00208 int Stem::Internal::out_grouping_U(const unsigned char * s, int min, int max, int repeat) {
00209 do {
00210 int ch;
00211 int w = get_utf8(&ch);
00212 if (!w) return -1;
00213 if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0))
00214 return w;
00215 c += w;
00216 } while (repeat);
00217 return 0;
00218 }
00219
00220 int Stem::Internal::out_grouping_b_U(const unsigned char * s, int min, int max, int repeat) {
00221 do {
00222 int ch;
00223 int w = get_b_utf8(&ch);
00224 if (!w) return -1;
00225 if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0))
00226 return w;
00227 c -= w;
00228 } while (repeat);
00229 return 0;
00230 }
00231
00232 int Stem::Internal::eq_s(int s_size, const symbol * s) {
00233 if (l - c < s_size || memcmp(p + c, s, s_size * sizeof(symbol)) != 0)
00234 return 0;
00235 c += s_size;
00236 return 1;
00237 }
00238
00239 int Stem::Internal::eq_s_b(int s_size, const symbol * s) {
00240 if (c - lb < s_size || memcmp(p + c - s_size, s, s_size * sizeof(symbol)) != 0)
00241 return 0;
00242 c -= s_size;
00243 return 1;
00244 }
00245
00246 int
00247 Stem::Internal::find_among(const symbol * pool, const struct among * v,
00248 int v_size, const unsigned char * fnum,
00249 const among_function * f)
00250 {
00251 int i = 0;
00252 int j = v_size;
00253
00254 const symbol * q = p + c;
00255 int c_orig = c;
00256
00257 int common_i = 0;
00258 int common_j = 0;
00259
00260 int first_key_inspected = 0;
00261
00262 while (1) {
00263 int k = i + ((j - i) >> 1);
00264 int diff = 0;
00265 int common = common_i < common_j ? common_i : common_j;
00266 const struct among * w = v + k;
00267 for (int x = common; x < w->s_size; x++) {
00268 if (c_orig + common == l) { diff = -1; break; }
00269 diff = q[common] - (pool + w->s)[x];
00270 if (diff != 0) break;
00271 common++;
00272 }
00273 if (diff < 0) { j = k; common_j = common; }
00274 else { i = k; common_i = common; }
00275 if (j - i <= 1) {
00276 if (i > 0) break;
00277 if (j == i) break;
00278
00279
00280
00281
00282
00283 if (first_key_inspected) break;
00284 first_key_inspected = 1;
00285 }
00286 }
00287 while (1) {
00288 const struct among * w = v + i;
00289 if (common_i >= w->s_size) {
00290 c = c_orig + w->s_size;
00291 if (!fnum || !fnum[i]) return w->result;
00292 {
00293 int res = f[fnum[i] - 1](this);
00294 c = c_orig + w->s_size;
00295 if (res) return w->result;
00296 }
00297 }
00298 i = w->substring_i;
00299 if (i < 0) return 0;
00300 }
00301 }
00302
00303
00304 int
00305 Stem::Internal::find_among_b(const symbol * pool, const struct among * v,
00306 int v_size, const unsigned char * fnum,
00307 const among_function * f)
00308 {
00309 int i = 0;
00310 int j = v_size;
00311
00312 const symbol * q = p + c - 1;
00313 int c_orig = c;
00314
00315 int common_i = 0;
00316 int common_j = 0;
00317
00318 int first_key_inspected = 0;
00319
00320 while (1) {
00321 int k = i + ((j - i) >> 1);
00322 int diff = 0;
00323 int common = common_i < common_j ? common_i : common_j;
00324 const struct among * w = v + k;
00325 for (int x = w->s_size - 1 - common; x >= 0; x--) {
00326 if (c_orig - common == lb) { diff = -1; break; }
00327 diff = q[- common] - (pool + w->s)[x];
00328 if (diff != 0) break;
00329 common++;
00330 }
00331 if (diff < 0) { j = k; common_j = common; }
00332 else { i = k; common_i = common; }
00333 if (j - i <= 1) {
00334 if (i > 0) break;
00335 if (j == i) break;
00336 if (first_key_inspected) break;
00337 first_key_inspected = 1;
00338 }
00339 }
00340 while (1) {
00341 const struct among * w = v + i;
00342 if (common_i >= w->s_size) {
00343 c = c_orig - w->s_size;
00344 if (!fnum || !fnum[i]) return w->result;
00345 {
00346 int res = f[fnum[i] - 1](this);
00347 c = c_orig - w->s_size;
00348 if (res) return w->result;
00349 }
00350 }
00351 i = w->substring_i;
00352 if (i < 0) return 0;
00353 }
00354 }
00355
00356 int
00357 Stem::Internal::replace_s(int c_bra, int c_ket, int s_size, const symbol * s)
00358 {
00359 int adjustment;
00360 int len;
00361 Assert(p);
00362 adjustment = s_size - (c_ket - c_bra);
00363 len = SIZE(p);
00364 if (adjustment != 0) {
00365 if (adjustment + len > CAPACITY(p)) {
00366 p = increase_size(p, adjustment + len);
00367 }
00368 memmove(p + c_ket + adjustment,
00369 p + c_ket,
00370 (len - c_ket) * sizeof(symbol));
00371 SET_SIZE(p, adjustment + len);
00372 l += adjustment;
00373 if (c >= c_ket)
00374 c += adjustment;
00375 else
00376 if (c > c_bra)
00377 c = c_bra;
00378 }
00379 if (s_size != 0) memmove(p + c_bra, s, s_size * sizeof(symbol));
00380 return adjustment;
00381 }
00382
00383 int Stem::Internal::slice_check() {
00384 Assert(p);
00385 if (bra < 0 || bra > ket || ket > l) {
00386 #if 0
00387 fprintf(stderr, "faulty slice operation:\n");
00388 debug(z, -1, 0);
00389 #endif
00390 return -1;
00391 }
00392 return 0;
00393 }
00394
00395 int Stem::Internal::slice_from_s(int s_size, const symbol * s) {
00396 if (slice_check()) return -1;
00397 replace_s(bra, ket, s_size, s);
00398 return 0;
00399 }
00400
00401 void Stem::Internal::insert_s(int c_bra, int c_ket, int s_size, const symbol * s) {
00402 int adjustment = replace_s(c_bra, c_ket, s_size, s);
00403 if (c_bra <= bra) bra += adjustment;
00404 if (c_bra <= ket) ket += adjustment;
00405 }
00406
00407 symbol * Stem::Internal::slice_to(symbol * v) {
00408 if (slice_check()) return NULL;
00409 {
00410 int len = ket - bra;
00411 if (CAPACITY(v) < len) {
00412 v = increase_size(v, len);
00413 }
00414 memmove(v, p + bra, len * sizeof(symbol));
00415 SET_SIZE(v, len);
00416 }
00417 return v;
00418 }
00419
00420 symbol * Stem::Internal::assign_to(symbol * v) {
00421 int len = l;
00422 if (CAPACITY(v) < len) {
00423 v = increase_size(v, len);
00424 }
00425 memmove(v, p, len * sizeof(symbol));
00426 SET_SIZE(v, len);
00427 return v;
00428 }
00429
00430 #if 0
00431 void Stem::Internal::debug(int number, int line_count) {
00432 int i;
00433 int limit = SIZE(p);
00434
00435 if (number >= 0) printf("%3d (line %4d): [%d]'", number, line_count,limit);
00436 for (i = 0; i <= limit; i++) {
00437 if (lb == i) printf("{");
00438 if (bra == i) printf("[");
00439 if (c == i) printf("|");
00440 if (ket == i) printf("]");
00441 if (l == i) printf("}");
00442 if (i < limit)
00443 { int ch = p[i];
00444 if (ch == 0) ch = '#';
00445 printf("%c", ch);
00446 }
00447 }
00448 printf("'\n");
00449 }
00450 #endif
00451 }