xapian-core  1.4.21
steminternal.cc
Go to the documentation of this file.
1 
4 /* Derived from snowball's runtime/api.c:
5  *
6  * Copyright (c) 2001, Dr Martin Porter
7  * Copyright (c) 2004,2005, Richard Boulton
8  * Copyright (c) 2006,2007,2008,2009,2016 Olly Betts
9  * All rights reserved.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions are met:
13  *
14  * * Redistributions of source code must retain the above copyright notice,
15  * this list of conditions and the following disclaimer.
16  *
17  * * Redistributions in binary form must reproduce the above copyright
18  * notice, this list of conditions and the following disclaimer in the
19  * documentation and/or other materials provided with the distribution.
20  *
21  * * Neither the name of the <ORGANIZATION> nor the names of its contributors
22  * may be used to endorse or promote products derived from this software
23  * without specific prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
26  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
29  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35  * POSSIBILITY OF SUCH DAMAGE.
36  */
37 /* Copyright (C) 2007,2010 Olly Betts
38  * Copyright (C) 2010 Evgeny Sizikov
39  *
40  * This program is free software; you can redistribute it and/or
41  * modify it under the terms of the GNU General Public License as
42  * published by the Free Software Foundation; either version 2 of the
43  * License, or (at your option) any later version.
44  *
45  * This program is distributed in the hope that it will be useful,
46  * but WITHOUT ANY WARRANTY; without even the implied warranty of
47  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
48  * GNU General Public License for more details.
49  *
50  * You should have received a copy of the GNU General Public License
51  * along with this program; if not, write to the Free Software
52  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
53  */
54 
55 #include <config.h>
56 
57 #include "steminternal.h"
58 
59 #include <xapian/error.h>
60 
61 #include "omassert.h"
62 
63 #include <cstdlib>
64 #include <cstring>
65 
66 #include <string>
67 
68 using namespace std;
69 
70 namespace Xapian {
71 
72 #define CREATE_SIZE 16
73 
74 symbol *
75 SnowballStemImplementation::create_s()
76 {
77  void * mem = malloc(HEAD + (CREATE_SIZE + 1) * sizeof(symbol));
78  if (mem == NULL) throw std::bad_alloc();
79  symbol * p = reinterpret_cast<symbol*>(HEAD + static_cast<char *>(mem));
80  SET_CAPACITY(p, CREATE_SIZE);
81  SET_SIZE(p, 0);
82  return p;
83 }
84 
85 /*
86  new_c = skip_utf8(p, c, lb, l, n); skips n characters forwards from p + c
87  if n +ve, or n characters backwards from p + c - 1 if n -ve. new_c is the new
88  value of the cursor c, or -1 on failure.
89 
90  -- used to implement hop and next in the utf8 case.
91 */
92 
93 int
94 SnowballStemImplementation::skip_utf8(const symbol * p, int c, int lb, int l, int n)
95 {
96  if (n >= 0) {
97  for (; n > 0; --n) {
98  if (c >= l) return -1;
99  if (p[c++] >= 0xC0) { /* 1100 0000 */
100  while (c < l) {
101  /* break unless p[c] is 10------ */
102  if (p[c] >> 6 != 2) break;
103  c++;
104  }
105  }
106  }
107  } else {
108  for (; n < 0; ++n) {
109  if (c <= lb) return -1;
110  if (p[--c] >= 0x80) { /* 1000 0000 */
111  while (c > lb) {
112  if (p[c] >= 0xC0) break; /* 1100 0000 */
113  c--;
114  }
115  }
116  }
117  }
118  return c;
119 }
120 
121 
122 /* Increase the size of the buffer pointed to by p to at least n symbols.
123  * If insufficient memory, throw std::bad_alloc().
124  */
125 symbol *
126 SnowballStemImplementation::increase_size(symbol * p, int n)
127 {
128  int new_size = n + 20;
129  void * mem = realloc(reinterpret_cast<char *>(p) - HEAD,
130  HEAD + (new_size + 1) * sizeof(symbol));
131  if (mem == NULL) {
132  throw std::bad_alloc();
133  }
134  symbol * q = reinterpret_cast<symbol*>(HEAD + static_cast<char *>(mem));
135  SET_CAPACITY(q, new_size);
136  return q;
137 }
138 
139 
140 StemImplementation::~StemImplementation() { }
141 
142 SnowballStemImplementation::~SnowballStemImplementation()
143 {
144  lose_s(p);
145 }
146 
147 string
148 SnowballStemImplementation::operator()(const string & word)
149 {
150  const symbol * s = reinterpret_cast<const symbol *>(word.data());
151  replace_s(0, l, word.size(), s);
152  c = 0;
153  if (stem() < 0) {
154  // FIXME: Is there a better choice of exception class?
155  throw Xapian::InternalError("stemming exception!");
156  }
157  return string(reinterpret_cast<const char *>(p), l);
158 }
159 
160 /* Code for character groupings: utf8 cases */
161 
163  int b0, b1, b2;
164  int tmp = c;
165  if (tmp >= l) return 0;
166  b0 = p[tmp++];
167  if (b0 < 0xC0 || tmp == l) { /* 1100 0000 */
168  *slot = b0;
169  return 1;
170  }
171  b1 = p[tmp++] & 0x3F;
172  if (b0 < 0xE0 || tmp == l) { /* 1110 0000 */
173  *slot = (b0 & 0x1F) << 6 | b1;
174  return 2;
175  }
176  b2 = p[tmp++] & 0x3F;
177  if (b0 < 0xF0 || tmp == l) { /* 1111 0000 */
178  *slot = (b0 & 0xF) << 12 | b1 << 6 | b2;
179  return 3;
180  }
181  *slot = (b0 & 0xE) << 18 | b1 << 12 | b2 << 6 | (p[tmp] & 0x3F);
182  return 4;
183 }
184 
185 int SnowballStemImplementation::get_b_utf8(int * slot) {
186  int a, b;
187  int tmp = c;
188  if (tmp <= lb) return 0;
189  b = p[--tmp];
190  if (b < 0x80 || tmp == lb) { /* 1000 0000 */
191  *slot = b;
192  return 1;
193  }
194  a = b & 0x3F;
195  b = p[--tmp];
196  if (b >= 0xC0 || tmp == lb) { /* 1100 0000 */
197  *slot = (b & 0x1F) << 6 | a;
198  return 2;
199  }
200  a |= (b & 0x3F) << 6;
201  b = p[--tmp];
202  if (b >= 0xE0 || tmp == lb) { /* 1110 0000 */
203  *slot = (b & 0xF) << 12 | a;
204  return 3;
205  }
206  *slot = (p[--tmp] & 0xE) << 18 | (b & 0x3F) << 12 | a;
207  return 4;
208 }
209 
210 int
211 SnowballStemImplementation::in_grouping_U(const unsigned char * s, int min,
212  int max, int repeat)
213 {
214  do {
215  int ch;
216  int w = get_utf8(&ch);
217  if (!w) return -1;
218  if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
219  return w;
220  c += w;
221  } while (repeat);
222  return 0;
223 }
224 
225 int
226 SnowballStemImplementation::in_grouping_b_U(const unsigned char * s, int min,
227  int max, int repeat)
228 {
229  do {
230  int ch;
231  int w = get_b_utf8(&ch);
232  if (!w) return -1;
233  if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
234  return w;
235  c -= w;
236  } while (repeat);
237  return 0;
238 }
239 
240 int
241 SnowballStemImplementation::out_grouping_U(const unsigned char * s, int min,
242  int max, int repeat)
243 {
244  do {
245  int ch;
246  int w = get_utf8(&ch);
247  if (!w) return -1;
248  if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0))
249  /* FIXME: try adding this so gopast in generated code is simpler: if (repeat == 2) c += w; */ return w;
250  c += w;
251  } while (repeat);
252  return 0;
253 }
254 
255 int
256 SnowballStemImplementation::out_grouping_b_U(const unsigned char * s, int min,
257  int max, int repeat)
258 {
259  do {
260  int ch;
261  int w = get_b_utf8(&ch);
262  if (!w) return -1;
263  if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0))
264  return w;
265  c -= w;
266  } while (repeat);
267  return 0;
268 }
269 
270 int SnowballStemImplementation::eq_s(int s_size, const symbol * s) {
271  if (l - c < s_size || memcmp(p + c, s, s_size * sizeof(symbol)) != 0)
272  return 0;
273  c += s_size;
274  return 1;
275 }
276 
277 int SnowballStemImplementation::eq_s_b(int s_size, const symbol * s) {
278  if (c - lb < s_size || memcmp(p + c - s_size, s, s_size * sizeof(symbol)) != 0)
279  return 0;
280  c -= s_size;
281  return 1;
282 }
283 
284 int
285 SnowballStemImplementation::find_among(const symbol * pool,
286  const struct among * v, int v_size,
287  const unsigned char * fnum,
288  const among_function * f)
289 {
290  int i = 0;
291  int j = v_size;
292 
293  const symbol * q = p + c;
294  int c_orig = c;
295 
296  int common_i = 0;
297  int common_j = 0;
298 
299  int first_key_inspected = 0;
300 
301  while (1) {
302  int k = i + ((j - i) >> 1);
303  int diff = 0;
304  int common = common_i < common_j ? common_i : common_j; /* smaller */
305  const struct among * w = v + k;
306  for (int x = common; x < w->s_size; ++x) {
307  if (c_orig + common == l) { diff = -1; break; }
308  diff = q[common] - (pool + w->s)[x];
309  if (diff != 0) break;
310  ++common;
311  }
312  if (diff < 0) {
313  j = k;
314  common_j = common;
315  } else {
316  i = k;
317  common_i = common;
318  }
319  if (j - i <= 1) {
320  if (i > 0) break; /* v->s has been inspected */
321  if (j == i) break; /* only one item in v */
322 
323  /* - but now we need to go round once more to get
324  v->s inspected. This looks messy, but is actually
325  the optimal approach. */
326 
327  if (first_key_inspected) break;
328  first_key_inspected = 1;
329  }
330  }
331  while (1) {
332  const struct among * w = v + i;
333  if (common_i >= w->s_size) {
334  c = c_orig + w->s_size;
335  if (!fnum || !fnum[i]) return w->result;
336  {
337  int res = f[fnum[i] - 1](this);
338  c = c_orig + w->s_size;
339  if (res) return w->result;
340  }
341  }
342  i = w->substring_i;
343  if (i < 0) return 0;
344  }
345 }
346 
347 /* find_among_b is for backwards processing. Same comments apply */
348 int
349 SnowballStemImplementation::find_among_b(const symbol * pool,
350  const struct among * v, int v_size,
351  const unsigned char * fnum,
352  const among_function * f)
353 {
354  int i = 0;
355  int j = v_size;
356 
357  const symbol * q = p + c - 1;
358  int c_orig = c;
359 
360  int common_i = 0;
361  int common_j = 0;
362 
363  int first_key_inspected = 0;
364 
365  while (1) {
366  int k = i + ((j - i) >> 1);
367  int diff = 0;
368  int common = common_i < common_j ? common_i : common_j;
369  const struct among * w = v + k;
370  for (int x = w->s_size - 1 - common; x >= 0; --x) {
371  if (c_orig - common == lb) { diff = -1; break; }
372  diff = q[- common] - (pool + w->s)[x];
373  if (diff != 0) break;
374  ++common;
375  }
376  if (diff < 0) { j = k; common_j = common; }
377  else { i = k; common_i = common; }
378  if (j - i <= 1) {
379  if (i > 0) break;
380  if (j == i) break;
381  if (first_key_inspected) break;
382  first_key_inspected = 1;
383  }
384  }
385  while (1) {
386  const struct among * w = v + i;
387  if (common_i >= w->s_size) {
388  c = c_orig - w->s_size;
389  if (!fnum || !fnum[i]) return w->result;
390  {
391  int res = f[fnum[i] - 1](this);
392  c = c_orig - w->s_size;
393  if (res) return w->result;
394  }
395  }
396  i = w->substring_i;
397  if (i < 0) return 0;
398  }
399 }
400 
401 int
402 SnowballStemImplementation::replace_s(int c_bra, int c_ket, int s_size,
403  const symbol * s)
404 {
405  int adjustment;
406  int len;
407  Assert(p);
408  adjustment = s_size - (c_ket - c_bra);
409  len = SIZE(p);
410  if (adjustment != 0) {
411  if (adjustment + len > CAPACITY(p)) {
412  p = increase_size(p, adjustment + len);
413  }
414  memmove(p + c_ket + adjustment,
415  p + c_ket,
416  (len - c_ket) * sizeof(symbol));
417  SET_SIZE(p, adjustment + len);
418  l += adjustment;
419  if (c >= c_ket)
420  c += adjustment;
421  else if (c > c_bra)
422  c = c_bra;
423  }
424  if (s_size) memmove(p + c_bra, s, s_size * sizeof(symbol));
425  return adjustment;
426 }
427 
428 int SnowballStemImplementation::slice_check() {
429  Assert(p);
430  if (bra < 0 || bra > ket || ket > l) {
431 #if 0
432  fprintf(stderr, "faulty slice operation:\n");
433  debug(z, -1, 0);
434 #endif
435  return -1;
436  }
437  return 0;
438 }
439 
440 int SnowballStemImplementation::slice_from_s(int s_size, const symbol * s) {
441  if (slice_check()) return -1;
442  replace_s(bra, ket, s_size, s);
443  return 0;
444 }
445 
446 void
447 SnowballStemImplementation::insert_s(int c_bra, int c_ket, int s_size,
448  const symbol * s)
449 {
450  int adjustment = replace_s(c_bra, c_ket, s_size, s);
451  if (c_bra <= bra) bra += adjustment;
452  if (c_bra <= ket) ket += adjustment;
453 }
454 
455 symbol * SnowballStemImplementation::slice_to(symbol * v) {
456  if (slice_check()) return NULL;
457  {
458  int len = ket - bra;
459  if (CAPACITY(v) < len) {
460  v = increase_size(v, len);
461  }
462  memmove(v, p + bra, len * sizeof(symbol));
463  SET_SIZE(v, len);
464  }
465  return v;
466 }
467 
468 symbol * SnowballStemImplementation::assign_to(symbol * v) {
469  int len = l;
470  if (CAPACITY(v) < len) {
471  v = increase_size(v, len);
472  }
473  memmove(v, p, len * sizeof(symbol));
474  SET_SIZE(v, len);
475  return v;
476 }
477 
478 int SnowballStemImplementation::len_utf8(const symbol * v) {
479  int size = SIZE(v);
480  int len = 0;
481  while (size--) {
482  symbol b = *v++;
483  if (static_cast<signed char>(b) >= static_cast<signed char>(0xc0))
484  ++len;
485  }
486  return len;
487 }
488 
489 #if 0
490 void SnowballStemImplementation::debug(int number, int line_count) {
491  int i;
492  int limit = SIZE(p);
493  if (number >= 0) printf("%3d (line %4d): [%d]'", number, line_count, limit);
494  for (i = 0; i <= limit; ++i) {
495  if (lb == i) printf("{");
496  if (bra == i) printf("[");
497  if (c == i) printf("|");
498  if (ket == i) printf("]");
499  if (l == i) printf("}");
500  if (i < limit) {
501  int ch = p[i];
502  if (ch == 0) ch = '#';
503  printf("%c", ch);
504  }
505  }
506  printf("'\n");
507 }
508 #endif
509 }
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:80
int s_size
Definition: steminternal.h:39
#define Assert(COND)
Definition: omassert.h:122
int substring_i
Definition: steminternal.h:41
#define CREATE_SIZE
Definition: steminternal.cc:72
int number
Definition: header.h:195
STL namespace.
#define HEAD
Definition: steminternal.h:34
Hierarchy of classes which Xapian can throw as exceptions.
Base class for implementations of stemming algorithms.
unsigned short symbol
Definition: header.h:6
Definition: header.h:86
int(* among_function)(Xapian::StemImplementation *)
Definition: steminternal.h:36
Definition: header.h:90
Definition: header.h:191
int result
Definition: steminternal.h:42
InternalError indicates a runtime problem of some sort.
Definition: error.h:761
#define CAPACITY(p)
Definition: header.h:18
int get_utf8(const symbol *p, int *slot)
#define SIZE(p)
Definition: header.h:17
Various assertion macros.
struct amongvec * b
Definition: header.h:194
unsigned s
Definition: steminternal.h:40
void lose_s(symbol *p)
Definition: steminternal.h:45