xapian-core  2.0.0
utilities.cc
Go to the documentation of this file.
1 
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <string.h>
5 
6 #define SNOWBALL_RUNTIME_THROW_EXCEPTIONS
7 
8 #include "snowball_runtime.h"
9 
10 #ifdef SNOWBALL_RUNTIME_THROW_EXCEPTIONS
11 # include <new>
12 # include <stdexcept>
13 # define SNOWBALL_RETURN_OK return
14 # define SNOWBALL_RETURN_OR_THROW(R, E) throw E
15 # define SNOWBALL_PROPAGATE_ERR(F) F
16 #else
17 # define SNOWBALL_RETURN_OK return 0
18 # define SNOWBALL_RETURN_OR_THROW(R, E) return R
19 # define SNOWBALL_PROPAGATE_ERR(F) do { \
20  int snowball_err = F; \
21  if (snowball_err < 0) return snowball_err; \
22  } while (0)
23 #endif
24 
25 #define CREATE_SIZE 1
26 
27 extern symbol * create_s(void) {
28  symbol * p;
29  void * mem = malloc(HEAD + (CREATE_SIZE + 1) * sizeof(symbol));
30  if (mem == NULL)
31  SNOWBALL_RETURN_OR_THROW(NULL, std::bad_alloc());
32  p = (symbol *) (HEAD + (char *) mem);
34  SET_SIZE(p, 0);
35  return p;
36 }
37 
38 extern void lose_s(symbol * p) {
39  if (p == NULL) return;
40  free((char *) p - HEAD);
41 }
42 
43 /*
44  new_p = skip_utf8(p, c, l, n); skips n characters forwards from p + c.
45  new_p is the new position, or -1 on failure.
46 
47  -- used to implement hop and next in the utf8 case.
48 */
49 
50 extern int skip_utf8(const symbol * p, int c, int limit, int n) {
51  int b;
52  if (n < 0) return -1;
53  for (; n > 0; n--) {
54  if (c >= limit) return -1;
55  b = p[c++];
56  if (b >= 0xC0) { /* 1100 0000 */
57  while (c < limit) {
58  b = p[c];
59  if (b >= 0xC0 || b < 0x80) break;
60  /* break unless b is 10------ */
61  c++;
62  }
63  }
64  }
65  return c;
66 }
67 
68 /*
69  new_p = skip_b_utf8(p, c, lb, n); skips n characters backwards from p + c - 1
70  new_p is the new position, or -1 on failure.
71 
72  -- used to implement hop and next in the utf8 case.
73 */
74 
75 extern int skip_b_utf8(const symbol * p, int c, int limit, int n) {
76  int b;
77  if (n < 0) return -1;
78  for (; n > 0; n--) {
79  if (c <= limit) return -1;
80  b = p[--c];
81  if (b >= 0x80) { /* 1000 0000 */
82  while (c > limit) {
83  b = p[c];
84  if (b >= 0xC0) break; /* 1100 0000 */
85  c--;
86  }
87  }
88  }
89  return c;
90 }
91 
92 /* Code for character groupings: utf8 cases */
93 
94 static int get_utf8(const symbol * p, int c, int l, int * slot) {
95  int b0, b1, b2;
96  if (c >= l) return 0;
97  b0 = p[c++];
98  if (b0 < 0xC0 || c == l) { /* 1100 0000 */
99  *slot = b0;
100  return 1;
101  }
102  b1 = p[c++] & 0x3F;
103  if (b0 < 0xE0 || c == l) { /* 1110 0000 */
104  *slot = (b0 & 0x1F) << 6 | b1;
105  return 2;
106  }
107  b2 = p[c++] & 0x3F;
108  if (b0 < 0xF0 || c == l) { /* 1111 0000 */
109  *slot = (b0 & 0xF) << 12 | b1 << 6 | b2;
110  return 3;
111  }
112  *slot = (b0 & 0x7) << 18 | b1 << 12 | b2 << 6 | (p[c] & 0x3F);
113  return 4;
114 }
115 
116 static int get_b_utf8(const symbol * p, int c, int lb, int * slot) {
117  int a, b;
118  if (c <= lb) return 0;
119  b = p[--c];
120  if (b < 0x80 || c == lb) { /* 1000 0000 */
121  *slot = b;
122  return 1;
123  }
124  a = b & 0x3F;
125  b = p[--c];
126  if (b >= 0xC0 || c == lb) { /* 1100 0000 */
127  *slot = (b & 0x1F) << 6 | a;
128  return 2;
129  }
130  a |= (b & 0x3F) << 6;
131  b = p[--c];
132  if (b >= 0xE0 || c == lb) { /* 1110 0000 */
133  *slot = (b & 0xF) << 12 | a;
134  return 3;
135  }
136  *slot = (p[--c] & 0x7) << 18 | (b & 0x3F) << 12 | a;
137  return 4;
138 }
139 
140 extern int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
141  do {
142  int ch;
143  int w = get_utf8(z->p, z->c, z->l, & ch);
144  if (!w) return -1;
145  if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
146  return w;
147  z->c += w;
148  } while (repeat);
149  return 0;
150 }
151 
152 extern int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
153  do {
154  int ch;
155  int w = get_b_utf8(z->p, z->c, z->lb, & ch);
156  if (!w) return -1;
157  if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
158  return w;
159  z->c -= w;
160  } while (repeat);
161  return 0;
162 }
163 
164 extern int out_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
165  do {
166  int ch;
167  int w = get_utf8(z->p, z->c, z->l, & ch);
168  if (!w) return -1;
169  if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0))
170  return w;
171  z->c += w;
172  } while (repeat);
173  return 0;
174 }
175 
176 extern int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
177  do {
178  int ch;
179  int w = get_b_utf8(z->p, z->c, z->lb, & ch);
180  if (!w) return -1;
181  if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0))
182  return w;
183  z->c -= w;
184  } while (repeat);
185  return 0;
186 }
187 
188 /* Code for character groupings: non-utf8 cases */
189 
190 extern int in_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
191  do {
192  int ch;
193  if (z->c >= z->l) return -1;
194  ch = z->p[z->c];
195  if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
196  return 1;
197  z->c++;
198  } while (repeat);
199  return 0;
200 }
201 
202 extern int in_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
203  do {
204  int ch;
205  if (z->c <= z->lb) return -1;
206  ch = z->p[z->c - 1];
207  if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
208  return 1;
209  z->c--;
210  } while (repeat);
211  return 0;
212 }
213 
214 extern int out_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
215  do {
216  int ch;
217  if (z->c >= z->l) return -1;
218  ch = z->p[z->c];
219  if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0))
220  return 1;
221  z->c++;
222  } while (repeat);
223  return 0;
224 }
225 
226 extern int out_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
227  do {
228  int ch;
229  if (z->c <= z->lb) return -1;
230  ch = z->p[z->c - 1];
231  if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0))
232  return 1;
233  z->c--;
234  } while (repeat);
235  return 0;
236 }
237 
238 extern int eq_s(struct SN_env * z, int s_size, const symbol * s) {
239  if (z->l - z->c < s_size || memcmp(z->p + z->c, s, s_size * sizeof(symbol)) != 0) return 0;
240  z->c += s_size; return 1;
241 }
242 
243 extern int eq_s_b(struct SN_env * z, int s_size, const symbol * s) {
244  if (z->c - z->lb < s_size || memcmp(z->p + z->c - s_size, s, s_size * sizeof(symbol)) != 0) return 0;
245  z->c -= s_size; return 1;
246 }
247 
248 extern int eq_v(struct SN_env * z, const symbol * p) {
249  return eq_s(z, SIZE(p), p);
250 }
251 
252 extern int eq_v_b(struct SN_env * z, const symbol * p) {
253  return eq_s_b(z, SIZE(p), p);
254 }
255 
256 extern int find_among(struct SN_env * z, const struct among * v, int v_size,
257  int (*call_among_func)(struct SN_env*)) {
258 
259  int i = 0;
260  int j = v_size;
261 
262  int c = z->c; int l = z->l;
263  const symbol * q = z->p + c;
264 
265  const struct among * w;
266 
267  int common_i = 0;
268  int common_j = 0;
269 
270  int first_key_inspected = 0;
271 
272  while (1) {
273  int k = i + ((j - i) >> 1);
274  int diff = 0;
275  int common = common_i < common_j ? common_i : common_j; /* smaller */
276  w = v + k;
277  {
278  int i2; for (i2 = common; i2 < w->s_size; i2++) {
279  if (c + common == l) { diff = -1; break; }
280  diff = q[common] - w->s[i2];
281  if (diff != 0) break;
282  common++;
283  }
284  }
285  if (diff < 0) {
286  j = k;
287  common_j = common;
288  } else {
289  i = k;
290  common_i = common;
291  }
292  if (j - i <= 1) {
293  if (i > 0) break; /* v->s has been inspected */
294  if (j == i) break; /* only one item in v */
295 
296  /* - but now we need to go round once more to get
297  v->s inspected. This looks messy, but is actually
298  the optimal approach. */
299 
300  if (first_key_inspected) break;
301  first_key_inspected = 1;
302  }
303  }
304  w = v + i;
305  while (1) {
306  if (common_i >= w->s_size) {
307  z->c = c + w->s_size;
308  if (!w->function) return w->result;
309  z->af = w->function;
310  if (call_among_func(z)) {
311  z->c = c + w->s_size;
312  return w->result;
313  }
314  }
315  if (!w->substring_i) return 0;
316  w += w->substring_i;
317  }
318 }
319 
320 /* find_among_b is for backwards processing. Same comments apply */
321 
322 extern int find_among_b(struct SN_env * z, const struct among * v, int v_size,
323  int (*call_among_func)(struct SN_env*)) {
324 
325  int i = 0;
326  int j = v_size;
327 
328  int c = z->c; int lb = z->lb;
329  const symbol * q = z->p + c - 1;
330 
331  const struct among * w;
332 
333  int common_i = 0;
334  int common_j = 0;
335 
336  int first_key_inspected = 0;
337 
338  while (1) {
339  int k = i + ((j - i) >> 1);
340  int diff = 0;
341  int common = common_i < common_j ? common_i : common_j;
342  w = v + k;
343  {
344  int i2; for (i2 = w->s_size - 1 - common; i2 >= 0; i2--) {
345  if (c - common == lb) { diff = -1; break; }
346  diff = q[- common] - w->s[i2];
347  if (diff != 0) break;
348  common++;
349  }
350  }
351  if (diff < 0) { j = k; common_j = common; }
352  else { i = k; common_i = common; }
353  if (j - i <= 1) {
354  if (i > 0) break;
355  if (j == i) break;
356  if (first_key_inspected) break;
357  first_key_inspected = 1;
358  }
359  }
360  w = v + i;
361  while (1) {
362  if (common_i >= w->s_size) {
363  z->c = c - w->s_size;
364  if (!w->function) return w->result;
365  z->af = w->function;
366  if (call_among_func(z)) {
367  z->c = c - w->s_size;
368  return w->result;
369  }
370  }
371  if (!w->substring_i) return 0;
372  w += w->substring_i;
373  }
374 }
375 
376 
377 /* Increase the size of the buffer pointed to by p to at least n symbols.
378  * On success, returns 0. If insufficient memory, returns -1.
379  */
380 static int increase_size(symbol ** p, int n) {
381  int new_size = n + 20;
382  void * mem = realloc((char *) *p - HEAD,
383  HEAD + (new_size + 1) * sizeof(symbol));
384  symbol * q;
385  if (mem == NULL) return -1;
386  q = (symbol *) (HEAD + (char *)mem);
387  CAPACITY(q) = new_size;
388  *p = q;
389  return 0;
390 }
391 
392 /* to replace symbols between c_bra and c_ket in z->p by the
393  s_size symbols at s.
394  Returns 0 on success, -1 on error.
395 */
396 extern SNOWBALL_ERR replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const symbol * s)
397 {
398  int adjustment = s_size - (c_ket - c_bra);
399  if (adjustment != 0) {
400  int len = SIZE(z->p);
401  if (adjustment + len > CAPACITY(z->p)) {
402  SNOWBALL_PROPAGATE_ERR(increase_size(&z->p, adjustment + len));
403  }
404  memmove(z->p + c_ket + adjustment,
405  z->p + c_ket,
406  (len - c_ket) * sizeof(symbol));
407  SET_SIZE(z->p, adjustment + len);
408  z->l += adjustment;
409  if (z->c >= c_ket)
410  z->c += adjustment;
411  else if (z->c > c_bra)
412  z->c = c_bra;
413  }
414  if (s_size) memmove(z->p + c_bra, s, s_size * sizeof(symbol));
416 }
417 
418 # define REPLACE_S(Z, B, K, SIZE, S) \
419  SNOWBALL_PROPAGATE_ERR(replace_s(Z, B, K, SIZE, S))
420 
421 static SNOWBALL_ERR slice_check(struct SN_env * z) {
422 
423  if (z->bra < 0 ||
424  z->bra > z->ket ||
425  z->ket > z->l ||
426  z->l > SIZE(z->p)) /* this line could be removed */
427  {
428 #if 0
429  fprintf(stderr, "faulty slice operation:\n");
430  debug(z, -1, 0);
431 #endif
432  SNOWBALL_RETURN_OR_THROW(-1, std::logic_error("Snowball slice invalid"));
433  }
435 }
436 
437 # define SLICE_CHECK(Z) SNOWBALL_PROPAGATE_ERR(slice_check(Z))
438 
439 extern SNOWBALL_ERR slice_from_s(struct SN_env * z, int s_size, const symbol * s) {
440  SLICE_CHECK(z);
441  REPLACE_S(z, z->bra, z->ket, s_size, s);
442  z->ket = z->bra + s_size;
444 }
445 
446 extern SNOWBALL_ERR slice_from_v(struct SN_env * z, const symbol * p) {
447  return slice_from_s(z, SIZE(p), p);
448 }
449 
450 extern SNOWBALL_ERR slice_del(struct SN_env * z) {
451  SLICE_CHECK(z);
452  {
453  int slice_size = z->ket - z->bra;
454  if (slice_size != 0) {
455  int len = SIZE(z->p);
456  memmove(z->p + z->bra,
457  z->p + z->ket,
458  (len - z->ket) * sizeof(symbol));
459  SET_SIZE(z->p, len - slice_size);
460  z->l -= slice_size;
461  if (z->c >= z->ket)
462  z->c -= slice_size;
463  else if (z->c > z->bra)
464  z->c = z->bra;
465  }
466  }
467  z->ket = z->bra;
469 }
470 
471 extern SNOWBALL_ERR insert_s(struct SN_env * z, int bra, int ket, int s_size, const symbol * s) {
472  REPLACE_S(z, bra, ket, s_size, s);
473  if (bra <= z->ket) {
474  int adjustment = s_size - (ket - bra);
475  z->ket += adjustment;
476  if (bra <= z->bra) z->bra += adjustment;
477  }
479 }
480 
481 extern SNOWBALL_ERR insert_v(struct SN_env * z, int bra, int ket, const symbol * p) {
482  return insert_s(z, bra, ket, SIZE(p), p);
483 }
484 
485 extern SNOWBALL_ERR slice_to(struct SN_env * z, symbol ** p) {
486  SLICE_CHECK(z);
487  {
488  int len = z->ket - z->bra;
489  if (CAPACITY(*p) < len) {
491  }
492  memmove(*p, z->p + z->bra, len * sizeof(symbol));
493  SET_SIZE(*p, len);
494  }
496 }
497 
498 extern SNOWBALL_ERR assign_to(struct SN_env * z, symbol ** p) {
499  int len = z->l;
500  if (CAPACITY(*p) < len) {
502  }
503  memmove(*p, z->p, len * sizeof(symbol));
504  SET_SIZE(*p, len);
506 }
507 
508 extern int len_utf8(const symbol * p) {
509  int size = SIZE(p);
510  int len = 0;
511  while (size--) {
512  symbol b = *p++;
513  if (b >= 0xC0 || b < 0x80) ++len;
514  }
515  return len;
516 }
unsigned char symbol
Definition: api.h:4
PositionList * p
#define SIZE(p)
Definition: header.h:21
@ c_ket
Definition: header.h:121
@ c_bra
Definition: header.h:117
#define CAPACITY(p)
Definition: header.h:24
#define SET_SIZE(p, n)
Definition: header.h:22
#define HEAD
#define SNOWBALL_ERR
Definition: api.h:15
int af
Definition: api.h:18
int lb
Definition: api.h:17
symbol * p
Definition: api.h:16
int ket
Definition: api.h:17
int c
Definition: api.h:17
int bra
Definition: api.h:17
int l
Definition: api.h:17
Definition: header.h:256
struct amongvec * b
Definition: header.h:258
int substring_i
const symbol * s
int function
#define REPLACE_S(Z, B, K, SIZE, S)
Definition: utilities.cc:418
#define SNOWBALL_RETURN_OR_THROW(R, E)
Definition: utilities.cc:14
#define SLICE_CHECK(Z)
Definition: utilities.cc:437
int out_grouping_U(struct SN_env *z, const unsigned char *s, int min, int max, int repeat)
Definition: utilities.cc:164
#define CREATE_SIZE
Definition: utilities.cc:25
int in_grouping_U(struct SN_env *z, const unsigned char *s, int min, int max, int repeat)
Definition: utilities.cc:140
SNOWBALL_ERR insert_s(struct SN_env *z, int bra, int ket, int s_size, const symbol *s)
Definition: utilities.cc:471
int eq_v_b(struct SN_env *z, const symbol *p)
Definition: utilities.cc:252
SNOWBALL_ERR slice_from_s(struct SN_env *z, int s_size, const symbol *s)
Definition: utilities.cc:439
int in_grouping(struct SN_env *z, const unsigned char *s, int min, int max, int repeat)
Definition: utilities.cc:190
SNOWBALL_ERR slice_to(struct SN_env *z, symbol **p)
Definition: utilities.cc:485
SNOWBALL_ERR assign_to(struct SN_env *z, symbol **p)
Definition: utilities.cc:498
SNOWBALL_ERR slice_from_v(struct SN_env *z, const symbol *p)
Definition: utilities.cc:446
int eq_s(struct SN_env *z, int s_size, const symbol *s)
Definition: utilities.cc:238
SNOWBALL_ERR insert_v(struct SN_env *z, int bra, int ket, const symbol *p)
Definition: utilities.cc:481
#define SNOWBALL_PROPAGATE_ERR(F)
Definition: utilities.cc:15
static SNOWBALL_ERR slice_check(struct SN_env *z)
Definition: utilities.cc:421
int eq_v(struct SN_env *z, const symbol *p)
Definition: utilities.cc:248
SNOWBALL_ERR replace_s(struct SN_env *z, int c_bra, int c_ket, int s_size, const symbol *s)
Definition: utilities.cc:396
void lose_s(symbol *p)
Definition: utilities.cc:38
static int get_utf8(const symbol *p, int c, int l, int *slot)
Definition: utilities.cc:94
symbol * create_s(void)
Definition: utilities.cc:27
int out_grouping_b(struct SN_env *z, const unsigned char *s, int min, int max, int repeat)
Definition: utilities.cc:226
int find_among_b(struct SN_env *z, const struct among *v, int v_size, int(*call_among_func)(struct SN_env *))
Definition: utilities.cc:322
#define SNOWBALL_RETURN_OK
Definition: utilities.cc:13
int in_grouping_b_U(struct SN_env *z, const unsigned char *s, int min, int max, int repeat)
Definition: utilities.cc:152
int skip_b_utf8(const symbol *p, int c, int limit, int n)
Definition: utilities.cc:75
int eq_s_b(struct SN_env *z, int s_size, const symbol *s)
Definition: utilities.cc:243
int out_grouping(struct SN_env *z, const unsigned char *s, int min, int max, int repeat)
Definition: utilities.cc:214
SNOWBALL_ERR slice_del(struct SN_env *z)
Definition: utilities.cc:450
int out_grouping_b_U(struct SN_env *z, const unsigned char *s, int min, int max, int repeat)
Definition: utilities.cc:176
int skip_utf8(const symbol *p, int c, int limit, int n)
Definition: utilities.cc:50
int in_grouping_b(struct SN_env *z, const unsigned char *s, int min, int max, int repeat)
Definition: utilities.cc:202
int len_utf8(const symbol *p)
Definition: utilities.cc:508
static int get_b_utf8(const symbol *p, int c, int lb, int *slot)
Definition: utilities.cc:116
static int increase_size(symbol **p, int n)
Definition: utilities.cc:380
int find_among(struct SN_env *z, const struct among *v, int v_size, int(*call_among_func)(struct SN_env *))
Definition: utilities.cc:256