languages/steminternal.cc

Go to the documentation of this file.
00001 
00004 /* Derived from snowball's runtime/api.c:
00005  *
00006  * Copyright (c) 2001, Dr Martin Porter
00007  * Copyright (c) 2004,2005, Richard Boulton
00008  * Copyright (c) 2006,2007,2009 Olly Betts
00009  * All rights reserved.
00010  *
00011  * Redistribution and use in source and binary forms, with or without
00012  * modification, are permitted provided that the following conditions are met:
00013  *
00014  *   * Redistributions of source code must retain the above copyright notice,
00015  *     this list of conditions and the following disclaimer.
00016  *
00017  *   * Redistributions in binary form must reproduce the above copyright
00018  *     notice, this list of conditions and the following disclaimer in the
00019  *     documentation and/or other materials provided with the distribution.
00020  *
00021  *   * Neither the name of the <ORGANIZATION> nor the names of its contributors
00022  *     may be used to endorse or promote products derived from this software
00023  *     without specific prior written permission.
00024  *
00025  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
00026  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00027  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00028  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
00029  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
00030  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
00031  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
00032  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
00033  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
00034  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00035  * POSSIBILITY OF SUCH DAMAGE.
00036  */
00037 /* Copyright (C) 2007 Olly Betts
00038  *
00039  * This program is free software; you can redistribute it and/or
00040  * modify it under the terms of the GNU General Public License as
00041  * published by the Free Software Foundation; either version 2 of the
00042  * License, or (at your option) any later version.
00043  *
00044  * This program is distributed in the hope that it will be useful,
00045  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00046  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00047  * GNU General Public License for more details.
00048  *
00049  * You should have received a copy of the GNU General Public License
00050  * along with this program; if not, write to the Free Software
00051  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
00052  */
00053 
00054 #include <config.h>
00055 
00056 #include <xapian/error.h>
00057 
00058 #include "omassert.h"
00059 #include "steminternal.h"
00060 
00061 #if 0
00062 #include <stdio.h>
00063 #endif
00064 #include <stdlib.h>
00065 #include <string.h>
00066 
00067 #include <string>
00068 
00069 using namespace std;
00070 
00071 #define CREATE_SIZE 16
00072 
00073 extern symbol * create_s() {
00074     void * mem = malloc(HEAD + (CREATE_SIZE + 1) * sizeof(symbol));
00075     if (mem == NULL) throw std::bad_alloc();
00076     symbol * p = reinterpret_cast<symbol*>(HEAD + static_cast<char *>(mem));
00077     SET_CAPACITY(p, CREATE_SIZE);
00078     SET_SIZE(p, CREATE_SIZE);
00079     return p;
00080 }
00081 
00082 /*
00083    new_c = skip_utf8(p, c, lb, l, n); skips n characters forwards from p + c
00084    if n +ve, or n characters backwards from p + c - 1 if n -ve. new_c is the new
00085    value of the cursor c, or -1 on failure.
00086 
00087    -- used to implement hop and next in the utf8 case.
00088 */
00089 
00090 extern int skip_utf8(const symbol * p, int c, int lb, int l, int n) {
00091     if (n >= 0) {
00092         for (; n > 0; n--) {
00093             if (c >= l) return -1;
00094             if (p[c++] >= 0xC0) {   /* 1100 0000 */
00095                 while (c < l) {
00096                     /* break unless p[c] is 10------ */
00097                     if (p[c] >> 6 != 2) break;
00098                     c++;
00099                 }
00100             }
00101         }
00102     } else {
00103         for (; n < 0; n++) {
00104             if (c <= lb) return -1;
00105             if (p[--c] >= 0x80) {   /* 1000 0000 */
00106                 while (c > lb) {
00107                     if (p[c] >= 0xC0) break; /* 1100 0000 */
00108                     c--;
00109                 }
00110             }
00111         }
00112     }
00113     return c;
00114 }
00115 
00116 
00117 /* Increase the size of the buffer pointed to by p to at least n symbols.
00118  * If insufficient memory, throw std::bad_alloc().
00119  */
00120 static symbol * increase_size(symbol * p, int n) {
00121     int new_size = n + 20;
00122     void * mem = realloc(reinterpret_cast<char *>(p) - HEAD,
00123                          HEAD + (new_size + 1) * sizeof(symbol));
00124     if (mem == NULL) {
00125         throw std::bad_alloc();
00126     }
00127     symbol * q = reinterpret_cast<symbol*>(HEAD + static_cast<char *>(mem));
00128     SET_CAPACITY(q, new_size);
00129     return q;
00130 }
00131 
00132 namespace Xapian {
00133 
00134 Stem::Internal::~Internal()
00135 {
00136     lose_s(p);
00137 }
00138 
00139 string
00140 Stem::Internal::operator()(const string & word)
00141 {
00142     const symbol * s = reinterpret_cast<const symbol *>(word.data());
00143     replace_s(0, l, word.size(), s);
00144     c = 0;
00145     if (stem() < 0) {
00146         // FIXME: Is there a better choice of exception class?
00147         throw Xapian::InternalError("stemming exception!");
00148     }
00149     return string(reinterpret_cast<const char *>(p), l);
00150 }
00151 
00152 /* Code for character groupings: utf8 cases */
00153 
00154 int Stem::Internal::get_utf8(int * slot) {
00155     int b0, b1;
00156     int tmp = c;
00157     if (tmp >= l) return 0;
00158     b0 = p[tmp++];
00159     if (b0 < 0xC0 || tmp == l) {   /* 1100 0000 */
00160         * slot = b0; return 1;
00161     }
00162     b1 = p[tmp++];
00163     if (b0 < 0xE0 || tmp == l) {   /* 1110 0000 */
00164         * slot = (b0 & 0x1F) << 6 | (b1 & 0x3F); return 2;
00165     }
00166     * slot = (b0 & 0xF) << 12 | (b1 & 0x3F) << 6 | (p[tmp] & 0x3F); return 3;
00167 }
00168 
00169 int Stem::Internal::get_b_utf8(int * slot) {
00170     int b0, b1;
00171     int tmp = c;
00172     if (tmp <= lb) return 0;
00173     b0 = p[--tmp];
00174     if (b0 < 0x80 || tmp == lb) {   /* 1000 0000 */
00175         * slot = b0; return 1;
00176     }
00177     b1 = p[--tmp];
00178     if (b1 >= 0xC0 || tmp == lb) {   /* 1100 0000 */
00179         * slot = (b1 & 0x1F) << 6 | (b0 & 0x3F); return 2;
00180     }
00181     * slot = (p[tmp] & 0xF) << 12 | (b1 & 0x3F) << 6 | (b0 & 0x3F); return 3;
00182 }
00183 
00184 int Stem::Internal::in_grouping_U(const unsigned char * s, int min, int max, int repeat) {
00185     do {
00186         int ch;
00187         int w = get_utf8(&ch);
00188         if (!w) return -1;
00189         if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
00190             return w;
00191         c += w;
00192     } while (repeat);
00193     return 0;
00194 }
00195 
00196 int Stem::Internal::in_grouping_b_U(const unsigned char * s, int min, int max, int repeat) {
00197     do {
00198         int ch;
00199         int w = get_b_utf8(&ch);
00200         if (!w) return -1;
00201         if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
00202             return w;
00203         c -= w;
00204     } while (repeat);
00205     return 0;
00206 }
00207 
00208 int Stem::Internal::out_grouping_U(const unsigned char * s, int min, int max, int repeat) {
00209     do {
00210         int ch;
00211         int w = get_utf8(&ch);
00212         if (!w) return -1;
00213         if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0))
00214             /* FIXME: try adding this so gopast in generated code is simpler: if (repeat == 2) c += w; */ return w;
00215         c += w;
00216     } while (repeat);
00217     return 0;
00218 }
00219 
00220 int Stem::Internal::out_grouping_b_U(const unsigned char * s, int min, int max, int repeat) {
00221     do {
00222         int ch;
00223         int w = get_b_utf8(&ch);
00224         if (!w) return -1;
00225         if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0))
00226             return w;
00227         c -= w;
00228     } while (repeat);
00229     return 0;
00230 }
00231 
00232 int Stem::Internal::eq_s(int s_size, const symbol * s) {
00233     if (l - c < s_size || memcmp(p + c, s, s_size * sizeof(symbol)) != 0)
00234         return 0;
00235     c += s_size;
00236     return 1;
00237 }
00238 
00239 int Stem::Internal::eq_s_b(int s_size, const symbol * s) {
00240     if (c - lb < s_size || memcmp(p + c - s_size, s, s_size * sizeof(symbol)) != 0)
00241         return 0;
00242     c -= s_size;
00243     return 1;
00244 }
00245 
00246 int
00247 Stem::Internal::find_among(const symbol * pool, const struct among * v,
00248                            int v_size, const unsigned char * fnum,
00249                            const among_function * f)
00250 {
00251     int i = 0;
00252     int j = v_size;
00253 
00254     const symbol * q = p + c;
00255     int c_orig = c;
00256 
00257     int common_i = 0;
00258     int common_j = 0;
00259 
00260     int first_key_inspected = 0;
00261 
00262     while (1) {
00263         int k = i + ((j - i) >> 1);
00264         int diff = 0;
00265         int common = common_i < common_j ? common_i : common_j; /* smaller */
00266         const struct among * w = v + k;
00267         for (int x = common; x < w->s_size; x++) {
00268             if (c_orig + common == l) { diff = -1; break; }
00269             diff = q[common] - (pool + w->s)[x];
00270             if (diff != 0) break;
00271             common++;
00272         }
00273         if (diff < 0) { j = k; common_j = common; }
00274                  else { i = k; common_i = common; }
00275         if (j - i <= 1) {
00276             if (i > 0) break; /* v->s has been inspected */
00277             if (j == i) break; /* only one item in v */
00278 
00279             /* - but now we need to go round once more to get
00280                v->s inspected. This looks messy, but is actually
00281                the optimal approach.  */
00282 
00283             if (first_key_inspected) break;
00284             first_key_inspected = 1;
00285         }
00286     }
00287     while (1) {
00288         const struct among * w = v + i;
00289         if (common_i >= w->s_size) {
00290             c = c_orig + w->s_size;
00291             if (!fnum || !fnum[i]) return w->result;
00292             {
00293                 int res = f[fnum[i] - 1](this);
00294                 c = c_orig + w->s_size;
00295                 if (res) return w->result;
00296             }
00297         }
00298         i = w->substring_i;
00299         if (i < 0) return 0;
00300     }
00301 }
00302 
00303 /* find_among_b is for backwards processing. Same comments apply */
00304 int
00305 Stem::Internal::find_among_b(const symbol * pool, const struct among * v,
00306                              int v_size, const unsigned char * fnum,
00307                              const among_function * f)
00308 {
00309     int i = 0;
00310     int j = v_size;
00311 
00312     const symbol * q = p + c - 1;
00313     int c_orig = c;
00314 
00315     int common_i = 0;
00316     int common_j = 0;
00317 
00318     int first_key_inspected = 0;
00319 
00320     while (1) {
00321         int k = i + ((j - i) >> 1);
00322         int diff = 0;
00323         int common = common_i < common_j ? common_i : common_j;
00324         const struct among * w = v + k;
00325         for (int x = w->s_size - 1 - common; x >= 0; x--) {
00326             if (c_orig - common == lb) { diff = -1; break; }
00327             diff = q[- common] - (pool + w->s)[x];
00328             if (diff != 0) break;
00329             common++;
00330         }
00331         if (diff < 0) { j = k; common_j = common; }
00332                  else { i = k; common_i = common; }
00333         if (j - i <= 1) {
00334             if (i > 0) break;
00335             if (j == i) break;
00336             if (first_key_inspected) break;
00337             first_key_inspected = 1;
00338         }
00339     }
00340     while (1) {
00341         const struct among * w = v + i;
00342         if (common_i >= w->s_size) {
00343             c = c_orig - w->s_size;
00344             if (!fnum || !fnum[i]) return w->result;
00345             {
00346                 int res = f[fnum[i] - 1](this);
00347                 c = c_orig - w->s_size;
00348                 if (res) return w->result;
00349             }
00350         }
00351         i = w->substring_i;
00352         if (i < 0) return 0;
00353     }
00354 }
00355 
00356 int
00357 Stem::Internal::replace_s(int c_bra, int c_ket, int s_size, const symbol * s)
00358 {
00359     int adjustment;
00360     int len;
00361     Assert(p);
00362     adjustment = s_size - (c_ket - c_bra);
00363     len = SIZE(p);
00364     if (adjustment != 0) {
00365         if (adjustment + len > CAPACITY(p)) {
00366             p = increase_size(p, adjustment + len);
00367         }
00368         memmove(p + c_ket + adjustment,
00369                 p + c_ket,
00370                 (len - c_ket) * sizeof(symbol));
00371         SET_SIZE(p, adjustment + len);
00372         l += adjustment;
00373         if (c >= c_ket)
00374             c += adjustment;
00375         else
00376             if (c > c_bra)
00377                 c = c_bra;
00378     }
00379     if (s_size != 0) memmove(p + c_bra, s, s_size * sizeof(symbol));
00380     return adjustment;
00381 }
00382 
00383 int Stem::Internal::slice_check() {
00384     Assert(p);
00385     if (bra < 0 || bra > ket || ket > l) {
00386 #if 0
00387         fprintf(stderr, "faulty slice operation:\n");
00388         debug(z, -1, 0);
00389 #endif
00390         return -1;
00391     }
00392     return 0;
00393 }
00394 
00395 int Stem::Internal::slice_from_s(int s_size, const symbol * s) {
00396     if (slice_check()) return -1;
00397     replace_s(bra, ket, s_size, s);
00398     return 0;
00399 }
00400 
00401 void Stem::Internal::insert_s(int c_bra, int c_ket, int s_size, const symbol * s) {
00402     int adjustment = replace_s(c_bra, c_ket, s_size, s);
00403     if (c_bra <= bra) bra += adjustment;
00404     if (c_bra <= ket) ket += adjustment;
00405 }
00406 
00407 symbol * Stem::Internal::slice_to(symbol * v) {
00408     if (slice_check()) return NULL;
00409     {
00410         int len = ket - bra;
00411         if (CAPACITY(v) < len) {
00412             v = increase_size(v, len);
00413         }
00414         memmove(v, p + bra, len * sizeof(symbol));
00415         SET_SIZE(v, len);
00416     }
00417     return v;
00418 }
00419 
00420 symbol * Stem::Internal::assign_to(symbol * v) {
00421     int len = l;
00422     if (CAPACITY(v) < len) {
00423         v = increase_size(v, len);
00424     }
00425     memmove(v, p, len * sizeof(symbol));
00426     SET_SIZE(v, len);
00427     return v;
00428 }
00429 
00430 #if 0
00431 void Stem::Internal::debug(int number, int line_count) {
00432     int i;
00433     int limit = SIZE(p);
00434     /*if (number >= 0) printf("%3d (line %4d): '", number, line_count);*/
00435     if (number >= 0) printf("%3d (line %4d): [%d]'", number, line_count,limit);
00436     for (i = 0; i <= limit; i++) {
00437         if (lb == i) printf("{");
00438         if (bra == i) printf("[");
00439         if (c == i) printf("|");
00440         if (ket == i) printf("]");
00441         if (l == i) printf("}");
00442         if (i < limit)
00443         {   int ch = p[i];
00444             if (ch == 0) ch = '#';
00445             printf("%c", ch);
00446         }
00447     }
00448     printf("'\n");
00449 }
00450 #endif
00451 }

Documentation for Xapian (version 1.0.20).
Generated on 28 Apr 2010 by Doxygen 1.5.2.