queryparser/termgenerator_internal.cc

Go to the documentation of this file.
00001 
00004 /* Copyright (C) 2007 Olly Betts
00005  *
00006  * This program is free software; you can redistribute it and/or modify
00007  * it under the terms of the GNU General Public License as published by
00008  * the Free Software Foundation; either version 2 of the License, or
00009  * (at your option) any later version.
00010  *
00011  * This program is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014  * GNU General Public License for more details.
00015  *
00016  * You should have received a copy of the GNU General Public License
00017  * along with this program; if not, write to the Free Software
00018  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
00019  */
00020 
00021 #include <config.h>
00022 
00023 #include "termgenerator_internal.h"
00024 
00025 #include <xapian/document.h>
00026 #include <xapian/queryparser.h>
00027 #include <xapian/unicode.h>
00028 
00029 #include "stringutils.h"
00030 
00031 #include <string>
00032 
00033 using namespace std;
00034 
00035 namespace Xapian {
00036 
00037 // Put a limit on the size of terms to help prevent the index being bloated
00038 // by useless junk terms.
00039 static const unsigned int MAX_PROB_TERM_LENGTH = 64;
00040 // FIXME: threshold is currently in bytes of UTF-8 representation, not unicode
00041 // characters - what actually makes most sense here?
00042 
00043 // FIXME: Add API to allow control of how stemming is used?
00044 
00045 inline bool
00046 U_isupper(unsigned ch) {
00047     return (ch < 128 && C_isupper((unsigned char)ch));
00048 }
00049 
00050 inline unsigned check_wordchar(unsigned ch) {
00051     if (Unicode::is_wordchar(ch)) return Unicode::tolower(ch);
00052     return 0;
00053 }
00054 
00055 inline bool
00056 should_stem(const std::string & term)
00057 {
00058     const unsigned int SHOULD_STEM_MASK =
00059         (1 << Unicode::LOWERCASE_LETTER) |
00060         (1 << Unicode::TITLECASE_LETTER) |
00061         (1 << Unicode::MODIFIER_LETTER) |
00062         (1 << Unicode::OTHER_LETTER);
00063     Utf8Iterator u(term);
00064     return ((SHOULD_STEM_MASK >> Unicode::get_category(*u)) & 1);
00065 }
00066 
00067 inline unsigned check_infix(unsigned ch) {
00068     if (ch == '\'' || ch == '&' || ch == 0xb7 || ch == 0x5f4 || ch == 0x2027) {
00069         // Unicode includes all these except '&' in it's word boundary rules,
00070         // as well as 0x2019 (which we handle below) and ':' (for Swedish
00071         // apparently, but we ignore this for now as it's problematic in
00072         // real world cases).
00073         return ch;
00074     }
00075     // 0x2019 is Unicode apostrophe and single closing quote.
00076     // 0x201b is Unicode single opening quote with the tail rising.
00077     if (ch == 0x2019 || ch == 0x201b) return '\'';
00078     return 0;
00079 }
00080 
00081 inline unsigned check_infix_digit(unsigned ch) {
00082     // This list of characters comes from Unicode's word identifying algorithm.
00083     switch (ch) {
00084         case ',':
00085         case '.':
00086         case ';':
00087         case 0x037e: // GREEK QUESTION MARK
00088         case 0x0589: // ARMENIAN FULL STOP
00089         case 0x060D: // ARABIC DATE SEPARATOR
00090         case 0x07F8: // NKO COMMA
00091         case 0x2044: // FRACTION SLASH
00092         case 0xFE10: // PRESENTATION FORM FOR VERTICAL COMMA
00093         case 0xFE13: // PRESENTATION FORM FOR VERTICAL COLON
00094         case 0xFE14: // PRESENTATION FORM FOR VERTICAL SEMICOLON
00095             return ch;
00096     }
00097     return 0;
00098 }
00099 
00100 inline bool
00101 is_digit(unsigned ch) {
00102     return (Unicode::get_category(ch) == Unicode::DECIMAL_DIGIT_NUMBER);
00103 }
00104 
00105 inline unsigned check_suffix(unsigned ch) {
00106     if (ch == '+' || ch == '#') return ch;
00107     // FIXME: what about '-'?
00108     return 0;
00109 }
00110 
00111 // FIXME: add API for this:
00112 #define STOPWORDS_NONE 0
00113 #define STOPWORDS_IGNORE 1
00114 #define STOPWORDS_INDEX_UNSTEMMED_ONLY 2
00115 
00116 void
00117 TermGenerator::Internal::index_text(Utf8Iterator itor, termcount weight,
00118                                     const string & prefix, bool with_positions)
00119 {
00120     int stop_mode = STOPWORDS_INDEX_UNSTEMMED_ONLY;
00121 
00122     if (!stopper) stop_mode = STOPWORDS_NONE;
00123 
00124     while (true) {
00125         // Advance to the start of the next term.
00126         unsigned ch;
00127         while (true) {
00128             if (itor == Utf8Iterator()) return;
00129             ch = check_wordchar(*itor);
00130             if (ch) break;
00131             ++itor;
00132         }
00133 
00134         string term;
00135         // Look for initials separated by '.' (e.g. P.T.O., U.N.C.L.E).
00136         // Don't worry if there's a trailing '.' or not.
00137         if (U_isupper(*itor)) {
00138             const Utf8Iterator end;
00139             Utf8Iterator p = itor;
00140             do {
00141                 Unicode::append_utf8(term, Unicode::tolower(*p++));
00142             } while (p != end && *p == '.' && ++p != end && U_isupper(*p));
00143             // One letter does not make an acronym!  If we handled a single
00144             // uppercase letter here, we wouldn't catch M&S below.
00145             if (term.size() > 1) {
00146                 // Check there's not a (lower case) letter or digit
00147                 // immediately after it.
00148                 if (p == end || !Unicode::is_wordchar(*p)) {
00149                     itor = p;
00150                     goto endofterm;
00151                 }
00152             }
00153             term.resize(0);
00154         }
00155 
00156         while (true) {
00157             unsigned prevch;
00158             do {
00159                 Unicode::append_utf8(term, ch);
00160                 prevch = ch;
00161                 if (++itor == Utf8Iterator()) goto endofterm;
00162                 ch = check_wordchar(*itor);
00163             } while (ch);
00164 
00165             Utf8Iterator next(itor);
00166             ++next;
00167             if (next == Utf8Iterator()) break;
00168             unsigned nextch = check_wordchar(*next);
00169             if (!nextch) break;
00170             unsigned infix_ch = *itor;
00171             if (is_digit(prevch) && is_digit(*next)) {
00172                 infix_ch = check_infix_digit(infix_ch);
00173             } else {
00174                 // Handle things like '&' in AT&T, apostrophes, etc.
00175                 infix_ch = check_infix(infix_ch);
00176             }
00177             if (!infix_ch) break;
00178             Unicode::append_utf8(term, infix_ch);
00179             ch = nextch;
00180             itor = next;
00181         }
00182 
00183         {
00184             size_t len = term.size();
00185             unsigned count = 0;
00186             while ((ch = check_suffix(*itor))) {
00187                 if (++count > 3) {
00188                     term.resize(len);
00189                     break;
00190                 }
00191                 Unicode::append_utf8(term, ch);
00192                 if (++itor == Utf8Iterator()) goto endofterm;
00193             }
00194             // Don't index fish+chips as fish+ chips.
00195             if (Unicode::is_wordchar(*itor))
00196                 term.resize(len);
00197         }
00198 
00199 endofterm:
00200         if (term.size() > MAX_PROB_TERM_LENGTH) continue;
00201 
00202         if (stop_mode == STOPWORDS_IGNORE && (*stopper)(term)) continue;
00203 
00204         if (with_positions) {
00205             doc.add_posting(prefix + term, ++termpos, weight);
00206         } else {
00207             doc.add_term(prefix + term, weight);
00208         }
00209         if ((flags & FLAG_SPELLING) && prefix.empty()) db.add_spelling(term);
00210 
00211         if (!stemmer.internal.get()) continue;
00212 
00213         if (stop_mode == STOPWORDS_INDEX_UNSTEMMED_ONLY && (*stopper)(term))
00214             continue;
00215 
00216         // Note, this uses the lowercased term, but that's OK as we only
00217         // want to avoid stemming terms starting with a digit.
00218         if (!should_stem(term)) continue;
00219 
00220         // Add stemmed form without positional information.
00221         string stem("Z");
00222         stem += prefix;
00223         stem += stemmer(term);
00224         doc.add_term(stem, weight);
00225     }
00226 }
00227 
00228 }

Documentation for Xapian (version 1.0.20).
Generated on 28 Apr 2010 by Doxygen 1.5.2.