sourcedoc/html/termgenerator__internal_8cc_source.html

 /* Copyright (C) 2007-2023 Olly Betts
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
  * (at your option) any later version.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
  */

 #include <config.h>

 #include "termgenerator_internal.h"

 #include "api/omenquireinternal.h"
 #include "api/queryinternal.h"

 #include <xapian/document.h>
 #include <xapian/queryparser.h>
 #include <xapian/stem.h>
 #include <xapian/unicode.h>

 #include "stringutils.h"

 #include <algorithm>
 #include <cmath>
 #include <deque>
 #include <limits>
 #include <list>
 #include <string>
 #include <unordered_map>
 #include <vector>

 #include "word-breaker.h"

 using namespace std;

 namespace Xapian {

 static inline bool
 U_isupper(unsigned ch)
 {
     return ch < 128 && C_isupper(static_cast<unsigned char>(ch));
 }

 static inline unsigned
 check_wordchar(unsigned ch)
 {
     if (Unicode::is_wordchar(ch)) return Unicode::tolower(ch);
     return 0;
 }

 static inline bool
 should_stem(const std::string & term)
 {
     const unsigned int SHOULD_STEM_MASK =
         (1 << Unicode::LOWERCASE_LETTER) |
         (1 << Unicode::TITLECASE_LETTER) |
         (1 << Unicode::MODIFIER_LETTER) |
         (1 << Unicode::OTHER_LETTER);
     Utf8Iterator u(term);
     return ((SHOULD_STEM_MASK >> Unicode::get_category(*u)) & 1);
 }

 static const unsigned UNICODE_IGNORE = numeric_limits<unsigned>::max();

 static inline unsigned
 check_infix(unsigned ch)
 {
     if (ch == '\'' || ch == '&' || ch == 0xb7 || ch == 0x5f4 || ch == 0x2027) {
         // Unicode includes all these except '&' in its word boundary rules,
         // as well as 0x2019 (which we handle below) and ':' (for Swedish
         // apparently, but we ignore this for now as it's problematic in
         // real world cases).
         return ch;
     }
     // 0x2019 is Unicode apostrophe and single closing quote.
     // 0x201b is Unicode single opening quote with the tail rising.
     if (ch == 0x2019 || ch == 0x201b) return '\'';
     if (ch >= 0x200b && (ch <= 0x200d || ch == 0x2060 || ch == 0xfeff))
         return UNICODE_IGNORE;
     return 0;
 }

 static inline unsigned
 check_infix_digit(unsigned ch)
 {
     // This list of characters comes from Unicode's word identifying algorithm.
     switch (ch) {
         case ',':
         case '.':
         case ';':
         case 0x037e: // GREEK QUESTION MARK
         case 0x0589: // ARMENIAN FULL STOP
         case 0x060D: // ARABIC DATE SEPARATOR
         case 0x07F8: // NKO COMMA
         case 0x2044: // FRACTION SLASH
         case 0xFE10: // PRESENTATION FORM FOR VERTICAL COMMA
         case 0xFE13: // PRESENTATION FORM FOR VERTICAL COLON
         case 0xFE14: // PRESENTATION FORM FOR VERTICAL SEMICOLON
             return ch;
     }
     if (ch >= 0x200b && (ch <= 0x200d || ch == 0x2060 || ch == 0xfeff))
         return UNICODE_IGNORE;
     return 0;
 }

 static inline bool
 is_digit(unsigned ch) {
     return (Unicode::get_category(ch) == Unicode::DECIMAL_DIGIT_NUMBER);
 }

 static inline unsigned
 check_suffix(unsigned ch)
 {
     if (ch == '+' || ch == '#') return ch;
     // FIXME: what about '-'?
     return 0;
 }

 template<typename ACTION>
 static void
 parse_terms(Utf8Iterator itor, bool try_word_break, bool with_positions,
             ACTION action)
 {
     while (true) {
         // Advance to the start of the next term.
         unsigned ch;
         while (true) {
             if (itor == Utf8Iterator()) return;
             ch = check_wordchar(*itor);
             if (ch) break;
             ++itor;
         }

         string term;
         // Look for initials separated by '.' (e.g. P.T.O., U.N.C.L.E).
         // Don't worry if there's a trailing '.' or not.
         if (U_isupper(*itor)) {
             const Utf8Iterator end;
             Utf8Iterator p = itor;
             do {
                 Unicode::append_utf8(term, Unicode::tolower(*p++));
             } while (p != end && *p == '.' && ++p != end && U_isupper(*p));
             // One letter does not make an acronym!  If we handled a single
             // uppercase letter here, we wouldn't catch M&S below.
             if (term.size() > 1) {
                 // Check there's not a (lower case) letter or digit
                 // immediately after it.
                 if (p == end || !Unicode::is_wordchar(*p)) {
                     itor = p;
                     goto endofterm;
                 }
             }
             term.resize(0);
         }

         while (true) {
             if (try_word_break &&
                 is_unbroken_script(*itor) &&
                 Unicode::is_wordchar(*itor)) {
                 NgramIterator tk(itor);
                 while (tk != NgramIterator()) {
                     const string& ngram = *tk;
                     if (!action(ngram, with_positions && tk.unigram(),
                                 tk.get_utf8iterator()))
                         return;
                     ++tk;
                 }
                 // Update itor to end of CJK text span.
                 itor = tk.get_utf8iterator();
                 while (true) {
                     if (itor == Utf8Iterator()) return;
                     ch = check_wordchar(*itor);
                     if (ch) break;
                     ++itor;
                 }
                 continue;
             }
             unsigned prevch;
             do {
                 Unicode::append_utf8(term, ch);
                 prevch = ch;
                 if (++itor == Utf8Iterator() ||
                     (try_word_break && is_unbroken_script(*itor)))
                     goto endofterm;
                 ch = check_wordchar(*itor);
             } while (ch);

             Utf8Iterator next(itor);
             ++next;
             if (next == Utf8Iterator()) break;
             unsigned nextch = check_wordchar(*next);
             if (!nextch) break;
             unsigned infix_ch = *itor;
             if (is_digit(prevch) && is_digit(*next)) {
                 infix_ch = check_infix_digit(infix_ch);
             } else {
                 // Handle things like '&' in AT&T, apostrophes, etc.
                 infix_ch = check_infix(infix_ch);
             }
             if (!infix_ch) break;
             if (infix_ch != UNICODE_IGNORE)
                 Unicode::append_utf8(term, infix_ch);
             ch = nextch;
             itor = next;
         }

         {
             size_t len = term.size();
             unsigned count = 0;
             while ((ch = check_suffix(*itor))) {
                 if (++count > 3) {
                     term.resize(len);
                     break;
                 }
                 Unicode::append_utf8(term, ch);
                 if (++itor == Utf8Iterator()) goto endofterm;
             }
             // Don't index fish+chips as fish+ chips.
             if (Unicode::is_wordchar(*itor))
                 term.resize(len);
         }

 endofterm:
         if (!action(term, with_positions, itor))
             return;
     }
 }

 void
 TermGenerator::Internal::index_text(Utf8Iterator itor, termcount wdf_inc,
                                     const string & prefix, bool with_positions)
 {
     bool try_word_break = (flags & FLAG_NGRAMS) || is_ngram_enabled();

     stop_strategy current_stop_mode;
     if (!stopper.get()) {
         current_stop_mode = TermGenerator::STOP_NONE;
     } else {
         current_stop_mode = stop_mode;
     }

     parse_terms(itor, try_word_break, with_positions,
         [=
 #if __cplusplus >= 201907L
 // C++20 no longer supports implicit `this` in lambdas but older C++ versions
 // don't allow `this` here.
         , this
 #endif
         ](const string & term, bool positional, const Utf8Iterator &) {
             if (term.size() > max_word_length) return true;

             if (current_stop_mode == TermGenerator::STOP_ALL &&
                 (*stopper)(term)) {
                 return true;
             }

             if (strategy == TermGenerator::STEM_SOME ||
                 strategy == TermGenerator::STEM_NONE ||
                 strategy == TermGenerator::STEM_SOME_FULL_POS) {
                 if (positional) {
                     doc.add_posting(prefix + term, ++cur_pos, wdf_inc);
                 } else {
                     doc.add_term(prefix + term, wdf_inc);
                 }
             }

             // MSVC seems to need "this->" on member variables in this
             // situation.
             if ((this->flags & FLAG_SPELLING) && prefix.empty())
                 db.add_spelling(term);

             if (strategy == TermGenerator::STEM_NONE || stemmer.is_none())
                 return true;

             if (strategy == TermGenerator::STEM_SOME ||
                 strategy == TermGenerator::STEM_SOME_FULL_POS) {
                 if (current_stop_mode == TermGenerator::STOP_STEMMED &&
                     (*stopper)(term))
                     return true;

                 // Note, this uses the lowercased term, but that's OK as we
                 // only want to avoid stemming terms starting with a digit.
                 if (!should_stem(term)) return true;
             }

             // Add stemmed form without positional information.
             const string& stem = stemmer(term);
             if (rare(stem.empty())) return true;
             string stemmed_term;
             if (strategy != TermGenerator::STEM_ALL) {
                 stemmed_term += "Z";
             }
             stemmed_term += prefix;
             stemmed_term += stem;
             if (strategy != TermGenerator::STEM_SOME && with_positions) {
                 if (strategy != TermGenerator::STEM_SOME_FULL_POS) ++cur_pos;
                 doc.add_posting(stemmed_term, cur_pos, wdf_inc);
             } else {
                 doc.add_term(stemmed_term, wdf_inc);
             }
             return true;
         });
 }

 struct Sniplet {
     double* relevance;

     size_t term_end;

     size_t highlight;

     Sniplet(double* r, size_t t, size_t h)
         : relevance(r), term_end(t), highlight(h) { }
 };

 class SnipPipe {
     deque<Sniplet> pipe;
     deque<Sniplet> best_pipe;

     // Requested length for snippet.
     size_t length;

     // Position in text of start of current pipe contents.
     size_t begin = 0;

     // Rolling sum of the current pipe contents.
     double sum = 0;

     size_t phrase_len = 0;

   public:
     size_t best_begin = 0;

     size_t best_end = 0;

     double best_sum = 0;

     // Add one to length to allow for inter-word space.
     // FIXME: We ought to correctly allow for multiple spaces.
     explicit SnipPipe(size_t length_) : length(length_ + 1) { }

     bool pump(double* r, size_t t, size_t h, unsigned flags);

     void done();

     bool drain(const string & input,
                const string & hi_start,
                const string & hi_end,
                const string & omit,
                string & output);
 };

 #define DECAY 2.0

 inline bool
 SnipPipe::pump(double* r, size_t t, size_t h, unsigned flags)
 {
     if (h > 1) {
         if (pipe.size() >= h - 1) {
             // The final term of a phrase is entering the window.  Peg the
             // phrase's relevance onto the first term of the phrase, so it'll
             // be removed from `sum` when the phrase starts to leave the
             // window.
             auto & phrase_start = pipe[pipe.size() - (h - 1)];
             if (phrase_start.relevance) {
                 *phrase_start.relevance *= DECAY;
                 sum -= *phrase_start.relevance;
             }
             sum += *r;
             phrase_start.relevance = r;
             phrase_start.highlight = h;
             *r /= DECAY;
         }
         r = NULL;
         h = 0;
     }
     pipe.emplace_back(r, t, h);
     if (r) {
         sum += *r;
         *r /= DECAY;
     }

     // If necessary, discard words from the start of the pipe until it has the
     // desired length.
     // FIXME: Also shrink the window past words with relevance < 0?
     while (t - begin > length /* || pipe.front().relevance < 0.0 */) {
         const Sniplet& word = pipe.front();
         if (word.relevance) {
             *word.relevance *= DECAY;
             sum -= *word.relevance;
         }
         begin = word.term_end;
         if (best_end >= begin)
             best_pipe.push_back(word);
         pipe.pop_front();
         // E.g. can happen if the current term is longer than the requested
         // length!
         if (rare(pipe.empty())) break;
     }

     // Using > here doesn't work well, as we don't extend a snippet over terms
     // with 0 weight.
     if (sum >= best_sum) {
         // Discard any part of `best_pipe` which is before `begin`.
         if (begin >= best_end) {
             best_pipe.clear();
         } else {
             while (!best_pipe.empty() &&
                    best_pipe.front().term_end <= begin) {
                 best_pipe.pop_front();
             }
         }
         best_sum = sum;
         best_begin = begin;
         best_end = t;
     } else if ((flags & Xapian::MSet::SNIPPET_EXHAUSTIVE) == 0) {
         if (best_sum > 0 && best_end < begin) {
             // We found something, and we aren't still looking near it.
             // FIXME: Benchmark this and adjust if necessary.
             return false;
         }
     }
     return true;
 }

 inline void
 SnipPipe::done()
 {
     // Discard any part of `pipe` which is after `best_end`.
     if (begin >= best_end) {
         pipe.clear();
     } else {
         // We should never empty the pipe (as that case should be handled
         // above).
         while (rare(!pipe.empty()) &&
                pipe.back().term_end > best_end) {
             pipe.pop_back();
         }
     }
 }

 // Check if a non-word character is should be included at the start of the
 // snippet.  We want to include certain leading non-word characters, but not
 // others.
 static inline bool
 snippet_check_leading_nonwordchar(unsigned ch) {
     if (Unicode::is_currency(ch) ||
         Unicode::get_category(ch) == Unicode::OPEN_PUNCTUATION ||
         Unicode::get_category(ch) == Unicode::INITIAL_QUOTE_PUNCTUATION) {
         return true;
     }
     switch (ch) {
         case '"':
         case '#':
         case '%':
         case '&':
         case '\'':
         case '+':
         case '-':
         case '/':
         case '<':
         case '@':
         case '\\':
         case '`':
         case '~':
         case 0x00A1: // INVERTED EXCLAMATION MARK
         case 0x00A7: // SECTION SIGN
         case 0x00BF: // INVERTED QUESTION MARK
             return true;
     }
     return false;
 }

 // Check if a non-word character is should be included at the end of the
 // snippet.  We want to include certain trailing non-word characters, but not
 // others.
 static inline bool
 snippet_check_trailing_nonwordchar(unsigned ch) {
     if (Unicode::is_currency(ch) ||
         Unicode::get_category(ch) == Unicode::CLOSE_PUNCTUATION ||
         Unicode::get_category(ch) == Unicode::FINAL_QUOTE_PUNCTUATION) {
         return true;
     }
     switch (ch) {
         case '"':
         case '%':
         case '\'':
         case '+':
         case '-':
         case '/':
         case '>':
         case '@':
         case '\\':
         case '`':
         case '~':
             return true;
     }
     return false;
 }

 static inline void
 append_escaping_xml(const char* p, const char* end, string& output)
 {
     while (p != end) {
         char ch = *p++;
         switch (ch) {
             case '&':
                 output += "&amp;";
                 break;
             case '<':
                 output += "&lt;";
                 break;
             case '>':
                 output += "&gt;";
                 break;
             default:
                 output += ch;
         }
     }
 }

 inline bool
 SnipPipe::drain(const string & input,
                 const string & hi_start,
                 const string & hi_end,
                 const string & omit,
                 string & output)
 {
     if (best_pipe.empty() && !pipe.empty()) {
         swap(best_pipe, pipe);
     }

     if (best_pipe.empty()) {
         size_t tail_len = input.size() - best_end;
         if (tail_len == 0) return false;

         // See if this is the end of a sentence.
         // FIXME: This is quite simplistic - look at the Unicode rules:
         // https://unicode.org/reports/tr29/#Sentence_Boundaries
         bool sentence_end = false;
         Utf8Iterator i(input.data() + best_end, tail_len);
         while (i != Utf8Iterator()) {
             unsigned ch = *i;
             if (sentence_end && Unicode::is_whitespace(ch)) break;

             // Allow "...", "!!", "!?!", etc...
             sentence_end = (ch == '.' || ch == '?' || ch == '!');

             if (Unicode::is_wordchar(ch)) break;
             ++i;
         }

         if (sentence_end) {
             // Include end of sentence punctuation.
             append_escaping_xml(input.data() + best_end, i.raw(), output);
             return false;
         }

         // Include trailing punctuation which includes meaning or context.
         i.assign(input.data() + best_end, tail_len);
         int trailing_punc = 0;
         while (i != Utf8Iterator() && snippet_check_trailing_nonwordchar(*i)) {
             // But limit how much trailing punctuation we include.
             if (++trailing_punc > 4) {
                 trailing_punc = 0;
                 break;
             }
             ++i;
         }
         if (trailing_punc) {
             append_escaping_xml(input.data() + best_end, i.raw(), output);
             if (i == Utf8Iterator()) return false;
         }

         // Append "..." or equivalent as this doesn't seem to be the start
         // of a sentence.
         output += omit;

         return false;
     }

     const Sniplet & word = best_pipe.front();

     if (output.empty()) {
         // Start of the snippet.
         enum { NO, PUNC, YES } sentence_boundary = (best_begin == 0) ? YES : NO;

         Utf8Iterator i(input.data() + best_begin, word.term_end - best_begin);
         while (i != Utf8Iterator()) {
             unsigned ch = *i;
             switch (sentence_boundary) {
                 case NO:
                     if (ch == '.' || ch == '?' || ch == '!') {
                         sentence_boundary = PUNC;
                     }
                     break;
                 case PUNC:
                     if (Unicode::is_whitespace(ch)) {
                         sentence_boundary = YES;
                     } else if (ch == '.' || ch == '?' || ch == '!') {
                         // Allow "...", "!!", "!?!", etc...
                     } else {
                         sentence_boundary = NO;
                     }
                     break;
                 case YES:
                     break;
             }

             // Start the snippet at the start of the first word, but include
             // certain punctuation too.
             if (Unicode::is_wordchar(ch)) {
                 // But limit how much leading punctuation we include.
                 size_t word_begin = i.raw() - input.data();
                 if (word_begin - best_begin > 4) {
                     best_begin = word_begin;
                 }
                 break;
             }
             ++i;
             if (!snippet_check_leading_nonwordchar(ch)) {
                 best_begin = i.raw() - input.data();
             }
         }

         // Add "..." or equivalent if this doesn't seem to be the start of a
         // sentence.
         if (sentence_boundary != YES) {
             output += omit;
         }
     }

     if (word.highlight) {
         // Don't include inter-word characters in the highlight.
         Utf8Iterator i(input.data() + best_begin, input.size() - best_begin);
         while (i != Utf8Iterator()) {
             unsigned ch = *i;
             if (Unicode::is_wordchar(ch)) {
                 append_escaping_xml(input.data() + best_begin, i.raw(), output);
                 best_begin = i.raw() - input.data();
                 break;
             }
             ++i;
         }
     }

     if (!phrase_len) {
         phrase_len = word.highlight;
         if (phrase_len) output += hi_start;
     }

     const char* p = input.data();
     append_escaping_xml(p + best_begin, p + word.term_end, output);
     best_begin = word.term_end;

     if (phrase_len && --phrase_len == 0) output += hi_end;

     best_pipe.pop_front();
     return true;
 }

 static void
 check_query(const Xapian::Query & query,
             list<vector<string>> & exact_phrases,
             unordered_map<string, double> & loose_terms,
             list<string> & wildcards,
             size_t & longest_phrase)
 {
     // FIXME: OP_NEAR, non-tight OP_PHRASE, OP_PHRASE with non-term subqueries
     size_t n_subqs = query.get_num_subqueries();
     Xapian::Query::op op = query.get_type();
     if (op == query.LEAF_TERM) {
         const Xapian::Internal::QueryTerm & qt =
             *static_cast<const Xapian::Internal::QueryTerm *>(query.internal.get());
         loose_terms.insert(make_pair(qt.get_term(), 0));
     } else if (op == query.OP_WILDCARD) {
         const Xapian::Internal::QueryWildcard & qw =
             *static_cast<const Xapian::Internal::QueryWildcard *>(query.internal.get());
         wildcards.push_back(qw.get_pattern());
     } else if (op == query.OP_PHRASE) {
         const Xapian::Internal::QueryPhrase & phrase =
             *static_cast<const Xapian::Internal::QueryPhrase *>(query.internal.get());
         if (phrase.get_window() == n_subqs) {
             // Tight phrase.
             for (size_t i = 0; i != n_subqs; ++i) {
                 if (query.get_subquery(i).get_type() != query.LEAF_TERM)
                     goto non_term_subquery;
             }

             // Tight phrase of terms.
             exact_phrases.push_back(vector<string>());
             vector<string> & terms = exact_phrases.back();
             terms.reserve(n_subqs);
             for (size_t i = 0; i != n_subqs; ++i) {
                 Xapian::Query q = query.get_subquery(i);
                 const Xapian::Internal::QueryTerm & qt =
                     *static_cast<const Xapian::Internal::QueryTerm *>(q.internal.get());
                 terms.push_back(qt.get_term());
             }
             if (n_subqs > longest_phrase) longest_phrase = n_subqs;
             return;
         }
     }
 non_term_subquery:
     for (size_t i = 0; i != n_subqs; ++i)
         check_query(query.get_subquery(i), exact_phrases, loose_terms,
                     wildcards, longest_phrase);
 }

 static double*
 check_term(unordered_map<string, double> & loose_terms,
            const Xapian::Weight::Internal * stats,
            const string & term,
            double max_tw)
 {
     auto it = loose_terms.find(term);
     if (it == loose_terms.end()) return NULL;

     if (it->second == 0.0) {
         double relevance;
         if (!stats->get_termweight(term, relevance)) {
             // FIXME: Assert?
             loose_terms.erase(it);
             return NULL;
         }

         it->second = relevance + max_tw;
     }
     return &it->second;
 }

 string
 MSet::Internal::snippet(const string & text,
                         size_t length,
                         const Xapian::Stem & stemmer,
                         unsigned flags,
                         const string & hi_start,
                         const string & hi_end,
                         const string & omit) const
 {
     if (hi_start.empty() && hi_end.empty() && text.size() <= length) {
         // Too easy!
         return text;
     }

     bool try_word_break = (flags & MSet::SNIPPET_CJK_NGRAM);
     if (!try_word_break) {
         try_word_break = true;
     }

     size_t term_start = 0;
     double min_tw = 0, max_tw = 0;
     if (stats) stats->get_max_termweight(min_tw, max_tw);
     if (max_tw == 0.0) {
         max_tw = 1.0;
     } else {
         // Scale up by (1 + 1/64) so that highlighting works better for terms
         // with termweight 0 (which happens for terms not in the database, and
         // also with some weighting schemes for terms which occur in almost all
         // documents.
         max_tw *= 1.015625;
     }

     Xapian::Query query;
     if (enquire.get()) {
         query = enquire->query;
     }
     SnipPipe snip(length);

     list<vector<string>> exact_phrases;
     unordered_map<string, double> loose_terms;
     list<string> wildcards;
     size_t longest_phrase = 0;
     check_query(query, exact_phrases, loose_terms,
                 wildcards, longest_phrase);

     vector<double> exact_phrases_relevance;
     exact_phrases_relevance.reserve(exact_phrases.size());
     for (auto&& terms : exact_phrases) {
         // FIXME: What relevance to use?
         exact_phrases_relevance.push_back(max_tw * terms.size());
     }

     vector<double> wildcards_relevance;
     wildcards_relevance.reserve(exact_phrases.size());
     for (auto&& pattern : wildcards) {
         // FIXME: What relevance to use?
         (void)pattern;
         wildcards_relevance.push_back(max_tw + min_tw);
     }

     // Background relevance is the same for a given MSet, so cache it
     // between calls to MSet::snippet() on the same object.
     unordered_map<string, double>& background = snippet_bg_relevance;

     vector<string> phrase;
     if (longest_phrase) phrase.resize(longest_phrase - 1);
     size_t phrase_next = 0;
     bool matchfound = false;
     parse_terms(Utf8Iterator(text), try_word_break, true,
         [&](const string & term, bool positional, const Utf8Iterator & it) {
             // FIXME: Don't hardcode this here.
             const size_t max_word_length = 64;

             if (!positional) return true;
             if (term.size() > max_word_length) return true;

             // We get segments with any "inter-word" characters in front of
             // each word, e.g.:
             // [The][ cat][ sat][ on][ the][ mat]
             size_t term_end = text.size() - it.left();

             double* relevance = 0;
             size_t highlight = 0;
             if (stats) {
                 size_t i = 0;
                 for (auto&& terms : exact_phrases) {
                     if (term == terms.back()) {
                         size_t n = terms.size() - 1;
                         bool match = true;
                         while (n--) {
                             if (terms[n] != phrase[(n + phrase_next) % (longest_phrase - 1)]) {
                                 match = false;
                                 break;
                             }
                         }
                         if (match) {
                             // FIXME: Sort phrases, highest score first!
                             relevance = &exact_phrases_relevance[i];
                             highlight = terms.size();
                             goto relevance_done;
                         }
                     }
                     ++i;
                 }

                 relevance = check_term(loose_terms, stats, term, max_tw);
                 if (relevance) {
                     // Matched unstemmed term.
                     highlight = 1;
                     goto relevance_done;
                 }

                 string stem = "Z";
                 stem += stemmer(term);
                 relevance = check_term(loose_terms, stats, stem, max_tw);
                 if (relevance) {
                     // Matched stemmed term.
                     highlight = 1;
                     goto relevance_done;
                 }

                 // Check wildcards.
                 // FIXME: Sort wildcards, shortest pattern first or something?
                 i = 0;
                 for (auto&& pattern : wildcards) {
                     if (startswith(term, pattern)) {
                         relevance = &wildcards_relevance[i];
                         highlight = 1;
                         goto relevance_done;
                     }
                     ++i;
                 }

                 if (flags & Xapian::MSet::SNIPPET_BACKGROUND_MODEL) {
                     // Background document model.
                     auto bgit = background.find(term);
                     if (bgit == background.end()) bgit = background.find(stem);
                     if (bgit == background.end()) {
                         Xapian::doccount tf = enquire->db.get_termfreq(term);
                         if (!tf) {
                             tf = enquire->db.get_termfreq(stem);
                         } else {
                             stem = term;
                         }
                         double r = 0.0;
                         if (tf) {
                             // Add one to avoid log(0) when a term indexes all
                             // documents.
                             Xapian::doccount num_docs = stats->collection_size + 1;
                             r = max_tw * log((num_docs - tf) / double(tf));
                             r /= (length + 1) * log(double(num_docs));
 #if 0
                             if (r <= 0) {
                                 Utf8Iterator i(text.data() + term_start, text.data() + term_end);
                                 while (i != Utf8Iterator()) {
                                     if (Unicode::get_category(*i++) == Unicode::UPPERCASE_LETTER) {
                                         r = max_tw * 0.05;
                                     }
                                 }
                             }
 #endif
                         }
                         bgit = background.emplace(make_pair(stem, r)).first;
                     }
                     relevance = &bgit->second;
                 }
             } else {
 #if 0
                 // In the absence of weight information, assume longer terms
                 // are more relevant, and that unstemmed matches are a bit more
                 // relevant than stemmed matches.
                 if (queryterms.find(term) != queryterms.end()) {
                     relevance = term.size() * 3;
                 } else {
                     string stem = "Z";
                     stem += stemmer(term);
                     if (queryterms.find(stem) != queryterms.end()) {
                         relevance = term.size() * 2;
                     }
                 }
 #endif
             }

             // FIXME: Allow Enquire without a DB set or an empty MSet() to be
             // used if you don't want the collection model?

 #if 0
             // FIXME: Punctuation should somehow be included in the model, but this
             // approach is problematic - we don't want the first word of a sentence
             // to be favoured when it's at the end of the window.

             // Give first word in each sentence a relevance boost.
             if (term_start == 0) {
                 relevance += 10;
             } else {
                 for (size_t i = term_start; i + term.size() < term_end; ++i) {
                     if (text[i] == '.' && Unicode::is_whitespace(text[i + 1])) {
                         relevance += 10;
                         break;
                     }
                 }
             }
 #endif

 relevance_done:
             if (longest_phrase) {
                 phrase[phrase_next] = term;
                 phrase_next = (phrase_next + 1) % (longest_phrase - 1);
             }

             if (highlight) matchfound = true;

             if (!snip.pump(relevance, term_end, highlight, flags)) return false;

             term_start = term_end;
             return true;
         });

     snip.done();

     // Put together the snippet.
     string result;
     if (matchfound || (flags & SNIPPET_EMPTY_WITHOUT_MATCH) == 0) {
         while (snip.drain(text, hi_start, hi_end, omit, result)) { }
     }

     return result;
 }

 }
unicode.h
Unicode and UTF-8 related classes and functions.

Xapian::Stem::is_none
bool is_none() const
Return true if this is a no-op stemmer.
Definition: stem.h:166

Xapian
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:80

Xapian::Unicode::append_utf8
void append_utf8(std::string &s, unsigned ch)
Append the UTF-8 representation of a single Unicode character to a std::string.
Definition: unicode.h:332

Xapian::Query::OP_WILDCARD
Wildcard expansion.
Definition: query.h:255

Xapian::MSet::SNIPPET_EXHAUSTIVE
Exhaustively evaluate candidate snippets in MSet::snippet().
Definition: mset.h:179

Xapian::Unicode::MODIFIER_LETTER
Letter, modifier (Lm)
Definition: unicode.h:225

Xapian::Query::get_subquery
const Query get_subquery(size_t n) const
Read a top level subquery.
Definition: query.cc:226

Xapian::check_suffix
static unsigned check_suffix(unsigned ch)
Definition: termgenerator_internal.cc:127

Xapian::Utf8Iterator::raw
const char * raw() const
Return the raw const char* pointer for the current position.
Definition: unicode.h:54

NgramIterator::get_utf8iterator
const Xapian::Utf8Iterator & get_utf8iterator() const
Definition: word-breaker.h:86

Xapian::SnipPipe::done
void done()
Definition: termgenerator_internal.cc:448

Xapian::Unicode::tolower
unsigned tolower(unsigned ch)
Convert a Unicode character to lowercase.
Definition: unicode.h:376

Xapian::SnipPipe::drain
bool drain(const string &input, const string &hi_start, const string &hi_end, const string &omit, string &output)
Definition: termgenerator_internal.cc:544

Xapian::Unicode::OTHER_LETTER
Letter, other (Lo)
Definition: unicode.h:226

queryinternal.h
Xapian::Query internals.

Xapian::check_term
static double * check_term(unordered_map< string, double > &loose_terms, const Xapian::Weight::Internal *stats, const string &term, double max_tw)
Definition: termgenerator_internal.cc:732

Xapian::Utf8Iterator::left
size_t left() const
Return the number of bytes left in the iterator&#39;s buffer.
Definition: unicode.h:59

is_digit
bool is_digit(unsigned ch)
Definition: queryparser_internal.cc:114

Xapian::Stem
Class representing a stemming algorithm.
Definition: stem.h:62

U_isupper
bool U_isupper(unsigned ch)
Definition: queryparser_internal.cc:85

Xapian::Query::op
op
Query operators.
Definition: query.h:78

Xapian::Unicode::is_currency
bool is_currency(unsigned ch)
Test if a given Unicode character is a currency symbol.
Definition: unicode.h:371

Xapian::Unicode::DECIMAL_DIGIT_NUMBER
Number, decimal digit (Nd)
Definition: unicode.h:230

is_unbroken_script
bool is_unbroken_script(unsigned p)
Definition: word-breaker.cc:71

input
Definition: header.h:63

config.h

std
STL namespace.

Xapian::Internal::QueryPhrase
Definition: queryinternal.h:349

Xapian::Internal::QueryWildcard
Definition: queryinternal.h:406

Xapian::SnipPipe::best_pipe
deque< Sniplet > best_pipe
Definition: termgenerator_internal.cc:339

Xapian::Query::internal
Xapian::Internal::intrusive_ptr< Internal > internal
Definition: query.h:49

stemmer
static Xapian::Stem stemmer
Definition: stemtest.cc:41

DECAY
#define DECAY
Definition: termgenerator_internal.cc:374

rare
#define rare(COND)
Definition: config.h:565

Xapian::check_query
static void check_query(const Xapian::Query &query, list< vector< string >> &exact_phrases, unordered_map< string, double > &loose_terms, list< string > &wildcards, size_t &longest_phrase)
Definition: termgenerator_internal.cc:684

NgramIterator
Iterator returning unigrams and bigrams.
Definition: word-breaker.h:52

omenquireinternal.h
Internals.

Xapian::Unicode::LOWERCASE_LETTER
Letter, lowercase (Ll)
Definition: unicode.h:223

Xapian::check_wordchar
static unsigned check_wordchar(unsigned ch)
Definition: termgenerator_internal.cc:57

termgenerator_internal.h
TermGenerator class internals.

Xapian::termcount
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72

Xapian::Internal::QueryWildcard::get_pattern
const std::string & get_pattern() const
Definition: queryinternal.h:430

Xapian::Internal::QueryTerm::get_term
const std::string & get_term() const
Definition: queryinternal.h:55

Xapian::TermGenerator::stop_strategy
stop_strategy
Stopper strategies, for use with set_stopper_strategy().
Definition: termgenerator.h:145

Xapian::Sniplet::relevance
double * relevance
Definition: termgenerator_internal.cc:327

Xapian::Unicode::TITLECASE_LETTER
Letter, titlecase (Lt)
Definition: unicode.h:224

Xapian::append_escaping_xml
static void append_escaping_xml(const char *p, const char *end, string &output)
Definition: termgenerator_internal.cc:523

Xapian::Query::OP_PHRASE
Match only documents where all subqueries match near and in order.
Definition: query.h:152

Xapian::SnipPipe
Definition: termgenerator_internal.cc:337

UNICODE_IGNORE
const unsigned UNICODE_IGNORE
Value representing "ignore this" when returned by check_infix() or check_infix_digit().
Definition: queryparser_internal.cc:599

Xapian::Weight::Internal
Class to hold statistics for a given collection.
Definition: weightinternal.h:70

Xapian::Unicode::UPPERCASE_LETTER
Letter, uppercase (Lu)
Definition: unicode.h:222

Xapian::Internal::QueryTerm
Definition: queryinternal.h:37

Xapian::Internal::QueryWindowed::get_window
size_t get_window() const
Definition: queryinternal.h:330

Xapian::Unicode::CLOSE_PUNCTUATION
Punctuation, close (Pe)
Definition: unicode.h:243

Xapian::Unicode::OPEN_PUNCTUATION
Punctuation, open (Ps)
Definition: unicode.h:242

Xapian::MSet::SNIPPET_BACKGROUND_MODEL
Model the relevancy of non-query terms in MSet::snippet().
Definition: mset.h:172

Xapian::SnipPipe::pipe
deque< Sniplet > pipe
Definition: termgenerator_internal.cc:338

Xapian::snippet_check_leading_nonwordchar
static bool snippet_check_leading_nonwordchar(unsigned ch)
Definition: termgenerator_internal.cc:467

startswith
bool startswith(const std::string &s, char pfx)
Definition: stringutils.h:51

Xapian::TermGenerator::flags
int flags
For backward compatibility with Xapian 1.2.
Definition: termgenerator.h:98

Xapian::SnipPipe::pump
bool pump(double *r, size_t t, size_t h, unsigned flags)
Definition: termgenerator_internal.cc:377

Xapian::Weight::Internal::get_termweight
bool get_termweight(const std::string &term, double &termweight) const
Get the termweight.
Definition: weightinternal.h:176

check_infix_digit
unsigned check_infix_digit(unsigned ch)
Definition: queryparser_internal.cc:620

word-breaker.h
Handle text without explicit word breaks.

Xapian::SnipPipe::length
size_t length
Definition: termgenerator_internal.cc:342

should_stem
bool should_stem(const string &term)
Definition: queryparser_internal.cc:585

Xapian::Utf8Iterator
An iterator which returns Unicode character values from a UTF-8 encoded string.
Definition: unicode.h:38

query
static Xapian::Query query(Xapian::Query::op op, const string &t1=string(), const string &t2=string(), const string &t3=string(), const string &t4=string(), const string &t5=string(), const string &t6=string(), const string &t7=string(), const string &t8=string(), const string &t9=string(), const string &t10=string())
Definition: api_anydb.cc:63

Xapian::SnipPipe::SnipPipe
SnipPipe(size_t length_)
Definition: termgenerator_internal.cc:361

Xapian::snippet_check_trailing_nonwordchar
static bool snippet_check_trailing_nonwordchar(unsigned ch)
Definition: termgenerator_internal.cc:499

Xapian::Sniplet::term_end
size_t term_end
Definition: termgenerator_internal.cc:329

Xapian::Unicode::is_wordchar
bool is_wordchar(unsigned ch)
Test if a given Unicode character is "word character".
Definition: unicode.h:343

Xapian::Query::get_num_subqueries
size_t get_num_subqueries() const
Get the number of subqueries of the top level query.
Definition: query.cc:220

Xapian::doccount
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38

Xapian::parse_terms
static void parse_terms(Utf8Iterator itor, bool try_word_break, bool with_positions, ACTION action)
Templated framework for processing terms.
Definition: termgenerator_internal.cc:142

Xapian::Query::LEAF_TERM
Value returned by get_type() for a term.
Definition: query.h:266

stringutils.h
Various handy helpers which std::string really should provide.

Xapian::Unicode::is_whitespace
bool is_whitespace(unsigned ch)
Test if a given Unicode character is a whitespace character.
Definition: unicode.h:361

Xapian::Unicode::INITIAL_QUOTE_PUNCTUATION
Punctuation, initial quote (Pi)
Definition: unicode.h:244

Xapian::Sniplet::Sniplet
Sniplet(double *r, size_t t, size_t h)
Definition: termgenerator_internal.cc:333

Xapian::Query::get_type
op get_type() const
Get the type of the top level of the query.
Definition: query.cc:212

Xapian::Unicode::Internal::get_category
category get_category(int info)
Definition: unicode.h:271

Xapian::Unicode::FINAL_QUOTE_PUNCTUATION
Punctuation, final quote (Pf)
Definition: unicode.h:245

is_ngram_enabled
bool is_ngram_enabled()
Should we use the n-gram code?
Definition: word-breaker.cc:41

Xapian::Query
Class representing a query.
Definition: query.h:46

Xapian::Sniplet::highlight
size_t highlight
Definition: termgenerator_internal.cc:331

document.h
API for working with documents.

check_infix
unsigned check_infix(unsigned ch)
Definition: queryparser_internal.cc:601

stem.h
stemming algorithms

NgramIterator::unigram
bool unigram() const
Is this a unigram?
Definition: word-breaker.h:84

Xapian::Sniplet
Definition: termgenerator_internal.cc:326

queryparser.h
parsing a user query string to build a Xapian::Query object