xapian-core  2.0.0
termgenerator_internal.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2007-2026 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, see
18  * <https://www.gnu.org/licenses/>.
19  */
20 
21 #include <config.h>
22 
23 #include "termgenerator_internal.h"
24 
25 #include "api/msetinternal.h"
26 #include "api/queryinternal.h"
27 
28 #include <xapian/document.h>
29 #include <xapian/queryparser.h>
30 #include <xapian/stem.h>
31 #include <xapian/unicode.h>
32 
33 #include "stringutils.h"
34 
35 #include <algorithm>
36 #include <cmath>
37 #include <deque>
38 #include <limits>
39 #include <list>
40 #include <string>
41 #include <string_view>
42 #include <unordered_map>
43 #include <vector>
44 
45 #include "word-breaker.h"
46 
47 using namespace std;
48 
49 namespace Xapian {
50 
51 static inline bool
52 U_isupper(unsigned ch)
53 {
54  return ch < 128 && C_isupper(static_cast<unsigned char>(ch));
55 }
56 
57 static inline unsigned
58 check_wordchar(unsigned ch)
59 {
60  if (Unicode::is_wordchar(ch)) return Unicode::tolower(ch);
61  return 0;
62 }
63 
64 static inline bool
65 should_stem(const std::string & term)
66 {
67  const unsigned int SHOULD_STEM_MASK =
71  (1 << Unicode::OTHER_LETTER);
72  Utf8Iterator u(term);
73  return ((SHOULD_STEM_MASK >> Unicode::get_category(*u)) & 1);
74 }
75 
79 static const unsigned UNICODE_IGNORE = numeric_limits<unsigned>::max();
80 
81 static inline unsigned
82 check_infix(unsigned ch)
83 {
84  if (ch == '\'' || ch == '&' || ch == 0xb7 || ch == 0x5f4 || ch == 0x2027) {
85  // Unicode includes all these except '&' in its word boundary rules,
86  // as well as 0x2019 (which we handle below) and ':' (for Swedish
87  // apparently, but we ignore this for now as it's problematic in
88  // real world cases).
89  return ch;
90  }
91  // 0x2019 is Unicode apostrophe and single closing quote.
92  // 0x201b is Unicode single opening quote with the tail rising.
93  if (ch == 0x2019 || ch == 0x201b) return '\'';
94  // 0x200c and 0x200d are zero width non-joiner and joiner respectively.
95  // 0x2060 and 0xfeff are word joiners (0xfeff deprecated since Unicode 3.2).
96  if (ch >= 0x200c && (ch <= 0x200d || ch == 0x2060 || ch == 0xfeff))
97  return UNICODE_IGNORE;
98  // 0xad is SOFT HYPHEN which marks a potential hyphenation point in a word.
99  if (ch == 0xad)
100  return UNICODE_IGNORE;
101  return 0;
102 }
103 
104 static inline unsigned
105 check_infix_digit(unsigned ch)
106 {
107  // This list of characters comes from Unicode's word identifying algorithm.
108  switch (ch) {
109  case ',':
110  case '.':
111  case ';':
112  case 0x037e: // GREEK QUESTION MARK
113  case 0x0589: // ARMENIAN FULL STOP
114  case 0x060D: // ARABIC DATE SEPARATOR
115  case 0x07F8: // NKO COMMA
116  case 0x2044: // FRACTION SLASH
117  case 0xFE10: // PRESENTATION FORM FOR VERTICAL COMMA
118  case 0xFE13: // PRESENTATION FORM FOR VERTICAL COLON
119  case 0xFE14: // PRESENTATION FORM FOR VERTICAL SEMICOLON
120  return ch;
121  }
122  if (ch >= 0x200b && (ch <= 0x200d || ch == 0x2060 || ch == 0xfeff))
123  return UNICODE_IGNORE;
124  return 0;
125 }
126 
127 static inline bool
128 is_digit(unsigned ch) {
130 }
131 
132 static inline unsigned
133 check_suffix(unsigned ch)
134 {
135  if (ch == '+' || ch == '#') return ch;
136  // FIXME: what about '-'?
137  return 0;
138 }
139 
140 static_assert(int(MSet::SNIPPET_WORD_BREAKS) == TermGenerator::FLAG_WORD_BREAKS,
141  "WORD_BREAKS flags have same value");
142 
143 template<typename ACTION>
144 static bool
145 break_words(Utf8Iterator& itor, unsigned break_flags, bool with_positions,
146  ACTION action)
147 {
148 #ifdef USE_ICU
149  if (break_flags & MSet::SNIPPET_WORD_BREAKS) {
150  const char* start = itor.raw();
151  // get_unbroken() returns the number of codepoints, which we aren't
152  // interested in here.
153  (void)get_unbroken(itor);
154  size_t left = itor.raw() - start;
155  for (WordIterator tk(start, left); tk != WordIterator(); ++tk) {
156  const string& token = *tk;
157  left -= token.length();
158  if (!action(token, with_positions, itor.left() + left))
159  return false;
160  }
161  return true;
162  }
163 #else
164  (void)break_flags;
165 #endif
166 
167  NgramIterator tk(itor);
168  while (tk != NgramIterator()) {
169  const string& token = *tk;
170  // FLAG_NGRAMS only sets positions for tokens of length 1.
171  bool with_pos = with_positions && tk.unigram();
172  if (!action(token, with_pos, tk.get_utf8iterator().left()))
173  return false;
174  ++tk;
175  }
176  // Update itor to point the end of the span of text in an unbroken script.
177  itor = tk.get_utf8iterator();
178  return true;
179 }
180 
187 template<typename ACTION>
188 static void
189 parse_terms(Utf8Iterator itor, unsigned break_flags, bool with_positions,
190  ACTION action)
191 {
192  while (true) {
193  // Advance to the start of the next term.
194  unsigned ch;
195  while (true) {
196  if (itor == Utf8Iterator()) return;
197  ch = check_wordchar(*itor);
198  if (ch) break;
199  ++itor;
200  }
201 
202  string term;
203  // Look for initials separated by '.' (e.g. P.T.O., U.N.C.L.E).
204  // Don't worry if there's a trailing '.' or not.
205  if (U_isupper(*itor)) {
206  const Utf8Iterator end;
207  Utf8Iterator p = itor;
208  do {
210  } while (p != end && *p == '.' && ++p != end && U_isupper(*p));
211  // One letter does not make an acronym! If we handled a single
212  // uppercase letter here, we wouldn't catch M&S below.
213  if (term.size() > 1) {
214  // Check there's not a (lower case) letter or digit
215  // immediately after it.
216  if (p == end || !Unicode::is_wordchar(*p)) {
217  itor = p;
218  goto endofterm;
219  }
220  }
221  term.resize(0);
222  }
223 
224  while (true) {
225  if (break_flags && is_unbroken_wordchar(*itor)) {
226  if (!break_words(itor, break_flags, with_positions, action))
227  return;
228  while (true) {
229  if (itor == Utf8Iterator()) return;
230  ch = check_wordchar(*itor);
231  if (ch) break;
232  ++itor;
233  }
234  continue;
235  }
236  unsigned prevch;
237  do {
239  prevch = ch;
240  if (++itor == Utf8Iterator() ||
241  (break_flags && is_unbroken_script(*itor)))
242  goto endofterm;
243  ch = check_wordchar(*itor);
244  } while (ch);
245 
246  Utf8Iterator next(itor);
247  ++next;
248  if (next == Utf8Iterator()) break;
249  unsigned nextch = check_wordchar(*next);
250  if (!nextch) break;
251  unsigned infix_ch = *itor;
252  if (is_digit(prevch) && is_digit(*next)) {
253  infix_ch = check_infix_digit(infix_ch);
254  } else {
255  // Handle things like '&' in AT&T, apostrophes, etc.
256  infix_ch = check_infix(infix_ch);
257  }
258  if (!infix_ch) break;
259  if (infix_ch != UNICODE_IGNORE)
260  Unicode::append_utf8(term, infix_ch);
261  ch = nextch;
262  itor = next;
263  }
264 
265  {
266  size_t len = term.size();
267  unsigned count = 0;
268  while ((ch = check_suffix(*itor))) {
269  if (++count > 3) {
270  term.resize(len);
271  break;
272  }
274  if (++itor == Utf8Iterator()) goto endofterm;
275  }
276  // Don't index fish+chips as fish+ chips.
277  if (Unicode::is_wordchar(*itor))
278  term.resize(len);
279  }
280 
281 endofterm:
282  if (!action(term, with_positions, itor.left()))
283  return;
284  }
285 }
286 
287 void
288 TermGenerator::Internal::index_text(Utf8Iterator itor, termcount wdf_inc,
289  string_view prefix, bool with_positions)
290 {
291 #ifndef USE_ICU
292  if (flags & FLAG_WORD_BREAKS) {
293  throw Xapian::FeatureUnavailableError("FLAG_WORD_BREAKS requires "
294  "building Xapian to use ICU");
295  }
296 #endif
297  unsigned break_flags = flags & (FLAG_NGRAMS | FLAG_WORD_BREAKS);
298  if (break_flags == 0 && is_ngram_enabled()) {
299  break_flags = FLAG_NGRAMS;
300  }
301 
302  stop_strategy current_stop_mode;
303  if (!stopper) {
304  current_stop_mode = TermGenerator::STOP_NONE;
305  } else {
306  current_stop_mode = stop_mode;
307  }
308 
309  // Create two std::string objects which we effectively use as buffers to
310  // build the prefixed terms we generate (one for non-stemmed and one for
311  // stemmed terms). This is a simple way to avoid creating a temporary
312  // std::string object each time we generate a prefixed term.
313  string prefixed_term;
314  auto prefix_size = prefix.size();
315  prefixed_term.reserve(prefix_size + max_word_length);
316  prefixed_term.assign(prefix);
317 
318  string prefixed_stemmed_term;
319  int add_z = (strategy != TermGenerator::STEM_ALL);
320  prefixed_stemmed_term.reserve(add_z + prefix_size + max_word_length);
321  if (add_z) prefixed_stemmed_term.assign(1, 'Z');
322  prefixed_stemmed_term.append(prefix);
323  auto prefixed_stemmed_size = prefixed_stemmed_term.size();
324 
325  parse_terms(itor, break_flags, with_positions,
326  [=, &prefixed_term, &prefixed_stemmed_term
327 #if __cplusplus >= 201907L
328 // C++20 no longer supports implicit `this` in lambdas but older C++ versions
329 // don't allow `this` here.
330  , this
331 #endif
332  ](const string & term, bool positional, size_t) {
333  if (term.size() > max_word_length) return true;
334 
335  if (current_stop_mode == TermGenerator::STOP_ALL &&
336  (*stopper)(term)) {
337  return true;
338  }
339 
340  if (strategy == TermGenerator::STEM_SOME ||
341  strategy == TermGenerator::STEM_NONE ||
342  strategy == TermGenerator::STEM_SOME_FULL_POS) {
343  prefixed_term.append(term);
344  if (positional) {
345  if (rare(cur_pos >= pos_limit))
346  throw Xapian::RangeError("termpos limit exceeded");
347  doc.add_posting(prefixed_term, ++cur_pos, wdf_inc);
348  } else {
349  doc.add_term(prefixed_term, wdf_inc);
350  }
351  prefixed_term.resize(prefix_size);
352  }
353 
354  // MSVC seems to need "this->" on member variables in this
355  // situation.
356  if ((this->flags & FLAG_SPELLING) && prefix_size == 0)
357  db.add_spelling(term);
358 
359  if (strategy == TermGenerator::STEM_NONE || stemmer.is_none())
360  return true;
361 
362  if (strategy == TermGenerator::STEM_SOME ||
363  strategy == TermGenerator::STEM_SOME_FULL_POS) {
364  if (current_stop_mode == TermGenerator::STOP_STEMMED &&
365  (*stopper)(term))
366  return true;
367 
368  // Note, this uses the lowercased term, but that's OK as we
369  // only want to avoid stemming terms starting with a digit.
370  if (!should_stem(term)) return true;
371  }
372 
373  // Add stemmed form without positional information.
374  const string& stem = stemmer(term);
375  if (rare(stem.empty())) return true;
376  prefixed_stemmed_term.append(stem);
377  if (strategy != TermGenerator::STEM_SOME && positional) {
378  if (strategy != TermGenerator::STEM_SOME_FULL_POS) {
379  if (rare(cur_pos >= pos_limit))
380  throw Xapian::RangeError("termpos limit exceeded");
381  ++cur_pos;
382  }
383  doc.add_posting(prefixed_stemmed_term, cur_pos, wdf_inc);
384  } else {
385  doc.add_term(prefixed_stemmed_term, wdf_inc);
386  }
387  prefixed_stemmed_term.resize(prefixed_stemmed_size);
388 
389  return true;
390  });
391 }
392 
393 struct Sniplet {
394  double* relevance;
395 
396  size_t term_end;
397 
398  size_t highlight;
399 
400  Sniplet(double* r, size_t t, size_t h)
401  : relevance(r), term_end(t), highlight(h) { }
402 };
403 
404 class SnipPipe {
405  deque<Sniplet> pipe;
406  deque<Sniplet> best_pipe;
407 
408  // Requested length for snippet.
409  size_t length;
410 
411  // Position in text of start of current pipe contents.
412  size_t begin = 0;
413 
414  // Rolling sum of the current pipe contents.
415  double sum = 0;
416 
417  size_t phrase_len = 0;
418 
419  public:
420  size_t best_begin = 0;
421 
422  size_t best_end = 0;
423 
424  double best_sum = 0;
425 
426  // Add one to length to allow for inter-word space.
427  // FIXME: We ought to correctly allow for multiple spaces.
428  explicit SnipPipe(size_t length_) : length(length_ + 1) { }
429 
430  bool pump(double* r, size_t t, size_t h, unsigned flags);
431 
432  void done();
433 
434  bool drain(string_view input,
435  string_view hi_start,
436  string_view hi_end,
437  string_view omit,
438  string& output);
439 };
440 
441 #define DECAY 2.0
442 
443 inline bool
444 SnipPipe::pump(double* r, size_t t, size_t h, unsigned flags)
445 {
446  if (h > 1) {
447  if (pipe.size() >= h - 1) {
448  // The final term of a phrase is entering the window. Peg the
449  // phrase's relevance onto the first term of the phrase, so it'll
450  // be removed from `sum` when the phrase starts to leave the
451  // window.
452  auto & phrase_start = pipe[pipe.size() - (h - 1)];
453  if (phrase_start.relevance) {
454  *phrase_start.relevance *= DECAY;
455  sum -= *phrase_start.relevance;
456  }
457  sum += *r;
458  phrase_start.relevance = r;
459  phrase_start.highlight = h;
460  *r /= DECAY;
461  }
462  r = NULL;
463  h = 0;
464  }
465  pipe.emplace_back(r, t, h);
466  if (r) {
467  sum += *r;
468  *r /= DECAY;
469  }
470 
471  // If necessary, discard words from the start of the pipe until it has the
472  // desired length.
473  // FIXME: Also shrink the window past words with relevance < 0?
474  while (t - begin > length /* || pipe.front().relevance < 0.0 */) {
475  const Sniplet& word = pipe.front();
476  if (word.relevance) {
477  *word.relevance *= DECAY;
478  sum -= *word.relevance;
479  }
480  begin = word.term_end;
481  if (best_end >= begin)
482  best_pipe.push_back(word);
483  pipe.pop_front();
484  // E.g. can happen if the current term is longer than the requested
485  // length!
486  if (rare(pipe.empty())) break;
487  }
488 
489  // Using > here doesn't work well, as we don't extend a snippet over terms
490  // with 0 weight.
491  if (sum >= best_sum) {
492  // Discard any part of `best_pipe` which is before `begin`.
493  if (begin >= best_end) {
494  best_pipe.clear();
495  } else {
496  while (!best_pipe.empty() &&
497  best_pipe.front().term_end <= begin) {
498  best_pipe.pop_front();
499  }
500  }
501  best_sum = sum;
502  best_begin = begin;
503  best_end = t;
504  } else if ((flags & Xapian::MSet::SNIPPET_EXHAUSTIVE) == 0) {
505  if (best_sum > 0 && best_end < begin) {
506  // We found something, and we aren't still looking near it.
507  // FIXME: Benchmark this and adjust if necessary.
508  return false;
509  }
510  }
511  return true;
512 }
513 
514 inline void
515 SnipPipe::done()
516 {
517  // Discard any part of `pipe` which is after `best_end`.
518  if (begin >= best_end) {
519  pipe.clear();
520  } else {
521  // We should never empty the pipe (as that case should be handled
522  // above).
523  while (rare(!pipe.empty()) &&
524  pipe.back().term_end > best_end) {
525  pipe.pop_back();
526  }
527  }
528 }
529 
530 // Check if a non-word character is should be included at the start of the
531 // snippet. We want to include certain leading non-word characters, but not
532 // others.
533 static inline bool
535  if (Unicode::is_currency(ch) ||
538  return true;
539  }
540  switch (ch) {
541  case '"':
542  case '#':
543  case '%':
544  case '&':
545  case '\'':
546  case '+':
547  case '-':
548  case '/':
549  case '<':
550  case '@':
551  case '\\':
552  case '`':
553  case '~':
554  case 0x00A1: // INVERTED EXCLAMATION MARK
555  case 0x00A7: // SECTION SIGN
556  case 0x00BF: // INVERTED QUESTION MARK
557  return true;
558  }
559  return false;
560 }
561 
562 // Check if a non-word character is should be included at the end of the
563 // snippet. We want to include certain trailing non-word characters, but not
564 // others.
565 static inline bool
567  if (Unicode::is_currency(ch) ||
570  return true;
571  }
572  switch (ch) {
573  case '"':
574  case '%':
575  case '\'':
576  case '+':
577  case '-':
578  case '/':
579  case '>':
580  case '@':
581  case '\\':
582  case '`':
583  case '~':
584  return true;
585  }
586  return false;
587 }
588 
589 static inline void
590 append_escaping_xml(const char* p, const char* end, string& output)
591 {
592  while (p != end) {
593  char ch = *p++;
594  switch (ch) {
595  case '&':
596  output += "&amp;";
597  break;
598  case '<':
599  output += "&lt;";
600  break;
601  case '>':
602  output += "&gt;";
603  break;
604  default:
605  output += ch;
606  }
607  }
608 }
609 
610 inline bool
611 SnipPipe::drain(string_view input,
612  string_view hi_start,
613  string_view hi_end,
614  string_view omit,
615  string& output)
616 {
617  if (best_pipe.empty() && !pipe.empty()) {
618  swap(best_pipe, pipe);
619  }
620 
621  if (best_pipe.empty()) {
622  size_t tail_len = input.size() - best_end;
623  if (tail_len == 0) return false;
624 
625  // See if this is the end of a sentence.
626  // FIXME: This is quite simplistic - look at the Unicode rules:
627  // https://unicode.org/reports/tr29/#Sentence_Boundaries
628  bool sentence_end = false;
629  Utf8Iterator i(input.data() + best_end, tail_len);
630  while (i != Utf8Iterator()) {
631  unsigned ch = *i;
632  if (sentence_end && Unicode::is_whitespace(ch)) break;
633 
634  // Allow "...", "!!", "!?!", etc...
635  sentence_end = (ch == '.' || ch == '?' || ch == '!');
636 
637  if (Unicode::is_wordchar(ch)) break;
638  ++i;
639  }
640 
641  if (sentence_end) {
642  // Include end of sentence punctuation.
643  append_escaping_xml(input.data() + best_end, i.raw(), output);
644  return false;
645  }
646 
647  // Include trailing punctuation which includes meaning or context.
648  i.assign(input.data() + best_end, tail_len);
649  int trailing_punc = 0;
650  while (i != Utf8Iterator() && snippet_check_trailing_nonwordchar(*i)) {
651  // But limit how much trailing punctuation we include.
652  if (++trailing_punc > 4) {
653  trailing_punc = 0;
654  break;
655  }
656  ++i;
657  }
658  if (trailing_punc) {
659  append_escaping_xml(input.data() + best_end, i.raw(), output);
660  if (i == Utf8Iterator()) return false;
661  }
662 
663  // Append "..." or equivalent as this doesn't seem to be the start
664  // of a sentence.
665  output += omit;
666 
667  return false;
668  }
669 
670  const Sniplet & word = best_pipe.front();
671 
672  if (output.empty()) {
673  // Start of the snippet.
674  enum { NO, PUNC, YES } sentence_boundary = (best_begin == 0) ? YES : NO;
675 
676  Utf8Iterator i(input.data() + best_begin, word.term_end - best_begin);
677  while (i != Utf8Iterator()) {
678  unsigned ch = *i;
679  switch (sentence_boundary) {
680  case NO:
681  if (ch == '.' || ch == '?' || ch == '!') {
682  sentence_boundary = PUNC;
683  }
684  break;
685  case PUNC:
686  if (Unicode::is_whitespace(ch)) {
687  sentence_boundary = YES;
688  } else if (ch == '.' || ch == '?' || ch == '!') {
689  // Allow "...", "!!", "!?!", etc...
690  } else {
691  sentence_boundary = NO;
692  }
693  break;
694  case YES:
695  break;
696  }
697 
698  // Start the snippet at the start of the first word, but include
699  // certain punctuation too.
700  if (Unicode::is_wordchar(ch)) {
701  // But limit how much leading punctuation we include.
702  size_t word_begin = i.raw() - input.data();
703  if (word_begin - best_begin > 4) {
704  best_begin = word_begin;
705  }
706  break;
707  }
708  ++i;
710  best_begin = i.raw() - input.data();
711  }
712  }
713 
714  // Add "..." or equivalent if this doesn't seem to be the start of a
715  // sentence.
716  if (sentence_boundary != YES) {
717  output += omit;
718  }
719  }
720 
721  if (word.highlight) {
722  // Don't include inter-word characters in the highlight.
723  Utf8Iterator i(input.data() + best_begin, input.size() - best_begin);
724  while (i != Utf8Iterator()) {
725  unsigned ch = *i;
726  if (Unicode::is_wordchar(ch)) {
727  append_escaping_xml(input.data() + best_begin, i.raw(), output);
728  best_begin = i.raw() - input.data();
729  break;
730  }
731  ++i;
732  }
733  }
734 
735  if (!phrase_len) {
736  phrase_len = word.highlight;
737  if (phrase_len) output += hi_start;
738  }
739 
740  const char* p = input.data();
741  append_escaping_xml(p + best_begin, p + word.term_end, output);
742  best_begin = word.term_end;
743 
744  if (phrase_len && --phrase_len == 0) output += hi_end;
745 
746  best_pipe.pop_front();
747  return true;
748 }
749 
750 static void
752  list<vector<string>> & exact_phrases,
753  unordered_map<string, double> & loose_terms,
754  list<const Xapian::Internal::QueryWildcard*> & wildcards,
755  list<const Xapian::Internal::QueryEditDistance*> & fuzzies,
756  size_t & longest_phrase)
757 {
758  // FIXME: OP_NEAR, non-tight OP_PHRASE, OP_PHRASE with non-term subqueries
759  size_t n_subqs = query.get_num_subqueries();
761  if (op == query.LEAF_TERM) {
762  const Xapian::Internal::QueryTerm & qt =
763  *static_cast<const Xapian::Internal::QueryTerm *>(query.internal.get());
764  loose_terms.insert(make_pair(qt.get_term(), 0));
765  } else if (op == query.OP_WILDCARD) {
767  const QueryWildcard* qw =
768  static_cast<const QueryWildcard*>(query.internal.get());
769  wildcards.push_back(qw);
770  } else if (op == query.OP_EDIT_DISTANCE) {
772  const QueryEditDistance* qed =
773  static_cast<const QueryEditDistance*>(query.internal.get());
774  fuzzies.push_back(qed);
775  } else if (op == query.OP_PHRASE) {
776  const Xapian::Internal::QueryPhrase & phrase =
777  *static_cast<const Xapian::Internal::QueryPhrase *>(query.internal.get());
778  if (phrase.get_window() == n_subqs) {
779  // Tight phrase.
780  for (size_t i = 0; i != n_subqs; ++i) {
782  goto non_term_subquery;
783  }
784 
785  // Tight phrase of terms.
786  exact_phrases.push_back(vector<string>());
787  vector<string> & terms = exact_phrases.back();
788  terms.reserve(n_subqs);
789  for (size_t i = 0; i != n_subqs; ++i) {
791  const Xapian::Internal::QueryTerm & qt =
792  *static_cast<const Xapian::Internal::QueryTerm *>(q.internal.get());
793  terms.push_back(qt.get_term());
794  }
795  if (n_subqs > longest_phrase) longest_phrase = n_subqs;
796  return;
797  }
798  }
799 non_term_subquery:
800  for (size_t i = 0; i != n_subqs; ++i)
801  check_query(query.get_subquery(i), exact_phrases, loose_terms,
802  wildcards, fuzzies, longest_phrase);
803 }
804 
805 static double*
806 check_term(unordered_map<string, double> & loose_terms,
807  const Xapian::Weight::Internal * stats,
808  const string & term,
809  double max_tw)
810 {
811  auto it = loose_terms.find(term);
812  if (it == loose_terms.end()) return NULL;
813 
814  if (it->second == 0.0) {
815  double relevance;
816  if (!stats->get_termweight(term, relevance)) {
817  // FIXME: Assert?
818  loose_terms.erase(it);
819  return NULL;
820  }
821 
822  it->second = relevance + max_tw;
823  }
824  return &it->second;
825 }
826 
827 string
828 MSet::Internal::snippet(string_view text,
829  size_t length,
830  const Xapian::Stem & stemmer,
831  unsigned flags,
832  string_view hi_start,
833  string_view hi_end,
834  string_view omit) const
835 {
836  if (hi_start.empty() && hi_end.empty() && text.size() <= length) {
837  // Too easy!
838  return string{text};
839  }
840 
841 #ifndef USE_ICU
842  if (flags & MSet::SNIPPET_WORD_BREAKS) {
843  throw Xapian::FeatureUnavailableError("SNIPPET_WORD_BREAKS requires "
844  "building Xapian to use ICU");
845  }
846 #endif
847  auto SNIPPET_BREAK_MASK = MSet::SNIPPET_NGRAMS | MSet::SNIPPET_WORD_BREAKS;
848  unsigned break_flags = flags & SNIPPET_BREAK_MASK;
849  if (break_flags == 0 && is_ngram_enabled()) {
850  break_flags = MSet::SNIPPET_NGRAMS;
851  }
852 
853  size_t term_start = 0;
854  double min_tw = 0, max_tw = 0;
855  if (stats) stats->get_max_termweight(min_tw, max_tw);
856  if (max_tw == 0.0) {
857  max_tw = 1.0;
858  } else {
859  // Scale up by (1 + 1/64) so that highlighting works better for terms
860  // with termweight 0 (which happens for terms not in the database, and
861  // also with some weighting schemes for terms which occur in almost all
862  // documents.
863  max_tw *= 1.015625;
864  }
865 
867  if (enquire) {
868  query = enquire->query;
869  }
870  SnipPipe snip(length);
871 
872  list<vector<string>> exact_phrases;
873  unordered_map<string, double> loose_terms;
874  list<const Xapian::Internal::QueryWildcard*> wildcards;
875  list<const Xapian::Internal::QueryEditDistance*> fuzzies;
876  size_t longest_phrase = 0;
877  check_query(query, exact_phrases, loose_terms,
878  wildcards, fuzzies, longest_phrase);
879 
880  vector<double> exact_phrases_relevance;
881  exact_phrases_relevance.reserve(exact_phrases.size());
882  for (auto&& terms : exact_phrases) {
883  // FIXME: What relevance to use?
884  exact_phrases_relevance.push_back(max_tw * terms.size());
885  }
886 
887  vector<double> wildcards_relevance;
888  wildcards_relevance.reserve(wildcards.size());
889  for (auto&& pattern : wildcards) {
890  // FIXME: What relevance to use?
891  (void)pattern;
892  wildcards_relevance.push_back(max_tw + min_tw);
893  }
894 
895  vector<double> fuzzies_relevance;
896  fuzzies_relevance.reserve(fuzzies.size());
897  for (auto&& pattern : fuzzies) {
898  // FIXME: What relevance to use?
899  (void)pattern;
900  fuzzies_relevance.push_back(max_tw + min_tw);
901  }
902 
903  // Background relevance is the same for a given MSet, so cache it
904  // between calls to MSet::snippet() on the same object.
905  unordered_map<string, double>& background = snippet_bg_relevance;
906 
907  vector<string> phrase;
908  if (longest_phrase) phrase.resize(longest_phrase - 1);
909  size_t phrase_next = 0;
910  bool matchfound = false;
911  parse_terms(Utf8Iterator(text), break_flags, true,
912  [&](const string & term, bool positional, size_t left) {
913  // FIXME: Don't hardcode this here.
914  const size_t max_word_length = 64;
915 
916  if (!positional) return true;
917  if (term.size() > max_word_length) return true;
918 
919  // We get segments with any "inter-word" characters in front of
920  // each word, e.g.:
921  // [The][ cat][ sat][ on][ the][ mat]
922  size_t term_end = text.size() - left;
923 
924  double* relevance = 0;
925  size_t highlight = 0;
926  if (stats) {
927  size_t i = 0;
928  for (auto&& terms : exact_phrases) {
929  if (term == terms.back()) {
930  size_t n = terms.size() - 1;
931  bool match = true;
932  while (UNSIGNED_OVERFLOW_OK(n--)) {
933  if (terms[n] != phrase[(n + phrase_next) % (longest_phrase - 1)]) {
934  match = false;
935  break;
936  }
937  }
938  if (match) {
939  // FIXME: Sort phrases, highest score first!
940  relevance = &exact_phrases_relevance[i];
941  highlight = terms.size();
942  goto relevance_done;
943  }
944  }
945  ++i;
946  }
947 
948  relevance = check_term(loose_terms, stats.get(), term, max_tw);
949  if (relevance) {
950  // Matched unstemmed term.
951  highlight = 1;
952  goto relevance_done;
953  }
954 
955  string stem = "Z";
956  stem += stemmer(term);
957  relevance = check_term(loose_terms, stats.get(), stem, max_tw);
958  if (relevance) {
959  // Matched stemmed term.
960  highlight = 1;
961  goto relevance_done;
962  }
963 
964  // Check wildcards.
965  // FIXME: Sort wildcards, cheapest to check first or something?
966  i = 0;
967  for (auto&& qw : wildcards) {
968  if (qw->test(term)) {
969  relevance = &wildcards_relevance[i];
970  highlight = 1;
971  goto relevance_done;
972  }
973  ++i;
974  }
975 
976  // Check fuzzies.
977  // FIXME: Sort fuzzies, cheapest to check first or something?
978  i = 0;
979  for (auto&& qed : fuzzies) {
980  // test() returns 0 for no match, or edit_distance + 1.
981  int ed_result = qed->test(term);
982  if (ed_result) {
983  // FIXME: Reduce relevance the more edits there are?
984  // We can't just divide by ed_result here as this
985  // relevance is used by any term matching this
986  // subquery.
987  relevance = &fuzzies_relevance[i];
988  highlight = 1;
989  goto relevance_done;
990  }
991  ++i;
992  }
993 
995  // Background document model.
996  auto bgit = background.find(term);
997  if (bgit == background.end()) bgit = background.find(stem);
998  if (bgit == background.end()) {
999  Xapian::doccount tf = enquire->db.get_termfreq(term);
1000  if (!tf) {
1001  tf = enquire->db.get_termfreq(stem);
1002  } else {
1003  stem = term;
1004  }
1005  double r = 0.0;
1006  if (tf) {
1007  // Add one to avoid log(0) when a term indexes all
1008  // documents.
1009  Xapian::doccount num_docs = stats->collection_size + 1;
1010  r = max_tw * log((num_docs - tf) / double(tf));
1011  r /= (length + 1) * log(double(num_docs));
1012 #if 0
1013  if (r <= 0) {
1014  Utf8Iterator i(text.data() + term_start, text.data() + term_end);
1015  while (i != Utf8Iterator()) {
1017  r = max_tw * 0.05;
1018  }
1019  }
1020  }
1021 #endif
1022  }
1023  bgit = background.emplace(make_pair(stem, r)).first;
1024  }
1025  relevance = &bgit->second;
1026  }
1027  } else {
1028 #if 0
1029  // In the absence of weight information, assume longer terms
1030  // are more relevant, and that unstemmed matches are a bit more
1031  // relevant than stemmed matches.
1032  if (queryterms.find(term) != queryterms.end()) {
1033  relevance = term.size() * 3;
1034  } else {
1035  string stem = "Z";
1036  stem += stemmer(term);
1037  if (queryterms.find(stem) != queryterms.end()) {
1038  relevance = term.size() * 2;
1039  }
1040  }
1041 #endif
1042  }
1043 
1044  // FIXME: Allow Enquire without a DB set or an empty MSet() to be
1045  // used if you don't want the collection model?
1046 
1047 #if 0
1048  // FIXME: Punctuation should somehow be included in the model, but this
1049  // approach is problematic - we don't want the first word of a sentence
1050  // to be favoured when it's at the end of the window.
1051 
1052  // Give first word in each sentence a relevance boost.
1053  if (term_start == 0) {
1054  relevance += 10;
1055  } else {
1056  for (size_t i = term_start; i + term.size() < term_end; ++i) {
1057  if (text[i] == '.' && Unicode::is_whitespace(text[i + 1])) {
1058  relevance += 10;
1059  break;
1060  }
1061  }
1062  }
1063 #endif
1064 
1065 relevance_done:
1066  if (longest_phrase) {
1067  phrase[phrase_next] = term;
1068  phrase_next = (phrase_next + 1) % (longest_phrase - 1);
1069  }
1070 
1071  if (highlight) matchfound = true;
1072 
1073  if (!snip.pump(relevance, term_end, highlight, flags)) return false;
1074 
1075  term_start = term_end;
1076  return true;
1077  });
1078 
1079  snip.done();
1080 
1081  // Put together the snippet.
1082  string result;
1083  if (matchfound || (flags & SNIPPET_EMPTY_WITHOUT_MATCH) == 0) {
1084  while (snip.drain(text, hi_start, hi_end, omit, result)) { }
1085  }
1086 
1087  return result;
1088 }
1089 
1090 }
static Xapian::Query query(Xapian::Query::op op, const string &t1=string(), const string &t2=string(), const string &t3=string(), const string &t4=string(), const string &t5=string(), const string &t6=string(), const string &t7=string(), const string &t8=string(), const string &t9=string(), const string &t10=string())
Definition: api_anydb.cc:62
if(!(properties &BACKEND))
Definition: api_collated.h:3
Iterator returning unigrams and bigrams.
Definition: word-breaker.h:71
bool unigram() const
Is this a unigram?
Definition: word-breaker.h:103
const Xapian::Utf8Iterator & get_utf8iterator() const
Definition: word-breaker.h:105
Indicates an attempt to use a feature which is unavailable.
Definition: error.h:707
const std::string & get_term() const
Definition: queryinternal.h:58
@ SNIPPET_BACKGROUND_MODEL
Model the relevancy of non-query terms in MSet::snippet().
Definition: mset.h:324
@ SNIPPET_EXHAUSTIVE
Exhaustively evaluate candidate snippets in MSet::snippet().
Definition: mset.h:331
Class representing a query.
Definition: query.h:45
const Query get_subquery(size_t n) const
Read a top level subquery.
Definition: query.cc:289
op get_type() const noexcept
Get the type of the top level of the query.
Definition: query.cc:275
size_t get_num_subqueries() const noexcept
Get the number of subqueries of the top level query.
Definition: query.cc:283
op
Query operators.
Definition: query.h:78
@ OP_WILDCARD
Wildcard expansion.
Definition: query.h:255
@ OP_PHRASE
Match only documents where all subqueries match near and in order.
Definition: query.h:152
@ OP_EDIT_DISTANCE
Edit distance expansion.
Definition: query.h:269
@ LEAF_TERM
Value returned by get_type() for a term.
Definition: query.h:280
Xapian::Internal::intrusive_ptr< Internal > internal
Definition: query.h:48
RangeError indicates an attempt to access outside the bounds of a container.
Definition: error.h:959
bool pump(double *r, size_t t, size_t h, unsigned flags)
deque< Sniplet > best_pipe
Class representing a stemming algorithm.
Definition: stem.h:74
bool is_none() const
Return true if this is a no-op stemmer.
Definition: stem.h:193
int flags
For backward compatibility with Xapian 1.2.
Definition: termgenerator.h:97
stop_strategy
Stopper strategies, for use with set_stopper_strategy().
An iterator which returns Unicode character values from a UTF-8 encoded string.
Definition: unicode.h:39
const char * raw() const
Return the raw const char* pointer for the current position.
Definition: unicode.h:55
size_t left() const
Return the number of bytes left in the iterator's buffer.
Definition: unicode.h:60
void assign(const char *p_, size_t len)
Assign a new string to the iterator.
Definition: unicode.h:73
Class to hold statistics for a given collection.
bool get_termweight(std::string_view term, double &termweight) const
Get the termweight.
#define rare(COND)
Definition: config.h:607
string term
PositionList * p
Class representing a document.
Xapian::MSet internals.
category get_category(int info)
Definition: unicode.h:283
void append_utf8(std::string &s, unsigned ch)
Append the UTF-8 representation of a single Unicode character to a std::string.
Definition: unicode.h:344
unsigned tolower(unsigned ch)
Convert a Unicode character to lowercase.
Definition: unicode.h:388
@ FINAL_QUOTE_PUNCTUATION
Punctuation, final quote (Pf)
Definition: unicode.h:253
@ INITIAL_QUOTE_PUNCTUATION
Punctuation, initial quote (Pi)
Definition: unicode.h:252
@ LOWERCASE_LETTER
Letter, lowercase (Ll)
Definition: unicode.h:231
@ MODIFIER_LETTER
Letter, modifier (Lm)
Definition: unicode.h:233
@ OTHER_LETTER
Letter, other (Lo)
Definition: unicode.h:234
@ DECIMAL_DIGIT_NUMBER
Number, decimal digit (Nd)
Definition: unicode.h:238
@ CLOSE_PUNCTUATION
Punctuation, close (Pe)
Definition: unicode.h:251
@ OPEN_PUNCTUATION
Punctuation, open (Ps)
Definition: unicode.h:250
@ TITLECASE_LETTER
Letter, titlecase (Lt)
Definition: unicode.h:232
@ UPPERCASE_LETTER
Letter, uppercase (Lu)
Definition: unicode.h:230
bool is_wordchar(unsigned ch)
Test if a given Unicode character is "word character".
Definition: unicode.h:355
bool is_currency(unsigned ch)
Test if a given Unicode character is a currency symbol.
Definition: unicode.h:383
bool is_whitespace(unsigned ch)
Test if a given Unicode character is a whitespace character.
Definition: unicode.h:373
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:82
static bool snippet_check_leading_nonwordchar(unsigned ch)
static double * check_term(unordered_map< string, double > &loose_terms, const Xapian::Weight::Internal *stats, const string &term, double max_tw)
static void append_escaping_xml(const char *p, const char *end, string &output)
static bool snippet_check_trailing_nonwordchar(unsigned ch)
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:64
static void check_query(const Xapian::Query &query, list< vector< string >> &exact_phrases, unordered_map< string, double > &loose_terms, list< const Xapian::Internal::QueryWildcard * > &wildcards, list< const Xapian::Internal::QueryEditDistance * > &fuzzies, size_t &longest_phrase)
static unsigned check_suffix(unsigned ch)
static unsigned check_wordchar(unsigned ch)
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:37
static void parse_terms(Utf8Iterator itor, unsigned break_flags, bool with_positions, ACTION action)
Templated framework for processing terms.
static bool break_words(Utf8Iterator &itor, unsigned break_flags, bool with_positions, ACTION action)
Xapian::Query internals.
parsing a user query string to build a Xapian::Query object
const unsigned UNICODE_IGNORE
Value representing "ignore this" when returned by check_infix() or check_infix_digit().
bool should_stem(const string &term, const State &state)
bool U_isupper(unsigned ch)
bool is_digit(unsigned ch)
unsigned check_infix_digit(unsigned ch)
unsigned check_infix(unsigned ch)
stemming algorithms
static Xapian::Stem stemmer
Definition: stemtest.cc:42
Various handy string-related helpers.
Sniplet(double *r, size_t t, size_t h)
Definition: header.h:87
#define DECAY
TermGenerator class internals.
Unicode and UTF-8 related classes and functions.
bool is_unbroken_script(unsigned p)
Definition: word-breaker.cc:51
bool is_unbroken_wordchar(unsigned p)
size_t get_unbroken(Xapian::Utf8Iterator &it)
bool is_ngram_enabled()
Should we use the n-gram code?
Definition: word-breaker.cc:43
Handle text without explicit word breaks.