xapian-core  1.4.22
termgenerator_internal.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2007,2010,2011,2012,2015,2016,2017,2018,2019,2020 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include <config.h>
22 
23 #include "termgenerator_internal.h"
24 
25 #include "api/omenquireinternal.h"
26 #include "api/queryinternal.h"
27 
28 #include <xapian/document.h>
29 #include <xapian/queryparser.h>
30 #include <xapian/stem.h>
31 #include <xapian/unicode.h>
32 
33 #include "stringutils.h"
34 
35 #include <algorithm>
36 #include <cmath>
37 #include <deque>
38 #include <limits>
39 #include <list>
40 #include <string>
41 #include <unordered_map>
42 #include <vector>
43 
44 #include "cjk-tokenizer.h"
45 
46 using namespace std;
47 
48 namespace Xapian {
49 
50 static inline bool
51 U_isupper(unsigned ch)
52 {
53  return ch < 128 && C_isupper(static_cast<unsigned char>(ch));
54 }
55 
56 static inline unsigned
57 check_wordchar(unsigned ch)
58 {
59  if (Unicode::is_wordchar(ch)) return Unicode::tolower(ch);
60  return 0;
61 }
62 
63 static inline bool
64 should_stem(const std::string & term)
65 {
66  const unsigned int SHOULD_STEM_MASK =
70  (1 << Unicode::OTHER_LETTER);
71  Utf8Iterator u(term);
72  return ((SHOULD_STEM_MASK >> Unicode::get_category(*u)) & 1);
73 }
74 
78 static const unsigned UNICODE_IGNORE = numeric_limits<unsigned>::max();
79 
80 static inline unsigned
81 check_infix(unsigned ch)
82 {
83  if (ch == '\'' || ch == '&' || ch == 0xb7 || ch == 0x5f4 || ch == 0x2027) {
84  // Unicode includes all these except '&' in its word boundary rules,
85  // as well as 0x2019 (which we handle below) and ':' (for Swedish
86  // apparently, but we ignore this for now as it's problematic in
87  // real world cases).
88  return ch;
89  }
90  // 0x2019 is Unicode apostrophe and single closing quote.
91  // 0x201b is Unicode single opening quote with the tail rising.
92  if (ch == 0x2019 || ch == 0x201b) return '\'';
93  if (ch >= 0x200b && (ch <= 0x200d || ch == 0x2060 || ch == 0xfeff))
94  return UNICODE_IGNORE;
95  return 0;
96 }
97 
98 static inline unsigned
99 check_infix_digit(unsigned ch)
100 {
101  // This list of characters comes from Unicode's word identifying algorithm.
102  switch (ch) {
103  case ',':
104  case '.':
105  case ';':
106  case 0x037e: // GREEK QUESTION MARK
107  case 0x0589: // ARMENIAN FULL STOP
108  case 0x060D: // ARABIC DATE SEPARATOR
109  case 0x07F8: // NKO COMMA
110  case 0x2044: // FRACTION SLASH
111  case 0xFE10: // PRESENTATION FORM FOR VERTICAL COMMA
112  case 0xFE13: // PRESENTATION FORM FOR VERTICAL COLON
113  case 0xFE14: // PRESENTATION FORM FOR VERTICAL SEMICOLON
114  return ch;
115  }
116  if (ch >= 0x200b && (ch <= 0x200d || ch == 0x2060 || ch == 0xfeff))
117  return UNICODE_IGNORE;
118  return 0;
119 }
120 
121 static inline bool
122 is_digit(unsigned ch) {
124 }
125 
126 static inline unsigned
127 check_suffix(unsigned ch)
128 {
129  if (ch == '+' || ch == '#') return ch;
130  // FIXME: what about '-'?
131  return 0;
132 }
133 
140 template<typename ACTION>
141 static void
142 parse_terms(Utf8Iterator itor, bool cjk_ngram, bool with_positions, ACTION action)
143 {
144  while (true) {
145  // Advance to the start of the next term.
146  unsigned ch;
147  while (true) {
148  if (itor == Utf8Iterator()) return;
149  ch = check_wordchar(*itor);
150  if (ch) break;
151  ++itor;
152  }
153 
154  string term;
155  // Look for initials separated by '.' (e.g. P.T.O., U.N.C.L.E).
156  // Don't worry if there's a trailing '.' or not.
157  if (U_isupper(*itor)) {
158  const Utf8Iterator end;
159  Utf8Iterator p = itor;
160  do {
162  } while (p != end && *p == '.' && ++p != end && U_isupper(*p));
163  // One letter does not make an acronym! If we handled a single
164  // uppercase letter here, we wouldn't catch M&S below.
165  if (term.size() > 1) {
166  // Check there's not a (lower case) letter or digit
167  // immediately after it.
168  if (p == end || !Unicode::is_wordchar(*p)) {
169  itor = p;
170  goto endofterm;
171  }
172  }
173  term.resize(0);
174  }
175 
176  while (true) {
177  if (cjk_ngram &&
178  CJK::codepoint_is_cjk(*itor) &&
179  Unicode::is_wordchar(*itor)) {
180  CJKTokenIterator tk(itor);
181  while (tk != CJKTokenIterator()) {
182  const string& cjk_token = *tk;
183  if (!action(cjk_token, with_positions && tk.unigram(),
184  tk.get_utf8iterator()))
185  return;
186  ++tk;
187  }
188  // Update itor to end of CJK text span.
189  itor = tk.get_utf8iterator();
190  while (true) {
191  if (itor == Utf8Iterator()) return;
192  ch = check_wordchar(*itor);
193  if (ch) break;
194  ++itor;
195  }
196  continue;
197  }
198  unsigned prevch;
199  do {
200  Unicode::append_utf8(term, ch);
201  prevch = ch;
202  if (++itor == Utf8Iterator() ||
203  (cjk_ngram && CJK::codepoint_is_cjk(*itor)))
204  goto endofterm;
205  ch = check_wordchar(*itor);
206  } while (ch);
207 
208  Utf8Iterator next(itor);
209  ++next;
210  if (next == Utf8Iterator()) break;
211  unsigned nextch = check_wordchar(*next);
212  if (!nextch) break;
213  unsigned infix_ch = *itor;
214  if (is_digit(prevch) && is_digit(*next)) {
215  infix_ch = check_infix_digit(infix_ch);
216  } else {
217  // Handle things like '&' in AT&T, apostrophes, etc.
218  infix_ch = check_infix(infix_ch);
219  }
220  if (!infix_ch) break;
221  if (infix_ch != UNICODE_IGNORE)
222  Unicode::append_utf8(term, infix_ch);
223  ch = nextch;
224  itor = next;
225  }
226 
227  {
228  size_t len = term.size();
229  unsigned count = 0;
230  while ((ch = check_suffix(*itor))) {
231  if (++count > 3) {
232  term.resize(len);
233  break;
234  }
235  Unicode::append_utf8(term, ch);
236  if (++itor == Utf8Iterator()) goto endofterm;
237  }
238  // Don't index fish+chips as fish+ chips.
239  if (Unicode::is_wordchar(*itor))
240  term.resize(len);
241  }
242 
243 endofterm:
244  if (!action(term, with_positions, itor))
245  return;
246  }
247 }
248 
249 void
250 TermGenerator::Internal::index_text(Utf8Iterator itor, termcount wdf_inc,
251  const string & prefix, bool with_positions)
252 {
253  bool cjk_ngram = (flags & FLAG_CJK_NGRAM) || CJK::is_cjk_enabled();
254 
255  stop_strategy current_stop_mode;
256  if (!stopper.get()) {
257  current_stop_mode = TermGenerator::STOP_NONE;
258  } else {
259  current_stop_mode = stop_mode;
260  }
261 
262  parse_terms(itor, cjk_ngram, with_positions,
263  [=
264 #if __cplusplus >= 201907L
265 // C++20 no longer supports implicit `this` in lambdas but older C++ versions
266 // don't allow `this` here.
267  , this
268 #endif
269  ](const string & term, bool positional, const Utf8Iterator &) {
270  if (term.size() > max_word_length) return true;
271 
272  if (current_stop_mode == TermGenerator::STOP_ALL &&
273  (*stopper)(term)) {
274  return true;
275  }
276 
277  if (strategy == TermGenerator::STEM_SOME ||
278  strategy == TermGenerator::STEM_NONE ||
279  strategy == TermGenerator::STEM_SOME_FULL_POS) {
280  if (positional) {
281  doc.add_posting(prefix + term, ++cur_pos, wdf_inc);
282  } else {
283  doc.add_term(prefix + term, wdf_inc);
284  }
285  }
286 
287  // MSVC seems to need "this->" on member variables in this
288  // situation.
289  if ((this->flags & FLAG_SPELLING) && prefix.empty())
290  db.add_spelling(term);
291 
292  if (strategy == TermGenerator::STEM_NONE || stemmer.is_none())
293  return true;
294 
295  if (strategy == TermGenerator::STEM_SOME ||
296  strategy == TermGenerator::STEM_SOME_FULL_POS) {
297  if (current_stop_mode == TermGenerator::STOP_STEMMED &&
298  (*stopper)(term))
299  return true;
300 
301  // Note, this uses the lowercased term, but that's OK as we
302  // only want to avoid stemming terms starting with a digit.
303  if (!should_stem(term)) return true;
304  }
305 
306  // Add stemmed form without positional information.
307  const string& stem = stemmer(term);
308  if (rare(stem.empty())) return true;
309  string stemmed_term;
310  if (strategy != TermGenerator::STEM_ALL) {
311  stemmed_term += "Z";
312  }
313  stemmed_term += prefix;
314  stemmed_term += stem;
315  if (strategy != TermGenerator::STEM_SOME && with_positions) {
316  if (strategy != TermGenerator::STEM_SOME_FULL_POS) ++cur_pos;
317  doc.add_posting(stemmed_term, cur_pos, wdf_inc);
318  } else {
319  doc.add_term(stemmed_term, wdf_inc);
320  }
321  return true;
322  });
323 }
324 
325 struct Sniplet {
326  double* relevance;
327 
328  size_t term_end;
329 
330  size_t highlight;
331 
332  Sniplet(double* r, size_t t, size_t h)
333  : relevance(r), term_end(t), highlight(h) { }
334 };
335 
336 class SnipPipe {
337  deque<Sniplet> pipe;
338  deque<Sniplet> best_pipe;
339 
340  // Requested length for snippet.
341  size_t length;
342 
343  // Position in text of start of current pipe contents.
344  size_t begin = 0;
345 
346  // Rolling sum of the current pipe contents.
347  double sum = 0;
348 
349  size_t phrase_len = 0;
350 
351  public:
352  size_t best_begin = 0;
353 
354  size_t best_end = 0;
355 
356  double best_sum = 0;
357 
358  // Add one to length to allow for inter-word space.
359  // FIXME: We ought to correctly allow for multiple spaces.
360  explicit SnipPipe(size_t length_) : length(length_ + 1) { }
361 
362  bool pump(double* r, size_t t, size_t h, unsigned flags);
363 
364  void done();
365 
366  bool drain(const string & input,
367  const string & hi_start,
368  const string & hi_end,
369  const string & omit,
370  string & output);
371 };
372 
373 #define DECAY 2.0
374 
375 inline bool
376 SnipPipe::pump(double* r, size_t t, size_t h, unsigned flags)
377 {
378  if (h > 1) {
379  if (pipe.size() >= h - 1) {
380  // The final term of a phrase is entering the window. Peg the
381  // phrase's relevance onto the first term of the phrase, so it'll
382  // be removed from `sum` when the phrase starts to leave the
383  // window.
384  auto & phrase_start = pipe[pipe.size() - (h - 1)];
385  if (phrase_start.relevance) {
386  *phrase_start.relevance *= DECAY;
387  sum -= *phrase_start.relevance;
388  }
389  sum += *r;
390  phrase_start.relevance = r;
391  phrase_start.highlight = h;
392  *r /= DECAY;
393  }
394  r = NULL;
395  h = 0;
396  }
397  pipe.emplace_back(r, t, h);
398  if (r) {
399  sum += *r;
400  *r /= DECAY;
401  }
402 
403  // If necessary, discard words from the start of the pipe until it has the
404  // desired length.
405  // FIXME: Also shrink the window past words with relevance < 0?
406  while (t - begin > length /* || pipe.front().relevance < 0.0 */) {
407  const Sniplet& word = pipe.front();
408  if (word.relevance) {
409  *word.relevance *= DECAY;
410  sum -= *word.relevance;
411  }
412  begin = word.term_end;
413  if (best_end >= begin)
414  best_pipe.push_back(word);
415  pipe.pop_front();
416  // E.g. can happen if the current term is longer than the requested
417  // length!
418  if (rare(pipe.empty())) break;
419  }
420 
421  // Using > here doesn't work well, as we don't extend a snippet over terms
422  // with 0 weight.
423  if (sum >= best_sum) {
424  // Discard any part of `best_pipe` which is before `begin`.
425  if (begin >= best_end) {
426  best_pipe.clear();
427  } else {
428  while (!best_pipe.empty() &&
429  best_pipe.front().term_end <= begin) {
430  best_pipe.pop_front();
431  }
432  }
433  best_sum = sum;
434  best_begin = begin;
435  best_end = t;
436  } else if ((flags & Xapian::MSet::SNIPPET_EXHAUSTIVE) == 0) {
437  if (best_sum > 0 && best_end < begin) {
438  // We found something, and we aren't still looking near it.
439  // FIXME: Benchmark this and adjust if necessary.
440  return false;
441  }
442  }
443  return true;
444 }
445 
446 inline void
447 SnipPipe::done()
448 {
449  // Discard any part of `pipe` which is after `best_end`.
450  if (begin >= best_end) {
451  pipe.clear();
452  } else {
453  // We should never empty the pipe (as that case should be handled
454  // above).
455  while (rare(!pipe.empty()) &&
456  pipe.back().term_end > best_end) {
457  pipe.pop_back();
458  }
459  }
460 }
461 
462 // Check if a non-word character is should be included at the start of the
463 // snippet. We want to include certain leading non-word characters, but not
464 // others.
465 static inline bool
467  if (Unicode::is_currency(ch) ||
470  return true;
471  }
472  switch (ch) {
473  case '"':
474  case '#':
475  case '%':
476  case '&':
477  case '\'':
478  case '+':
479  case '-':
480  case '/':
481  case '<':
482  case '@':
483  case '\\':
484  case '`':
485  case '~':
486  case 0x00A1: // INVERTED EXCLAMATION MARK
487  case 0x00A7: // SECTION SIGN
488  case 0x00BF: // INVERTED QUESTION MARK
489  return true;
490  }
491  return false;
492 }
493 
494 // Check if a non-word character is should be included at the end of the
495 // snippet. We want to include certain trailing non-word characters, but not
496 // others.
497 static inline bool
499  if (Unicode::is_currency(ch) ||
502  return true;
503  }
504  switch (ch) {
505  case '"':
506  case '%':
507  case '\'':
508  case '+':
509  case '-':
510  case '/':
511  case '>':
512  case '@':
513  case '\\':
514  case '`':
515  case '~':
516  return true;
517  }
518  return false;
519 }
520 
521 static inline void
522 append_escaping_xml(const char* p, const char* end, string& output)
523 {
524  while (p != end) {
525  char ch = *p++;
526  switch (ch) {
527  case '&':
528  output += "&amp;";
529  break;
530  case '<':
531  output += "&lt;";
532  break;
533  case '>':
534  output += "&gt;";
535  break;
536  default:
537  output += ch;
538  }
539  }
540 }
541 
542 inline bool
543 SnipPipe::drain(const string & input,
544  const string & hi_start,
545  const string & hi_end,
546  const string & omit,
547  string & output)
548 {
549  if (best_pipe.empty() && !pipe.empty()) {
550  swap(best_pipe, pipe);
551  }
552 
553  if (best_pipe.empty()) {
554  size_t tail_len = input.size() - best_end;
555  if (tail_len == 0) return false;
556 
557  // See if this is the end of a sentence.
558  // FIXME: This is quite simplistic - look at the Unicode rules:
559  // https://unicode.org/reports/tr29/#Sentence_Boundaries
560  bool sentence_end = false;
561  Utf8Iterator i(input.data() + best_end, tail_len);
562  while (i != Utf8Iterator()) {
563  unsigned ch = *i;
564  if (sentence_end && Unicode::is_whitespace(ch)) break;
565 
566  // Allow "...", "!!", "!?!", etc...
567  sentence_end = (ch == '.' || ch == '?' || ch == '!');
568 
569  if (Unicode::is_wordchar(ch)) break;
570  ++i;
571  }
572 
573  if (sentence_end) {
574  // Include end of sentence punctuation.
575  append_escaping_xml(input.data() + best_end, i.raw(), output);
576  return false;
577  }
578 
579  // Include trailing punctuation which includes meaning or context.
580  i.assign(input.data() + best_end, tail_len);
581  int trailing_punc = 0;
582  while (i != Utf8Iterator() && snippet_check_trailing_nonwordchar(*i)) {
583  // But limit how much trailing punctuation we include.
584  if (++trailing_punc > 4) {
585  trailing_punc = 0;
586  break;
587  }
588  ++i;
589  }
590  if (trailing_punc) {
591  append_escaping_xml(input.data() + best_end, i.raw(), output);
592  if (i == Utf8Iterator()) return false;
593  }
594 
595  // Append "..." or equivalent as this doesn't seem to be the start
596  // of a sentence.
597  output += omit;
598 
599  return false;
600  }
601 
602  const Sniplet & word = best_pipe.front();
603 
604  if (output.empty()) {
605  // Start of the snippet.
606  enum { NO, PUNC, YES } sentence_boundary = (best_begin == 0) ? YES : NO;
607 
608  Utf8Iterator i(input.data() + best_begin, word.term_end - best_begin);
609  while (i != Utf8Iterator()) {
610  unsigned ch = *i;
611  switch (sentence_boundary) {
612  case NO:
613  if (ch == '.' || ch == '?' || ch == '!') {
614  sentence_boundary = PUNC;
615  }
616  break;
617  case PUNC:
618  if (Unicode::is_whitespace(ch)) {
619  sentence_boundary = YES;
620  } else if (ch == '.' || ch == '?' || ch == '!') {
621  // Allow "...", "!!", "!?!", etc...
622  } else {
623  sentence_boundary = NO;
624  }
625  break;
626  case YES:
627  break;
628  }
629 
630  // Start the snippet at the start of the first word, but include
631  // certain punctuation too.
632  if (Unicode::is_wordchar(ch)) {
633  // But limit how much leading punctuation we include.
634  size_t word_begin = i.raw() - input.data();
635  if (word_begin - best_begin > 4) {
636  best_begin = word_begin;
637  }
638  break;
639  }
640  ++i;
642  best_begin = i.raw() - input.data();
643  }
644  }
645 
646  // Add "..." or equivalent if this doesn't seem to be the start of a
647  // sentence.
648  if (sentence_boundary != YES) {
649  output += omit;
650  }
651  }
652 
653  if (word.highlight) {
654  // Don't include inter-word characters in the highlight.
655  Utf8Iterator i(input.data() + best_begin, input.size() - best_begin);
656  while (i != Utf8Iterator()) {
657  unsigned ch = *i;
658  if (Unicode::is_wordchar(ch)) {
659  append_escaping_xml(input.data() + best_begin, i.raw(), output);
660  best_begin = i.raw() - input.data();
661  break;
662  }
663  ++i;
664  }
665  }
666 
667  if (!phrase_len) {
668  phrase_len = word.highlight;
669  if (phrase_len) output += hi_start;
670  }
671 
672  const char* p = input.data();
673  append_escaping_xml(p + best_begin, p + word.term_end, output);
674  best_begin = word.term_end;
675 
676  if (phrase_len && --phrase_len == 0) output += hi_end;
677 
678  best_pipe.pop_front();
679  return true;
680 }
681 
682 static void
684  list<vector<string>> & exact_phrases,
685  unordered_map<string, double> & loose_terms,
686  list<string> & wildcards,
687  size_t & longest_phrase)
688 {
689  // FIXME: OP_NEAR, non-tight OP_PHRASE, OP_PHRASE with non-term subqueries
690  size_t n_subqs = query.get_num_subqueries();
691  Xapian::Query::op op = query.get_type();
692  if (op == query.LEAF_TERM) {
693  const Xapian::Internal::QueryTerm & qt =
694  *static_cast<const Xapian::Internal::QueryTerm *>(query.internal.get());
695  loose_terms.insert(make_pair(qt.get_term(), 0));
696  } else if (op == query.OP_WILDCARD) {
698  *static_cast<const Xapian::Internal::QueryWildcard *>(query.internal.get());
699  wildcards.push_back(qw.get_pattern());
700  } else if (op == query.OP_PHRASE) {
701  const Xapian::Internal::QueryPhrase & phrase =
702  *static_cast<const Xapian::Internal::QueryPhrase *>(query.internal.get());
703  if (phrase.get_window() == n_subqs) {
704  // Tight phrase.
705  for (size_t i = 0; i != n_subqs; ++i) {
706  if (query.get_subquery(i).get_type() != query.LEAF_TERM)
707  goto non_term_subquery;
708  }
709 
710  // Tight phrase of terms.
711  exact_phrases.push_back(vector<string>());
712  vector<string> & terms = exact_phrases.back();
713  terms.reserve(n_subqs);
714  for (size_t i = 0; i != n_subqs; ++i) {
715  Xapian::Query q = query.get_subquery(i);
716  const Xapian::Internal::QueryTerm & qt =
717  *static_cast<const Xapian::Internal::QueryTerm *>(q.internal.get());
718  terms.push_back(qt.get_term());
719  }
720  if (n_subqs > longest_phrase) longest_phrase = n_subqs;
721  return;
722  }
723  }
724 non_term_subquery:
725  for (size_t i = 0; i != n_subqs; ++i)
726  check_query(query.get_subquery(i), exact_phrases, loose_terms,
727  wildcards, longest_phrase);
728 }
729 
730 static double*
731 check_term(unordered_map<string, double> & loose_terms,
732  const Xapian::Weight::Internal * stats,
733  const string & term,
734  double max_tw)
735 {
736  auto it = loose_terms.find(term);
737  if (it == loose_terms.end()) return NULL;
738 
739  if (it->second == 0.0) {
740  double relevance;
741  if (!stats->get_termweight(term, relevance)) {
742  // FIXME: Assert?
743  loose_terms.erase(it);
744  return NULL;
745  }
746 
747  it->second = relevance + max_tw;
748  }
749  return &it->second;
750 }
751 
752 string
753 MSet::Internal::snippet(const string & text,
754  size_t length,
755  const Xapian::Stem & stemmer,
756  unsigned flags,
757  const string & hi_start,
758  const string & hi_end,
759  const string & omit) const
760 {
761  if (hi_start.empty() && hi_end.empty() && text.size() <= length) {
762  // Too easy!
763  return text;
764  }
765 
766  bool cjk_ngram = (flags & MSet::SNIPPET_CJK_NGRAM);
767  if (!cjk_ngram) {
768  cjk_ngram = CJK::is_cjk_enabled();
769  }
770 
771  size_t term_start = 0;
772  double min_tw = 0, max_tw = 0;
773  if (stats) stats->get_max_termweight(min_tw, max_tw);
774  if (max_tw == 0.0) {
775  max_tw = 1.0;
776  } else {
777  // Scale up by (1 + 1/64) so that highlighting works better for terms
778  // with termweight 0 (which happens for terms not in the database, and
779  // also with some weighting schemes for terms which occur in almost all
780  // documents.
781  max_tw *= 1.015625;
782  }
783 
785  if (enquire.get()) {
786  query = enquire->query;
787  }
788  SnipPipe snip(length);
789 
790  list<vector<string>> exact_phrases;
791  unordered_map<string, double> loose_terms;
792  list<string> wildcards;
793  size_t longest_phrase = 0;
794  check_query(query, exact_phrases, loose_terms,
795  wildcards, longest_phrase);
796 
797  vector<double> exact_phrases_relevance;
798  exact_phrases_relevance.reserve(exact_phrases.size());
799  for (auto&& terms : exact_phrases) {
800  // FIXME: What relevance to use?
801  exact_phrases_relevance.push_back(max_tw * terms.size());
802  }
803 
804  vector<double> wildcards_relevance;
805  wildcards_relevance.reserve(exact_phrases.size());
806  for (auto&& pattern : wildcards) {
807  // FIXME: What relevance to use?
808  (void)pattern;
809  wildcards_relevance.push_back(max_tw + min_tw);
810  }
811 
812  // Background relevance is the same for a given MSet, so cache it
813  // between calls to MSet::snippet() on the same object.
814  unordered_map<string, double>& background = snippet_bg_relevance;
815 
816  vector<string> phrase;
817  if (longest_phrase) phrase.resize(longest_phrase - 1);
818  size_t phrase_next = 0;
819  bool matchfound = false;
820  parse_terms(Utf8Iterator(text), cjk_ngram, true,
821  [&](const string & term, bool positional, const Utf8Iterator & it) {
822  // FIXME: Don't hardcode this here.
823  const size_t max_word_length = 64;
824 
825  if (!positional) return true;
826  if (term.size() > max_word_length) return true;
827 
828  // We get segments with any "inter-word" characters in front of
829  // each word, e.g.:
830  // [The][ cat][ sat][ on][ the][ mat]
831  size_t term_end = text.size() - it.left();
832 
833  double* relevance = 0;
834  size_t highlight = 0;
835  if (stats) {
836  size_t i = 0;
837  for (auto&& terms : exact_phrases) {
838  if (term == terms.back()) {
839  size_t n = terms.size() - 1;
840  bool match = true;
841  while (n--) {
842  if (terms[n] != phrase[(n + phrase_next) % (longest_phrase - 1)]) {
843  match = false;
844  break;
845  }
846  }
847  if (match) {
848  // FIXME: Sort phrases, highest score first!
849  relevance = &exact_phrases_relevance[i];
850  highlight = terms.size();
851  goto relevance_done;
852  }
853  }
854  ++i;
855  }
856 
857  relevance = check_term(loose_terms, stats, term, max_tw);
858  if (relevance) {
859  // Matched unstemmed term.
860  highlight = 1;
861  goto relevance_done;
862  }
863 
864  string stem = "Z";
865  stem += stemmer(term);
866  relevance = check_term(loose_terms, stats, stem, max_tw);
867  if (relevance) {
868  // Matched stemmed term.
869  highlight = 1;
870  goto relevance_done;
871  }
872 
873  // Check wildcards.
874  // FIXME: Sort wildcards, shortest pattern first or something?
875  i = 0;
876  for (auto&& pattern : wildcards) {
877  if (startswith(term, pattern)) {
878  relevance = &wildcards_relevance[i];
879  highlight = 1;
880  goto relevance_done;
881  }
882  ++i;
883  }
884 
886  // Background document model.
887  auto bgit = background.find(term);
888  if (bgit == background.end()) bgit = background.find(stem);
889  if (bgit == background.end()) {
890  Xapian::doccount tf = enquire->db.get_termfreq(term);
891  if (!tf) {
892  tf = enquire->db.get_termfreq(stem);
893  } else {
894  stem = term;
895  }
896  double r = 0.0;
897  if (tf) {
898  // Add one to avoid log(0) when a term indexes all
899  // documents.
900  Xapian::doccount num_docs = stats->collection_size + 1;
901  r = max_tw * log((num_docs - tf) / double(tf));
902  r /= (length + 1) * log(double(num_docs));
903 #if 0
904  if (r <= 0) {
905  Utf8Iterator i(text.data() + term_start, text.data() + term_end);
906  while (i != Utf8Iterator()) {
908  r = max_tw * 0.05;
909  }
910  }
911  }
912 #endif
913  }
914  bgit = background.emplace(make_pair(stem, r)).first;
915  }
916  relevance = &bgit->second;
917  }
918  } else {
919 #if 0
920  // In the absence of weight information, assume longer terms
921  // are more relevant, and that unstemmed matches are a bit more
922  // relevant than stemmed matches.
923  if (queryterms.find(term) != queryterms.end()) {
924  relevance = term.size() * 3;
925  } else {
926  string stem = "Z";
927  stem += stemmer(term);
928  if (queryterms.find(stem) != queryterms.end()) {
929  relevance = term.size() * 2;
930  }
931  }
932 #endif
933  }
934 
935  // FIXME: Allow Enquire without a DB set or an empty MSet() to be
936  // used if you don't want the collection model?
937 
938 #if 0
939  // FIXME: Punctuation should somehow be included in the model, but this
940  // approach is problematic - we don't want the first word of a sentence
941  // to be favoured when it's at the end of the window.
942 
943  // Give first word in each sentence a relevance boost.
944  if (term_start == 0) {
945  relevance += 10;
946  } else {
947  for (size_t i = term_start; i + term.size() < term_end; ++i) {
948  if (text[i] == '.' && Unicode::is_whitespace(text[i + 1])) {
949  relevance += 10;
950  break;
951  }
952  }
953  }
954 #endif
955 
956 relevance_done:
957  if (longest_phrase) {
958  phrase[phrase_next] = term;
959  phrase_next = (phrase_next + 1) % (longest_phrase - 1);
960  }
961 
962  if (highlight) matchfound = true;
963 
964  if (!snip.pump(relevance, term_end, highlight, flags)) return false;
965 
966  term_start = term_end;
967  return true;
968  });
969 
970  snip.done();
971 
972  // Put together the snippet.
973  string result;
974  if (matchfound || (flags & SNIPPET_EMPTY_WITHOUT_MATCH) == 0) {
975  while (snip.drain(text, hi_start, hi_end, omit, result)) { }
976  }
977 
978  return result;
979 }
980 
981 }
Unicode and UTF-8 related classes and functions.
bool is_none() const
Return true if this is a no-op stemmer.
Definition: stem.h:166
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:80
void append_utf8(std::string &s, unsigned ch)
Append the UTF-8 representation of a single Unicode character to a std::string.
Definition: unicode.h:332
Wildcard expansion.
Definition: query.h:255
Exhaustively evaluate candidate snippets in MSet::snippet().
Definition: mset.h:179
Letter, modifier (Lm)
Definition: unicode.h:225
const Query get_subquery(size_t n) const
Read a top level subquery.
Definition: query.cc:226
static unsigned check_suffix(unsigned ch)
const char * raw() const
Return the raw const char* pointer for the current position.
Definition: unicode.h:54
unsigned tolower(unsigned ch)
Convert a Unicode character to lowercase.
Definition: unicode.h:376
bool drain(const string &input, const string &hi_start, const string &hi_end, const string &omit, string &output)
Letter, other (Lo)
Definition: unicode.h:226
Xapian::Query internals.
static double * check_term(unordered_map< string, double > &loose_terms, const Xapian::Weight::Internal *stats, const string &term, double max_tw)
size_t left() const
Return the number of bytes left in the iterator&#39;s buffer.
Definition: unicode.h:59
bool is_digit(unsigned ch)
Class representing a stemming algorithm.
Definition: stem.h:62
bool U_isupper(unsigned ch)
op
Query operators.
Definition: query.h:78
bool is_currency(unsigned ch)
Test if a given Unicode character is a currency symbol.
Definition: unicode.h:371
Number, decimal digit (Nd)
Definition: unicode.h:230
Definition: header.h:63
bool is_cjk_enabled()
Should we use the CJK n-gram code?
bool unigram() const
Is this a unigram?
Definition: cjk-tokenizer.h:88
STL namespace.
deque< Sniplet > best_pipe
Xapian::Internal::intrusive_ptr< Internal > internal
Definition: query.h:49
static Xapian::Stem stemmer
Definition: stemtest.cc:41
#define DECAY
bool codepoint_is_cjk(unsigned codepoint)
#define rare(COND)
Definition: config.h:573
static void check_query(const Xapian::Query &query, list< vector< string >> &exact_phrases, unordered_map< string, double > &loose_terms, list< string > &wildcards, size_t &longest_phrase)
Letter, lowercase (Ll)
Definition: unicode.h:223
static unsigned check_wordchar(unsigned ch)
TermGenerator class internals.
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72
const std::string & get_pattern() const
const std::string & get_term() const
Definition: queryinternal.h:55
stop_strategy
Stopper strategies, for use with set_stopper_strategy().
Letter, titlecase (Lt)
Definition: unicode.h:224
static void append_escaping_xml(const char *p, const char *end, string &output)
Match only documents where all subqueries match near and in order.
Definition: query.h:152
const unsigned UNICODE_IGNORE
Value representing "ignore this" when returned by check_infix() or check_infix_digit().
Class to hold statistics for a given collection.
Letter, uppercase (Lu)
Definition: unicode.h:222
Punctuation, close (Pe)
Definition: unicode.h:243
Punctuation, open (Ps)
Definition: unicode.h:242
Model the relevancy of non-query terms in MSet::snippet().
Definition: mset.h:172
Tokenise CJK text as n-grams.
static bool snippet_check_leading_nonwordchar(unsigned ch)
bool startswith(const std::string &s, char pfx)
Definition: stringutils.h:46
int flags
For backward compatibility with Xapian 1.2.
Definition: termgenerator.h:98
bool pump(double *r, size_t t, size_t h, unsigned flags)
bool get_termweight(const std::string &term, double &termweight) const
Get the termweight.
unsigned check_infix_digit(unsigned ch)
Iterator returning unigrams and bigrams.
Definition: cjk-tokenizer.h:56
static void parse_terms(Utf8Iterator itor, bool cjk_ngram, bool with_positions, ACTION action)
Templated framework for processing terms.
bool should_stem(const string &term)
An iterator which returns Unicode character values from a UTF-8 encoded string.
Definition: unicode.h:38
static Xapian::Query query(Xapian::Query::op op, const string &t1=string(), const string &t2=string(), const string &t3=string(), const string &t4=string(), const string &t5=string(), const string &t6=string(), const string &t7=string(), const string &t8=string(), const string &t9=string(), const string &t10=string())
Definition: api_anydb.cc:63
static bool snippet_check_trailing_nonwordchar(unsigned ch)
const Xapian::Utf8Iterator & get_utf8iterator() const
Definition: cjk-tokenizer.h:90
bool is_wordchar(unsigned ch)
Test if a given Unicode character is "word character".
Definition: unicode.h:343
size_t get_num_subqueries() const
Get the number of subqueries of the top level query.
Definition: query.cc:220
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
Value returned by get_type() for a term.
Definition: query.h:266
Various handy helpers which std::string really should provide.
bool is_whitespace(unsigned ch)
Test if a given Unicode character is a whitespace character.
Definition: unicode.h:361
Punctuation, initial quote (Pi)
Definition: unicode.h:244
Sniplet(double *r, size_t t, size_t h)
op get_type() const
Get the type of the top level of the query.
Definition: query.cc:212
category get_category(int info)
Definition: unicode.h:271
Punctuation, final quote (Pf)
Definition: unicode.h:245
Class representing a query.
Definition: query.h:46
API for working with documents.
unsigned check_infix(unsigned ch)
stemming algorithms
parsing a user query string to build a Xapian::Query object