41 #include <unordered_map> 53 return ch < 128 && C_isupper(static_cast<unsigned char>(ch));
56 static inline unsigned 66 const unsigned int SHOULD_STEM_MASK =
80 static inline unsigned 83 if (ch ==
'\'' || ch ==
'&' || ch == 0xb7 || ch == 0x5f4 || ch == 0x2027) {
92 if (ch == 0x2019 || ch == 0x201b)
return '\'';
93 if (ch >= 0x200b && (ch <= 0x200d || ch == 0x2060 || ch == 0xfeff))
98 static inline unsigned 116 if (ch >= 0x200b && (ch <= 0x200d || ch == 0x2060 || ch == 0xfeff))
126 static inline unsigned 129 if (ch ==
'+' || ch ==
'#')
return ch;
140 template<
typename ACTION>
163 }
while (p != end && *p ==
'.' && ++p != end &&
U_isupper(*p));
166 if (term.size() > 1) {
178 if (try_word_break &&
183 const string& ngram = *tk;
184 if (!action(ngram, with_positions && tk.
unigram(),
214 unsigned infix_ch = *itor;
221 if (!infix_ch)
break;
222 if (infix_ch != UNICODE_IGNORE)
229 size_t len = term.size();
245 if (!action(term, with_positions, itor))
252 const string & prefix,
bool with_positions)
257 if (!stopper.get()) {
258 current_stop_mode = TermGenerator::STOP_NONE;
260 current_stop_mode = stop_mode;
265 #
if __cplusplus >= 201907L
270 ](
const string & term,
bool positional,
const Utf8Iterator &) {
271 if (term.size() > max_word_length)
return true;
273 if (current_stop_mode == TermGenerator::STOP_ALL &&
278 if (strategy == TermGenerator::STEM_SOME ||
279 strategy == TermGenerator::STEM_NONE ||
280 strategy == TermGenerator::STEM_SOME_FULL_POS) {
282 doc.add_posting(prefix + term, ++cur_pos, wdf_inc);
284 doc.add_term(prefix + term, wdf_inc);
290 if ((this->
flags & FLAG_SPELLING) && prefix.empty())
291 db.add_spelling(term);
296 if (strategy == TermGenerator::STEM_SOME ||
297 strategy == TermGenerator::STEM_SOME_FULL_POS) {
298 if (current_stop_mode == TermGenerator::STOP_STEMMED &&
308 const string& stem =
stemmer(term);
309 if (
rare(stem.empty()))
return true;
311 if (strategy != TermGenerator::STEM_ALL) {
314 stemmed_term += prefix;
315 stemmed_term += stem;
316 if (strategy != TermGenerator::STEM_SOME && with_positions) {
317 if (strategy != TermGenerator::STEM_SOME_FULL_POS) ++cur_pos;
318 doc.add_posting(stemmed_term, cur_pos, wdf_inc);
320 doc.add_term(stemmed_term, wdf_inc);
334 : relevance(r), term_end(t), highlight(h) { }
350 size_t phrase_len = 0;
353 size_t best_begin = 0;
361 explicit SnipPipe(
size_t length_) : length(length_ + 1) { }
363 bool pump(
double* r,
size_t t,
size_t h,
unsigned flags);
367 bool drain(
const string &
input,
368 const string & hi_start,
369 const string & hi_end,
377 SnipPipe::pump(
double* r,
size_t t,
size_t h,
unsigned flags)
380 if (pipe.size() >= h - 1) {
385 auto & phrase_start = pipe[pipe.size() - (h - 1)];
386 if (phrase_start.relevance) {
387 *phrase_start.relevance *=
DECAY;
388 sum -= *phrase_start.relevance;
391 phrase_start.relevance = r;
392 phrase_start.highlight = h;
398 pipe.emplace_back(r, t, h);
407 while (t - begin > length ) {
408 const Sniplet& word = pipe.front();
414 if (best_end >= begin)
415 best_pipe.push_back(word);
419 if (
rare(pipe.empty()))
break;
424 if (sum >= best_sum) {
426 if (begin >= best_end) {
429 while (!best_pipe.empty() &&
430 best_pipe.front().term_end <= begin) {
431 best_pipe.pop_front();
438 if (best_sum > 0 && best_end < begin) {
451 if (begin >= best_end) {
456 while (
rare(!pipe.empty()) &&
457 pipe.back().term_end > best_end) {
544 SnipPipe::drain(
const string &
input,
545 const string & hi_start,
546 const string & hi_end,
550 if (best_pipe.empty() && !pipe.empty()) {
551 swap(best_pipe, pipe);
554 if (best_pipe.empty()) {
555 size_t tail_len = input.size() - best_end;
556 if (tail_len == 0)
return false;
561 bool sentence_end =
false;
568 sentence_end = (ch ==
'.' || ch ==
'?' || ch ==
'!');
581 i.assign(input.data() + best_end, tail_len);
582 int trailing_punc = 0;
585 if (++trailing_punc > 4) {
603 const Sniplet & word = best_pipe.front();
605 if (output.empty()) {
607 enum { NO, PUNC, YES } sentence_boundary = (best_begin == 0) ? YES : NO;
612 switch (sentence_boundary) {
614 if (ch ==
'.' || ch ==
'?' || ch ==
'!') {
615 sentence_boundary = PUNC;
620 sentence_boundary = YES;
621 }
else if (ch ==
'.' || ch ==
'?' || ch ==
'!') {
624 sentence_boundary = NO;
635 size_t word_begin = i.
raw() - input.data();
636 if (word_begin - best_begin > 4) {
637 best_begin = word_begin;
643 best_begin = i.raw() - input.data();
649 if (sentence_boundary != YES) {
656 Utf8Iterator i(input.data() + best_begin, input.size() - best_begin);
661 best_begin = i.raw() - input.data();
670 if (phrase_len) output += hi_start;
673 const char* p = input.data();
677 if (phrase_len && --phrase_len == 0) output += hi_end;
679 best_pipe.pop_front();
685 list<vector<string>> & exact_phrases,
686 unordered_map<string, double> & loose_terms,
687 list<string> & wildcards,
688 size_t & longest_phrase)
696 loose_terms.insert(make_pair(qt.
get_term(), 0));
706 for (
size_t i = 0; i != n_subqs; ++i) {
708 goto non_term_subquery;
712 exact_phrases.push_back(vector<string>());
713 vector<string> & terms = exact_phrases.back();
714 terms.reserve(n_subqs);
715 for (
size_t i = 0; i != n_subqs; ++i) {
721 if (n_subqs > longest_phrase) longest_phrase = n_subqs;
726 for (
size_t i = 0; i != n_subqs; ++i)
728 wildcards, longest_phrase);
737 auto it = loose_terms.find(term);
738 if (it == loose_terms.end())
return NULL;
740 if (it->second == 0.0) {
744 loose_terms.erase(it);
748 it->second = relevance + max_tw;
754 MSet::Internal::snippet(
const string & text,
758 const string & hi_start,
759 const string & hi_end,
760 const string & omit)
const 762 if (hi_start.empty() && hi_end.empty() && text.size() <= length) {
767 bool try_word_break = (flags & MSet::SNIPPET_CJK_NGRAM);
768 if (!try_word_break) {
769 try_word_break =
true;
772 size_t term_start = 0;
773 double min_tw = 0, max_tw = 0;
774 if (stats) stats->get_max_termweight(min_tw, max_tw);
787 query = enquire->query;
791 list<vector<string>> exact_phrases;
792 unordered_map<string, double> loose_terms;
793 list<string> wildcards;
794 size_t longest_phrase = 0;
796 wildcards, longest_phrase);
798 vector<double> exact_phrases_relevance;
799 exact_phrases_relevance.reserve(exact_phrases.size());
800 for (
auto&& terms : exact_phrases) {
802 exact_phrases_relevance.push_back(max_tw * terms.size());
805 vector<double> wildcards_relevance;
806 wildcards_relevance.reserve(exact_phrases.size());
807 for (
auto&& pattern : wildcards) {
810 wildcards_relevance.push_back(max_tw + min_tw);
815 unordered_map<string, double>& background = snippet_bg_relevance;
817 vector<string> phrase;
818 if (longest_phrase) phrase.resize(longest_phrase - 1);
819 size_t phrase_next = 0;
820 bool matchfound =
false;
822 [&](
const string & term,
bool positional,
const Utf8Iterator & it) {
824 const size_t max_word_length = 64;
826 if (!positional)
return true;
827 if (term.size() > max_word_length)
return true;
832 size_t term_end = text.size() - it.
left();
834 double* relevance = 0;
835 size_t highlight = 0;
838 for (
auto&& terms : exact_phrases) {
839 if (term == terms.back()) {
840 size_t n = terms.size() - 1;
843 if (terms[n] != phrase[(n + phrase_next) % (longest_phrase - 1)]) {
850 relevance = &exact_phrases_relevance[i];
851 highlight = terms.size();
858 relevance =
check_term(loose_terms, stats, term, max_tw);
867 relevance =
check_term(loose_terms, stats, stem, max_tw);
877 for (
auto&& pattern : wildcards) {
879 relevance = &wildcards_relevance[i];
888 auto bgit = background.find(term);
889 if (bgit == background.end()) bgit = background.find(stem);
890 if (bgit == background.end()) {
893 tf = enquire->db.get_termfreq(stem);
902 r = max_tw * log((num_docs - tf) /
double(tf));
903 r /= (length + 1) * log(
double(num_docs));
906 Utf8Iterator i(text.data() + term_start, text.data() + term_end);
915 bgit = background.emplace(make_pair(stem, r)).first;
917 relevance = &bgit->second;
924 if (queryterms.find(term) != queryterms.end()) {
925 relevance = term.size() * 3;
929 if (queryterms.find(stem) != queryterms.end()) {
930 relevance = term.size() * 2;
945 if (term_start == 0) {
948 for (
size_t i = term_start; i + term.size() < term_end; ++i) {
958 if (longest_phrase) {
959 phrase[phrase_next] = term;
960 phrase_next = (phrase_next + 1) % (longest_phrase - 1);
963 if (highlight) matchfound =
true;
965 if (!snip.
pump(relevance, term_end, highlight, flags))
return false;
967 term_start = term_end;
975 if (matchfound || (flags & SNIPPET_EMPTY_WITHOUT_MATCH) == 0) {
976 while (snip.
drain(text, hi_start, hi_end, omit, result)) { }
Unicode and UTF-8 related classes and functions.
bool is_none() const
Return true if this is a no-op stemmer.
The Xapian namespace contains public interfaces for the Xapian library.
void append_utf8(std::string &s, unsigned ch)
Append the UTF-8 representation of a single Unicode character to a std::string.
Exhaustively evaluate candidate snippets in MSet::snippet().
const Query get_subquery(size_t n) const
Read a top level subquery.
static unsigned check_suffix(unsigned ch)
const char * raw() const
Return the raw const char* pointer for the current position.
const Xapian::Utf8Iterator & get_utf8iterator() const
unsigned tolower(unsigned ch)
Convert a Unicode character to lowercase.
bool drain(const string &input, const string &hi_start, const string &hi_end, const string &omit, string &output)
static double * check_term(unordered_map< string, double > &loose_terms, const Xapian::Weight::Internal *stats, const string &term, double max_tw)
size_t left() const
Return the number of bytes left in the iterator's buffer.
bool is_digit(unsigned ch)
Class representing a stemming algorithm.
bool U_isupper(unsigned ch)
bool is_currency(unsigned ch)
Test if a given Unicode character is a currency symbol.
Number, decimal digit (Nd)
bool is_unbroken_script(unsigned p)
deque< Sniplet > best_pipe
Xapian::Internal::intrusive_ptr< Internal > internal
static Xapian::Stem stemmer
static void check_query(const Xapian::Query &query, list< vector< string >> &exact_phrases, unordered_map< string, double > &loose_terms, list< string > &wildcards, size_t &longest_phrase)
Iterator returning unigrams and bigrams.
static unsigned check_wordchar(unsigned ch)
TermGenerator class internals.
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
const std::string & get_pattern() const
const std::string & get_term() const
stop_strategy
Stopper strategies, for use with set_stopper_strategy().
static void append_escaping_xml(const char *p, const char *end, string &output)
Match only documents where all subqueries match near and in order.
const unsigned UNICODE_IGNORE
Value representing "ignore this" when returned by check_infix() or check_infix_digit().
Class to hold statistics for a given collection.
size_t get_window() const
Model the relevancy of non-query terms in MSet::snippet().
static bool snippet_check_leading_nonwordchar(unsigned ch)
bool startswith(const std::string &s, char pfx)
int flags
For backward compatibility with Xapian 1.2.
bool pump(double *r, size_t t, size_t h, unsigned flags)
bool get_termweight(const std::string &term, double &termweight) const
Get the termweight.
unsigned check_infix_digit(unsigned ch)
Handle text without explicit word breaks.
bool should_stem(const string &term)
An iterator which returns Unicode character values from a UTF-8 encoded string.
static Xapian::Query query(Xapian::Query::op op, const string &t1=string(), const string &t2=string(), const string &t3=string(), const string &t4=string(), const string &t5=string(), const string &t6=string(), const string &t7=string(), const string &t8=string(), const string &t9=string(), const string &t10=string())
static bool snippet_check_trailing_nonwordchar(unsigned ch)
bool is_wordchar(unsigned ch)
Test if a given Unicode character is "word character".
size_t get_num_subqueries() const
Get the number of subqueries of the top level query.
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
static void parse_terms(Utf8Iterator itor, bool try_word_break, bool with_positions, ACTION action)
Templated framework for processing terms.
Value returned by get_type() for a term.
Various handy helpers which std::string really should provide.
bool is_whitespace(unsigned ch)
Test if a given Unicode character is a whitespace character.
Punctuation, initial quote (Pi)
Sniplet(double *r, size_t t, size_t h)
op get_type() const
Get the type of the top level of the query.
category get_category(int info)
Punctuation, final quote (Pf)
bool is_ngram_enabled()
Should we use the n-gram code?
Class representing a query.
API for working with documents.
unsigned check_infix(unsigned ch)
bool unigram() const
Is this a unigram?
parsing a user query string to build a Xapian::Query object