41 #include <unordered_map> 53 return ch < 128 && C_isupper(static_cast<unsigned char>(ch));
56 static inline unsigned 66 const unsigned int SHOULD_STEM_MASK =
80 static inline unsigned 83 if (ch ==
'\'' || ch ==
'&' || ch == 0xb7 || ch == 0x5f4 || ch == 0x2027) {
92 if (ch == 0x2019 || ch == 0x201b)
return '\'';
93 if (ch >= 0x200b && (ch <= 0x200d || ch == 0x2060 || ch == 0xfeff))
98 static inline unsigned 116 if (ch >= 0x200b && (ch <= 0x200d || ch == 0x2060 || ch == 0xfeff))
126 static inline unsigned 129 if (ch ==
'+' || ch ==
'#')
return ch;
140 template<
typename ACTION>
162 }
while (p != end && *p ==
'.' && ++p != end &&
U_isupper(*p));
165 if (term.size() > 1) {
182 const string& cjk_token = *tk;
183 if (!action(cjk_token, with_positions && tk.
unigram(),
213 unsigned infix_ch = *itor;
220 if (!infix_ch)
break;
221 if (infix_ch != UNICODE_IGNORE)
228 size_t len = term.size();
244 if (!action(term, with_positions, itor))
251 const string & prefix,
bool with_positions)
256 if (!stopper.get()) {
257 current_stop_mode = TermGenerator::STOP_NONE;
259 current_stop_mode = stop_mode;
264 #
if __cplusplus >= 201907L
269 ](
const string & term,
bool positional,
const Utf8Iterator &) {
270 if (term.size() > max_word_length)
return true;
272 if (current_stop_mode == TermGenerator::STOP_ALL &&
277 if (strategy == TermGenerator::STEM_SOME ||
278 strategy == TermGenerator::STEM_NONE ||
279 strategy == TermGenerator::STEM_SOME_FULL_POS) {
281 doc.add_posting(prefix + term, ++cur_pos, wdf_inc);
283 doc.add_term(prefix + term, wdf_inc);
289 if ((this->
flags & FLAG_SPELLING) && prefix.empty())
290 db.add_spelling(term);
295 if (strategy == TermGenerator::STEM_SOME ||
296 strategy == TermGenerator::STEM_SOME_FULL_POS) {
297 if (current_stop_mode == TermGenerator::STOP_STEMMED &&
307 const string& stem =
stemmer(term);
308 if (
rare(stem.empty()))
return true;
310 if (strategy != TermGenerator::STEM_ALL) {
313 stemmed_term += prefix;
314 stemmed_term += stem;
315 if (strategy != TermGenerator::STEM_SOME && with_positions) {
316 if (strategy != TermGenerator::STEM_SOME_FULL_POS) ++cur_pos;
317 doc.add_posting(stemmed_term, cur_pos, wdf_inc);
319 doc.add_term(stemmed_term, wdf_inc);
333 : relevance(r), term_end(t), highlight(h) { }
349 size_t phrase_len = 0;
352 size_t best_begin = 0;
360 explicit SnipPipe(
size_t length_) : length(length_ + 1) { }
362 bool pump(
double* r,
size_t t,
size_t h,
unsigned flags);
366 bool drain(
const string &
input,
367 const string & hi_start,
368 const string & hi_end,
376 SnipPipe::pump(
double* r,
size_t t,
size_t h,
unsigned flags)
379 if (pipe.size() >= h - 1) {
384 auto & phrase_start = pipe[pipe.size() - (h - 1)];
385 if (phrase_start.relevance) {
386 *phrase_start.relevance *=
DECAY;
387 sum -= *phrase_start.relevance;
390 phrase_start.relevance = r;
391 phrase_start.highlight = h;
397 pipe.emplace_back(r, t, h);
406 while (t - begin > length ) {
407 const Sniplet& word = pipe.front();
413 if (best_end >= begin)
414 best_pipe.push_back(word);
418 if (
rare(pipe.empty()))
break;
423 if (sum >= best_sum) {
425 if (begin >= best_end) {
428 while (!best_pipe.empty() &&
429 best_pipe.front().term_end <= begin) {
430 best_pipe.pop_front();
437 if (best_sum > 0 && best_end < begin) {
450 if (begin >= best_end) {
455 while (
rare(!pipe.empty()) &&
456 pipe.back().term_end > best_end) {
543 SnipPipe::drain(
const string &
input,
544 const string & hi_start,
545 const string & hi_end,
549 if (best_pipe.empty() && !pipe.empty()) {
550 swap(best_pipe, pipe);
553 if (best_pipe.empty()) {
554 size_t tail_len = input.size() - best_end;
555 if (tail_len == 0)
return false;
560 bool sentence_end =
false;
567 sentence_end = (ch ==
'.' || ch ==
'?' || ch ==
'!');
580 i.assign(input.data() + best_end, tail_len);
581 int trailing_punc = 0;
584 if (++trailing_punc > 4) {
602 const Sniplet & word = best_pipe.front();
604 if (output.empty()) {
606 enum { NO, PUNC, YES } sentence_boundary = (best_begin == 0) ? YES : NO;
611 switch (sentence_boundary) {
613 if (ch ==
'.' || ch ==
'?' || ch ==
'!') {
614 sentence_boundary = PUNC;
619 sentence_boundary = YES;
620 }
else if (ch ==
'.' || ch ==
'?' || ch ==
'!') {
623 sentence_boundary = NO;
634 size_t word_begin = i.
raw() - input.data();
635 if (word_begin - best_begin > 4) {
636 best_begin = word_begin;
642 best_begin = i.raw() - input.data();
648 if (sentence_boundary != YES) {
655 Utf8Iterator i(input.data() + best_begin, input.size() - best_begin);
660 best_begin = i.raw() - input.data();
669 if (phrase_len) output += hi_start;
672 const char* p = input.data();
676 if (phrase_len && --phrase_len == 0) output += hi_end;
678 best_pipe.pop_front();
684 list<vector<string>> & exact_phrases,
685 unordered_map<string, double> & loose_terms,
686 list<string> & wildcards,
687 size_t & longest_phrase)
695 loose_terms.insert(make_pair(qt.
get_term(), 0));
705 for (
size_t i = 0; i != n_subqs; ++i) {
707 goto non_term_subquery;
711 exact_phrases.push_back(vector<string>());
712 vector<string> & terms = exact_phrases.back();
713 terms.reserve(n_subqs);
714 for (
size_t i = 0; i != n_subqs; ++i) {
720 if (n_subqs > longest_phrase) longest_phrase = n_subqs;
725 for (
size_t i = 0; i != n_subqs; ++i)
727 wildcards, longest_phrase);
736 auto it = loose_terms.find(term);
737 if (it == loose_terms.end())
return NULL;
739 if (it->second == 0.0) {
743 loose_terms.erase(it);
747 it->second = relevance + max_tw;
753 MSet::Internal::snippet(
const string & text,
757 const string & hi_start,
758 const string & hi_end,
759 const string & omit)
const 761 if (hi_start.empty() && hi_end.empty() && text.size() <= length) {
766 bool cjk_ngram = (flags & MSet::SNIPPET_CJK_NGRAM);
771 size_t term_start = 0;
772 double min_tw = 0, max_tw = 0;
773 if (stats) stats->get_max_termweight(min_tw, max_tw);
786 query = enquire->query;
790 list<vector<string>> exact_phrases;
791 unordered_map<string, double> loose_terms;
792 list<string> wildcards;
793 size_t longest_phrase = 0;
795 wildcards, longest_phrase);
797 vector<double> exact_phrases_relevance;
798 exact_phrases_relevance.reserve(exact_phrases.size());
799 for (
auto&& terms : exact_phrases) {
801 exact_phrases_relevance.push_back(max_tw * terms.size());
804 vector<double> wildcards_relevance;
805 wildcards_relevance.reserve(exact_phrases.size());
806 for (
auto&& pattern : wildcards) {
809 wildcards_relevance.push_back(max_tw + min_tw);
814 unordered_map<string, double>& background = snippet_bg_relevance;
816 vector<string> phrase;
817 if (longest_phrase) phrase.resize(longest_phrase - 1);
818 size_t phrase_next = 0;
819 bool matchfound =
false;
821 [&](
const string & term,
bool positional,
const Utf8Iterator & it) {
823 const size_t max_word_length = 64;
825 if (!positional)
return true;
826 if (term.size() > max_word_length)
return true;
831 size_t term_end = text.size() - it.
left();
833 double* relevance = 0;
834 size_t highlight = 0;
837 for (
auto&& terms : exact_phrases) {
838 if (term == terms.back()) {
839 size_t n = terms.size() - 1;
842 if (terms[n] != phrase[(n + phrase_next) % (longest_phrase - 1)]) {
849 relevance = &exact_phrases_relevance[i];
850 highlight = terms.size();
857 relevance =
check_term(loose_terms, stats, term, max_tw);
866 relevance =
check_term(loose_terms, stats, stem, max_tw);
876 for (
auto&& pattern : wildcards) {
878 relevance = &wildcards_relevance[i];
887 auto bgit = background.find(term);
888 if (bgit == background.end()) bgit = background.find(stem);
889 if (bgit == background.end()) {
892 tf = enquire->db.get_termfreq(stem);
901 r = max_tw * log((num_docs - tf) /
double(tf));
902 r /= (length + 1) * log(
double(num_docs));
905 Utf8Iterator i(text.data() + term_start, text.data() + term_end);
914 bgit = background.emplace(make_pair(stem, r)).first;
916 relevance = &bgit->second;
923 if (queryterms.find(term) != queryterms.end()) {
924 relevance = term.size() * 3;
928 if (queryterms.find(stem) != queryterms.end()) {
929 relevance = term.size() * 2;
944 if (term_start == 0) {
947 for (
size_t i = term_start; i + term.size() < term_end; ++i) {
957 if (longest_phrase) {
958 phrase[phrase_next] = term;
959 phrase_next = (phrase_next + 1) % (longest_phrase - 1);
962 if (highlight) matchfound =
true;
964 if (!snip.
pump(relevance, term_end, highlight, flags))
return false;
966 term_start = term_end;
974 if (matchfound || (flags & SNIPPET_EMPTY_WITHOUT_MATCH) == 0) {
975 while (snip.
drain(text, hi_start, hi_end, omit, result)) { }
Unicode and UTF-8 related classes and functions.
bool is_none() const
Return true if this is a no-op stemmer.
The Xapian namespace contains public interfaces for the Xapian library.
void append_utf8(std::string &s, unsigned ch)
Append the UTF-8 representation of a single Unicode character to a std::string.
Exhaustively evaluate candidate snippets in MSet::snippet().
const Query get_subquery(size_t n) const
Read a top level subquery.
static unsigned check_suffix(unsigned ch)
const char * raw() const
Return the raw const char* pointer for the current position.
unsigned tolower(unsigned ch)
Convert a Unicode character to lowercase.
bool drain(const string &input, const string &hi_start, const string &hi_end, const string &omit, string &output)
static double * check_term(unordered_map< string, double > &loose_terms, const Xapian::Weight::Internal *stats, const string &term, double max_tw)
size_t left() const
Return the number of bytes left in the iterator's buffer.
bool is_digit(unsigned ch)
Class representing a stemming algorithm.
bool U_isupper(unsigned ch)
bool is_currency(unsigned ch)
Test if a given Unicode character is a currency symbol.
Number, decimal digit (Nd)
bool is_cjk_enabled()
Should we use the CJK n-gram code?
bool unigram() const
Is this a unigram?
deque< Sniplet > best_pipe
Xapian::Internal::intrusive_ptr< Internal > internal
static Xapian::Stem stemmer
bool codepoint_is_cjk(unsigned codepoint)
static void check_query(const Xapian::Query &query, list< vector< string >> &exact_phrases, unordered_map< string, double > &loose_terms, list< string > &wildcards, size_t &longest_phrase)
static unsigned check_wordchar(unsigned ch)
TermGenerator class internals.
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
const std::string & get_pattern() const
const std::string & get_term() const
stop_strategy
Stopper strategies, for use with set_stopper_strategy().
static void append_escaping_xml(const char *p, const char *end, string &output)
Match only documents where all subqueries match near and in order.
const unsigned UNICODE_IGNORE
Value representing "ignore this" when returned by check_infix() or check_infix_digit().
Class to hold statistics for a given collection.
size_t get_window() const
Model the relevancy of non-query terms in MSet::snippet().
Tokenise CJK text as n-grams.
static bool snippet_check_leading_nonwordchar(unsigned ch)
bool startswith(const std::string &s, char pfx)
int flags
For backward compatibility with Xapian 1.2.
bool pump(double *r, size_t t, size_t h, unsigned flags)
bool get_termweight(const std::string &term, double &termweight) const
Get the termweight.
unsigned check_infix_digit(unsigned ch)
Iterator returning unigrams and bigrams.
static void parse_terms(Utf8Iterator itor, bool cjk_ngram, bool with_positions, ACTION action)
Templated framework for processing terms.
bool should_stem(const string &term)
An iterator which returns Unicode character values from a UTF-8 encoded string.
static Xapian::Query query(Xapian::Query::op op, const string &t1=string(), const string &t2=string(), const string &t3=string(), const string &t4=string(), const string &t5=string(), const string &t6=string(), const string &t7=string(), const string &t8=string(), const string &t9=string(), const string &t10=string())
static bool snippet_check_trailing_nonwordchar(unsigned ch)
const Xapian::Utf8Iterator & get_utf8iterator() const
bool is_wordchar(unsigned ch)
Test if a given Unicode character is "word character".
size_t get_num_subqueries() const
Get the number of subqueries of the top level query.
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Value returned by get_type() for a term.
Various handy helpers which std::string really should provide.
bool is_whitespace(unsigned ch)
Test if a given Unicode character is a whitespace character.
Punctuation, initial quote (Pi)
Sniplet(double *r, size_t t, size_t h)
op get_type() const
Get the type of the top level of the query.
category get_category(int info)
Punctuation, final quote (Pf)
Class representing a query.
API for working with documents.
unsigned check_infix(unsigned ch)
parsing a user query string to build a Xapian::Query object