41 #include <string_view>
42 #include <unordered_map>
54 return ch < 128 && C_isupper(static_cast<unsigned char>(ch));
57 static inline unsigned
67 const unsigned int SHOULD_STEM_MASK =
81 static inline unsigned
84 if (ch ==
'\'' || ch ==
'&' || ch == 0xb7 || ch == 0x5f4 || ch == 0x2027) {
93 if (ch == 0x2019 || ch == 0x201b)
return '\'';
96 if (ch >= 0x200c && (ch <= 0x200d || ch == 0x2060 || ch == 0xfeff))
104 static inline unsigned
122 if (ch >= 0x200b && (ch <= 0x200d || ch == 0x2060 || ch == 0xfeff))
132 static inline unsigned
135 if (ch ==
'+' || ch ==
'#')
return ch;
140 static_assert(
int(MSet::SNIPPET_WORD_BREAKS) == TermGenerator::FLAG_WORD_BREAKS,
141 "WORD_BREAKS flags have same value");
143 template<
typename ACTION>
149 if (break_flags & MSet::SNIPPET_WORD_BREAKS) {
150 const char* start = itor.
raw();
154 size_t left = itor.
raw() - start;
155 for (WordIterator tk(start, left); tk != WordIterator(); ++tk) {
156 const string& token = *tk;
157 left -= token.length();
158 if (!action(token, with_positions, itor.
left() + left))
169 const string& token = *tk;
171 bool with_pos = with_positions && tk.
unigram();
187 template<
typename ACTION>
210 }
while (
p != end && *
p ==
'.' && ++
p != end &&
U_isupper(*
p));
213 if (
term.size() > 1) {
226 if (!
break_words(itor, break_flags, with_positions, action))
251 unsigned infix_ch = *itor;
258 if (!infix_ch)
break;
266 size_t len =
term.size();
282 if (!action(
term, with_positions, itor.
left()))
289 string_view prefix,
bool with_positions)
292 if (
flags & FLAG_WORD_BREAKS) {
294 "building Xapian to use ICU");
297 unsigned break_flags =
flags & (FLAG_NGRAMS | FLAG_WORD_BREAKS);
299 break_flags = FLAG_NGRAMS;
304 current_stop_mode = TermGenerator::STOP_NONE;
306 current_stop_mode = stop_mode;
313 string prefixed_term;
314 auto prefix_size = prefix.size();
315 prefixed_term.reserve(prefix_size + max_word_length);
316 prefixed_term.assign(prefix);
318 string prefixed_stemmed_term;
319 int add_z = (strategy != TermGenerator::STEM_ALL);
320 prefixed_stemmed_term.reserve(add_z + prefix_size + max_word_length);
321 if (add_z) prefixed_stemmed_term.assign(1,
'Z');
322 prefixed_stemmed_term.append(prefix);
323 auto prefixed_stemmed_size = prefixed_stemmed_term.size();
326 [=, &prefixed_term, &prefixed_stemmed_term
327 #
if __cplusplus >= 201907L
332 ](
const string &
term,
bool positional,
size_t) {
333 if (
term.size() > max_word_length)
return true;
335 if (current_stop_mode == TermGenerator::STOP_ALL &&
340 if (strategy == TermGenerator::STEM_SOME ||
341 strategy == TermGenerator::STEM_NONE ||
342 strategy == TermGenerator::STEM_SOME_FULL_POS) {
343 prefixed_term.append(
term);
345 if (
rare(cur_pos >= pos_limit))
347 doc.add_posting(prefixed_term, ++cur_pos, wdf_inc);
349 doc.add_term(prefixed_term, wdf_inc);
351 prefixed_term.resize(prefix_size);
356 if ((this->
flags & FLAG_SPELLING) && prefix_size == 0)
357 db.add_spelling(
term);
362 if (strategy == TermGenerator::STEM_SOME ||
363 strategy == TermGenerator::STEM_SOME_FULL_POS) {
364 if (current_stop_mode == TermGenerator::STOP_STEMMED &&
375 if (
rare(stem.empty()))
return true;
376 prefixed_stemmed_term.append(stem);
377 if (strategy != TermGenerator::STEM_SOME && positional) {
378 if (strategy != TermGenerator::STEM_SOME_FULL_POS) {
379 if (
rare(cur_pos >= pos_limit))
383 doc.add_posting(prefixed_stemmed_term, cur_pos, wdf_inc);
385 doc.add_term(prefixed_stemmed_term, wdf_inc);
387 prefixed_stemmed_term.resize(prefixed_stemmed_size);
401 : relevance(r), term_end(t), highlight(h) { }
417 size_t phrase_len = 0;
420 size_t best_begin = 0;
428 explicit SnipPipe(
size_t length_) : length(length_ + 1) { }
430 bool pump(
double* r,
size_t t,
size_t h,
unsigned flags);
434 bool drain(string_view
input,
435 string_view hi_start,
444 SnipPipe::pump(
double* r,
size_t t,
size_t h,
unsigned flags)
447 if (pipe.size() >= h - 1) {
452 auto & phrase_start = pipe[pipe.size() - (h - 1)];
453 if (phrase_start.relevance) {
454 *phrase_start.relevance *=
DECAY;
455 sum -= *phrase_start.relevance;
458 phrase_start.relevance = r;
459 phrase_start.highlight = h;
465 pipe.emplace_back(r, t, h);
474 while (t - begin > length ) {
475 const Sniplet& word = pipe.front();
481 if (best_end >= begin)
482 best_pipe.push_back(word);
486 if (
rare(pipe.empty()))
break;
491 if (sum >= best_sum) {
493 if (begin >= best_end) {
496 while (!best_pipe.empty() &&
497 best_pipe.front().term_end <= begin) {
498 best_pipe.pop_front();
505 if (best_sum > 0 && best_end < begin) {
518 if (begin >= best_end) {
523 while (
rare(!pipe.empty()) &&
524 pipe.back().term_end > best_end) {
612 string_view hi_start,
617 if (best_pipe.empty() && !pipe.empty()) {
618 swap(best_pipe, pipe);
621 if (best_pipe.empty()) {
622 size_t tail_len =
input.size() - best_end;
623 if (tail_len == 0)
return false;
628 bool sentence_end =
false;
635 sentence_end = (ch ==
'.' || ch ==
'?' || ch ==
'!');
649 int trailing_punc = 0;
652 if (++trailing_punc > 4) {
670 const Sniplet & word = best_pipe.front();
672 if (output.empty()) {
674 enum { NO, PUNC, YES } sentence_boundary = (best_begin == 0) ? YES : NO;
679 switch (sentence_boundary) {
681 if (ch ==
'.' || ch ==
'?' || ch ==
'!') {
682 sentence_boundary = PUNC;
687 sentence_boundary = YES;
688 }
else if (ch ==
'.' || ch ==
'?' || ch ==
'!') {
691 sentence_boundary = NO;
702 size_t word_begin = i.
raw() -
input.data();
703 if (word_begin - best_begin > 4) {
704 best_begin = word_begin;
710 best_begin = i.
raw() -
input.data();
716 if (sentence_boundary != YES) {
728 best_begin = i.
raw() -
input.data();
737 if (phrase_len) output += hi_start;
740 const char*
p =
input.data();
744 if (phrase_len && --phrase_len == 0) output += hi_end;
746 best_pipe.pop_front();
752 list<vector<string>> & exact_phrases,
753 unordered_map<string, double> & loose_terms,
754 list<const Xapian::Internal::QueryWildcard*> & wildcards,
755 list<const Xapian::Internal::QueryEditDistance*> & fuzzies,
756 size_t & longest_phrase)
764 loose_terms.insert(make_pair(qt.
get_term(), 0));
767 const QueryWildcard* qw =
769 wildcards.push_back(qw);
772 const QueryEditDistance* qed =
774 fuzzies.push_back(qed);
780 for (
size_t i = 0; i != n_subqs; ++i) {
782 goto non_term_subquery;
786 exact_phrases.push_back(vector<string>());
787 vector<string> & terms = exact_phrases.back();
788 terms.reserve(n_subqs);
789 for (
size_t i = 0; i != n_subqs; ++i) {
795 if (n_subqs > longest_phrase) longest_phrase = n_subqs;
800 for (
size_t i = 0; i != n_subqs; ++i)
802 wildcards, fuzzies, longest_phrase);
811 auto it = loose_terms.find(
term);
812 if (it == loose_terms.end())
return NULL;
814 if (it->second == 0.0) {
818 loose_terms.erase(it);
822 it->second = relevance + max_tw;
828 MSet::Internal::snippet(string_view text,
832 string_view hi_start,
834 string_view omit)
const
836 if (hi_start.empty() && hi_end.empty() && text.size() <= length) {
842 if (flags & MSet::SNIPPET_WORD_BREAKS) {
844 "building Xapian to use ICU");
847 auto SNIPPET_BREAK_MASK = MSet::SNIPPET_NGRAMS | MSet::SNIPPET_WORD_BREAKS;
848 unsigned break_flags = flags & SNIPPET_BREAK_MASK;
850 break_flags = MSet::SNIPPET_NGRAMS;
853 size_t term_start = 0;
854 double min_tw = 0, max_tw = 0;
855 if (stats) stats->get_max_termweight(min_tw, max_tw);
868 query = enquire->query;
872 list<vector<string>> exact_phrases;
873 unordered_map<string, double> loose_terms;
874 list<const Xapian::Internal::QueryWildcard*> wildcards;
875 list<const Xapian::Internal::QueryEditDistance*> fuzzies;
876 size_t longest_phrase = 0;
878 wildcards, fuzzies, longest_phrase);
880 vector<double> exact_phrases_relevance;
881 exact_phrases_relevance.reserve(exact_phrases.size());
882 for (
auto&& terms : exact_phrases) {
884 exact_phrases_relevance.push_back(max_tw * terms.size());
887 vector<double> wildcards_relevance;
888 wildcards_relevance.reserve(wildcards.size());
889 for (
auto&& pattern : wildcards) {
892 wildcards_relevance.push_back(max_tw + min_tw);
895 vector<double> fuzzies_relevance;
896 fuzzies_relevance.reserve(fuzzies.size());
897 for (
auto&& pattern : fuzzies) {
900 fuzzies_relevance.push_back(max_tw + min_tw);
905 unordered_map<string, double>& background = snippet_bg_relevance;
907 vector<string> phrase;
908 if (longest_phrase) phrase.resize(longest_phrase - 1);
909 size_t phrase_next = 0;
910 bool matchfound =
false;
912 [&](
const string &
term,
bool positional,
size_t left) {
914 const size_t max_word_length = 64;
916 if (!positional)
return true;
917 if (
term.size() > max_word_length)
return true;
922 size_t term_end = text.size() - left;
924 double* relevance = 0;
925 size_t highlight = 0;
928 for (
auto&& terms : exact_phrases) {
929 if (
term == terms.back()) {
930 size_t n = terms.size() - 1;
932 while (UNSIGNED_OVERFLOW_OK(n--)) {
933 if (terms[n] != phrase[(n + phrase_next) % (longest_phrase - 1)]) {
940 relevance = &exact_phrases_relevance[i];
941 highlight = terms.size();
957 relevance =
check_term(loose_terms, stats.get(), stem, max_tw);
967 for (
auto&& qw : wildcards) {
968 if (qw->test(
term)) {
969 relevance = &wildcards_relevance[i];
979 for (
auto&& qed : fuzzies) {
981 int ed_result = qed->test(
term);
987 relevance = &fuzzies_relevance[i];
996 auto bgit = background.find(
term);
997 if (bgit == background.end()) bgit = background.find(stem);
998 if (bgit == background.end()) {
1001 tf = enquire->db.get_termfreq(stem);
1010 r = max_tw * log((num_docs - tf) /
double(tf));
1011 r /= (length + 1) * log(
double(num_docs));
1014 Utf8Iterator i(text.data() + term_start, text.data() + term_end);
1023 bgit = background.emplace(make_pair(stem, r)).first;
1025 relevance = &bgit->second;
1032 if (queryterms.find(
term) != queryterms.end()) {
1033 relevance =
term.size() * 3;
1037 if (queryterms.find(stem) != queryterms.end()) {
1038 relevance =
term.size() * 2;
1053 if (term_start == 0) {
1056 for (
size_t i = term_start; i +
term.size() < term_end; ++i) {
1066 if (longest_phrase) {
1067 phrase[phrase_next] =
term;
1068 phrase_next = (phrase_next + 1) % (longest_phrase - 1);
1071 if (highlight) matchfound =
true;
1073 if (!snip.
pump(relevance, term_end, highlight, flags))
return false;
1075 term_start = term_end;
1083 if (matchfound || (flags & SNIPPET_EMPTY_WITHOUT_MATCH) == 0) {
1084 while (snip.drain(text, hi_start, hi_end, omit, result)) { }
static Xapian::Query query(Xapian::Query::op op, const string &t1=string(), const string &t2=string(), const string &t3=string(), const string &t4=string(), const string &t5=string(), const string &t6=string(), const string &t7=string(), const string &t8=string(), const string &t9=string(), const string &t10=string())
if(!(properties &BACKEND))
Iterator returning unigrams and bigrams.
bool unigram() const
Is this a unigram?
const Xapian::Utf8Iterator & get_utf8iterator() const
Indicates an attempt to use a feature which is unavailable.
const std::string & get_term() const
size_t get_window() const
@ SNIPPET_BACKGROUND_MODEL
Model the relevancy of non-query terms in MSet::snippet().
@ SNIPPET_EXHAUSTIVE
Exhaustively evaluate candidate snippets in MSet::snippet().
Class representing a query.
const Query get_subquery(size_t n) const
Read a top level subquery.
op get_type() const noexcept
Get the type of the top level of the query.
size_t get_num_subqueries() const noexcept
Get the number of subqueries of the top level query.
@ OP_WILDCARD
Wildcard expansion.
@ OP_PHRASE
Match only documents where all subqueries match near and in order.
@ OP_EDIT_DISTANCE
Edit distance expansion.
@ LEAF_TERM
Value returned by get_type() for a term.
Xapian::Internal::intrusive_ptr< Internal > internal
RangeError indicates an attempt to access outside the bounds of a container.
bool pump(double *r, size_t t, size_t h, unsigned flags)
deque< Sniplet > best_pipe
Class representing a stemming algorithm.
bool is_none() const
Return true if this is a no-op stemmer.
int flags
For backward compatibility with Xapian 1.2.
stop_strategy
Stopper strategies, for use with set_stopper_strategy().
An iterator which returns Unicode character values from a UTF-8 encoded string.
const char * raw() const
Return the raw const char* pointer for the current position.
size_t left() const
Return the number of bytes left in the iterator's buffer.
void assign(const char *p_, size_t len)
Assign a new string to the iterator.
Class to hold statistics for a given collection.
bool get_termweight(std::string_view term, double &termweight) const
Get the termweight.
Class representing a document.
category get_category(int info)
void append_utf8(std::string &s, unsigned ch)
Append the UTF-8 representation of a single Unicode character to a std::string.
unsigned tolower(unsigned ch)
Convert a Unicode character to lowercase.
@ FINAL_QUOTE_PUNCTUATION
Punctuation, final quote (Pf)
@ INITIAL_QUOTE_PUNCTUATION
Punctuation, initial quote (Pi)
@ LOWERCASE_LETTER
Letter, lowercase (Ll)
@ MODIFIER_LETTER
Letter, modifier (Lm)
@ OTHER_LETTER
Letter, other (Lo)
@ DECIMAL_DIGIT_NUMBER
Number, decimal digit (Nd)
@ CLOSE_PUNCTUATION
Punctuation, close (Pe)
@ OPEN_PUNCTUATION
Punctuation, open (Ps)
@ TITLECASE_LETTER
Letter, titlecase (Lt)
@ UPPERCASE_LETTER
Letter, uppercase (Lu)
bool is_wordchar(unsigned ch)
Test if a given Unicode character is "word character".
bool is_currency(unsigned ch)
Test if a given Unicode character is a currency symbol.
bool is_whitespace(unsigned ch)
Test if a given Unicode character is a whitespace character.
The Xapian namespace contains public interfaces for the Xapian library.
static bool snippet_check_leading_nonwordchar(unsigned ch)
static double * check_term(unordered_map< string, double > &loose_terms, const Xapian::Weight::Internal *stats, const string &term, double max_tw)
static void append_escaping_xml(const char *p, const char *end, string &output)
static bool snippet_check_trailing_nonwordchar(unsigned ch)
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
static void check_query(const Xapian::Query &query, list< vector< string >> &exact_phrases, unordered_map< string, double > &loose_terms, list< const Xapian::Internal::QueryWildcard * > &wildcards, list< const Xapian::Internal::QueryEditDistance * > &fuzzies, size_t &longest_phrase)
static unsigned check_suffix(unsigned ch)
static unsigned check_wordchar(unsigned ch)
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
static void parse_terms(Utf8Iterator itor, unsigned break_flags, bool with_positions, ACTION action)
Templated framework for processing terms.
static bool break_words(Utf8Iterator &itor, unsigned break_flags, bool with_positions, ACTION action)
parsing a user query string to build a Xapian::Query object
const unsigned UNICODE_IGNORE
Value representing "ignore this" when returned by check_infix() or check_infix_digit().
bool should_stem(const string &term, const State &state)
bool U_isupper(unsigned ch)
bool is_digit(unsigned ch)
unsigned check_infix_digit(unsigned ch)
unsigned check_infix(unsigned ch)
static Xapian::Stem stemmer
Various handy string-related helpers.
Sniplet(double *r, size_t t, size_t h)
TermGenerator class internals.
Unicode and UTF-8 related classes and functions.
bool is_unbroken_script(unsigned p)
bool is_unbroken_wordchar(unsigned p)
size_t get_unbroken(Xapian::Utf8Iterator &it)
bool is_ngram_enabled()
Should we use the n-gram code?
Handle text without explicit word breaks.