xapian-core  2.0.0
honey_synonym.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2004,2005,2006,2007,2008,2009,2011,2017,2024 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, see
18  * <https://www.gnu.org/licenses/>.
19  */
20 
21 #include <config.h>
22 #include "honey_synonym.h"
23 
24 #include "xapian/error.h"
25 
26 #include "honey_cursor.h"
27 #include "honey_database.h"
28 #include "debuglog.h"
29 #include "stringutils.h"
30 #include "api/vectortermlist.h"
31 
32 #include <set>
33 #include <string>
34 #include <string_view>
35 #include <vector>
36 
37 using namespace std;
38 
39 // We XOR the length values with this so that they are more likely to coincide
40 // with lower case ASCII letters, which are likely to be common. This means
41 // that zlib should do a better job of compressing tag values.
42 #define MAGIC_XOR_VALUE 96
43 
44 void
46 {
47  if (last_term.empty()) return;
48 
49  if (last_synonyms.empty()) {
50  del(last_term);
51  } else {
52  string tag;
53 
54  for (auto&& synonym : last_synonyms) {
55  tag += uint8_t(synonym.size() ^ MAGIC_XOR_VALUE);
56  tag += synonym;
57  }
58 
59  add(last_term, tag);
60  last_synonyms.clear();
61  }
62  last_term.resize(0);
63 }
64 
65 void
66 HoneySynonymTable::add_synonym(string_view term, string_view synonym)
67 {
68  if (last_term != term) {
69  merge_changes();
70  last_term = term;
71 
72  string tag;
73  if (get_exact_entry(term, tag)) {
74  const char* p = tag.data();
75  const char* end = p + tag.size();
76  while (p != end) {
77  size_t len;
78  if (p == end ||
79  (len = uint8_t(*p) ^ MAGIC_XOR_VALUE) >= size_t(end - p))
80  throw Xapian::DatabaseCorruptError("Bad synonym data");
81  ++p;
82  last_synonyms.insert(string(p, len));
83  p += len;
84  }
85  }
86  }
87 
88  last_synonyms.emplace(synonym);
89 }
90 
91 void
92 HoneySynonymTable::remove_synonym(string_view term, string_view synonym)
93 {
94  if (last_term != term) {
95  merge_changes();
96  last_term = term;
97 
98  string tag;
99  if (get_exact_entry(term, tag)) {
100  const char* p = tag.data();
101  const char* end = p + tag.size();
102  while (p != end) {
103  size_t len;
104  if (p == end ||
105  (len = uint8_t(*p) ^ MAGIC_XOR_VALUE) >= size_t(end - p))
106  throw Xapian::DatabaseCorruptError("Bad synonym data");
107  ++p;
108  last_synonyms.emplace(p, len);
109  p += len;
110  }
111  }
112  }
113 
114 #ifdef __cpp_lib_associative_heterogeneous_erasure // C++23
115  last_synonyms.erase(synonym);
116 #else
117  last_synonyms.erase(string(synonym));
118 #endif
119 }
120 
121 void
123 {
124  // We don't actually ever need to merge_changes() here, but it's quite
125  // likely that someone might clear_synonyms() and then add_synonym() for
126  // the same term. The alternative we could otherwise optimise for (modify
127  // synonyms for a term, then clear those for another, then modify those for
128  // the first term again) seems much less likely.
129  if (last_term == term) {
130  last_synonyms.clear();
131  } else {
132  merge_changes();
133  last_term = term;
134  }
135 }
136 
137 TermList*
139 {
140  vector<string> synonyms;
141 
142  if (last_term == term) {
143  if (last_synonyms.empty()) return NULL;
144 
145  synonyms.reserve(last_synonyms.size());
146  for (auto&& i : last_synonyms) {
147  synonyms.push_back(i);
148  }
149  } else {
150  string tag;
151  if (!get_exact_entry(term, tag)) return NULL;
152 
153  const char* p = tag.data();
154  const char* end = p + tag.size();
155  while (p != end) {
156  size_t len;
157  if (p == end ||
158  (len = uint8_t(*p) ^ MAGIC_XOR_VALUE) >= size_t(end - p))
159  throw Xapian::DatabaseCorruptError("Bad synonym data");
160  ++p;
161  synonyms.push_back(string(p, len));
162  p += len;
163  }
164  }
165 
166  return new VectorTermList(synonyms.begin(), synonyms.end());
167 }
168 
170 
172 {
173  LOGCALL_DTOR(DB, "HoneySynonymTermList");
174  delete cursor;
175 }
176 
179 {
180  // This is an over-estimate, but we only use this value to build a balanced
181  // or-tree, and it'll do a decent enough job for that.
182  return database->synonym_table.get_approx_entry_count();
183 }
184 
187 {
188  throw Xapian::InvalidOperationError("HoneySynonymTermList::get_termfreq() "
189  "not meaningful");
190 }
191 
192 TermList*
194 {
195  LOGCALL(DB, TermList*, "HoneySynonymTermList::next", NO_ARGS);
196  if (cursor->after_end()) {
197  // This is the first action on a new HoneySynonymTermList.
198  if (cursor->find_entry_ge(prefix))
199  RETURN(NULL);
200  } else {
201  cursor->next();
202  }
203  if (cursor->after_end() || !startswith(cursor->current_key, prefix)) {
204  // We've reached the end of the prefixed terms.
205  RETURN(this);
206  }
207  current_term = cursor->current_key;
208 
209  RETURN(NULL);
210 }
211 
212 TermList*
214 {
215  LOGCALL(DB, TermList*, "HoneySynonymTermList::skip_to", term);
216  if (cursor->after_end() && prefix > term) {
217  // This is the first action on a new HoneySynonymTermList and we were
218  // asked to skip to a term before the prefix - this ought to leave us
219  // on the first term with the specified prefix.
220  RETURN(skip_to(prefix));
221  }
222 
223  if (cursor->find_entry_ge(term)) {
224  // Exact match.
225  current_term = term;
226  } else {
227  // The exact term we asked for isn't there, so check if the next
228  // term after it also has the right prefix.
229  if (cursor->after_end() || !startswith(cursor->current_key, prefix)) {
230  // We've reached the end of the prefixed terms.
231  RETURN(this);
232  }
233  current_term = cursor->current_key;
234  }
235  RETURN(NULL);
236 }
void clear_synonyms(std::string_view term)
Remove all synonyms for term.
void add_synonym(std::string_view term, std::string_view synonym)
Add a synonym for term.
void remove_synonym(std::string_view term, std::string_view synonym)
Remove a synonym for term.
TermList * open_termlist(std::string_view term) const
Open synonym termlist for a term.
TermList * skip_to(std::string_view term)
Advance to the first term which is >= term.
Xapian::doccount get_termfreq() const
Return the term frequency for the term at the current position.
~HoneySynonymTermList()
Destructor.
Xapian::termcount get_approx_size() const
Return approximate size of this termlist.
TermList * next()
Advance to the next term in the list.
This class stores a list of terms.
DatabaseCorruptError indicates database corruption was detected.
Definition: error.h:397
InvalidOperationError indicates the API was used in an invalid way.
Definition: error.h:271
Abstract base class for termlists.
Definition: termlist.h:42
string term
PositionList * p
Debug logging macros.
#define RETURN(...)
Definition: debuglog.h:484
#define LOGCALL(CATEGORY, TYPE, FUNC, PARAMS)
Definition: debuglog.h:478
#define LOGCALL_DTOR(CATEGORY, CLASS)
Definition: debuglog.h:481
Hierarchy of classes which Xapian can throw as exceptions.
HoneyCursor class.
Database using honey backend.
#define MAGIC_XOR_VALUE
Synonym data for a honey database.
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:64
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:37
Various handy string-related helpers.
bool startswith(std::string_view s, char pfx)
Definition: stringutils.h:56
A vector-like container of terms which can be iterated.