xapian-core  2.0.0
honey_database.cc
Go to the documentation of this file.
1 
4 /* Copyright 2015,2017,2018,2022,2024 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License as
8  * published by the Free Software Foundation; either version 2 of the
9  * License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, see
18  * <https://www.gnu.org/licenses/>.
19  */
20 
21 #include <config.h>
22 
23 #include "honey_database.h"
24 
25 #include "honey_alltermslist.h"
26 #include "honey_document.h"
27 #include "honey_metadata.h"
28 #include "honey_termlist.h"
30 #include "honey_valuelist.h"
31 
32 #include "backends/backends.h"
34 #include "backends/leafpostlist.h"
35 #include "xapian/error.h"
36 
37 #include <string_view>
38 
39 using namespace std;
40 
41 void
43 {
44  // Either the database has been closed, or else there's no termlist table.
45  // Check if the postlist table is open to determine which is the case.
46  if (!postlist_table.is_open())
48  throw Xapian::FeatureUnavailableError("Database has no termlist");
49 }
50 
51 // Relied on below - opening to read should allow the termlist to be missing.
52 static_assert(Xapian::DB_READONLY_ & Xapian::DB_NO_TERMLIST,
53  "Xapian::DB_READONLY_ should imply Xapian::DB_NO_TERMLIST");
54 
55 HoneyDatabase::HoneyDatabase(std::string_view path_, int flags)
56  : Xapian::Database::Internal(TRANSACTION_READONLY),
57  path(path_),
58  version_file(path_),
59  docdata_table(path, true),
60  postlist_table(path, true),
61  position_table(path, true),
62  spelling_table(path, true),
63  synonym_table(path, true),
64  // Note: (Xapian::DB_READONLY_ & Xapian::DB_NO_TERMLIST) is true, so
65  // opening to read we always allow the termlist to be missing.
66  termlist_table(path, true, (flags & Xapian::DB_NO_TERMLIST)),
67  value_manager(postlist_table, termlist_table)
68 {
70  auto rev = version_file.get_revision();
77 }
78 
79 HoneyDatabase::HoneyDatabase(int fd, int flags)
80  : Xapian::Database::Internal(TRANSACTION_READONLY),
81  version_file(fd),
82  docdata_table(fd, version_file.get_offset(), true),
83  postlist_table(fd, version_file.get_offset(), true),
84  position_table(fd, version_file.get_offset(), true),
85  spelling_table(fd, version_file.get_offset(), true),
86  synonym_table(fd, version_file.get_offset(), true),
87  // Note: (Xapian::DB_READONLY_ & Xapian::DB_NO_TERMLIST) is true, so
88  // opening to read we always allow the termlist to be missing.
89  termlist_table(fd, version_file.get_offset(), true,
90  (flags & Xapian::DB_NO_TERMLIST)),
91  value_manager(postlist_table, termlist_table)
92 {
94  auto rev = version_file.get_revision();
101 }
102 
104 {
105  delete doclen_cursor;
106 }
107 
108 void
110 {
111  (void)query;
112  // FIXME: Implement - pre-read the start of the postlist table?
113 }
114 
117 {
118  return version_file.get_doccount();
119 }
120 
123 {
124  return version_file.get_last_docid();
125 }
126 
129 {
131 }
132 
135 {
136  Assert(did != 0);
137  if (usual(did <= version_file.get_last_docid())) {
138  if (doclen_cursor == NULL) {
140  } else {
143  }
144  }
145 
146  // If exact is true, the desired docid is the last in this chunk.
147  bool exact =
150  if (exact)
151  return doclen_chunk_reader.back();
154  }
155  }
156  }
157 
158  string message = "Document ID not in use: ";
159  message += str(did);
160  throw Xapian::DocNotFoundError(message);
161 }
162 
165 {
166  Assert(did != 0);
167  return HoneyTermList(this, did).get_unique_terms();
168 }
169 
172 {
173  Assert(did != 0);
174  HoneyTermList termlist(this, did);
175  Xapian::termcount max_wdf = 0;
176  while (termlist.next() == NULL) {
177  Xapian::termcount current_wdf = termlist.get_wdf();
178  if (current_wdf > max_wdf) max_wdf = current_wdf;
179  }
180  return max_wdf;
181 }
182 
183 void
185  Xapian::doccount* termfreq_ptr,
186  Xapian::termcount* collfreq_ptr) const
187 {
188  postlist_table.get_freqs(term, termfreq_ptr, collfreq_ptr);
189 }
190 
193 {
194  return value_manager.get_value_freq(slot);
195 }
196 
197 string
199 {
201 }
202 
203 string
205 {
207 }
208 
211 {
213 }
214 
217 {
219 }
220 
223 {
225  // It's unlikely wdf is always 0, but when it is there's no need to do any
226  // further work.
227  if (usual(wdf_bound != 0)) {
228  // We don't store per-term wdf upper bounds currently, but
229  // HoneyPostListTable can provide an upper bound based on termfreq,
230  // coll_freq, and the first wdf value, which more often than not is
231  // actually the exact bound (in 77% of cases in an example database of
232  // wikipedia data).
233  wdf_bound = min(wdf_bound, postlist_table.get_wdf_upper_bound(term));
234  }
235  return wdf_bound;
236 }
237 
240 {
242 }
243 
246 {
248 }
249 
250 bool
252 {
253  if (term.empty())
254  return HoneyDatabase::get_doccount() != 0;
256 }
257 
258 bool
260 {
261  return !position_table.empty();
262 }
263 
264 PostList*
266 {
268 }
269 
271 HoneyDatabase::open_leaf_post_list(string_view term, bool need_read_pos) const
272 {
273  if (term.empty()) {
274  Assert(!need_read_pos);
276  if (rare(doccount == 0)) {
277  return nullptr;
278  }
279  if (doccount == get_lastdocid()) {
280  // The used docid range is exactly 1 to doccount inclusive.
282  }
283  return new HoneyAllDocsPostList(this, doccount);
284  }
285 
286  return postlist_table.open_post_list(this, term, need_read_pos);
287 }
288 
289 ValueList*
291 {
292  return new HoneyValueList(slot, this);
293 }
294 
295 TermList*
297 {
298  Assert(did != 0);
299  if (!termlist_table.is_open())
301  HoneyTermList* tl = new HoneyTermList(this, did);
302  if (tl->size() == 0) {
303  // It could be the document has no terms, but maybe it doesn't exist -
304  // in the latter case we ought to throw DocNotFoundError. FIXME: If
305  // the document has no terms, but does have values, we should be able
306  // to avoid this check.
307 
308  // Put the pointer in a unique_ptr so it gets released if an exception
309  // is thrown.
310  unique_ptr<TermList> tl_ptr(tl);
311 
312  // This will throw DocNotFoundError if did isn't in use.
313  (void)HoneyDatabase::get_doclength(did);
314  tl_ptr.release();
315  }
316  return tl;
317 }
318 
319 TermList*
321 {
322  // Same as open_term_list() except for MultiDatabase.
323  return HoneyDatabase::open_term_list(did);
324 }
325 
326 TermList*
327 HoneyDatabase::open_allterms(string_view prefix) const
328 {
329  return new HoneyAllTermsList(this, prefix);
330 }
331 
334 {
336 }
337 
340 {
341  Assert(did != 0);
342  if (!lazy) {
343  // This will throw DocNotFoundError if did isn't in use.
344  (void)HoneyDatabase::get_doclength(did);
345  }
346  return new HoneyDocument(this, did, &value_manager, &docdata_table);
347 }
348 
349 TermList*
351 {
352  return spelling_table.open_termlist(word);
353 }
354 
355 TermList*
357 {
358  auto cursor = spelling_table.cursor_get();
359  if (rare(cursor == NULL)) {
360  // No spelling table.
361  return NULL;
362  }
363  return new HoneySpellingWordsList(this, cursor);
364 }
365 
368 {
369  return spelling_table.get_word_frequency(word);
370 }
371 
372 void
373 HoneyDatabase::add_spelling(string_view word, Xapian::termcount freqinc) const
374 {
375  (void)word;
376  (void)freqinc;
377  throw Xapian::UnimplementedError("Honey backend doesn't support update");
378 }
379 
382  Xapian::termcount freqdec) const
383 {
384  (void)word;
385  (void)freqdec;
386  throw Xapian::UnimplementedError("Honey backend doesn't support update");
387 }
388 
389 TermList*
391 {
393 }
394 
395 TermList*
396 HoneyDatabase::open_synonym_keylist(string_view prefix) const
397 {
398  auto cursor = synonym_table.cursor_get();
399  if (rare(cursor == NULL)) {
400  // No synonym table.
401  return NULL;
402  }
403  return new HoneySynonymTermList(this, cursor, prefix);
404 }
405 
406 void
407 HoneyDatabase::add_synonym(string_view term, string_view synonym) const
408 {
409  (void)term;
410  (void)synonym;
411  throw Xapian::UnimplementedError("Honey backend doesn't support update");
412 }
413 
414 void
415 HoneyDatabase::remove_synonym(string_view term, string_view synonym) const
416 {
417  (void)term;
418  (void)synonym;
419  throw Xapian::UnimplementedError("Honey backend doesn't support update");
420 }
421 
422 void
424 {
425  (void)term;
426  throw Xapian::UnimplementedError("Honey backend doesn't support update");
427 }
428 
429 string
430 HoneyDatabase::get_metadata(string_view key) const
431 {
432  return postlist_table.get_metadata(key);
433 }
434 
435 TermList*
436 HoneyDatabase::open_metadata_keylist(string_view prefix) const
437 {
438  auto cursor = postlist_table.cursor_get();
439  Assert(cursor != NULL);
440  return new HoneyMetadataTermList(this, cursor, prefix);
441 }
442 
443 void
444 HoneyDatabase::set_metadata(string_view key, string_view value)
445 {
446  (void)key;
447  (void)value;
448  throw Xapian::UnimplementedError("Honey backend doesn't support update");
449 }
450 
451 bool
453 {
454  if (!postlist_table.is_open())
456  return false;
457 }
458 
459 void
461 {
462  docdata_table.close(true);
463  postlist_table.close(true);
464  position_table.close(true);
465  spelling_table.close(true);
466  synonym_table.close(true);
467  termlist_table.close(true);
468 }
469 
470 void
472 {
473  Assert(did != 0);
474  (void)did; // FIXME
475 }
476 
479 {
480  return version_file.get_revision();
481 }
482 
483 string
485 {
486  return version_file.get_uuid_string();
487 }
488 
489 int
490 HoneyDatabase::get_backend_info(string* path_ptr) const
491 {
492  if (path_ptr)
493  *path_ptr = path;
494  return BACKEND_HONEY;
495 }
496 
497 void
499  Xapian::docid& last) const
500 {
502  if (doccount == 0) {
503  // Empty database.
504  first = last = 0;
505  return;
506  }
507  auto last_docid = version_file.get_last_docid();
508  if (last_docid == doccount) {
509  // Contiguous range starting at 1.
510  first = 1;
511  last = last_docid;
512  return;
513  }
515 }
516 
517 string
519 {
520  string desc = "Honey(";
521  desc += path;
522  desc += ')';
523  return desc;
524 }
static Xapian::Query query(Xapian::Query::op op, const string &t1=string(), const string &t2=string(), const string &t3=string(), const string &t4=string(), const string &t5=string(), const string &t6=string(), const string &t7=string(), const string &t8=string(), const string &t9=string(), const string &t10=string())
Definition: api_anydb.cc:62
BACKEND_* constants.
@ BACKEND_HONEY
Definition: backends.h:30
A PostList iterating all docids when they form a contiguous range.
bool find_entry_ge(std::string_view key)
Definition: honey_cursor.h:110
Xapian::termcount get_doclength_lower_bound() const
Get a lower bound on the length of a document in this DB.
HoneyCursor * doclen_cursor
Xapian::doccount get_spelling_frequency(std::string_view word) const
Return the number of times word was added as a spelling.
void add_spelling(std::string_view word, Xapian::termcount freqinc) const
Add a word to the spelling dictionary.
HoneySpellingTable spelling_table
void get_freqs(std::string_view term, Xapian::doccount *termfreq_ptr, Xapian::termcount *collfreq_ptr) const
Returns frequencies for a term.
Honey::DocLenChunkReader doclen_chunk_reader
HoneyCursor * get_postlist_cursor() const
TermList * open_spelling_wordlist() const
Return a termlist which returns the words which are spelling correction targets.
std::string path
Path of the directory.
void add_synonym(std::string_view term, std::string_view synonym) const
Add a synonym for a term.
HoneyVersion version_file
Version file ("iamhoney").
HoneyPostListTable postlist_table
std::string get_value_upper_bound(Xapian::valueno slot) const
Get an upper bound on the values stored in the given value slot.
void remove_synonym(std::string_view term, std::string_view synonym) const
Remove a synonym for a term.
void request_document(Xapian::docid did) const
Request a document.
Xapian::Document::Internal * open_document(Xapian::docid did, bool lazy) const
Open a handle on a document.
std::string get_value_lower_bound(Xapian::valueno slot) const
Get a lower bound on the values stored in the given value slot.
PositionList * open_position_list(Xapian::docid did, std::string_view term) const
HoneyTermListTable termlist_table
int get_backend_info(std::string *path) const
Get backend information about this database.
bool reopen()
Reopen the database to the latest available revision.
TermList * open_synonym_termlist(std::string_view term) const
Open a termlist returning synonyms for a term.
std::string get_description() const
Return a string describing this object.
Xapian::termcount remove_spelling(std::string_view word, Xapian::termcount freqdec) const
Remove a word from the spelling dictionary.
TermList * open_allterms(std::string_view prefix) const
Xapian::termcount get_doclength(Xapian::docid did) const
void close()
Close the database.
Xapian::termcount get_unique_terms_upper_bound() const
Get an upper bound on the unique terms size of a document in this DB.
TermList * open_metadata_keylist(std::string_view prefix) const
Open a termlist returning each metadata key.
Xapian::termcount get_wdfdocmax(Xapian::docid did) const
Get the max wdf in document.
friend class HoneySpellingWordsList
std::string get_metadata(std::string_view key) const
Get the metadata associated with a given key.
void get_used_docid_range(Xapian::docid &first, Xapian::docid &last) const
Find lowest and highest docids actually in use.
HoneySynonymTable synonym_table
friend class HoneyTermList
Xapian::docid get_lastdocid() const
Return the last used document id of this (sub) database.
HoneyValueManager value_manager
friend class HoneyAllTermsList
HoneyPositionTable position_table
std::string get_uuid() const
Get a UUID for the database.
Xapian::termcount get_wdf_upper_bound(std::string_view term) const
Get an upper bound on the wdf of term term.
PostList * open_post_list(std::string_view term) const
Return a PostList suitable for use in a PostingIterator.
Xapian::termcount get_unique_terms_lower_bound() const
Get a lower bound on the unique terms size of a document in this DB.
Xapian::rev get_revision() const
Get the current revision of the database.
TermList * open_term_list_direct(Xapian::docid did) const
Like open_term_list() but without MultiTermList wrapper.
TermList * open_synonym_keylist(std::string_view prefix) const
Open a termlist returning each term which has synonyms.
void throw_termlist_table_close_exception() const
Xapian::termcount get_doclength_upper_bound() const
Get an upper bound on the length of a document in this DB.
bool has_positions() const
Check whether this database contains any positional information.
HoneyDocDataTable docdata_table
TermList * open_term_list(Xapian::docid did) const
void set_metadata(std::string_view key, std::string_view value)
Set the metadata associated with a given key.
void readahead_for_query(const Xapian::Query &query) const
ValueList * open_value_list(Xapian::valueno slot) const
Open a value stream.
LeafPostList * open_leaf_post_list(std::string_view term, bool need_read_pos) const
Create a LeafPostList for use during a match.
Xapian::doccount get_value_freq(Xapian::valueno slot) const
Return the frequency of a given value slot.
HoneyDatabase(const HoneyDatabase &)=delete
Don't allow copying.
bool term_exists(std::string_view term) const
Xapian::doccount get_doccount() const
Xapian::totallength get_total_length() const
Return the total length of all documents in this database.
friend class HoneySynonymTermList
void clear_synonyms(std::string_view term) const
Clear all synonyms for a term.
Xapian::termcount get_unique_terms(Xapian::docid did) const
Get the number of unique terms in document.
TermList * open_spelling_termlist(std::string_view word) const
Create a termlist tree from trigrams of word.
A document read from a HoneyDatabase.
HoneyPositionList * open_position_list(Xapian::docid did, std::string_view term) const
void get_used_docid_range(Xapian::doccount doccount, Xapian::docid &first, Xapian::docid &last) const
bool term_exists(std::string_view term) const
void get_freqs(std::string_view term, Xapian::doccount *termfreq_ptr, Xapian::termcount *collfreq_ptr) const
HoneyPostList * open_post_list(const HoneyDatabase *db, std::string_view term, bool need_read_pos) const
Xapian::termcount get_wdf_upper_bound(std::string_view term) const
std::string get_metadata(std::string_view key) const
TermList * open_termlist(std::string_view word)
Xapian::doccount get_word_frequency(std::string_view word) const
TermList * open_termlist(std::string_view term) const
Open synonym termlist for a term.
HoneyCursor * cursor_get() const
Definition: honey_table.cc:454
void close(bool permanent)
Definition: honey_table.h:622
bool is_open() const
Definition: honey_table.h:687
static void throw_database_closed()
Definition: honey_table.h:689
bool empty() const
Definition: honey_table.h:659
void open(int flags_, const Honey::RootInfo &root_info, honey_revision_number_t)
Definition: honey_table.cc:58
A TermList in a honey database.
Xapian::termcount get_wdf() const
Return the wdf for the term at the current position.
TermList * next()
Advance the current position to the next term in the termlist.
Xapian::termcount size() const
Return the number of entries in this termlist.
Honey class for value streams.
std::string get_value_upper_bound(Xapian::valueno slot) const
Definition: honey_values.h:194
std::string get_value_lower_bound(Xapian::valueno slot) const
Definition: honey_values.h:189
Xapian::doccount get_value_freq(Xapian::valueno slot) const
Definition: honey_values.h:184
std::string get_uuid_string() const
Return UUID in the standard 36 character string format.
Xapian::termcount get_doclength_lower_bound() const
Xapian::docid get_last_docid() const
Xapian::totallength get_total_doclen() const
Xapian::termcount get_unique_terms_upper_bound() const
Xapian::termcount get_unique_terms_lower_bound() const
honey_revision_number_t get_revision() const
void read()
Read the version file and check it's a version we understand.
Xapian::termcount get_wdf_upper_bound() const
Xapian::doccount get_doccount() const
Xapian::termcount get_doclength_upper_bound() const
const Honey::RootInfo & get_root(Honey::table_type tbl) const
Xapian::termcount back()
Return the last document length in this chunk.
bool find_doclength(Xapian::docid target)
Searches the whole chunk (skip_to() only advances).
bool update(HoneyCursor *cursor)
Update to use the chunk currently pointed to by cursor.
Xapian::termcount get_doclength() const
Abstract base class for leaf postlists.
Definition: leafpostlist.h:40
Indicates an attempt to access a document not present in the database.
Definition: error.h:662
Abstract base class for a document.
Indicates an attempt to use a feature which is unavailable.
Definition: error.h:707
Abstract base class for postlists.
Definition: postlist.h:40
Abstract base class for iterating term positions in a document.
Definition: positionlist.h:32
Class representing a query.
Definition: query.h:45
Abstract base class for termlists.
Definition: termlist.h:42
UnimplementedError indicates an attempt to use an unimplemented feature.
Definition: error.h:313
Abstract base class for value streams.
Definition: valuelist.h:31
#define usual(COND)
Definition: config.h:608
#define rare(COND)
Definition: config.h:607
Iterate all document ids when they form a contiguous range.
string term
Hierarchy of classes which Xapian can throw as exceptions.
#define true
Definition: header.h:8
A termlist containing all terms in a honey database.
Database using honey backend.
A document read from a HoneyDatabase.
Access to metadata for a honey database.
A termlist containing all words which are spelling targets.
A TermList in a honey database.
Honey class for value streams.
Abstract base class for leaf postlists.
std::string make_doclenchunk_key(Xapian::docid last_did)
Generate a key for a doclen chunk.
@ TERMLIST
Definition: honey_defs.h:71
@ DOCDATA
Definition: honey_defs.h:70
@ SYNONYM
Definition: honey_defs.h:74
@ POSITION
Definition: honey_defs.h:72
@ SPELLING
Definition: honey_defs.h:73
@ POSTLIST
Definition: honey_defs.h:69
string str(int value)
Convert int to std::string.
Definition: str.cc:91
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:82
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:64
XAPIAN_REVISION_TYPE rev
Revision number of a database.
Definition: types.h:108
unsigned valueno
The number for a value slot in a document.
Definition: types.h:90
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:37
const int DB_NO_TERMLIST
When creating a database, don't create a termlist table.
Definition: constants.h:135
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:51
XAPIAN_TOTALLENGTH_TYPE totallength
The total length of all documents in a database.
Definition: types.h:114
#define Assert(COND)
Definition: omassert.h:122