xapian-core  2.0.0
database.cc
Go to the documentation of this file.
1 
4 /* Copyright 2006-2024 Olly Betts
5  * Copyright 2007,2008,2009 Lemur Consulting Ltd
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License as
9  * published by the Free Software Foundation; either version 2 of the
10  * License, or (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, see
19  * <https://www.gnu.org/licenses/>.
20  */
21 
22 #include <config.h>
23 
24 #include <xapian/database.h>
25 
29 #include "debuglog.h"
30 #include "editdistance.h"
31 #include "omassert.h"
33 #include <xapian/constants.h>
34 #include <xapian/error.h>
36 #include <xapian/postingiterator.h>
37 #include <xapian/termiterator.h>
38 #include <xapian/unicode.h>
39 
40 #include <algorithm>
41 #include <cstdlib> // For abs().
42 #include <memory>
43 #include <string>
44 #include <vector>
45 
46 using namespace std;
47 
48 [[noreturn]]
49 static void docid_zero_invalid()
50 {
51  throw Xapian::InvalidArgumentError("Document ID 0 is invalid");
52 }
53 
54 [[noreturn]]
55 static void empty_metadata_key()
56 {
57  throw Xapian::InvalidArgumentError("Empty metadata keys are invalid");
58 }
59 
60 [[noreturn]]
61 static void empty_term_invalid()
62 {
63  throw Xapian::InvalidArgumentError("Empty terms are invalid");
64 }
65 
66 namespace Xapian {
67 
68 Database::Database(Database::Internal* internal_)
69  : internal(internal_)
70 {
71 }
72 
73 Database::Database(const Database&) = default;
74 
75 Database&
76 Database::operator=(const Database&) = default;
77 
78 Database::Database(Database&&) = default;
79 
80 Database&
81 Database::operator=(Database&&) = default;
82 
84  : internal(new EmptyDatabase)
85 {
86 }
87 
89 {
90 }
91 
92 bool
94 {
95  return internal->reopen();
96 }
97 
98 void
100 {
101  internal->close();
102 }
103 
104 size_t
106 {
107  return internal->size();
108 }
109 
110 void
111 Database::add_database_(const Database& o, bool read_only)
112 {
113  if (this == &o) {
114  const char* msg = read_only ?
115  "Database::add_database(): Can't add a Database to itself" :
116  "WritableDatabase::add_database(): Can't add a WritableDatabase "
117  "to itself";
118  throw InvalidArgumentError(msg);
119  }
120 
121  auto o_size = o.internal->size();
122  if (o_size == 0) {
123  // Adding an empty database is a no-op.
124  return;
125  }
126 
127  auto my_size = internal->size();
128  if (my_size == 0 && o_size == 1) {
129  // Just copy.
130  internal = o.internal;
131  return;
132  }
133 
134 #if 0
135  // The check below doesn't work - for example:
136  //
137  // Database db;
138  // db.add_database(WritableDatabase("one.db"));
139  // db.add_database(WritableDatabase("two.db"));
140  //
141  // The first add_database() assigns the internal across, so at the second
142  // call internal->is_read_only() returns false but read_only is true.
143  //
144  // I'm not entirely convinced the extra complexity required to make this
145  // work is worthwhile. We catch static violations such as this at compile
146  // time:
147  //
148  // WritableDatabase db;
149  // db.add_database(Database("one.db"));
150  //
151  // The case we don't catch at compile time is:
152  //
153  // WritableDatabase db;
154  // Database ro_db = db;
155  // ro_db.add_database(Database("one.db"));
156  //
157  // But performing WritableDatabase actions using such a WritableDatabase
158  // should now throw InvalidOperationError.
159  if (!internal->is_read_only() && read_only) {
160  throw InvalidArgumentError("Database::add_database(): Can't add a "
161  "Database to a WritableDatabase");
162  }
163 #endif
164 
165  // Make sure internal is a MultiDatabase with enough space reserved.
166  auto new_size = my_size + o_size;
167  MultiDatabase* multi_db;
168  if (my_size <= 1) {
169  multi_db = new MultiDatabase(new_size, read_only);
170  if (my_size) multi_db->push_back(internal.get());
171  internal = multi_db;
172  } else {
173  // Must already be a MultiDatabase as everything else reports 1 for
174  // size().
175  multi_db = static_cast<MultiDatabase*>(internal.get());
176  multi_db->reserve(new_size);
177  }
178 
179  if (o_size == 1) {
180  multi_db->push_back(o.internal.get());
181  } else {
182  // Must be a MultiDatabase.
183  auto o_multi = static_cast<MultiDatabase*>(o.internal.get());
184  // Add the shards from o to ourself.
185  for (auto&& shard : o_multi->shards) {
186  multi_db->push_back(shard);
187  }
188  }
189 }
190 
192 Database::postlist_begin(string_view term) const
193 {
194  PostList* pl = internal->open_post_list(term);
195  if (!pl) return PostingIterator();
196  return PostingIterator(new PostingIterator::Internal(pl, *this));
197 }
198 
201 {
202  if (did == 0)
204 
205  return TermIterator(internal->open_term_list(did));
206 }
207 
209 Database::allterms_begin(string_view prefix) const
210 {
211  return TermIterator(internal->open_allterms(prefix));
212 }
213 
214 bool
216 {
217  return internal->has_positions();
218 }
219 
222 {
223  if (did == 0)
225 
226  if (term.empty())
228 
230 }
231 
234 {
235  return internal->get_doccount();
236 }
237 
240 {
241  return internal->get_lastdocid();
242 }
243 
244 double
246 {
247  Xapian::doccount doc_count = internal->get_doccount();
248  if (rare(doc_count == 0))
249  return 0.0;
250 
251  Xapian::totallength total_length = internal->get_total_length();
252  return total_length / double(doc_count);
253 }
254 
257 {
258  return internal->get_total_length();
259 }
260 
262 Database::get_termfreq(string_view term) const
263 {
264  if (term.empty())
265  return get_doccount();
266 
267  Xapian::doccount result;
268  internal->get_freqs(term, &result, NULL);
269  return result;
270 }
271 
274 {
275  if (term.empty())
276  return get_doccount();
277 
278  Xapian::termcount result;
279  internal->get_freqs(term, NULL, &result);
280  return result;
281 }
282 
285 {
286  return internal->get_value_freq(slot);
287 }
288 
289 string
291 {
292  return internal->get_value_lower_bound(slot);
293 }
294 
295 string
297 {
298  return internal->get_value_upper_bound(slot);
299 }
300 
303 {
304  return internal->get_doclength_lower_bound();
305 }
306 
309 {
310  return internal->get_doclength_upper_bound();
311 }
312 
315 {
316  if (term.empty())
317  return 0;
318 
319  return internal->get_wdf_upper_bound(term);
320 }
321 
324 {
325  return internal->get_unique_terms_lower_bound();
326 }
327 
330 {
331  return internal->get_unique_terms_upper_bound();
332 }
333 
336 {
337  return ValueIterator(internal->open_value_list(slot));
338 }
339 
342 {
343  if (did == 0)
345 
346  return internal->get_doclength(did);
347 }
348 
351 {
352  if (did == 0)
354 
355  return internal->get_unique_terms(did);
356 }
357 
360 {
361  if (did == 0)
363 
364  return internal->get_wdfdocmax(did);
365 }
366 
367 Document
368 Database::get_document(Xapian::docid did, unsigned flags) const
369 {
370  if (rare(did == 0))
372 
373  bool assume_valid = flags & Xapian::DOC_ASSUME_VALID;
374  return Document(internal->open_document(did, assume_valid));
375 }
376 
377 bool
378 Database::term_exists(string_view term) const
379 {
380  // NB Internal::term_exists() handles term.empty().
381  return internal->term_exists(term);
382 }
383 
384 void
386 {
387  internal->keep_alive();
388 }
389 
390 string
392 {
393  string desc = "Database(";
394  desc += internal->get_description();
395  desc += ')';
396  return desc;
397 }
398 
399 string
401  unsigned max_edit_distance) const
402 {
403  if (word.size() <= 1 || max_edit_distance == 0)
404  return string();
405 
406  max_edit_distance = min(max_edit_distance, unsigned(word.size() - 1));
407 
408  unique_ptr<TermList> merger(internal->open_spelling_termlist(word));
409  if (!merger)
410  return string();
411 
412  EditDistanceCalculator edcalc(word);
413  string result;
414  int edist_best = max_edit_distance;
415  Xapian::doccount freq_best = 0;
416  Xapian::doccount freq_exact = 0;
417  while (true) {
418  TermList* ret = merger->next();
419  if (rare(ret == merger.get())) {
420  // Out of entries.
421  break;
422  }
423  if (rare(ret)) merger.reset(ret);
424 
425  const string& term = merger->get_termname();
427 
428  // We can get the number of matching n-grams from merger->get_wdf() but
429  // a long candidate can match all the n-grams yet be too many edits
430  // away, while a candidate within edit distance range can match fewer
431  // n-grams. E.g. if looking for corrections for `kuarq` we consider
432  // entries with n-grams `^ku`, `kua`, `uar`, `arq` or `rq$`.
433  //
434  // * `kuazzzuarq` contains all 5 n-grams but is 5 edits away
435  // * `quark` matches a single n-gram but is 2 edits away
436  //
437  // A single edit can potentially eliminate 3 n-grams which possibly
438  // gives us a potential criteria for rejecting based on the n-gram
439  // count, but in practice it seems it rejects so few candidates that
440  // it's actually cheaper to not try it.
441 
442  int edist = edcalc(term, edist_best);
443  LOGVALUE(SPELLING, edist);
444 
445  if (edist <= edist_best) {
446  Xapian::doccount freq = internal->get_spelling_frequency(term);
447 
448  LOGVALUE(SPELLING, freq);
449  LOGVALUE(SPELLING, freq_best);
450  // Even if we have an exact match, there may be a much more
451  // frequent potential correction which will still be interesting.
452  if (rare(edist == 0)) {
453  freq_exact = freq;
454  continue;
455  }
456 
457  if (edist < edist_best || freq > freq_best) {
458  LOGLINE(SPELLING, "Best so far: \"" << term <<
459  "\" edist " << edist << " freq " << freq);
460  result = term;
461  edist_best = edist;
462  freq_best = freq;
463  }
464  }
465  }
466  if (freq_best < freq_exact)
467  return string();
468  return result;
469 }
470 
473 {
475 }
476 
478 Database::synonyms_begin(string_view term) const
479 {
481 }
482 
484 Database::synonym_keys_begin(string_view prefix) const
485 {
486  return TermIterator(internal->open_synonym_keylist(prefix));
487 }
488 
489 string
490 Database::get_metadata(string_view key) const
491 {
492  if (rare(key.empty()))
494 
495  return internal->get_metadata(key);
496 }
497 
499 Database::metadata_keys_begin(string_view prefix) const
500 {
502 }
503 
504 string
506 {
507  return internal->get_uuid();
508 }
509 
510 bool
512 {
513  return internal->locked();
514 }
515 
517 Database::lock(int flags) {
519 }
520 
523  return Xapian::Database(internal->update_lock(Xapian::DB_READONLY_));
524 }
525 
528 {
529  return internal->get_revision();
530 }
531 
532 string
534  size_t length,
535  std::string_view prefix,
536  Xapian::termpos start_pos,
537  Xapian::termpos end_pos) const
538 {
539  return internal->reconstruct_text(did, length, prefix, start_pos, end_pos);
540 }
541 
542 void
544 {
545  internal->commit();
546 }
547 
548 void
550 {
551  internal->begin_transaction(flushed);
552 }
553 
554 void
556 {
557  internal->end_transaction(do_commit);
558 }
559 
562 {
563  return internal->add_document(doc);
564 }
565 
566 void
568 {
569  internal->delete_document(did);
570 }
571 
572 void
574 {
575  if (term.empty())
577 
578  internal->delete_document(term);
579 }
580 
581 void
583 {
584  if (rare(did == 0))
586 
587  internal->replace_document(did, doc);
588 }
589 
592 {
593  if (term.empty())
595 
596  return internal->replace_document(term, doc);
597 }
598 
599 void
601  Xapian::termcount freqinc) const
602 {
603  internal->add_spelling(word, freqinc);
604 }
605 
608  Xapian::termcount freqdec) const
609 {
610  return internal->remove_spelling(word, freqdec);
611 }
612 
613 void
615  string_view synonym) const
616 {
617  internal->add_synonym(term, synonym);
618 }
619 
620 void
622  string_view synonym) const
623 {
624  internal->remove_synonym(term, synonym);
625 }
626 
627 void
629 {
630  internal->clear_synonyms(term);
631 }
632 
633 void
634 WritableDatabase::set_metadata(string_view key, string_view value)
635 {
636  if (rare(key.empty()))
638 
639  internal->set_metadata(key, value);
640 }
641 
642 string
644 {
645  string desc = "WritableDatabase(";
646  desc += internal->get_description();
647  desc += ')';
648  return desc;
649 }
650 
651 }
Calculate edit distances to a target string.
Definition: editdistance.h:43
Empty database internals.
Sharded database backend.
void push_back(Xapian::Database::Internal *shard)
void reserve(size_type new_size)
Virtual base class for Database internals.
virtual size_type size() const
virtual TermList * open_term_list(docid did) const =0
bool is_read_only() const
Test if this shard is read-only.
virtual TermList * open_spelling_wordlist() const
Return a termlist which returns the words which are spelling correction targets.
virtual PositionList * open_position_list(docid did, std::string_view term) const =0
virtual TermList * open_spelling_termlist(std::string_view word) const
Create a termlist tree from trigrams of word.
virtual Document::Internal * open_document(docid did, bool lazy) const =0
Open a handle on a document.
virtual TermList * open_allterms(std::string_view prefix) const =0
virtual TermList * open_synonym_termlist(std::string_view term) const
Open a termlist returning synonyms for a term.
virtual TermList * open_metadata_keylist(std::string_view prefix) const
Open a termlist returning each metadata key.
virtual Internal * update_lock(int flags)
Lock a read-only database for writing or unlock a writable database.
virtual TermList * open_synonym_keylist(std::string_view prefix) const
Open a termlist returning each term which has synonyms.
virtual ValueList * open_value_list(valueno slot) const
Open a value stream.
An indexed database of documents.
Definition: database.h:75
Xapian::TermIterator metadata_keys_begin(std::string_view prefix={}) const
An iterator which returns all user-specified metadata keys.
Definition: database.cc:499
Xapian::rev get_revision() const
Get the revision of the database.
Definition: database.cc:527
ValueIterator valuestream_begin(Xapian::valueno slot) const
Return an iterator over the value in slot slot for each document.
Definition: database.cc:335
Xapian::doccount get_termfreq(std::string_view term) const
Get the number of documents indexed by a specified term.
Definition: database.cc:262
Database()
Construct a Database containing no shards.
Definition: database.cc:83
Xapian::TermIterator synonym_keys_begin(std::string_view prefix={}) const
An iterator which returns all terms which have synonyms.
Definition: database.cc:484
Xapian::termcount get_unique_terms_lower_bound() const
Get a lower bound on the unique terms size of a document in this DB.
Definition: database.cc:323
void close()
Close the database.
Definition: database.cc:99
double get_average_length() const
Get the mean document length in the database.
Definition: database.cc:245
Xapian::totallength get_total_length() const
Get the total length of all the documents in the database.
Definition: database.cc:256
Xapian::TermIterator spellings_begin() const
An iterator which returns all the spelling correction targets.
Definition: database.cc:472
Xapian::WritableDatabase lock(int flags=0)
Lock a read-only database for writing.
Definition: database.cc:517
Xapian::termcount get_doclength_lower_bound() const
Get a lower bound on the length of a document in this DB.
Definition: database.cc:302
PostingIterator postlist_begin(std::string_view term) const
Start iterating the postings of a term.
Definition: database.cc:192
std::string reconstruct_text(Xapian::docid did, size_t length=0, std::string_view prefix={}, Xapian::termpos start_pos=0, Xapian::termpos end_pos=0) const
Reconstruct document text.
Definition: database.cc:533
bool locked() const
Test if this database is currently locked for writing.
Definition: database.cc:511
TermIterator termlist_begin(Xapian::docid did) const
Start iterating the terms in a document.
Definition: database.cc:200
Xapian::termcount get_wdfdocmax(Xapian::docid did) const
Get the maximum wdf value in a specified document.
Definition: database.cc:359
void keep_alive()
Send a keep-alive message.
Definition: database.cc:385
virtual ~Database()
Destructor.
Definition: database.cc:88
Xapian::termcount get_wdf_upper_bound(std::string_view term) const
Get an upper bound on the wdf of term term.
Definition: database.cc:314
PositionIterator positionlist_begin(Xapian::docid did, std::string_view term) const
Start iterating positions for a term in a document.
Definition: database.cc:221
size_t size() const
Return number of shards in this Database object.
Definition: database.cc:105
std::string get_value_upper_bound(Xapian::valueno slot) const
Get an upper bound on the values stored in the given value slot.
Definition: database.cc:296
virtual std::string get_description() const
Return a string describing this object.
Definition: database.cc:391
Xapian::termcount get_doclength(Xapian::docid did) const
Get the length of a specified document.
Definition: database.cc:341
void add_database_(const Database &other, bool read_only)
Definition: database.cc:111
bool term_exists(std::string_view term) const
Test is a particular term is present in any document.
Definition: database.cc:378
std::string get_value_lower_bound(Xapian::valueno slot) const
Get a lower bound on the values stored in the given value slot.
Definition: database.cc:290
Xapian::termcount get_unique_terms_upper_bound() const
Get an upper bound on the unique terms size of a document in this DB.
Definition: database.cc:329
bool has_positions() const
Does this database have any positional information?
Definition: database.cc:215
Xapian::termcount get_collection_freq(std::string_view term) const
Get the total number of occurrences of a specified term.
Definition: database.cc:273
Xapian::doccount get_doccount() const
Get the number of documents in the database.
Definition: database.cc:233
Xapian::docid get_lastdocid() const
Get the highest document id which has been used in the database.
Definition: database.cc:239
Xapian::doccount get_value_freq(Xapian::valueno slot) const
Return the frequency of a given value slot.
Definition: database.cc:284
TermIterator allterms_begin(std::string_view prefix={}) const
Start iterating all terms in the database with a given prefix.
Definition: database.cc:209
Database & operator=(const Database &o)
Assignment operator.
Xapian::TermIterator synonyms_begin(std::string_view term) const
An iterator which returns all the synonyms for a given term.
Definition: database.cc:478
bool reopen()
Reopen the database at the latest available revision.
Definition: database.cc:93
Xapian::termcount get_doclength_upper_bound() const
Get an upper bound on the length of a document in this DB.
Definition: database.cc:308
std::string get_spelling_suggestion(std::string_view word, unsigned max_edit_distance=2) const
Suggest a spelling correction.
Definition: database.cc:400
Xapian::Document get_document(Xapian::docid did, unsigned flags=0) const
Get a document from the database.
Definition: database.cc:368
Xapian::Database unlock()
Release a database write lock.
Definition: database.cc:522
std::string get_uuid() const
Get the UUID for the database.
Definition: database.cc:505
Xapian::Internal::intrusive_ptr_nonnull< Internal > internal
Definition: database.h:95
std::string get_metadata(std::string_view key) const
Get the user-specified metadata associated with a given key.
Definition: database.cc:490
Xapian::termcount get_unique_terms(Xapian::docid did) const
Get the number of unique terms in a specified document.
Definition: database.cc:350
Class representing a document.
Definition: document.h:64
Abstract base class for postlists.
Definition: postlist.h:40
InvalidArgumentError indicates an invalid parameter value was passed to the API.
Definition: error.h:229
Class for iterating over term positions.
Class for iterating over a list of terms.
Abstract base class for termlists.
Definition: termlist.h:42
virtual Internal * next()=0
Advance the current position to the next term in the termlist.
Class for iterating over a list of terms.
Definition: termiterator.h:41
Class for iterating over document values.
Definition: valueiterator.h:39
This class provides read/write access to a database.
Definition: database.h:964
void delete_document(Xapian::docid did)
Delete a document from the database.
Definition: database.cc:567
void clear_synonyms(std::string_view term) const
Remove all synonyms for a term.
Definition: database.cc:628
void begin_transaction(bool flushed=true)
Begin a transaction.
Definition: database.cc:549
void add_synonym(std::string_view term, std::string_view synonym) const
Add a synonym for a term.
Definition: database.cc:614
void replace_document(Xapian::docid did, const Xapian::Document &document)
Replace a document in the database.
Definition: database.cc:582
void set_metadata(std::string_view key, std::string_view metadata)
Set the user-specified metadata associated with a given key.
Definition: database.cc:634
void end_transaction_(bool do_commit)
Definition: database.cc:555
void add_spelling(std::string_view word, Xapian::termcount freqinc=1) const
Add a word to the spelling dictionary.
Definition: database.cc:600
termcount remove_spelling(std::string_view word, termcount freqdec=1) const
Remove a word from the spelling dictionary.
Definition: database.cc:607
std::string get_description() const
Return a string describing this object.
Definition: database.cc:643
void commit()
Commit pending modifications.
Definition: database.cc:543
Xapian::docid add_document(const Xapian::Document &doc)
Add a document to the database.
Definition: database.cc:561
void remove_synonym(std::string_view term, std::string_view synonym) const
Remove a synonym for a term.
Definition: database.cc:621
#define rare(COND)
Definition: config.h:607
Constants in the Xapian namespace.
static void docid_zero_invalid()
Definition: database.cc:49
static void empty_term_invalid()
Definition: database.cc:61
static void empty_metadata_key()
Definition: database.cc:55
An indexed database of documents.
string term
Virtual base class for Database internals.
Debug logging macros.
#define LOGLINE(a, b)
Definition: debuglog.h:485
#define LOGVALUE(a, b)
Definition: debuglog.h:486
Edit distance calculation algorithm.
Empty database internals.
Hierarchy of classes which Xapian can throw as exceptions.
Sharded database backend.
@ SPELLING
Definition: glass_defs.h:58
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:82
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:64
const int DOC_ASSUME_VALID
Assume document id is valid.
Definition: constants.h:275
XAPIAN_REVISION_TYPE rev
Revision number of a database.
Definition: types.h:108
unsigned valueno
The number for a value slot in a document.
Definition: types.h:90
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:37
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:51
unsigned XAPIAN_TERMPOS_BASE_TYPE termpos
A term position within a document or query.
Definition: types.h:75
XAPIAN_TOTALLENGTH_TYPE totallength
The total length of all documents in a database.
Definition: types.h:114
Various assertion macros.
Class for iterating over term positions.
Class for iterating over a list of document ids.
Xapian::PostingIterator internals.
Class for iterating over a list of terms.
Unicode and UTF-8 related classes and functions.