xapian-core  2.0.0
inmemory_database.h
Go to the documentation of this file.
1 
4 /* Copyright 1999,2000,2001 BrightStation PLC
5  * Copyright 2002 Ananova Ltd
6  * Copyright 2002-2024 Olly Betts
7  * Copyright 2006,2009 Lemur Consulting Ltd
8  *
9  * This program is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU General Public License as
11  * published by the Free Software Foundation; either version 2 of the
12  * License, or (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; if not, see
21  * <https://www.gnu.org/licenses/>.
22  */
23 
24 #ifndef XAPIAN_INCLUDED_INMEMORY_DATABASE_H
25 #define XAPIAN_INCLUDED_INMEMORY_DATABASE_H
26 
27 #include "api/smallvector.h"
28 #include "api/termlist.h"
29 #include "backends/backends.h"
31 #include "backends/leafpostlist.h"
32 #include "backends/valuestats.h"
33 #include <algorithm>
34 #include <map>
35 #include <string_view>
36 #include <vector>
37 #include <xapian/document.h>
38 #include "inmemory_positionlist.h"
39 #include "internaltypes.h"
40 #include "omassert.h"
41 
42 // Class representing a posting (a term/doc pair, and
43 // all the relevant positional information, is a single posting)
45  public:
47  bool valid;
48  Xapian::VecCOW<Xapian::termpos> positions; // Sorted vector of positions
50 
51  // Add new position entry preserving sorted order.
53  auto p = std::lower_bound(positions.begin(), positions.end(), pos);
54  Assert(p == positions.end() || *p != pos);
55  positions.insert(p, pos);
56  }
57 };
58 
60  public:
61  std::string tname;
62  Xapian::VecCOW<Xapian::termpos> positions; // Sorted vector of positions
64 
65  // Add new position entry preserving sorted order.
67  auto p = std::lower_bound(positions.begin(), positions.end(), pos);
68  Assert(p == positions.end() || *p != pos);
69  positions.insert(p, pos);
70  }
71 };
72 
73 // Compare by document ID
75  public:
77  const InMemoryPosting &p2) const
78  {
79  return p1.did < p2.did;
80  }
81 };
82 
83 // Compare by termname
85  public:
87  const InMemoryTermEntry&p2) const
88  {
89  return p1.tname < p2.tname;
90  }
91 };
92 
93 // Class representing a term and the documents indexing it
94 class InMemoryTerm {
95  public:
96  // Sorted list of documents indexing this term.
97  std::vector<InMemoryPosting> docs;
98 
101 
103 
104  void add_posting(Xapian::docid did,
105  Xapian::termcount wdf,
106  Xapian::termpos position,
107  bool use_position);
108 };
109 
111 class InMemoryDoc {
112  public:
113  bool is_valid;
114  // Sorted list of terms indexing this document.
115  std::vector<InMemoryTermEntry> terms;
116 
117  /* Initialise invalid by default, so that resizing the termlist array
118  * doesn't create valid documents. */
120 
121  // Initialise specifying validity.
122  explicit InMemoryDoc(bool is_valid_) : is_valid(is_valid_) {}
123 
124  void add_posting(const std::string& tname,
125  Xapian::termcount wdf,
126  Xapian::termpos position,
127  bool use_position);
128 };
129 
130 class InMemoryDatabase;
131 
135  friend class InMemoryDatabase;
136 
137  private:
138  std::vector<InMemoryPosting>::const_iterator pos;
139  std::vector<InMemoryPosting>::const_iterator end;
140  bool started;
141 
146 
148 
150 
152  const InMemoryTerm& imterm, std::string_view term_);
153  public:
154  Xapian::docid get_docid() const; // Gets current docid
155  Xapian::termcount get_wdf() const; // Within Document Frequency
156  // Max wdf of terms in current document
160 
161  PostList *next(double w_min); // Moves to next docid
162 
163  // Moves to next docid >= specified docid
164  PostList *skip_to(Xapian::docid did, double w_min);
165 
166  // True if we're off the end of the list.
167  bool at_end() const;
168 
170 
171  void get_docid_range(Xapian::docid& first, Xapian::docid& last) const;
172 
173  std::string get_description() const;
174 };
175 
179  friend class InMemoryDatabase;
180 
181  private:
183 
185 
187 
188  public:
189  Xapian::docid get_docid() const; // Gets current docid
190  Xapian::termcount get_doclength() const; // Length of current document
191  // number of terms in current document
193  Xapian::termcount get_wdf() const; // Within Document Frequency
196 
197  PostList *next(double w_min); // Moves to next docid
198 
199  // Moves to next docid >= specified docid
200  PostList *skip_to(Xapian::docid did, double w_min);
201 
202  // True if we're off the end of the list
203  bool at_end() const;
204 
206 
207  std::string get_description() const;
208 };
209 
210 // Term List
211 class InMemoryTermList : public TermList {
212  friend class InMemoryDatabase;
213 
214  private:
215  std::vector<InMemoryTermEntry>::const_iterator pos;
216  std::vector<InMemoryTermEntry>::const_iterator end;
218  bool started;
219 
223 
226  const InMemoryDoc & doc,
227  Xapian::termcount len);
228 
229  public:
231 
234 
235  // Number of occurrences of term in current doc
236  Xapian::termcount get_wdf() const;
237  Xapian::doccount get_termfreq() const; // Number of docs indexed by term
238  TermList * next();
239  TermList* skip_to(std::string_view term);
242 };
243 
244 class InMemoryDocument;
245 
252  friend class InMemoryDocument;
253 
254  std::map<std::string, InMemoryTerm, std::less<>> postlists;
255  std::vector<InMemoryDoc> termlists;
256  std::vector<std::string> doclists;
257  std::vector<std::map<Xapian::valueno, std::string>> valuelists;
258  std::map<Xapian::valueno, ValueStats> valuestats;
259 
260  std::vector<Xapian::termcount> doclengths;
261 
262  std::map<std::string, std::string, std::less<>> metadata;
263 
265 
267 
269 
270  // Flag, true if the db has been closed.
271  bool closed;
272 
273  // Stop copy / assignment being allowed
276 
277  void make_term(const std::string& tname);
278 
279  bool doc_exists(Xapian::docid did) const;
280  Xapian::docid make_doc(const std::string& docdata);
281 
282  /* The common parts of add_doc and replace_doc */
283  void finish_add_doc(Xapian::docid did, const Xapian::Document& document);
285  const std::map<Xapian::valueno, std::string>& values_);
286 
287  void make_posting(InMemoryDoc* doc,
288  const std::string& tname,
290  Xapian::termpos position,
291  Xapian::termcount wdf,
292  bool use_position = true);
293 
295 
297  void commit();
298  void cancel();
299 
300  Xapian::docid add_document(const Xapian::Document & document);
301  // Stop the default implementation of delete_document(term) and
302  // replace_document(term) from being hidden. This isn't really
303  // a problem as we only try to call them through the base class
304  // (where they aren't hidden) but some compilers generate a warning
305  // about the hiding.
309  void replace_document(Xapian::docid did, const Xapian::Document & document);
311 
312  public:
318 
320 
321  bool reopen();
322  void close();
323  bool is_closed() const { return closed; }
324 
326 
328 
333 
334  void get_freqs(std::string_view term,
335  Xapian::doccount* termfreq_ptr,
336  Xapian::termcount* collfreq_ptr) const;
338  std::string get_value_lower_bound(Xapian::valueno slot) const;
339  std::string get_value_upper_bound(Xapian::valueno slot) const;
342  Xapian::termcount get_wdf_upper_bound(std::string_view term) const;
343  bool term_exists(std::string_view term) const;
344  bool has_positions() const;
345 
346  PostList* open_post_list(std::string_view tname) const;
347  LeafPostList* open_leaf_post_list(std::string_view term,
348  bool need_read_pos) const;
352  bool lazy) const;
353 
354  std::string get_metadata(std::string_view key) const;
355  TermList* open_metadata_keylist(std::string_view prefix) const;
356  void set_metadata(std::string_view key, std::string_view value);
357 
359  std::string_view tname) const;
361  std::string_view tname) const;
362  TermList* open_allterms(std::string_view prefix) const;
363 
364  [[noreturn]]
365  static void throw_database_closed();
366 
367  int get_backend_info(std::string* path) const {
368  if (path) *path = std::string();
369  return BACKEND_INMEMORY;
370  }
371 
372  void get_used_docid_range(Xapian::docid& first, Xapian::docid& last) const;
373 
374  bool locked() const { return !closed; }
375 
377 
378  std::string get_description() const;
379 };
380 
381 #ifdef DISABLE_GPL_LIBXAPIAN
382 # error GPL source we cannot relicense included in libxapian
383 #endif
384 
385 #endif /* XAPIAN_INCLUDED_INMEMORY_DATABASE_H */
BACKEND_* constants.
@ BACKEND_INMEMORY
Definition: backends.h:28
A PostList over all docs in an inmemory database.
PositionList * open_position_list() const
Read the position list for the term in the current document and return a pointer to it (not owned by ...
PostList * skip_to(Xapian::docid did, double w_min)
Skip forward to the specified docid.
Xapian::docid get_docid() const
Return the current docid.
Xapian::Internal::intrusive_ptr< const InMemoryDatabase > db
Xapian::termcount get_unique_terms() const
bool at_end() const
Return true if the current position is past the last entry in this list.
InMemoryAllDocsPostList(const InMemoryDatabase *db)
Xapian::termcount get_wdf() const
Return the wdf for the document at the current position.
std::string get_description() const
Return a string description of this object.
Xapian::termcount get_wdf_upper_bound() const
Xapian::termcount get_doclength() const
PositionList * read_position_list()
Read the position list for the term in the current document and return a pointer to it (owned by the ...
A database held entirely in memory.
Xapian::termcount get_wdfdocmax(Xapian::docid did) const
Get the max wdf in document.
std::string get_value_upper_bound(Xapian::valueno slot) const
Get an upper bound on the values stored in the given value slot.
static void throw_database_closed()
bool locked() const
Return true if the database is open for writing.
TermList * open_term_list_direct(Xapian::docid did) const
Like open_term_list() but without MultiTermList wrapper.
Xapian::termcount get_doclength(Xapian::docid did) const
TermList * open_term_list(Xapian::docid did) const
Xapian::termcount get_wdf_upper_bound(std::string_view term) const
Get an upper bound on the wdf of term term.
std::vector< InMemoryDoc > termlists
Xapian::termcount get_doclength_lower_bound() const
Get a lower bound on the length of a document in this DB.
TermList * open_metadata_keylist(std::string_view prefix) const
Open a termlist returning each metadata key.
void commit()
Implementation of virtual methods: see Database for details.
std::vector< Xapian::termcount > doclengths
void cancel()
Cancel pending modifications to the database.
std::string get_value_lower_bound(Xapian::valueno slot) const
Get a lower bound on the values stored in the given value slot.
InMemoryDatabase(const InMemoryDatabase &)
void replace_document(Xapian::docid did, const Xapian::Document &document)
Xapian::doccount get_value_freq(Xapian::valueno slot) const
Return the frequency of a given value slot.
void get_used_docid_range(Xapian::docid &first, Xapian::docid &last) const
Find lowest and highest docids actually in use.
std::string get_description() const
Return a string describing this object.
void add_values(Xapian::docid did, const std::map< Xapian::valueno, std::string > &values_)
std::map< Xapian::valueno, ValueStats > valuestats
PostList * open_post_list(std::string_view tname) const
Return a PostList suitable for use in a PostingIterator.
Xapian::docid add_document(const Xapian::Document &document)
void make_term(const std::string &tname)
Xapian::docid get_lastdocid() const
Return the last used document id of this (sub) database.
void delete_document(Xapian::docid did)
LeafPostList * open_leaf_post_list(std::string_view term, bool need_read_pos) const
Create a LeafPostList for use during a match.
PositionList * open_position_list(Xapian::docid did, std::string_view tname) const
Xapian::docid make_doc(const std::string &docdata)
Xapian::termcount positionlist_count(Xapian::docid did, std::string_view tname) const
bool reopen()
Reopen the database to the latest available revision.
Xapian::Database::Internal * update_lock(int flags)
Lock a read-only database for writing or unlock a writable database.
void make_posting(InMemoryDoc *doc, const std::string &tname, Xapian::docid did, Xapian::termpos position, Xapian::termcount wdf, bool use_position=true)
InMemoryDatabase & operator=(const InMemoryDatabase &)
Xapian::termcount get_unique_terms(Xapian::docid did) const
Get the number of unique terms in document.
bool doc_exists(Xapian::docid did) const
std::vector< std::string > doclists
Xapian::Document::Internal * open_document(Xapian::docid did, bool lazy) const
Open a handle on a document.
Xapian::doccount get_doccount() const
bool term_exists(std::string_view term) const
void close()
Close the database.
bool is_closed() const
TermList * open_allterms(std::string_view prefix) const
std::map< std::string, std::string, std::less<> > metadata
Xapian::totallength totlen
std::map< std::string, InMemoryTerm, std::less<> > postlists
Xapian::doccount totdocs
std::vector< std::map< Xapian::valueno, std::string > > valuelists
void finish_add_doc(Xapian::docid did, const Xapian::Document &document)
void set_metadata(std::string_view key, std::string_view value)
Set the metadata associated with a given key.
Xapian::termcount get_doclength_upper_bound() const
Get an upper bound on the length of a document in this DB.
void get_freqs(std::string_view term, Xapian::doccount *termfreq_ptr, Xapian::termcount *collfreq_ptr) const
Returns frequencies for a term.
int get_backend_info(std::string *path) const
Get backend information about this database.
std::string get_metadata(std::string_view key) const
Get the metadata associated with a given key.
Xapian::totallength get_total_length() const
Return the total length of all documents in this database.
bool has_positions() const
Check whether this database contains any positional information.
InMemoryDatabase()
Create and open an in-memory database.
Class representing a document and the terms indexing it.
InMemoryDoc(bool is_valid_)
void add_posting(const std::string &tname, Xapian::termcount wdf, Xapian::termpos position, bool use_position)
std::vector< InMemoryTermEntry > terms
A document read from a InMemoryDatabase.
PositionList from an InMemory DB or a Document object.
A PostList in an inmemory database.
std::string get_description() const
Return a string description of this object.
Xapian::termcount wdf_upper_bound
Xapian::docid get_docid() const
Return the current docid.
InMemoryPositionList mypositions
List of positions of the current term.
InMemoryPostList(const InMemoryDatabase *db, const InMemoryTerm &imterm, std::string_view term_)
void get_docid_range(Xapian::docid &first, Xapian::docid &last) const
Get the bounds on the range of docids this PostList can return.
PostList * skip_to(Xapian::docid did, double w_min)
Skip forward to the specified docid.
Xapian::termcount get_wdf() const
Return the wdf for the document at the current position.
Xapian::termcount get_wdfdocmax() const
std::vector< InMemoryPosting >::const_iterator pos
bool at_end() const
Return true if the current position is past the last entry in this list.
PositionList * open_position_list() const
Read the position list for the term in the current document and return a pointer to it (not owned by ...
Xapian::Internal::intrusive_ptr< const InMemoryDatabase > db
PositionList * read_position_list()
Read the position list for the term in the current document and return a pointer to it (owned by the ...
Xapian::termcount get_wdf_upper_bound() const
std::vector< InMemoryPosting >::const_iterator end
int operator()(const InMemoryPosting &p1, const InMemoryPosting &p2) const
void add_position(Xapian::termpos pos)
Xapian::VecCOW< Xapian::termpos > positions
Xapian::docid did
Xapian::termcount wdf
int operator()(const InMemoryTermEntry &p1, const InMemoryTermEntry &p2) const
void add_position(Xapian::termpos pos)
Xapian::VecCOW< Xapian::termpos > positions
Xapian::termcount wdf
InMemoryTermList(Xapian::Internal::intrusive_ptr< const InMemoryDatabase > db, Xapian::docid did, const InMemoryDoc &doc, Xapian::termcount len)
Xapian::termcount get_approx_size() const
Return approximate size of this termlist.
Xapian::termcount get_wdf() const
Return the wdf for the term at the current position.
void accumulate_stats(Xapian::Internal::ExpandStats &stats) const
Collate weighting information for the current term.
Xapian::doccount get_termfreq() const
Return the term frequency for the term at the current position.
Xapian::Internal::intrusive_ptr< const InMemoryDatabase > db
TermList * next()
Advance the current position to the next term in the termlist.
std::vector< InMemoryTermEntry >::const_iterator pos
Xapian::termcount terms
TermList * skip_to(std::string_view term)
Skip forward to the specified term.
Xapian::termcount document_length
std::vector< InMemoryTermEntry >::const_iterator end
Xapian::termcount positionlist_count() const
Return the length of the position list for the current position.
PositionList * positionlist_begin() const
Return PositionList for the current position.
Xapian::termcount collection_freq
void add_posting(Xapian::docid did, Xapian::termcount wdf, Xapian::termpos position, bool use_position)
Xapian::termcount term_freq
std::vector< InMemoryPosting > docs
Abstract base class for leaf postlists.
Definition: leafpostlist.h:40
Virtual base class for Database internals.
virtual void replace_document(docid did, const Document &document)
virtual void delete_document(docid did)
Abstract base class for a document.
Xapian::docid did
The document ID this document came from in database.
Class representing a document.
Definition: document.h:64
Collates statistics while calculating term weight in an ESet.
Definition: expandweight.h:37
Abstract base class for postlists.
Definition: postlist.h:40
PostList * next()
Advance the current position to the next document in the postlist.
Definition: postlist.h:168
Abstract base class for iterating term positions in a document.
Definition: positionlist.h:32
Abstract base class for termlists.
Definition: termlist.h:42
Suitable for "simple" type T.
Definition: smallvector.h:62
string term
PositionList * p
Xapian::termpos pos
Virtual base class for Database internals.
Class representing a document.
#define false
Definition: header.h:9
PositionList from an InMemory DB or a Document object.
Types used internally.
Abstract base class for leaf postlists.
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:64
unsigned valueno
The number for a value slot in a document.
Definition: types.h:90
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:37
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:51
unsigned XAPIAN_TERMPOS_BASE_TYPE termpos
A term position within a document or query.
Definition: types.h:75
XAPIAN_TOTALLENGTH_TYPE totallength
The total length of all documents in a database.
Definition: types.h:114
Various assertion macros.
#define Assert(COND)
Definition: omassert.h:122
Custom vector implementations using small vector optimisation.
Abstract base class for termlists.
Statistics about values.