xapian-core  1.4.26
inmemory_database.h
Go to the documentation of this file.
1 
4 /* Copyright 1999,2000,2001 BrightStation PLC
5  * Copyright 2002 Ananova Ltd
6  * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2014,2015 Olly Betts
7  * Copyright 2006,2009 Lemur Consulting Ltd
8  *
9  * This program is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU General Public License as
11  * published by the Free Software Foundation; either version 2 of the
12  * License, or (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
22  * USA
23  */
24 
25 #ifndef OM_HGUARD_INMEMORY_DATABASE_H
26 #define OM_HGUARD_INMEMORY_DATABASE_H
27 
28 #include "api/leafpostlist.h"
29 #include "api/termlist.h"
30 #include "backends/backends.h"
31 #include "backends/database.h"
32 #include "backends/valuestats.h"
33 #include <map>
34 #include <vector>
35 #include <algorithm>
36 #include <xapian/document.h>
37 #include "inmemory_positionlist.h"
38 #include "internaltypes.h"
39 #include "omassert.h"
40 #include "noreturn.h"
41 
42 using namespace std;
43 
44 // Class representing a posting (a term/doc pair, and
45 // all the relevant positional information, is a single posting)
47  public:
49  bool valid;
50  vector<Xapian::termpos> positions; // Sorted vector of positions
52 
53  // Add new position entry preserving sorted order.
55  auto p = std::lower_bound(positions.begin(), positions.end(), pos);
56  Assert(p == positions.end() || *p != pos);
57  positions.insert(p, pos);
58  }
59 };
60 
62  public:
63  string tname;
64  vector<Xapian::termpos> positions; // Sorted vector of positions
66 
67  // Add new position entry preserving sorted order.
69  auto p = std::lower_bound(positions.begin(), positions.end(), pos);
70  Assert(p == positions.end() || *p != pos);
71  positions.insert(p, pos);
72  }
73 };
74 
75 // Compare by document ID
77  public:
78  int operator() (const InMemoryPosting &p1,
79  const InMemoryPosting &p2) const
80  {
81  return p1.did < p2.did;
82  }
83 };
84 
85 // Compare by termname
87  public:
88  int operator() (const InMemoryTermEntry&p1,
89  const InMemoryTermEntry&p2) const
90  {
91  return p1.tname < p2.tname;
92  }
93 };
94 
95 // Class representing a term and the documents indexing it
96 class InMemoryTerm {
97  public:
98  // Sorted list of documents indexing this term.
99  vector<InMemoryPosting> docs;
100 
103 
104  InMemoryTerm() : term_freq(0), collection_freq(0) {}
105 
106  void add_posting(Xapian::docid did,
107  Xapian::termcount wdf,
108  Xapian::termpos position,
109  bool use_position);
110 };
111 
113 class InMemoryDoc {
114  public:
115  bool is_valid;
116  // Sorted list of terms indexing this document.
117  vector<InMemoryTermEntry> terms;
118 
119  /* Initialise invalid by default, so that resizing the termlist array
120  * doesn't create valid documents. */
121  InMemoryDoc() : is_valid(false) {}
122 
123  // Initialise specifying validity.
124  explicit InMemoryDoc(bool is_valid_) : is_valid(is_valid_) {}
125 
126  void add_posting(const std::string& tname,
127  Xapian::termcount wdf,
128  Xapian::termpos position,
129  bool use_position);
130 };
131 
132 class InMemoryDatabase;
133 
137  friend class InMemoryDatabase;
138 
139  private:
140  vector<InMemoryPosting>::const_iterator pos;
141  vector<InMemoryPosting>::const_iterator end;
143  bool started;
144 
149 
151 
153 
155  const InMemoryTerm & imterm, const std::string & term_);
156  public:
157  Xapian::doccount get_termfreq() const;
158 
159  Xapian::docid get_docid() const; // Gets current docid
160  Xapian::termcount get_doclength() const; // Length of current document
161  Xapian::termcount get_unique_terms() const; // number of terms in current document
162  Xapian::termcount get_wdf() const; // Within Document Frequency
163  PositionList * read_position_list();
165 
166  PostList *next(double w_min); // Moves to next docid
167 
168  // Moves to next docid >= specified docid
169  PostList *skip_to(Xapian::docid did, double w_min);
170 
171  // True if we're off the end of the list.
172  bool at_end() const;
173 
175 
176  string get_description() const;
177 };
178 
182  friend class InMemoryDatabase;
183 
184  private:
186 
188 
190 
191  public:
192  Xapian::doccount get_termfreq() const;
193 
194  Xapian::docid get_docid() const; // Gets current docid
195  Xapian::termcount get_doclength() const; // Length of current document
196  // number of terms in current document
198  Xapian::termcount get_wdf() const; // Within Document Frequency
199  PositionList * read_position_list();
201 
202  PostList *next(double w_min); // Moves to next docid
203 
204  // Moves to next docid >= specified docid
205  PostList *skip_to(Xapian::docid did, double w_min);
206 
207  // True if we're off the end of the list
208  bool at_end() const;
209 
211 
212  string get_description() const;
213 };
214 
215 // Term List
216 class InMemoryTermList : public TermList {
217  friend class InMemoryDatabase;
218 
219  private:
220  vector<InMemoryTermEntry>::const_iterator pos;
221  vector<InMemoryTermEntry>::const_iterator end;
223  bool started;
224 
228 
230  Xapian::docid did,
231  const InMemoryDoc & doc,
232  Xapian::termcount len);
233 
234  public:
235  Xapian::termcount get_approx_size() const;
236 
238  void accumulate_stats(Xapian::Internal::ExpandStats & stats) const;
239 
240  string get_termname() const;
241  // Number of occurrences of term in current doc
242  Xapian::termcount get_wdf() const;
243  Xapian::doccount get_termfreq() const; // Number of docs indexed by term
244  TermList * next();
245  TermList * skip_to(const std::string & term);
246  bool at_end() const;
248  Xapian::PositionIterator positionlist_begin() const;
249 };
250 
251 class InMemoryDocument;
252 
259  friend class InMemoryDocument;
260 
261  map<string, InMemoryTerm> postlists;
262  vector<InMemoryDoc> termlists;
263  vector<std::string> doclists;
264  vector<std::map<Xapian::valueno, string>> valuelists;
265  std::map<Xapian::valueno, ValueStats> valuestats;
266 
267  vector<Xapian::termcount> doclengths;
268 
269  std::map<string, string> metadata;
270 
272 
274 
276 
277  // Flag, true if the db has been closed.
278  bool closed;
279 
280  // Stop copy / assignment being allowed
283 
284  void make_term(const string & tname);
285 
286  bool doc_exists(Xapian::docid did) const;
287  Xapian::docid make_doc(const string & docdata);
288 
289  /* The common parts of add_doc and replace_doc */
290  void finish_add_doc(Xapian::docid did, const Xapian::Document& document);
291  void add_values(Xapian::docid did,
292  const map<Xapian::valueno, string>& values_);
293 
294  void make_posting(InMemoryDoc * doc,
295  const string & tname,
297  Xapian::termpos position,
298  Xapian::termcount wdf,
299  bool use_position = true);
300 
302 
304  void commit();
305  void cancel();
306 
307  Xapian::docid add_document(const Xapian::Document & document);
308  // Stop the default implementation of delete_document(term) and
309  // replace_document(term) from being hidden. This isn't really
310  // a problem as we only try to call them through the base class
311  // (where they aren't hidden) but some compilers generate a warning
312  // about the hiding.
315  void delete_document(Xapian::docid did);
316  void replace_document(Xapian::docid did, const Xapian::Document & document);
318 
319  public:
325 
326  ~InMemoryDatabase();
327 
328  bool reopen();
329  void close();
330  bool is_closed() const { return closed; }
331 
332  Xapian::doccount get_doccount() const;
333 
334  Xapian::docid get_lastdocid() const;
335 
336  Xapian::totallength get_total_length() const;
337  Xapian::termcount get_doclength(Xapian::docid did) const;
338  Xapian::termcount get_unique_terms(Xapian::docid did) const;
339 
340  void get_freqs(const string & term,
341  Xapian::doccount * termfreq_ptr,
342  Xapian::termcount * collfreq_ptr) const;
343  Xapian::doccount get_value_freq(Xapian::valueno slot) const;
344  std::string get_value_lower_bound(Xapian::valueno slot) const;
345  std::string get_value_upper_bound(Xapian::valueno slot) const;
346  bool term_exists(const string & tname) const;
347  bool has_positions() const;
348 
349  LeafPostList * open_post_list(const string & tname) const;
351  Xapian::Document::Internal* open_document(Xapian::docid did,
352  bool lazy) const;
353 
354  std::string get_metadata(const std::string & key) const;
355  TermList * open_metadata_keylist(const std::string &prefix) const;
356  void set_metadata(const std::string & key, const std::string & value);
357 
358  Xapian::termcount positionlist_count(Xapian::docid did,
359  const string & tname) const;
360  PositionList * open_position_list(Xapian::docid did,
361  const string & tname) const;
362  TermList * open_allterms(const string & prefix) const;
363 
364  XAPIAN_NORETURN(static void throw_database_closed());
365 
366  int get_backend_info(string * path) const {
367  if (path) *path = string();
368  return BACKEND_INMEMORY;
369  }
370 
371  bool locked() const { return !closed; }
372 };
373 
374 #endif /* OM_HGUARD_INMEMORY_DATABASE_H */
Xapian::Internal::intrusive_ptr< const InMemoryDatabase > db
int close(FD &fd)
Definition: fd.h:63
PositionList from an InMemory DB or a Document object.
vector< InMemoryPosting >::const_iterator end
#define Assert(COND)
Definition: omassert.h:122
vector< InMemoryTermEntry >::const_iterator end
Define the XAPIAN_NORETURN macro.
virtual void replace_document(Xapian::docid did, const Xapian::Document &document)
Replace a given document in the database.
Definition: database.cc:186
Abstract base class for postlists.
Definition: postlist.h:37
Statistics about values.
vector< Xapian::termpos > positions
Xapian::docid did
The document ID of the document in that database.
Definition: document.h:90
A database held entirely in memory.
Xapian::termcount wdf_upper_bound
Base class for databases.
Definition: database.h:57
int get_backend_info(string *path) const
Get backend information about this database.
friend class InMemoryAllDocsPostList
bool is_closed() const
XAPIAN_TOTALLENGTH_TYPE totallength
The total length of all documents in a database.
Definition: types.h:139
TermList * open_term_list() const
Open a term list.
Definition: omdocument.cc:429
Xapian::totallength totlen
virtual Xapian::termcount get_wdf_upper_bound(const std::string &term) const
Get an upper bound on the wdf of term term.
Definition: database.cc:75
vector< Xapian::termcount > doclengths
A document in the database, possibly plus modifications.
Definition: document.h:43
Abstract base class for termlists.
Definition: termlist.h:39
STL namespace.
Xapian::termcount terms
vector< InMemoryTermEntry >::const_iterator pos
Abstract base class for leaf postlists.
Definition: leafpostlist.h:39
static void throw_database_closed()
#define false
Definition: header.h:9
Abstract base class for leaf postlists.
vector< Xapian::termpos > positions
Xapian::termcount document_length
Xapian::termcount wdf
virtual void delete_document(Xapian::docid did)
Delete a document in the database.
Definition: database.cc:169
PositionList * open_position_list(Xapian::docid did, const string &tname) const
Open a position list for the given term in the given document.
vector< InMemoryPosting > docs
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72
Class representing a document and the terms indexing it.
InMemoryPositionList mypositions
List of positions of the current term.
BACKEND_* constants.
friend class InMemoryDatabase
InMemoryDatabase::open_document() needs to call our private constructor.
vector< InMemoryTermEntry > terms
Xapian::termcount collection_freq
InMemoryDoc(bool is_valid_)
std::map< string, string > metadata
Xapian::doccount totdocs
A PostList in an inmemory database.
void add_position(Xapian::termpos pos)
Xapian::termcount wdf
A position list in a inmemory database.
std::map< Xapian::valueno, ValueStats > valuestats
Class for iterating over term positions.
Xapian::Internal::intrusive_ptr< const InMemoryDatabase > db
vector< std::map< Xapian::valueno, string > > valuelists
Xapian::doccount termfreq
A document read from a InMemoryDatabase.
Xapian::termcount get_unique_terms(Xapian::docid did) const
Get the number of unique term in document.
void operator=(const InMemoryDocument &)
Don&#39;t allow assignment.
vector< std::string > doclists
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
Xapian::termcount positionlist_count(Xapian::docid did, const string &tname) const
map< string, InMemoryTerm > postlists
bool locked() const
Return true if the database is open for writing.
Xapian::Internal::intrusive_ptr< const InMemoryDatabase > db
Collates statistics while calculating term weight in an ESet.
Definition: expandweight.h:37
unsigned valueno
The number for a value slot in a document.
Definition: types.h:108
unsigned XAPIAN_TERMPOS_BASE_TYPE termpos
A term position within a document or query.
Definition: types.h:83
Xapian::termcount get_doclength(Xapian::docid did) const
Get the length of a given document.
Abstract base class for termlists.
vector< InMemoryPosting >::const_iterator pos
void add_position(Xapian::termpos pos)
Xapian::docid did
Various assertion macros.
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:52
Abstract base class for iterating term positions in a document.
Definition: positionlist.h:31
API for working with documents.
A handle representing a document in a Xapian database.
Definition: document.h:61
Types used internally.
A PostList over all docs in an inmemory database.
Xapian::termcount term_freq
vector< InMemoryDoc > termlists