xapian-core  1.4.19
inmemory_database.h
Go to the documentation of this file.
1 
4 /* Copyright 1999,2000,2001 BrightStation PLC
5  * Copyright 2002 Ananova Ltd
6  * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2014,2015 Olly Betts
7  * Copyright 2006,2009 Lemur Consulting Ltd
8  *
9  * This program is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU General Public License as
11  * published by the Free Software Foundation; either version 2 of the
12  * License, or (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
22  * USA
23  */
24 
25 #ifndef OM_HGUARD_INMEMORY_DATABASE_H
26 #define OM_HGUARD_INMEMORY_DATABASE_H
27 
28 #include "api/leafpostlist.h"
29 #include "api/termlist.h"
30 #include "backends/backends.h"
31 #include "backends/database.h"
32 #include "backends/valuestats.h"
33 #include <map>
34 #include <vector>
35 #include <algorithm>
36 #include <xapian/document.h>
37 #include "inmemory_positionlist.h"
38 #include "internaltypes.h"
39 #include "omassert.h"
40 #include "noreturn.h"
41 
42 using namespace std;
43 
44 // Class representing a posting (a term/doc pair, and
45 // all the relevant positional information, is a single posting)
47  public:
49  bool valid;
50  vector<Xapian::termpos> positions; // Sorted vector of positions
52 
53  // Merge two postings (same term/doc pair, new positional info)
54  void merge(const InMemoryPosting & post) {
55  Assert(did == post.did);
56 
57  positions.insert(positions.end(),
58  post.positions.begin(),
59  post.positions.end());
60  // FIXME - inefficient - use merge (and list<>)?
61  sort(positions.begin(), positions.end());
62  }
63 };
64 
66  public:
67  string tname;
68  vector<Xapian::termpos> positions; // Sorted vector of positions
70 
71  // Merge two postings (same term/doc pair, new positional info)
72  void merge(const InMemoryTermEntry & post) {
73  Assert(tname == post.tname);
74 
75  positions.insert(positions.end(),
76  post.positions.begin(),
77  post.positions.end());
78  // FIXME - inefficient - use merge (and list<>)?
79  sort(positions.begin(), positions.end());
80  }
81 };
82 
83 // Compare by document ID
85  public:
86  int operator() (const InMemoryPosting &p1,
87  const InMemoryPosting &p2) const
88  {
89  return p1.did < p2.did;
90  }
91 };
92 
93 // Compare by termname
95  public:
96  int operator() (const InMemoryTermEntry&p1,
97  const InMemoryTermEntry&p2) const
98  {
99  return p1.tname < p2.tname;
100  }
101 };
102 
103 // Class representing a term and the documents indexing it
105  public:
106  // Sorted list of documents indexing this term.
107  vector<InMemoryPosting> docs;
108 
111 
112  InMemoryTerm() : term_freq(0), collection_freq(0) {}
113 
114  void add_posting(const InMemoryPosting & post);
115 };
116 
118 class InMemoryDoc {
119  public:
120  bool is_valid;
121  // Sorted list of terms indexing this document.
122  vector<InMemoryTermEntry> terms;
123 
124  /* Initialise invalid by default, so that resizing the termlist array
125  * doesn't create valid documents. */
126  InMemoryDoc() : is_valid(false) {}
127 
128  // Initialise specifying validity.
129  explicit InMemoryDoc(bool is_valid_) : is_valid(is_valid_) {}
130 
131  void add_posting(const InMemoryTermEntry& post);
132 };
133 
134 class InMemoryDatabase;
135 
139  friend class InMemoryDatabase;
140 
141  private:
142  vector<InMemoryPosting>::const_iterator pos;
143  vector<InMemoryPosting>::const_iterator end;
145  bool started;
146 
151 
153 
155 
157  const InMemoryTerm & imterm, const std::string & term_);
158  public:
159  Xapian::doccount get_termfreq() const;
160 
161  Xapian::docid get_docid() const; // Gets current docid
162  Xapian::termcount get_doclength() const; // Length of current document
163  Xapian::termcount get_unique_terms() const; // number of terms in current document
164  Xapian::termcount get_wdf() const; // Within Document Frequency
165  PositionList * read_position_list();
167 
168  PostList *next(double w_min); // Moves to next docid
169 
170  // Moves to next docid >= specified docid
171  PostList *skip_to(Xapian::docid did, double w_min);
172 
173  // True if we're off the end of the list.
174  bool at_end() const;
175 
177 
178  string get_description() const;
179 };
180 
184  friend class InMemoryDatabase;
185 
186  private:
188 
190 
192 
193  public:
194  Xapian::doccount get_termfreq() const;
195 
196  Xapian::docid get_docid() const; // Gets current docid
197  Xapian::termcount get_doclength() const; // Length of current document
198  // number of terms in current document
200  Xapian::termcount get_wdf() const; // Within Document Frequency
201  PositionList * read_position_list();
203 
204  PostList *next(double w_min); // Moves to next docid
205 
206  // Moves to next docid >= specified docid
207  PostList *skip_to(Xapian::docid did, double w_min);
208 
209  // True if we're off the end of the list
210  bool at_end() const;
211 
213 
214  string get_description() const;
215 };
216 
217 // Term List
218 class InMemoryTermList : public TermList {
219  friend class InMemoryDatabase;
220 
221  private:
222  vector<InMemoryTermEntry>::const_iterator pos;
223  vector<InMemoryTermEntry>::const_iterator end;
225  bool started;
226 
230 
232  Xapian::docid did,
233  const InMemoryDoc & doc,
234  Xapian::termcount len);
235 
236  public:
237  Xapian::termcount get_approx_size() const;
238 
240  void accumulate_stats(Xapian::Internal::ExpandStats & stats) const;
241 
242  string get_termname() const;
243  // Number of occurrences of term in current doc
244  Xapian::termcount get_wdf() const;
245  Xapian::doccount get_termfreq() const; // Number of docs indexed by term
246  TermList * next();
247  TermList * skip_to(const std::string & term);
248  bool at_end() const;
250  Xapian::PositionIterator positionlist_begin() const;
251 };
252 
253 class InMemoryDocument;
254 
261  friend class InMemoryDocument;
262 
263  map<string, InMemoryTerm> postlists;
264  vector<InMemoryDoc> termlists;
265  vector<std::string> doclists;
266  vector<std::map<Xapian::valueno, string>> valuelists;
267  std::map<Xapian::valueno, ValueStats> valuestats;
268 
269  vector<Xapian::termcount> doclengths;
270 
271  std::map<string, string> metadata;
272 
274 
276 
278 
279  // Flag, true if the db has been closed.
280  bool closed;
281 
282  // Stop copy / assignment being allowed
285 
286  void make_term(const string & tname);
287 
288  bool doc_exists(Xapian::docid did) const;
289  Xapian::docid make_doc(const string & docdata);
290 
291  /* The common parts of add_doc and replace_doc */
292  void finish_add_doc(Xapian::docid did, const Xapian::Document& document);
293  void add_values(Xapian::docid did,
294  const map<Xapian::valueno, string>& values_);
295 
296  void make_posting(InMemoryDoc * doc,
297  const string & tname,
299  Xapian::termpos position,
300  Xapian::termcount wdf,
301  bool use_position = true);
302 
304 
306  void commit();
307  void cancel();
308 
309  Xapian::docid add_document(const Xapian::Document & document);
310  // Stop the default implementation of delete_document(term) and
311  // replace_document(term) from being hidden. This isn't really
312  // a problem as we only try to call them through the base class
313  // (where they aren't hidden) but some compilers generate a warning
314  // about the hiding.
317  void delete_document(Xapian::docid did);
318  void replace_document(Xapian::docid did, const Xapian::Document & document);
320 
321  public:
327 
328  ~InMemoryDatabase();
329 
330  bool reopen();
331  void close();
332  bool is_closed() const { return closed; }
333 
334  Xapian::doccount get_doccount() const;
335 
336  Xapian::docid get_lastdocid() const;
337 
338  Xapian::totallength get_total_length() const;
339  Xapian::termcount get_doclength(Xapian::docid did) const;
340  Xapian::termcount get_unique_terms(Xapian::docid did) const;
341 
342  void get_freqs(const string & term,
343  Xapian::doccount * termfreq_ptr,
344  Xapian::termcount * collfreq_ptr) const;
345  Xapian::doccount get_value_freq(Xapian::valueno slot) const;
346  std::string get_value_lower_bound(Xapian::valueno slot) const;
347  std::string get_value_upper_bound(Xapian::valueno slot) const;
348  bool term_exists(const string & tname) const;
349  bool has_positions() const;
350 
351  LeafPostList * open_post_list(const string & tname) const;
353  Xapian::Document::Internal* open_document(Xapian::docid did,
354  bool lazy) const;
355 
356  std::string get_metadata(const std::string & key) const;
357  TermList * open_metadata_keylist(const std::string &prefix) const;
358  void set_metadata(const std::string & key, const std::string & value);
359 
360  Xapian::termcount positionlist_count(Xapian::docid did,
361  const string & tname) const;
362  PositionList * open_position_list(Xapian::docid did,
363  const string & tname) const;
364  TermList * open_allterms(const string & prefix) const;
365 
366  XAPIAN_NORETURN(static void throw_database_closed());
367 
368  int get_backend_info(string * path) const {
369  if (path) *path = string();
370  return BACKEND_INMEMORY;
371  }
372 
373  bool locked() const { return !closed; }
374 };
375 
376 #endif /* OM_HGUARD_INMEMORY_DATABASE_H */
Xapian::Internal::intrusive_ptr< const InMemoryDatabase > db
int close(FD &fd)
Definition: fd.h:63
PositionList from an InMemory DB or a Document object.
vector< InMemoryPosting >::const_iterator end
#define Assert(COND)
Definition: omassert.h:122
vector< InMemoryTermEntry >::const_iterator end
Define the XAPIAN_NORETURN macro.
virtual void replace_document(Xapian::docid did, const Xapian::Document &document)
Replace a given document in the database.
Definition: database.cc:186
Abstract base class for postlists.
Definition: postlist.h:37
Statistics about values.
vector< Xapian::termpos > positions
Xapian::docid did
The document ID of the document in that database.
Definition: document.h:88
A database held entirely in memory.
void merge(const InMemoryPosting &post)
Xapian::termcount wdf_upper_bound
Base class for databases.
Definition: database.h:56
int get_backend_info(string *path) const
Get backend information about this database.
friend class InMemoryAllDocsPostList
bool is_closed() const
XAPIAN_TOTALLENGTH_TYPE totallength
The total length of all documents in a database.
Definition: types.h:139
TermList * open_term_list() const
Open a term list.
Definition: omdocument.cc:429
Xapian::totallength totlen
virtual Xapian::termcount get_wdf_upper_bound(const std::string &term) const
Get an upper bound on the wdf of term term.
Definition: database.cc:75
vector< Xapian::termcount > doclengths
A document in the database, possibly plus modifications.
Definition: document.h:41
Abstract base class for termlists.
Definition: termlist.h:39
STL namespace.
Xapian::termcount terms
vector< InMemoryTermEntry >::const_iterator pos
Abstract base class for leaf postlists.
Definition: leafpostlist.h:38
static void throw_database_closed()
#define false
Definition: header.h:9
Abstract base class for leaf postlists.
vector< Xapian::termpos > positions
Xapian::termcount document_length
Xapian::termcount wdf
virtual void delete_document(Xapian::docid did)
Delete a document in the database.
Definition: database.cc:169
PositionList * open_position_list(Xapian::docid did, const string &tname) const
Open a position list for the given term in the given document.
vector< InMemoryPosting > docs
unsigned XAPIAN_TERMCOUNT_BASE_TYPE termcount
A counts of terms.
Definition: types.h:72
void merge(const InMemoryTermEntry &post)
Class representing a document and the terms indexing it.
InMemoryPositionList mypositions
List of positions of the current term.
BACKEND_* constants.
friend class InMemoryDatabase
InMemoryDatabase::open_document() needs to call our private constructor.
vector< InMemoryTermEntry > terms
Xapian::termcount collection_freq
InMemoryDoc(bool is_valid_)
std::map< string, string > metadata
Xapian::doccount totdocs
A PostList in an inmemory database.
Xapian::termcount wdf
A position list in a inmemory database.
std::map< Xapian::valueno, ValueStats > valuestats
Class for iterating over term positions.
Xapian::Internal::intrusive_ptr< const InMemoryDatabase > db
vector< std::map< Xapian::valueno, string > > valuelists
Xapian::doccount termfreq
A document read from a InMemoryDatabase.
Xapian::termcount get_unique_terms(Xapian::docid did) const
Get the number of unique term in document.
void operator=(const InMemoryDocument &)
Don&#39;t allow assignment.
vector< std::string > doclists
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
Xapian::termcount positionlist_count(Xapian::docid did, const string &tname) const
map< string, InMemoryTerm > postlists
bool locked() const
Return true if the database is open for writing.
Xapian::Internal::intrusive_ptr< const InMemoryDatabase > db
Collates statistics while calculating term weight in an ESet.
Definition: expandweight.h:37
unsigned valueno
The number for a value slot in a document.
Definition: types.h:108
unsigned XAPIAN_TERMPOS_BASE_TYPE termpos
A term position within a document or query.
Definition: types.h:83
Xapian::termcount get_doclength(Xapian::docid did) const
Get the length of a given document.
Abstract base class for termlists.
vector< InMemoryPosting >::const_iterator pos
Xapian::docid did
Various assertion macros.
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:52
Abstract base class for iterating term positions in a document.
Definition: positionlist.h:31
API for working with documents.
A handle representing a document in a Xapian database.
Definition: document.h:61
Types used internally.
A PostList over all docs in an inmemory database.
Xapian::termcount term_freq
vector< InMemoryDoc > termlists