xapian-core  1.4.26
copydatabase.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2006-2022 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include <config.h>
22 
23 #include <xapian.h>
24 
25 #include <initializer_list>
26 #include <iomanip>
27 #include <iostream>
28 
29 #include <cmath> // For log10().
30 #include <cstdlib> // For exit().
31 #include <cstring> // For strcmp() and strrchr().
32 
33 using namespace std;
34 
35 #define PROG_NAME "copydatabase"
36 #define PROG_DESC "Perform a document-by-document copy of one or more Xapian databases"
37 
38 static void
39 show_usage(int rc)
40 {
41  cout << "Usage: " PROG_NAME " SOURCE_DATABASE... DESTINATION_DATABASE\n\n"
42 "Options:\n"
43 " --no-renumber Preserve the numbering of document ids (useful if you have\n"
44 " external references to them, or have set them to match\n"
45 " unique ids from an external source). If multiple source\n"
46 " databases are specified and the same docid occurs in more\n"
47 " one, the last occurrence will be the one which ends up in\n"
48 " the destination database.\n"
49 " --help display this help and exit\n"
50 " --version output version information and exit\n";
51  exit(rc);
52 }
53 
54 int
55 main(int argc, char **argv)
56 try {
57  bool renumber = true;
58  if (argc > 1 && argv[1][0] == '-') {
59  if (strcmp(argv[1], "--help") == 0) {
60  cout << PROG_NAME " - " PROG_DESC "\n\n";
61  show_usage(0);
62  }
63  if (strcmp(argv[1], "--version") == 0) {
64  cout << PROG_NAME " - " PACKAGE_STRING "\n";
65  exit(0);
66  }
67  if (strcmp(argv[1], "--no-renumber") == 0) {
68  renumber = false;
69  argv[1] = argv[0];
70  ++argv;
71  --argc;
72  }
73  }
74 
75  // We expect two or more arguments: at least one source database path
76  // followed by the destination database path.
77  if (argc < 3) show_usage(1);
78 
79  // Create the destination database, using DB_CREATE so that we don't
80  // try to overwrite or update an existing database in case the user
81  // got the command line argument order wrong.
82  const char *dest = argv[argc - 1];
84 
85  for (int i = 1; i < argc - 1; ++i) {
86  string src = argv[i];
87  if (!src.empty()) {
88  // Remove any trailing directory separator.
89  char ch = src.back();
90  for (char dir_sep : DIR_SEPS_LIST) {
91  if (ch == dir_sep) {
92  src.resize(src.size() - 1);
93  break;
94  }
95  }
96  }
97 
98  // Open the source database.
99  Xapian::Database db_in(src);
100 
101  // Find the leaf-name of the database path for reporting progress.
102  //
103  // If we found a directory separator, + 1 advances to the next
104  // character; If we didn't, incrementing string::npos will give us 0,
105  // so we use the whole of src as the leaf-name.
106  const char * leaf = src.c_str() + (src.find_last_of(DIR_SEPS) + 1);
107 
108  // Iterate over all the documents in db_in, copying each to db_out.
109  Xapian::doccount dbsize = db_in.get_doccount();
110  if (dbsize == 0) {
111  cout << leaf << ": empty!\n";
112  } else {
113  // Calculate how many decimal digits there are in dbsize.
114  int width = static_cast<int>(log10(double(dbsize))) + 1;
115 
116  Xapian::doccount c = 0;
117  Xapian::PostingIterator it = db_in.postlist_begin(string());
118  while (it != db_in.postlist_end(string())) {
119  Xapian::docid did = *it;
120  if (renumber) {
121  db_out.add_document(db_in.get_document(did));
122  } else {
123  db_out.replace_document(did, db_in.get_document(did));
124  }
125 
126  // Update for the first 10, and then every 13th document
127  // counting back from the end (this means that all the
128  // digits "rotate" and the counter ends up on the exact
129  // total.
130  ++c;
131  if (c <= 10 || (dbsize - c) % 13 == 0) {
132  cout << '\r' << leaf << ": ";
133  cout << setw(width) << c << '/' << dbsize << flush;
134  }
135 
136  ++it;
137  }
138 
139  cout << '\n';
140  }
141 
142  cout << "Copying spelling data..." << flush;
143  Xapian::TermIterator spellword = db_in.spellings_begin();
144  while (spellword != db_in.spellings_end()) {
145  db_out.add_spelling(*spellword, spellword.get_termfreq());
146  ++spellword;
147  }
148  cout << " done.\n";
149 
150  cout << "Copying synonym data..." << flush;
151  Xapian::TermIterator synkey = db_in.synonym_keys_begin();
152  while (synkey != db_in.synonym_keys_end()) {
153  string key = *synkey;
154  Xapian::TermIterator syn = db_in.synonyms_begin(key);
155  while (syn != db_in.synonyms_end(key)) {
156  db_out.add_synonym(key, *syn);
157  ++syn;
158  }
159  ++synkey;
160  }
161  cout << " done.\n";
162 
163  cout << "Copying user metadata..." << flush;
164  Xapian::TermIterator metakey = db_in.metadata_keys_begin();
165  while (metakey != db_in.metadata_keys_end()) {
166  string key = *metakey;
167  db_out.set_metadata(key, db_in.get_metadata(key));
168  ++metakey;
169  }
170  cout << " done.\n";
171  }
172 
173  cout << "Committing..." << flush;
174  // Commit explicitly so that any error is reported.
175  db_out.commit();
176  cout << " done.\n";
177 } catch (const Xapian::Error & e) {
178  cerr << '\n' << argv[0] << ": " << e.get_description() << '\n';
179  exit(1);
180 }
Xapian::Document get_document(Xapian::docid did) const
Get a document from the database, given its document id.
Definition: omdatabase.cc:490
Xapian::docid add_document(const Xapian::Document &document)
Add a new document to the database.
Definition: omdatabase.cc:902
#define DIR_SEPS_LIST
Definition: config.h:11
This class is used to access a database, or a group of databases.
Definition: database.h:68
const int DB_CREATE
Create a new database.
Definition: constants.h:44
#define DIR_SEPS
Definition: config.h:8
Xapian::doccount get_termfreq() const
Return the term frequency for the term at the current position.
STL namespace.
Xapian::TermIterator synonyms_end(const std::string &) const
Corresponding end iterator to synonyms_begin(term).
Definition: database.h:447
void replace_document(Xapian::docid did, const Xapian::Document &document)
Replace a given document in the database.
Definition: omdatabase.cc:952
void set_metadata(const std::string &key, const std::string &metadata)
Set the user-specified metadata associated with a given key.
Definition: omdatabase.cc:1064
Xapian::doccount get_doccount() const
Get the number of documents in the database.
Definition: omdatabase.cc:267
std::string get_metadata(const std::string &key) const
Get the user-specified metadata associated with a given key.
Definition: omdatabase.cc:758
Class for iterating over a list of terms.
Definition: termiterator.h:41
Class for iterating over a list of terms.
This class provides read/write access to a database.
Definition: database.h:789
Xapian::TermIterator spellings_begin() const
An iterator which returns all the spelling correction targets.
Definition: omdatabase.cc:704
Public interfaces for the Xapian library.
Xapian::TermIterator synonym_keys_begin(const std::string &prefix=std::string()) const
An iterator which returns all terms which have synonyms.
Definition: omdatabase.cc:740
Xapian::TermIterator synonym_keys_end(const std::string &=std::string()) const
Corresponding end iterator to synonym_keys_begin(prefix).
Definition: database.h:459
void commit()
Commit any pending modifications made to the database.
Definition: omdatabase.cc:857
#define PROG_NAME
Definition: copydatabase.cc:35
Xapian::TermIterator synonyms_begin(const std::string &term) const
An iterator which returns all the synonyms for a given term.
Definition: omdatabase.cc:722
#define PROG_DESC
Definition: copydatabase.cc:36
std::string get_description() const
Return a string describing this object.
Definition: error.cc:93
int main(int argc, char **argv)
Definition: copydatabase.cc:55
static void show_usage(int rc)
Definition: copydatabase.cc:39
unsigned XAPIAN_DOCID_BASE_TYPE doccount
A count of documents.
Definition: types.h:38
All exceptions thrown by Xapian are subclasses of Xapian::Error.
Definition: error.h:43
#define PACKAGE_STRING
Definition: config.h:337
Xapian::TermIterator metadata_keys_begin(const std::string &prefix=std::string()) const
An iterator which returns all user-specified metadata keys.
Definition: omdatabase.cc:768
Xapian::TermIterator spellings_end() const
Corresponding end iterator to spellings_begin().
Definition: database.h:436
void add_synonym(const std::string &term, const std::string &synonym) const
Add a synonym for a term.
Definition: omdatabase.cc:1028
unsigned XAPIAN_DOCID_BASE_TYPE docid
A unique identifier for a document.
Definition: types.h:52
PostingIterator postlist_end(const std::string &) const
Corresponding end iterator to postlist_begin().
Definition: database.h:225
void add_spelling(const std::string &word, Xapian::termcount freqinc=1) const
Add a word to the spelling dictionary.
Definition: omdatabase.cc:1004
PostingIterator postlist_begin(const std::string &tname) const
An iterator pointing to the start of the postlist for a given term.
Definition: omdatabase.cc:162
Xapian::TermIterator metadata_keys_end(const std::string &=std::string()) const
Corresponding end iterator to metadata_keys_begin().
Definition: database.h:510