00001
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include <config.h>
00023
00024 #include "safeerrno.h"
00025
00026 #include <iostream>
00027
00028 #include <cstdio>
00029 #include <cstdlib>
00030 #include <cstring>
00031 #include "safesysstat.h"
00032 #include <sys/types.h>
00033 #include "utils.h"
00034
00035 #include "chert_table.h"
00036
00037 #include "flint_table.h"
00038 #include "flint_cursor.h"
00039 #include "flint_utils.h"
00040 #include "pack.h"
00041
00042 #include "safeunistd.h"
00043 #include "safefcntl.h"
00044
00045 #ifdef __WIN32__
00046 # include "safewindows.h"
00047 #endif
00048
00049 #include "stringutils.h"
00050
00051 #include <xapian.h>
00052
00053 #include "gnu_getopt.h"
00054
00055 using namespace std;
00056
00057 #define PROG_NAME "chert-update"
00058 #define PROG_DESC "Update a chert database to the new format keys"
00059
00060 #define OPT_HELP 1
00061 #define OPT_VERSION 2
00062 #define OPT_NO_RENUMBER 3
00063
00064 static void show_usage() {
00065 cout << "Usage: "PROG_NAME" [OPTIONS] SOURCE_DATABASE DESTINATION_DATABASE\n\n"
00066 "Options:\n"
00067 " -b, --blocksize Set the blocksize in bytes (e.g. 4096) or K (e.g. 4K)\n"
00068 " (must be between 2K and 64K and a power of 2, default 8K)\n"
00069 " --help display this help and exit\n"
00070 " --version output version information and exit" << endl;
00071 }
00072
00074 static bool
00075 append_filename_argument(string & cmd, const string & arg) {
00076 #ifdef __WIN32__
00077 cmd.reserve(cmd.size() + arg.size() + 3);
00078 cmd += " \"";
00079 for (string::const_iterator i = arg.begin(); i != arg.end(); ++i) {
00080 if (*i == '/') {
00081
00082
00083
00084 cmd += '\\';
00085 } else if (*i < 32 || strchr("<>\"|*?", *i)) {
00086
00087 return false;
00088 } else {
00089 cmd += *i;
00090 }
00091 }
00092 cmd += '"';
00093 #else
00094
00095 cmd.reserve(cmd.size() + arg.size() + 10);
00096
00097
00098
00099 if (arg[0] == '-')
00100 cmd += " ./";
00101 else
00102 cmd += ' ';
00103
00104 for (string::const_iterator i = arg.begin(); i != arg.end(); ++i) {
00105
00106 if (!C_isalnum(*i) && strchr("/._-", *i) == NULL) {
00107 cmd += '\\';
00108 }
00109 cmd += *i;
00110 }
00111 #endif
00112 return true;
00113 }
00114
00115 #ifdef __WIN32__
00116 static bool running_on_win9x() {
00117 static int win9x = -1;
00118 if (win9x == -1) {
00119 OSVERSIONINFO info;
00120 memset(&info, 0, sizeof(OSVERSIONINFO));
00121 info.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
00122 if (GetVersionEx(&info)) {
00123 win9x = (info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS);
00124 }
00125 }
00126 return win9x;
00127 }
00128 #endif
00129
00131 static void rm_rf(const string &filename) {
00132
00133 struct stat sb;
00134 if (filename.empty() || stat(filename, &sb) != 0 || !S_ISDIR(sb.st_mode))
00135 return;
00136
00137 #ifdef __WIN32__
00138 string cmd;
00139 if (running_on_win9x()) {
00140
00141 cmd = "deltree /y";
00142 } else {
00143
00144 cmd = "rd /s /q";
00145 }
00146 #else
00147 string cmd("rm -rf");
00148 #endif
00149 if (!append_filename_argument(cmd, filename)) return;
00150 system(cmd);
00151 }
00152
00153 static void
00154 copy_position(FlintTable &in, ChertTable *out)
00155 {
00156 in.open();
00157 if (in.empty()) return;
00158
00159 FlintCursor cur(&in);
00160 cur.find_entry(string());
00161
00162 string newkey;
00163 while (cur.next()) {
00164 const string & key = cur.current_key;
00165 const char * d = key.data();
00166 const char * e = d + key.size();
00167 Xapian::docid did;
00168 if (!F_unpack_uint_preserving_sort(&d, e, &did) || d == e)
00169 throw Xapian::DatabaseCorruptError("Bad docid key");
00170 newkey.resize(0);
00171 pack_uint_preserving_sort(newkey, did);
00172 newkey.append(d, e - d);
00173 bool compressed = cur.read_tag(true);
00174 out->add(newkey, cur.current_tag, compressed);
00175 }
00176 }
00177
00178 static void
00179 copy_postlist(FlintTable &in, ChertTable *out)
00180 {
00181 const string firstvaluechunk("\0\xd8", 2);
00182 const string firstdoclenchunk("\0\xe0", 2);
00183 const string firstchunk("\0\xff", 2);
00184
00185 in.open();
00186 if (in.empty()) return;
00187
00188
00189 FlintCursor cur(&in);
00190 cur.find_entry(string());
00191 while (true) {
00192 if (!cur.next()) return;
00193 if (cur.current_key >= firstvaluechunk) break;
00194 bool compressed = cur.read_tag(true);
00195 out->add(cur.current_key, cur.current_tag, compressed);
00196 }
00197
00198
00199 string newkey;
00200 do {
00201 const string & key = cur.current_key;
00202 const char * d = key.data();
00203 const char * d_orig = d;
00204 const char * e = d + key.size();
00205 d += 2;
00206 Xapian::valueno slot;
00207 if (!unpack_uint(&d, e, &slot))
00208 throw Xapian::DatabaseCorruptError("Bad value chunk key (no slot)");
00209 newkey.assign(d_orig, d - d_orig);
00210 Xapian::docid did;
00211 if (!F_unpack_uint_preserving_sort(&d, e, &did))
00212 throw Xapian::DatabaseCorruptError("Bad value chunk key (no docid)");
00213 if (d != e)
00214 throw Xapian::DatabaseCorruptError("Bad value chunk key (trailing junk)");
00215 pack_uint_preserving_sort(newkey, did);
00216 bool compressed = cur.read_tag(true);
00217 out->add(newkey, cur.current_tag, compressed);
00218 if (!cur.next()) return;
00219 } while (cur.current_key < firstdoclenchunk);
00220
00221
00222 do {
00223 const string & key = cur.current_key;
00224 const char * d = key.data();
00225 const char * e = d + key.size();
00226 newkey.assign(d, 2);
00227 d += 2;
00228 if (d != e) {
00229 Xapian::docid did;
00230 if (!F_unpack_uint_preserving_sort(&d, e, &did))
00231 throw Xapian::DatabaseCorruptError("Bad doclen chunk key (no docid)");
00232 if (d != e)
00233 throw Xapian::DatabaseCorruptError("Bad doclen chunk key (trailing junk)");
00234 pack_uint_preserving_sort(newkey, did);
00235 }
00236 bool compressed = cur.read_tag(true);
00237 out->add(newkey, cur.current_tag, compressed);
00238 if (!cur.next()) return;
00239 } while (cur.current_key < firstchunk);
00240
00241 do {
00242 const string & key = cur.current_key;
00243 const char * d = key.data();
00244 const char * e = d + key.size();
00245 string term;
00246 if (!F_unpack_string_preserving_sort(&d, e, term))
00247 throw Xapian::DatabaseCorruptError("Bad postlist key");
00248 if (d == e) {
00249
00250 newkey = pack_chert_postlist_key(term);
00251 } else {
00252
00253 Xapian::docid firstdid;
00254 if (!F_unpack_uint_preserving_sort(&d, e, &firstdid) || d != e)
00255 throw Xapian::DatabaseCorruptError("Bad postlist key");
00256 newkey = pack_chert_postlist_key(term, firstdid);
00257 }
00258 bool compressed = cur.read_tag(true);
00259 out->add(newkey, cur.current_tag, compressed);
00260 } while (cur.next());
00261 }
00262
00263 static void
00264 copy_unchanged(FlintTable &in, ChertTable *out)
00265 {
00266 in.open();
00267 if (in.empty()) return;
00268
00269 FlintCursor cur(&in);
00270 cur.find_entry(string());
00271 while (cur.next()) {
00272 bool compressed = cur.read_tag(true);
00273 out->add(cur.current_key, cur.current_tag, compressed);
00274 }
00275 }
00276
00277 static void
00278 copy_termlist(FlintTable &in, ChertTable *out)
00279 {
00280 in.open();
00281 if (in.empty()) return;
00282
00283 FlintCursor cur(&in);
00284 cur.find_entry(string());
00285
00286 string newkey;
00287 while (cur.next()) {
00288 const string & key = cur.current_key;
00289 const char * d = key.data();
00290 const char * e = d + key.size();
00291 Xapian::docid did;
00292 if (!F_unpack_uint_preserving_sort(&d, e, &did))
00293 throw Xapian::DatabaseCorruptError("Bad termlist key");
00294 newkey.resize(0);
00295 pack_uint_preserving_sort(newkey, did);
00296 if (d != e) {
00297
00298 if (*d++ != '\0' || d != e)
00299 throw Xapian::DatabaseCorruptError("Bad termlist key");
00300 newkey.append(1, '\0');
00301 }
00302 bool compressed = cur.read_tag(true);
00303 out->add(newkey, cur.current_tag, compressed);
00304 }
00305 }
00306
00307 static void
00308 copy_docid_keyed(FlintTable &in, ChertTable *out)
00309 {
00310 in.open();
00311 if (in.empty()) return;
00312
00313 FlintCursor cur(&in);
00314 cur.find_entry(string());
00315
00316 string newkey;
00317 while (cur.next()) {
00318 const string & key = cur.current_key;
00319 const char * d = key.data();
00320 const char * e = d + key.size();
00321 Xapian::docid did;
00322 if (!F_unpack_uint_preserving_sort(&d, e, &did) || d != e)
00323 throw Xapian::DatabaseCorruptError("Bad docid key");
00324 newkey.resize(0);
00325 pack_uint_preserving_sort(newkey, did);
00326 bool compressed = cur.read_tag(true);
00327 out->add(newkey, cur.current_tag, compressed);
00328 }
00329 }
00330
00331 int
00332 main(int argc, char **argv)
00333 {
00334 const char * opts = "b";
00335 const struct option long_opts[] = {
00336 {"help", no_argument, 0, OPT_HELP},
00337 {"version", no_argument, 0, OPT_VERSION},
00338 {NULL, 0, 0, 0}
00339 };
00340
00341 size_t block_size = 8192;
00342
00343 int c;
00344 while ((c = gnu_getopt_long(argc, argv, opts, long_opts, 0)) != -1) {
00345 switch (c) {
00346 case 'b': {
00347 char *p;
00348 block_size = strtoul(optarg, &p, 10);
00349 if (block_size <= 64 && (*p == 'K' || *p == 'k')) {
00350 ++p;
00351 block_size *= 1024;
00352 }
00353 if (*p || block_size < 2048 || block_size > 65536 ||
00354 (block_size & (block_size - 1)) != 0) {
00355 cerr << PROG_NAME": Bad value '" << optarg
00356 << "' passed for blocksize, must be a power of 2 between 2K and 64K"
00357 << endl;
00358 exit(1);
00359 }
00360 break;
00361 }
00362 case OPT_HELP:
00363 cout << PROG_NAME" - "PROG_DESC"\n\n";
00364 show_usage();
00365 exit(0);
00366 case OPT_VERSION:
00367 cout << PROG_NAME" - "PACKAGE_STRING << endl;
00368 exit(0);
00369 default:
00370 show_usage();
00371 exit(1);
00372 }
00373 }
00374
00375 if (argc - optind != 2) {
00376 show_usage();
00377 exit(1);
00378 }
00379
00380
00381 const char *destdir = argv[argc - 1];
00382
00383 try {
00384 const char *srcdir = argv[optind];
00385
00386 if (strcmp(srcdir, destdir) == 0) {
00387 cout << argv[0]
00388 << ": destination may not be the same as the source directory."
00389 << endl;
00390 exit(1);
00391 }
00392
00393 {
00394 struct stat sb;
00395 if (stat(string(srcdir) + "/iamchert", &sb) != 0) {
00396 cout << argv[0] << ": '" << srcdir
00397 << "' is not a chert database directory" << endl;
00398 exit(1);
00399 }
00400 try {
00401
00402 Xapian::Database db(srcdir);
00403 cout << argv[0] << ": '" << srcdir
00404 << "' is already the latest chert format" << endl;
00405 exit(1);
00406 } catch (const Xapian::DatabaseVersionError &) {
00407
00408
00409 }
00410 }
00411
00412
00413 if (mkdir(destdir, 0755) < 0) {
00414
00415
00416
00417 if (errno == EEXIST) {
00418 struct stat sb;
00419 if (stat(destdir, &sb) == 0 && S_ISDIR(sb.st_mode))
00420 errno = 0;
00421 else
00422 errno = EEXIST;
00423 }
00424 if (errno) {
00425 cerr << argv[0] << ": cannot create directory '"
00426 << destdir << "': " << strerror(errno) << endl;
00427 exit(1);
00428 }
00429 }
00430
00431 enum table_type {
00432 POSTLIST, RECORD, TERMLIST, POSITION, SPELLING, SYNONYM
00433 };
00434 struct table_list {
00435
00436 const char * name;
00437
00438 table_type type;
00439
00440 int compress_strategy;
00441
00442 bool lazy;
00443 };
00444
00445 static const table_list tables[] = {
00446
00447 { "postlist", POSTLIST, DONT_COMPRESS, false },
00448 { "record", RECORD, Z_DEFAULT_STRATEGY, false },
00449 { "termlist", TERMLIST, Z_DEFAULT_STRATEGY, false },
00450 { "position", POSITION, DONT_COMPRESS, true },
00451 { "spelling", SPELLING, Z_DEFAULT_STRATEGY, true },
00452 { "synonym", SYNONYM, Z_DEFAULT_STRATEGY, true }
00453 };
00454 const table_list * tables_end = tables +
00455 (sizeof(tables) / sizeof(tables[0]));
00456
00457 for (const table_list * t = tables; t < tables_end; ++t) {
00458 bool bad_stat = false;
00459 off_t in_size = 0;
00460
00461
00462
00463
00464 cout << t->name << " ..." << flush;
00465
00466 string s(srcdir);
00467 s += '/';
00468 s += t->name;
00469 s += '.';
00470 {
00471 struct stat sb;
00472 if (stat(s + "DB", &sb) == 0) {
00473 in_size += sb.st_size / 1024;
00474 } else if (errno != ENOENT) {
00475
00476 bad_stat = true;
00477 } else if (t->type == TERMLIST) {
00478 cout << '\r' << t->name << ": doesn't exist" << endl;
00479 continue;
00480 }
00481 }
00482
00483 FlintTable in(t->name, s, true, DONT_COMPRESS, t->lazy);
00484
00485 string dest = destdir;
00486 dest += '/';
00487 dest += t->name;
00488 dest += '.';
00489
00490 ChertTable out(t->name, dest, false, t->compress_strategy, t->lazy);
00491 if (!t->lazy) {
00492 out.create_and_open(block_size);
00493 } else {
00494 out.erase();
00495 out.set_block_size(block_size);
00496 }
00497
00498 out.set_full_compaction(true);
00499
00500
00501
00502
00503
00504 switch (t->type) {
00505 case POSITION:
00506 copy_position(in, &out);
00507 break;
00508 case POSTLIST:
00509 copy_postlist(in, &out);
00510 break;
00511 case SPELLING: case SYNONYM:
00512 copy_unchanged(in, &out);
00513 break;
00514 case TERMLIST:
00515 copy_termlist(in, &out);
00516 break;
00517 default:
00518
00519 copy_docid_keyed(in, &out);
00520 break;
00521 }
00522
00523
00524 out.flush_db();
00525 out.commit(1);
00526
00527 cout << '\r' << t->name << ": ";
00528 off_t out_size = 0;
00529 if (!bad_stat) {
00530 struct stat sb;
00531 if (stat(dest + "DB", &sb) == 0) {
00532 out_size = sb.st_size / 1024;
00533 } else {
00534 bad_stat = (errno != ENOENT);
00535 }
00536 }
00537 if (bad_stat) {
00538 cout << "Done (couldn't stat all the DB files)";
00539 } else {
00540 if (out_size == in_size) {
00541 cout << "Size unchanged (";
00542 } else if (out_size < in_size) {
00543 cout << "Reduced by "
00544 << 100 * double(in_size - out_size) / in_size << "% "
00545 << in_size - out_size << "K (" << in_size << "K -> ";
00546 } else {
00547 cout << "INCREASED by "
00548 << 100 * double(out_size - in_size) / in_size << "% "
00549 << out_size - in_size << "K (" << in_size << "K -> ";
00550 }
00551 cout << out_size << "K)";
00552 }
00553 cout << endl;
00554 }
00555
00556
00557
00558
00559
00560
00561
00562 string donor = destdir;
00563 donor += "/donor.tmp";
00564
00565 (void)Xapian::Chert::open(donor, Xapian::DB_CREATE_OR_OVERWRITE);
00566 string from = donor;
00567 from += "/iamchert";
00568 string to(destdir);
00569 to += "/iamchert";
00570 if (rename(from.c_str(), to.c_str()) == -1) {
00571 cerr << argv[0] << ": cannot rename '" << from << "' to '"
00572 << to << "': " << strerror(errno) << endl;
00573 exit(1);
00574 }
00575
00576 rm_rf(donor);
00577 } catch (const Xapian::Error &error) {
00578 cerr << argv[0] << ": " << error.get_description() << endl;
00579 exit(1);
00580 } catch (const char * msg) {
00581 cerr << argv[0] << ": " << msg << endl;
00582 exit(1);
00583 }
00584 }