net/serialise.cc

Go to the documentation of this file.
00001 /* @file serialise.cc
00002  * @brief functions to convert Xapian objects to strings and back
00003  */
00004 /* Copyright (C) 2006,2007,2010 Olly Betts
00005  *
00006  * This program is free software; you can redistribute it and/or modify
00007  * it under the terms of the GNU General Public License as published by
00008  * the Free Software Foundation; either version 2 of the License, or
00009  * (at your option) any later version.
00010  *
00011  * This program is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014  * GNU General Public License for more details.
00015  *
00016  * You should have received a copy of the GNU General Public License
00017  * along with this program; if not, write to the Free Software
00018  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
00019  */
00020 
00021 #include <config.h>
00022 
00023 #include <xapian/document.h>
00024 #include <xapian/error.h>
00025 #include <xapian/positioniterator.h>
00026 #include <xapian/termiterator.h>
00027 #include <xapian/valueiterator.h>
00028 
00029 #include "omassert.h"
00030 #include "omenquireinternal.h"
00031 #include "serialise.h"
00032 #include "serialise-double.h"
00033 #include "stats.h"
00034 #include "utils.h"
00035 
00036 #include <string>
00037 #include <string.h>
00038 
00039 using namespace std;
00040 
00041 string
00042 encode_length(size_t len)
00043 {
00044     string result;
00045     if (len < 255) {
00046         result += static_cast<unsigned char>(len);
00047     } else {
00048         result += '\xff';
00049         len -= 255;
00050         while (true) {
00051             unsigned char byte = static_cast<unsigned char>(len & 0x7f);
00052             len >>= 7;
00053             if (!len) {
00054                 result += (byte | static_cast<unsigned char>(0x80));
00055                 break;
00056             }
00057             result += byte;
00058         }
00059     }
00060     return result;
00061 }
00062 
00063 size_t
00064 decode_length(const char ** p, const char *end, bool check_remaining)
00065 {
00066     if (*p == end) {
00067         throw Xapian::NetworkError("Bad encoded length: no data");
00068     }
00069 
00070     size_t len = static_cast<unsigned char>(*(*p)++);
00071     if (len == 0xff) {
00072         len = 0;
00073         unsigned char ch;
00074         int shift = 0;
00075         do {
00076             if (*p == end || shift > 28)
00077                 throw Xapian::NetworkError("Bad encoded length: insufficient data");
00078             ch = *(*p)++;
00079             len |= size_t(ch & 0x7f) << shift;
00080             shift += 7;
00081         } while ((ch & 0x80) == 0);
00082         len += 255;
00083     }
00084     if (check_remaining && len > size_t(end - *p)) {
00085         throw Xapian::NetworkError("Bad encoded length: length greater than data");
00086     }
00087     return len;
00088 }
00089 
00090 string
00091 serialise_error(const Xapian::Error &e)
00092 {
00093     string result;
00094     result += encode_length(strlen(e.get_type()));
00095     result += e.get_type();
00096     result += encode_length(e.get_context().length());
00097     result += e.get_context();
00098     result += encode_length(e.get_msg().length());
00099     result += e.get_msg();
00100     // The "error string" goes last so we don't need to store its length.
00101     const char * err = e.get_error_string();
00102     if (err) result += err;
00103     return result;
00104 }
00105 
00106 void
00107 unserialise_error(const string &serialised_error, const string &prefix,
00108                   const string &new_context)
00109 {
00110     // Use c_str() so last string is nul-terminated.
00111     const char * p = serialised_error.c_str();
00112     const char * end = p + serialised_error.size();
00113     size_t len;
00114     len = decode_length(&p, end, true);
00115     if (len == 7 && memcmp(p, "UNKNOWN", 7) == 0) {
00116         throw Xapian::InternalError("UNKNOWN");
00117     }
00118     string type(p, len);
00119     p += len;
00120 
00121     len = decode_length(&p, end, true);
00122     string context(p, len);
00123     p += len;
00124 
00125     len = decode_length(&p, end, true);
00126     string msg(prefix);
00127     msg.append(p, len);
00128     p += len;
00129 
00130     const char * error_string = (p == end) ? NULL : p;
00131 
00132     if (!context.empty() && !new_context.empty()) {
00133         msg += "; context was: ";
00134         msg += context;
00135         context = new_context;
00136     }
00137 
00138 #include <xapian/errordispatch.h>
00139 
00140     string newmsg = "Unknown remote exception type ";
00141     newmsg += type;
00142     newmsg += ": ";
00143     newmsg += msg;
00144     throw Xapian::InternalError(newmsg, context);
00145 }
00146 
00147 string serialise_stats(const Stats &stats)
00148 {
00149     string result;
00150 
00151     result += encode_length(stats.collection_size);
00152     result += encode_length(stats.rset_size);
00153     result += serialise_double(stats.average_length);
00154 
00155     map<string, Xapian::doccount>::const_iterator i;
00156 
00157     result += encode_length(stats.termfreq.size());
00158     for (i = stats.termfreq.begin(); i != stats.termfreq.end(); ++i) {
00159         result += encode_length(i->first.size());
00160         result += i->first;
00161         result += encode_length(i->second);
00162     }
00163 
00164     for (i = stats.reltermfreq.begin(); i != stats.reltermfreq.end(); ++i) {
00165         result += encode_length(i->first.size());
00166         result += i->first;
00167         result += encode_length(i->second);
00168     }
00169 
00170     return result;
00171 }
00172 
00173 Stats
00174 unserialise_stats(const string &s)
00175 {
00176     const char * p = s.data();
00177     const char * p_end = p + s.size();
00178 
00179     Stats stat;
00180 
00181     stat.collection_size = decode_length(&p, p_end, false);
00182     stat.rset_size = decode_length(&p, p_end, false);
00183     stat.average_length = unserialise_double(&p, p_end);
00184 
00185     size_t n = decode_length(&p, p_end, false);
00186     while (n--) {
00187         size_t len = decode_length(&p, p_end, true);
00188         string term(p, len);
00189         p += len;
00190         stat.termfreq.insert(make_pair(term, decode_length(&p, p_end, false)));
00191     }
00192 
00193     while (p != p_end) {
00194         size_t len = decode_length(&p, p_end, true);
00195         string term(p, len);
00196         p += len;
00197         stat.reltermfreq.insert(make_pair(term, decode_length(&p, p_end, false)));
00198     }
00199 
00200     return stat;
00201 }
00202 
00203 string
00204 serialise_mset_pre_30_5(const Xapian::MSet &mset)
00205 {
00206     string result;
00207 
00208     result += encode_length(mset.get_firstitem());
00209     result += encode_length(mset.get_matches_lower_bound());
00210     result += encode_length(mset.get_matches_estimated());
00211     result += encode_length(mset.get_matches_upper_bound());
00212     result += serialise_double(mset.get_max_possible());
00213     result += serialise_double(mset.get_max_attained());
00214     result += encode_length(mset.size());
00215     for (Xapian::MSetIterator i = mset.begin(); i != mset.end(); ++i) {
00216         result += serialise_double(i.get_weight());
00217         result += encode_length(*i);
00218         result += encode_length(i.get_collapse_key().size());
00219         result += i.get_collapse_key();
00220         result += encode_length(i.get_collapse_count());
00221     }
00222 
00223     const map<string, Xapian::MSet::Internal::TermFreqAndWeight> &termfreqandwts
00224         = mset.internal->termfreqandwts;
00225 
00226     map<string, Xapian::MSet::Internal::TermFreqAndWeight>::const_iterator j;
00227     for (j = termfreqandwts.begin(); j != termfreqandwts.end(); ++j) {
00228         result += encode_length(j->first.size());
00229         result += j->first;
00230         result += encode_length(j->second.termfreq);
00231         result += serialise_double(j->second.termweight);
00232     }
00233 
00234     return result;
00235 }
00236 
00237 string
00238 serialise_mset(const Xapian::MSet &mset)
00239 {
00240     string result;
00241 
00242     result += encode_length(mset.get_firstitem());
00243     result += encode_length(mset.get_matches_lower_bound());
00244     result += encode_length(mset.get_matches_estimated());
00245     result += encode_length(mset.get_matches_upper_bound());
00246     result += serialise_double(mset.get_max_possible());
00247     result += serialise_double(mset.get_max_attained());
00248 
00249     result += serialise_double(mset.internal->percent_factor);
00250 
00251     result += encode_length(mset.size());
00252     for (Xapian::MSetIterator i = mset.begin(); i != mset.end(); ++i) {
00253         result += serialise_double(i.get_weight());
00254         result += encode_length(*i);
00255         result += encode_length(i.get_collapse_key().size());
00256         result += i.get_collapse_key();
00257         result += encode_length(i.get_collapse_count());
00258     }
00259 
00260     const map<string, Xapian::MSet::Internal::TermFreqAndWeight> &termfreqandwts
00261         = mset.internal->termfreqandwts;
00262 
00263     map<string, Xapian::MSet::Internal::TermFreqAndWeight>::const_iterator j;
00264     for (j = termfreqandwts.begin(); j != termfreqandwts.end(); ++j) {
00265         result += encode_length(j->first.size());
00266         result += j->first;
00267         result += encode_length(j->second.termfreq);
00268         result += serialise_double(j->second.termweight);
00269     }
00270 
00271     return result;
00272 }
00273 
00274 Xapian::MSet
00275 unserialise_mset(const string &s)
00276 {
00277     const char * p = s.data();
00278     const char * p_end = p + s.size();
00279 
00280     Xapian::doccount firstitem = decode_length(&p, p_end, false);
00281     Xapian::doccount matches_lower_bound = decode_length(&p, p_end, false);
00282     Xapian::doccount matches_estimated = decode_length(&p, p_end, false);
00283     Xapian::doccount matches_upper_bound = decode_length(&p, p_end, false);
00284     Xapian::weight max_possible = unserialise_double(&p, p_end);
00285     Xapian::weight max_attained = unserialise_double(&p, p_end);
00286 
00287     double percent_factor = unserialise_double(&p, p_end);
00288 
00289     vector<Xapian::Internal::MSetItem> items;
00290     size_t msize = decode_length(&p, p_end, false);
00291     while (msize-- > 0) {
00292         Xapian::weight wt = unserialise_double(&p, p_end);
00293         Xapian::docid did = decode_length(&p, p_end, false);
00294         size_t len = decode_length(&p, p_end, true);
00295         string key(p, len);
00296         p += len;
00297         items.push_back(Xapian::Internal::MSetItem(wt, did, key,
00298                                                    decode_length(&p, p_end, false)));
00299     }
00300 
00301     map<string, Xapian::MSet::Internal::TermFreqAndWeight> terminfo;
00302     while (p != p_end) {
00303         Xapian::MSet::Internal::TermFreqAndWeight tfaw;
00304         size_t len = decode_length(&p, p_end, true);
00305         string term(p, len);
00306         p += len;
00307         tfaw.termfreq = decode_length(&p, p_end, false);
00308         tfaw.termweight = unserialise_double(&p, p_end);
00309         terminfo.insert(make_pair(term, tfaw));
00310     }
00311 
00312     return Xapian::MSet(new Xapian::MSet::Internal(
00313                                        firstitem,
00314                                        matches_upper_bound,
00315                                        matches_lower_bound,
00316                                        matches_estimated,
00317                                        max_possible, max_attained,
00318                                        items, terminfo, percent_factor));
00319 }
00320 
00321 string
00322 serialise_rset(const Xapian::RSet &rset)
00323 {
00324     const set<Xapian::docid> & items = rset.internal->get_items();
00325     string result;
00326     set<Xapian::docid>::const_iterator i;
00327     Xapian::docid lastdid = 0;
00328     for (i = items.begin(); i != items.end(); ++i) {
00329         Xapian::docid did = *i;
00330         result += encode_length(did - lastdid - 1);
00331         lastdid = did;
00332     }
00333     return result;
00334 }
00335 
00336 Xapian::RSet
00337 unserialise_rset(const string &s)
00338 {
00339     Xapian::RSet rset;
00340 
00341     const char * p = s.data();
00342     const char * p_end = p + s.size();
00343 
00344     Xapian::docid did = 0;
00345     while (p != p_end) {
00346         did += decode_length(&p, p_end, false) + 1;
00347         rset.add_document(did);
00348     }
00349 
00350     return rset;
00351 }
00352 
00353 string
00354 serialise_document(const Xapian::Document &doc)
00355 {
00356     string result;
00357 
00358     size_t n = doc.values_count();
00359     result += encode_length(n);
00360     Xapian::ValueIterator value;
00361     for (value = doc.values_begin(); value != doc.values_end(); ++value) {
00362         result += encode_length(value.get_valueno());
00363         result += encode_length((*value).size());
00364         result += *value;
00365         --n;
00366     }
00367     Assert(n == 0);
00368 
00369     n = doc.termlist_count();
00370     result += encode_length(n);
00371     Xapian::TermIterator term;
00372     for (term = doc.termlist_begin(); term != doc.termlist_end(); ++term) {
00373         result += encode_length((*term).size());
00374         result += *term;
00375         result += encode_length(term.get_wdf());
00376 
00377         size_t x = term.positionlist_count();
00378         result += encode_length(x);
00379         Xapian::PositionIterator pos;
00380         Xapian::termpos oldpos = 0;
00381         for (pos = term.positionlist_begin(); pos != term.positionlist_end(); ++pos) {
00382             Xapian::termpos diff = *pos - oldpos;
00383             string delta = encode_length(diff);
00384             result += delta;
00385             oldpos = *pos;
00386             --x;
00387         }
00388         Assert(x == 0);
00389         --n;
00390     }
00391     Assert(n == 0);
00392 
00393     result += doc.get_data();
00394     return result;
00395 }
00396 
00397 Xapian::Document
00398 unserialise_document(const string &s)
00399 {
00400     Xapian::Document doc;
00401     const char * p = s.data();
00402     const char * p_end = p + s.size();
00403 
00404     size_t n_values = decode_length(&p, p_end, false);
00405     while (n_values--) {
00406         Xapian::valueno valno = decode_length(&p, p_end, false);
00407         size_t len = decode_length(&p, p_end, true);
00408         doc.add_value(valno, string(p, len));
00409         p += len;
00410     }
00411 
00412     size_t n_terms = decode_length(&p, p_end, false);
00413     while (n_terms--) {
00414         size_t len = decode_length(&p, p_end, true);
00415         string term(p, len);
00416         p += len;
00417 
00418         // Set all the wdf using add_term, then pass wdf_inc 0 to add_posting.
00419         Xapian::termcount wdf = decode_length(&p, p_end, false);
00420         doc.add_term(term, wdf);
00421 
00422         size_t n_pos = decode_length(&p, p_end, false);
00423         Xapian::termpos pos = 0;
00424         while (n_pos--) {
00425             pos += decode_length(&p, p_end, false);
00426             doc.add_posting(term, pos, 0);
00427         }
00428     }
00429 
00430     doc.set_data(string(p, p_end - p));
00431     return doc;
00432 }

Documentation for Xapian (version 1.0.20).
Generated on 28 Apr 2010 by Doxygen 1.5.2.