xapian-core  2.0.0
prefix_compressed_strings.h
Go to the documentation of this file.
1 
4 /* Copyright (C) 2004,2005,2006,2007,2008,2009,2010,2018,2024 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License as
8  * published by the Free Software Foundation; either version 2 of the
9  * License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, see
18  * <https://www.gnu.org/licenses/>.
19  */
20 
21 #ifndef XAPIAN_INCLUDED_PREFIX_COMPRESSED_STRINGS_H
22 #define XAPIAN_INCLUDED_PREFIX_COMPRESSED_STRINGS_H
23 
24 #include <xapian/error.h>
25 
26 #include <algorithm>
27 #include <string>
28 
29 #include "honey/honey_spelling.h"
30 #include "stringutils.h"
31 
32 // We XOR the length values with this so that they are more likely to coincide
33 // with lower case ASCII letters, which are likely to be common. This means
34 // that zlib should do a better job of compressing tag values - in tests, this
35 // gave 5% better compression.
36 #define MAGIC_XOR_VALUE 96
37 
39  const unsigned char * p;
40  size_t left;
41  std::string current;
42 
49  int tail = 0;
50 
52  : p(o.p), left(o.left), current(o.current), tail(o.tail) {}
53 
55  : p(o.p), left(o.left), current(std::move(o.current)), tail(o.tail) {}
56 
57  public:
62  explicit PrefixCompressedStringItor(const std::string& s)
63  : p(reinterpret_cast<const unsigned char *>(s.data())),
64  left(s.size()) {
65  if (left) {
66  operator++();
67  } else {
68  p = NULL;
69  }
70  }
71 
77  PrefixCompressedStringItor(const std::string& s,
78  const std::string& key)
79  : p(reinterpret_cast<const unsigned char *>(s.data())),
80  left(s.size()) {
81  Assert(!key.empty());
82  unsigned char first_ch = key[0];
83  AssertRel(first_ch, <, Honey::KEY_PREFIX_WORD);
84  switch (first_ch) {
86  tail = -1;
87  break;
89  tail = -2;
90  break;
92  tail = 2;
93  break;
94  }
95  if (tail != 0)
96  current.assign(key, 1, 2);
97  if (left) {
98  operator++();
99  } else {
100  p = NULL;
101  }
102  }
103 
104  const std::string & operator*() const {
105  return current;
106  }
107 
109  PrefixCompressedStringItor old(*this);
110  operator++();
111  return old;
112  }
113 
115  if (left == 0) {
116  p = NULL;
117  } else {
118  size_t keep = 0;
119  if (rare(tail < 0)) {
120  tail += 2;
121  keep = current.size() - tail;
122  } else if (usual(!current.empty())) {
123  keep = *p++ ^ MAGIC_XOR_VALUE;
124  --left;
125  }
126  size_t add;
127  if (left == 0 || (add = *p ^ MAGIC_XOR_VALUE) >= left)
128  throw Xapian::DatabaseCorruptError("Bad spelling data (too little left)");
129  current.replace(keep, current.size() - tail - keep,
130  reinterpret_cast<const char *>(p + 1), add);
131  p += add + 1;
132  left -= add + 1;
133  }
134  return *this;
135  }
136 
137  bool at_end() const {
138  return p == NULL;
139  }
140 };
141 
143  std::string current;
144  std::string & out;
145 
146  int tail = 0;
147 
148  public:
153  explicit PrefixCompressedStringWriter(std::string& out_) : out(out_) { }
154 
160  PrefixCompressedStringWriter(std::string& out_,
161  const std::string& key)
162  : out(out_) {
163  Assert(!key.empty());
164  unsigned char first_ch = key[0];
165  AssertRel(first_ch, <, Honey::KEY_PREFIX_WORD);
166  switch (first_ch) {
168  tail = -1;
169  break;
171  tail = -2;
172  break;
174  tail = 2;
175  break;
176  }
177  if (tail != 0)
178  current.assign(key, 1, 2);
179  }
180 
181  void append(const std::string & word) {
182  // If this isn't the first entry, see how much of the previous one
183  // we can reuse.
184  if (rare(tail < 0)) {
185  // First entry for BOOKEND or HEAD (tail is -1 or -2).
186  AssertRel(tail, >=, -2);
187  AssertEq(current[0], word[0]);
188  if (tail == -2) {
189  AssertEq(current[1], word[1]);
190  } else {
191  AssertEq(current.back(), word.back());
192  }
193  out += char((word.size() - 2) ^ MAGIC_XOR_VALUE);
194  out.append(word, -tail, word.size() - 2);
195  tail += 2;
196  } else if (usual(!current.empty())) {
197  // Incremental change.
198  if (tail)
199  AssertEq(current[current.size() - 1], word[word.size() - 1]);
200  if (tail > 1)
201  AssertEq(current[current.size() - 2], word[word.size() - 2]);
202  size_t i = common_prefix_length(current, word);
203  // Don't allow the reused prefix to overlap with tail
204  i = std::min(i, word.size() - tail);
205  out += char(i ^ MAGIC_XOR_VALUE);
206  size_t add = word.size() - i - tail;
207  out += char(add ^ MAGIC_XOR_VALUE);
208  out.append(word.data() + i, add);
209  } else {
210  // First entry for MIDDLE or TAIL (tail is 0 or 2).
211  if (tail) {
212  AssertEq(current[current.size() - 1], word[word.size() - 1]);
213  AssertEq(current[current.size() - 2], word[word.size() - 2]);
214  }
215  out += char((word.size() - tail) ^ MAGIC_XOR_VALUE);
216  out.append(word, 0, word.size() - tail);
217  }
218  current = word;
219  }
220 };
221 
225  const PrefixCompressedStringItor *b) const {
226  return (**a > **b);
227  }
228 };
229 
230 #endif // XAPIAN_INCLUDED_PREFIX_COMPRESSED_STRINGS_H
PrefixCompressedStringItor & operator++()
PrefixCompressedStringItor(const std::string &s, const std::string &key)
Construct for honey.
const std::string & operator*() const
PrefixCompressedStringItor(PrefixCompressedStringItor &&o)
PrefixCompressedStringItor operator++(int)
int tail
Number of constant characters on the end of the value.
PrefixCompressedStringItor(const std::string &s)
Construct for glass.
PrefixCompressedStringItor(PrefixCompressedStringItor &o)
PrefixCompressedStringWriter(std::string &out_, const std::string &key)
Construct for honey.
void append(const std::string &word)
PrefixCompressedStringWriter(std::string &out_)
Construct for glass.
DatabaseCorruptError indicates database corruption was detected.
Definition: error.h:397
#define usual(COND)
Definition: config.h:608
#define rare(COND)
Definition: config.h:607
Hierarchy of classes which Xapian can throw as exceptions.
Spelling correction data for a honey database.
const unsigned KEY_PREFIX_WORD
const unsigned KEY_PREFIX_TAIL
const unsigned KEY_PREFIX_BOOKEND
const unsigned KEY_PREFIX_HEAD
#define AssertEq(A, B)
Definition: omassert.h:124
#define AssertRel(A, REL, B)
Definition: omassert.h:123
#define Assert(COND)
Definition: omassert.h:122
#define MAGIC_XOR_VALUE
Various handy string-related helpers.
std::string::size_type common_prefix_length(std::string_view a, std::string_view b)
Definition: stringutils.h:128
bool operator()(const PrefixCompressedStringItor *a, const PrefixCompressedStringItor *b) const
Return true if and only if a's string is strictly greater than b's.