xapian-core  1.4.18
valuerangeproc.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2007,2008,2009,2010,2012,2016,2018 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include <config.h>
22 
23 #include <xapian/queryparser.h>
24 
25 #include <cerrno>
26 #include <cstdlib> // For atoi().
27 
28 #include <string>
29 #include "stringutils.h"
30 
31 using namespace std;
32 
33 namespace Xapian {
34 
36 StringValueRangeProcessor::operator()(string &begin, string &end)
37 {
38  if (str.size()) {
39  if (prefix) {
40  // If there's a prefix, require it on the start of the range.
41  if (!startswith(begin, str)) {
42  // Prefix not given.
43  return Xapian::BAD_VALUENO;
44  }
45  begin.erase(0, str.size());
46  // But it's optional on the end of the range, e.g. $10..50
47  if (startswith(end, str)) {
48  end.erase(0, str.size());
49  }
50  } else {
51  // If there's a suffix, require it on the end of the range.
52  if (!endswith(end, str)) {
53  // Suffix not given.
54  return Xapian::BAD_VALUENO;
55  }
56  end.resize(end.size() - str.size());
57  // But it's optional on the start of the range, e.g. 10..50kg
58  if (endswith(begin, str)) {
59  begin.resize(begin.size() - str.size());
60  }
61  }
62  }
63  return valno;
64 }
65 
66 static bool
67 decode_xxy(const string & s, int & x1, int &x2, int &y)
68 {
69  if (s.size() == 0) {
70  x1 = x2 = y = -1;
71  return true;
72  }
73  if (s.size() < 5 || s.size() > 10) return false;
74  size_t i = s.find_first_not_of("0123456789");
75  if (i < 1 || i > 2 || !(s[i] == '/' || s[i] == '-' || s[i] == '.'))
76  return false;
77  size_t j = s.find_first_not_of("0123456789", i + 1);
78  if (j - (i + 1) < 1 || j - (i + 1) > 2 ||
79  !(s[j] == '/' || s[j] == '-' || s[j] == '.'))
80  return false;
81  if (s.size() - j > 4 + 1) return false;
82  if (s.find_first_not_of("0123456789", j + 1) != string::npos)
83  return false;
84  x1 = atoi(s.c_str());
85  if (x1 < 1 || x1 > 31) return false;
86  x2 = atoi(s.c_str() + i + 1);
87  if (x2 < 1 || x2 > 31) return false;
88  y = atoi(s.c_str() + j + 1);
89  return true;
90 }
91 
92 // We just use this to decide if an ambiguous aa/bb/cc date could be a
93 // particular format, so there's no need to be anal about the exact number of
94 // days in February. The most useful check is that the month field is <= 12
95 // so we could just check the day is <= 31 really.
96 static const char max_month_length[12] = {
97  31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31
98 };
99 
100 static bool
101 vet_dm(int d, int m)
102 {
103  if (m == -1) return true;
104  if (m > 12 || m < 1) return false;
105  if (d < 1 || d > max_month_length[m - 1]) return false;
106  return true;
107 }
108 
109 // NB Assumes the length has been checked to be 10 already.
110 static bool
111 is_yyyy_mm_dd(const string &s)
112 {
113  return (s.find_first_not_of("0123456789") == 4 &&
114  s.find_first_not_of("0123456789", 5) == 7 &&
115  s.find_first_not_of("0123456789", 8) == string::npos &&
116  s[4] == s[7] &&
117  (s[4] == '-' || s[4] == '.' || s[4] == '/'));
118 }
119 
120 // Write exactly w chars to buffer p representing integer v.
121 //
122 // The result is left padded with zeros if v < pow(10, w - 1).
123 //
124 // If v >= pow(10, w), then the output will show v % pow(10, w) (i.e. the
125 // most significant digits are lost).
126 static void
127 format_int_fixed_width(char * p, int v, int w)
128 {
129  while (--w >= 0) {
130  p[w] = '0' + (v % 10);
131  v /= 10;
132  }
133 }
134 
135 static void
136 format_yyyymmdd(char * p, int y, int m, int d)
137 {
138  format_int_fixed_width(p, y, 4);
139  format_int_fixed_width(p + 4, m, 2);
140  format_int_fixed_width(p + 6, d, 2);
141 }
142 
144 DateValueRangeProcessor::operator()(string &begin, string &end)
145 {
146  if (StringValueRangeProcessor::operator()(begin, end) == BAD_VALUENO)
147  return BAD_VALUENO;
148 
149  if ((begin.size() == 8 || begin.size() == 0) &&
150  (end.size() == 8 || end.size() == 0) &&
151  begin.find_first_not_of("0123456789") == string::npos &&
152  end.find_first_not_of("0123456789") == string::npos) {
153  // YYYYMMDD
154  return valno;
155  }
156  if ((begin.size() == 10 || begin.size() == 0) &&
157  (end.size() == 10 || end.size() == 0)) {
158  if ((begin.empty() || is_yyyy_mm_dd(begin)) &&
159  (end.empty() || is_yyyy_mm_dd(end))) {
160  // YYYY-MM-DD
161  if (!begin.empty()) {
162  begin.erase(7, 1);
163  begin.erase(4, 1);
164  }
165  if (!end.empty()) {
166  end.erase(7, 1);
167  end.erase(4, 1);
168  }
169  return valno;
170  }
171  }
172 
173  int b_d, b_m, b_y;
174  int e_d, e_m, e_y;
175  if (!decode_xxy(begin, b_d, b_m, b_y) || !decode_xxy(end, e_d, e_m, e_y))
176  return Xapian::BAD_VALUENO;
177 
178  // Check that the month and day are within range. Also assume "start" <=
179  // "end" to help decide ambiguous cases.
180  if (!prefer_mdy && vet_dm(b_d, b_m) && vet_dm(e_d, e_m) &&
181  (b_y != e_y || b_m < e_m || (b_m == e_m && b_d <= e_d))) {
182  // OK.
183  } else if (vet_dm(b_m, b_d) && vet_dm(e_m, e_d) &&
184  (b_y != e_y || b_d < e_d || (b_d == e_d && b_m <= e_m))) {
185  swap(b_m, b_d);
186  swap(e_m, e_d);
187  } else if (prefer_mdy && vet_dm(b_d, b_m) && vet_dm(e_d, e_m) &&
188  (b_y != e_y || b_m < e_m || (b_m == e_m && b_d <= e_d))) {
189  // OK.
190  } else {
191  return Xapian::BAD_VALUENO;
192  }
193 
194  char buf[8];
195  if (!begin.empty()) {
196  if (b_y < 100) {
197  b_y += 1900;
198  if (b_y < epoch_year) b_y += 100;
199  }
200  format_yyyymmdd(buf, b_y, b_m, b_d);
201  begin.assign(buf, 8);
202  }
203  if (!end.empty()) {
204  if (e_y < 100) {
205  e_y += 1900;
206  if (e_y < epoch_year) e_y += 100;
207  }
208  format_yyyymmdd(buf, e_y, e_m, e_d);
209  end.assign(buf, 8);
210  }
211  return valno;
212 }
213 
215 NumberValueRangeProcessor::operator()(string &begin, string &end)
216 {
217  if (StringValueRangeProcessor::operator()(begin, end) == BAD_VALUENO)
218  return BAD_VALUENO;
219 
220  // Parse the numbers to floating point.
221  double beginnum;
222 
223  if (!begin.empty()) {
224  errno = 0;
225  const char * startptr = begin.c_str();
226  char * endptr;
227  beginnum = strtod(startptr, &endptr);
228  if (endptr != startptr + begin.size())
229  // Invalid characters in string
230  return Xapian::BAD_VALUENO;
231  if (errno)
232  // Overflow or underflow
233  return Xapian::BAD_VALUENO;
234  } else {
235  // Silence GCC warning.
236  beginnum = 0.0;
237  }
238 
239  if (!end.empty()) {
240  errno = 0;
241  const char * startptr = end.c_str();
242  char * endptr;
243  double endnum = strtod(startptr, &endptr);
244  if (endptr != startptr + end.size())
245  // Invalid characters in string
246  return Xapian::BAD_VALUENO;
247  if (errno)
248  // Overflow or underflow
249  return Xapian::BAD_VALUENO;
250  end.assign(Xapian::sortable_serialise(endnum));
251  }
252 
253  if (!begin.empty()) {
254  begin.assign(Xapian::sortable_serialise(beginnum));
255  }
256 
257  return valno;
258 }
259 
261 RangeProcessor::check_range(const string& b, const string& e)
262 {
263  if (str.empty())
264  return operator()(b, e);
265 
266  size_t off_b = 0, len_b = string::npos;
267  size_t off_e = 0, len_e = string::npos;
268 
269  bool prefix = !(flags & Xapian::RP_SUFFIX);
270  bool repeated = (flags & Xapian::RP_REPEATED);
271 
272  if (prefix) {
273  // If there's a prefix, require it on the start of the range.
274  if (!startswith(b, str)) {
275  // Prefix not given.
276  goto not_our_range;
277  }
278  off_b = str.size();
279  // Optionally allow it on the end of the range, e.g. $10..50
280  if (repeated && startswith(e, str)) {
281  off_e = off_b;
282  }
283  } else {
284  // If there's a suffix, require it on the end of the range.
285  if (!endswith(e, str)) {
286  // Suffix not given.
287  goto not_our_range;
288  }
289  len_e = e.size() - str.size();
290  // Optionally allow it on the start of the range, e.g. 10..50kg
291  if (repeated && endswith(b, str)) {
292  len_b = b.size() - str.size();
293  }
294  }
295 
296  return operator()(string(b, off_b, len_b), string(e, off_e, len_e));
297 
298 not_our_range:
300 }
301 
303 RangeProcessor::operator()(const string& b, const string& e)
304 {
305  if (e.empty())
306  return Xapian::Query(Xapian::Query::OP_VALUE_GE, slot, b);
307  return Xapian::Query(Xapian::Query::OP_VALUE_RANGE, slot, b, e);
308 }
309 
311 DateRangeProcessor::operator()(const string& b, const string& e)
312 {
313  if ((b.size() == 8 || b.size() == 0) &&
314  (e.size() == 8 || e.size() == 0) &&
315  b.find_first_not_of("0123456789") == string::npos &&
316  e.find_first_not_of("0123456789") == string::npos) {
317  // YYYYMMDD
318  return RangeProcessor::operator()(b, e);
319  }
320  if ((b.size() == 10 || b.size() == 0) &&
321  (e.size() == 10 || e.size() == 0)) {
322  if ((b.empty() || is_yyyy_mm_dd(b)) &&
323  (e.empty() || is_yyyy_mm_dd(e))) {
324  string begin = b, end = e;
325  // YYYY-MM-DD
326  if (!begin.empty()) {
327  begin.erase(7, 1);
328  begin.erase(4, 1);
329  }
330  if (!end.empty()) {
331  end.erase(7, 1);
332  end.erase(4, 1);
333  }
334  return RangeProcessor::operator()(begin, end);
335  }
336  }
337 
338  bool prefer_mdy = (flags & Xapian::RP_DATE_PREFER_MDY);
339  int b_d, b_m, b_y;
340  int e_d, e_m, e_y;
341  if (!decode_xxy(b, b_d, b_m, b_y) || !decode_xxy(e, e_d, e_m, e_y))
342  goto not_our_range;
343 
344  // Check that the month and day are within range. Also assume "start" <=
345  // "e" to help decide ambiguous cases.
346  if (!prefer_mdy && vet_dm(b_d, b_m) && vet_dm(e_d, e_m) &&
347  (b_y != e_y || b_m < e_m || (b_m == e_m && b_d <= e_d))) {
348  // OK.
349  } else if (vet_dm(b_m, b_d) && vet_dm(e_m, e_d) &&
350  (b_y != e_y || b_d < e_d || (b_d == e_d && b_m <= e_m))) {
351  swap(b_m, b_d);
352  swap(e_m, e_d);
353  } else if (prefer_mdy && vet_dm(b_d, b_m) && vet_dm(e_d, e_m) &&
354  (b_y != e_y || b_m < e_m || (b_m == e_m && b_d <= e_d))) {
355  // OK.
356  } else {
357  goto not_our_range;
358  }
359 
360  {
361  char buf_b[8], buf_e[8];
362  size_t len_b = 0, len_e = 0;
363  if (!b.empty()) {
364  if (b_y < 100) {
365  b_y += 1900;
366  if (b_y < epoch_year) b_y += 100;
367  }
368  format_yyyymmdd(buf_b, b_y, b_m, b_d);
369  len_b = 8;
370  }
371  if (!e.empty()) {
372  if (e_y < 100) {
373  e_y += 1900;
374  if (e_y < epoch_year) e_y += 100;
375  }
376  format_yyyymmdd(buf_e, e_y, e_m, e_d);
377  len_e = 8;
378  }
379  return RangeProcessor::operator()(string(buf_b, len_b),
380  string(buf_e, len_e));
381  }
382 
383 not_our_range:
385 }
386 
388 NumberRangeProcessor::operator()(const string& b, const string& e)
389 {
390  // Parse the numbers to floating point.
391  double num_b, num_e;
392 
393  if (!b.empty()) {
394  errno = 0;
395  const char * startptr = b.c_str();
396  char * endptr;
397  num_b = strtod(startptr, &endptr);
398  if (endptr != startptr + b.size() || errno) {
399  // Invalid characters in string || overflow or underflow.
400  goto not_our_range;
401  }
402  } else {
403  // Silence GCC warning.
404  num_b = 0.0;
405  }
406 
407  if (!e.empty()) {
408  errno = 0;
409  const char * startptr = e.c_str();
410  char * endptr;
411  num_e = strtod(startptr, &endptr);
412  if (endptr != startptr + e.size() || errno) {
413  // Invalid characters in string || overflow or underflow.
414  goto not_our_range;
415  }
416  } else {
417  // Silence GCC warning.
418  num_e = 0.0;
419  }
420 
421  return RangeProcessor::operator()(
422  b.empty() ? b : Xapian::sortable_serialise(num_b),
423  e.empty() ? e : Xapian::sortable_serialise(num_e));
424 
425 not_our_range:
427 }
428 
429 }
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:80
bool endswith(const std::string &s, char sfx)
Definition: stringutils.h:70
static void format_yyyymmdd(char *p, int y, int m, int d)
static const char max_month_length[12]
static bool is_yyyy_mm_dd(const string &s)
static void format_int_fixed_width(char *p, int v, int w)
STL namespace.
std::string sortable_serialise(double value)
Convert a floating point number to a string, preserving sort order.
Definition: queryparser.h:1320
Match only documents where a value slot is >= a given value.
Definition: query.h:214
Match only documents where a value slot is within a given range.
Definition: query.h:158
string str(int value)
Convert int to std::string.
Definition: str.cc:90
bool startswith(const std::string &s, char pfx)
Definition: stringutils.h:46
Construct an invalid query.
Definition: query.h:254
unsigned valueno
The number for a value slot in a document.
Definition: types.h:108
Various handy helpers which std::string really should provide.
static bool vet_dm(int d, int m)
Class representing a query.
Definition: query.h:46
const valueno BAD_VALUENO
Reserved value to indicate "no valueno".
Definition: types.h:125
static bool decode_xxy(const string &s, int &x1, int &x2, int &y)
parsing a user query string to build a Xapian::Query object