xapian-core  2.0.0
valuerangeproc.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2007-2026 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, see
18  * <https://www.gnu.org/licenses/>.
19  */
20 
21 #include <config.h>
22 
23 #include <xapian/queryparser.h>
24 
25 #include <cerrno>
26 #ifdef HAVE_STD_FROM_CHARS_DOUBLE
27 # include <charconv>
28 #else
29 # include <cstdlib> // For strtod().
30 #endif
31 
32 #include <string>
33 #include "stringutils.h"
34 
35 using namespace std;
36 
37 namespace Xapian {
38 
39 static bool
40 decode_xxy(const string & s, int & x1, int &x2, int &y)
41 {
42  if (s.size() == 0) {
43  x1 = x2 = y = -1;
44  return true;
45  }
46  if (s.size() < 5 || s.size() > 10) return false;
47  const char* p = s.c_str();
48  if (!C_isdigit(*p)) return false;
49  x1 = *p++ - '0';
50  if (C_isdigit(*p)) {
51  x1 = x1 * 10 + (*p++ - '0');
52  }
53  if (x1 < 1 || x1 > 31) return false;
54  char sep = *p++;
55  if (sep != '/' && sep != '-' && sep != '.') return false;
56  if (!C_isdigit(*p)) return false;
57  x2 = *p++ - '0';
58  if (C_isdigit(*p)) {
59  x2 = x2 * 10 + (*p++ - '0');
60  }
61  if (x2 < 1 || x2 > 31) return false;
62  if (*p++ != sep) return false;
63  if (s.size() - (p - s.c_str()) > 4) return false;
64  y = *p++ - '0';
65  while (C_isdigit(*p)) {
66  y = y * 10 + (*p++ - '0');
67  }
68  return size_t(p - s.c_str()) == s.size();
69 }
70 
71 // We just use this to decide if an ambiguous aa/bb/cc date could be a
72 // particular format, so there's no need to be anal about the exact number of
73 // days in February. The most useful check is that the month field is <= 12
74 // so we could just check the day is <= 31 really.
75 static const char max_month_length[12] = {
76  31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31
77 };
78 
79 static bool
80 vet_dm(int d, int m)
81 {
82  if (m == -1) return true;
83  if (m > 12 || m < 1) return false;
84  if (d < 1 || d > max_month_length[m - 1]) return false;
85  return true;
86 }
87 
88 // NB Assumes the length has been checked to be 10 already.
89 static bool
90 is_yyyy_mm_dd(const string &s)
91 {
92  return (s.find_first_not_of("0123456789") == 4 &&
93  s.find_first_not_of("0123456789", 5) == 7 &&
94  s.find_first_not_of("0123456789", 8) == string::npos &&
95  s[4] == s[7] &&
96  (s[4] == '-' || s[4] == '.' || s[4] == '/'));
97 }
98 
99 // Write exactly w chars to buffer p representing integer v.
100 //
101 // The result is left padded with zeros if v < pow(10, w - 1).
102 //
103 // If v >= pow(10, w), then the output will show v % pow(10, w) (i.e. the
104 // most significant digits are lost).
105 static void
106 format_int_fixed_width(char * p, int v, int w)
107 {
108  while (--w >= 0) {
109  p[w] = '0' + (v % 10);
110  v /= 10;
111  }
112 }
113 
114 static void
115 format_yyyymmdd(char * p, int y, int m, int d)
116 {
117  format_int_fixed_width(p, y, 4);
118  format_int_fixed_width(p + 4, m, 2);
119  format_int_fixed_width(p + 6, d, 2);
120 }
121 
123 RangeProcessor::check_range(const string& b, const string& e)
124 {
125  if (str.empty())
126  return operator()(b, e);
127 
128  size_t off_b = 0, len_b = string::npos;
129  size_t off_e = 0, len_e = string::npos;
130 
131  bool prefix = !(flags & Xapian::RP_SUFFIX);
132  bool repeated = (flags & Xapian::RP_REPEATED);
133 
134  if (repeated && prefix && b.empty()) {
135  // Handle empty start and prefix on end, e.g.: ..$20
136  if (!startswith(e, str)) {
137  goto not_our_range;
138  }
139  off_e = str.size();
140  } else if (repeated && !prefix && e.empty()) {
141  // Handle empty end and suffix on start, e.g.: 20kg..
142  if (!endswith(b, str)) {
143  goto not_our_range;
144  }
145  len_b = b.size() - str.size();
146  } else if (prefix) {
147  // If there's a prefix, require it on the start of the range.
148  if (!startswith(b, str)) {
149  // Prefix not given.
150  goto not_our_range;
151  }
152  off_b = str.size();
153  // Optionally allow it on the end of the range, e.g. $10..50
154  if (repeated && startswith(e, str)) {
155  off_e = off_b;
156  }
157  } else {
158  // If there's a suffix, require it on the end of the range.
159  if (!endswith(e, str)) {
160  // Suffix not given.
161  goto not_our_range;
162  }
163  len_e = e.size() - str.size();
164  // Optionally allow it on the start of the range, e.g. 10..50kg
165  if (repeated && endswith(b, str)) {
166  len_b = b.size() - str.size();
167  }
168  }
169 
170  return operator()(string(b, off_b, len_b), string(e, off_e, len_e));
171 
172 not_our_range:
174 }
175 
177 RangeProcessor::operator()(const string& b, const string& e)
178 {
179  if (e.empty())
180  return Xapian::Query(Xapian::Query::OP_VALUE_GE, slot, b);
181  return Xapian::Query(Xapian::Query::OP_VALUE_RANGE, slot, b, e);
182 }
183 
185 DateRangeProcessor::operator()(const string& b, const string& e)
186 {
187  if ((b.size() == 8 || b.size() == 0) &&
188  (e.size() == 8 || e.size() == 0) &&
189  b.find_first_not_of("0123456789") == string::npos &&
190  e.find_first_not_of("0123456789") == string::npos) {
191  // YYYYMMDD
192  return RangeProcessor::operator()(b, e);
193  }
194  if ((b.size() == 10 || b.size() == 0) &&
195  (e.size() == 10 || e.size() == 0)) {
196  if ((b.empty() || is_yyyy_mm_dd(b)) &&
197  (e.empty() || is_yyyy_mm_dd(e))) {
198  string begin = b, end = e;
199  // YYYY-MM-DD
200  if (!begin.empty()) {
201  begin.erase(7, 1);
202  begin.erase(4, 1);
203  }
204  if (!end.empty()) {
205  end.erase(7, 1);
206  end.erase(4, 1);
207  }
208  return RangeProcessor::operator()(begin, end);
209  }
210  }
211 
212  bool prefer_mdy = (flags & Xapian::RP_DATE_PREFER_MDY);
213  int b_d, b_m, b_y;
214  int e_d, e_m, e_y;
215  if (!decode_xxy(b, b_d, b_m, b_y) || !decode_xxy(e, e_d, e_m, e_y))
216  goto not_our_range;
217 
218  // Check that the month and day are within range. Also assume "start" <=
219  // "e" to help decide ambiguous cases.
220  if (!prefer_mdy && vet_dm(b_d, b_m) && vet_dm(e_d, e_m) &&
221  (b_y != e_y || b_m < e_m || (b_m == e_m && b_d <= e_d))) {
222  // OK.
223  } else if (vet_dm(b_m, b_d) && vet_dm(e_m, e_d) &&
224  (b_y != e_y || b_d < e_d || (b_d == e_d && b_m <= e_m))) {
225  swap(b_m, b_d);
226  swap(e_m, e_d);
227  } else if (prefer_mdy && vet_dm(b_d, b_m) && vet_dm(e_d, e_m) &&
228  (b_y != e_y || b_m < e_m || (b_m == e_m && b_d <= e_d))) {
229  // OK.
230  } else {
231  goto not_our_range;
232  }
233 
234  {
235  char buf_b[8], buf_e[8];
236  size_t len_b = 0, len_e = 0;
237  if (!b.empty()) {
238  if (b_y < 100) {
239  b_y += 1900;
240  if (b_y < epoch_year) b_y += 100;
241  }
242  format_yyyymmdd(buf_b, b_y, b_m, b_d);
243  len_b = 8;
244  }
245  if (!e.empty()) {
246  if (e_y < 100) {
247  e_y += 1900;
248  if (e_y < epoch_year) e_y += 100;
249  }
250  format_yyyymmdd(buf_e, e_y, e_m, e_d);
251  len_e = 8;
252  }
253  return RangeProcessor::operator()(string(buf_b, len_b),
254  string(buf_e, len_e));
255  }
256 
257 not_our_range:
259 }
260 
262 NumberRangeProcessor::operator()(const string& b, const string& e)
263 {
264  // Parse the numbers to floating point.
265  double num_b, num_e;
266 
267  if (!b.empty()) {
268 #ifdef HAVE_STD_FROM_CHARS_DOUBLE
269  const char* startptr = b.data();
270  const char* endptr = startptr + b.size();
271  const auto& r = from_chars(startptr, endptr, num_b);
272  if (r.ec != std::errc() || r.ptr != endptr) {
273  // Invalid characters in string || overflow or underflow.
274  goto not_our_range;
275  }
276 #else
277  errno = 0;
278  const char * startptr = b.c_str();
279  char * endptr;
280  num_b = strtod(startptr, &endptr);
281  if (endptr != startptr + b.size() || errno) {
282  // Invalid characters in string || overflow or underflow.
283  goto not_our_range;
284  }
285 #endif
286  } else {
287  // Silence GCC warning.
288  num_b = 0.0;
289  }
290 
291  if (!e.empty()) {
292 #ifdef HAVE_STD_FROM_CHARS_DOUBLE
293  const char* startptr = e.data();
294  const char* endptr = startptr + e.size();
295  const auto& r = from_chars(startptr, endptr, num_e);
296  if (r.ec != std::errc() || r.ptr != endptr) {
297  // Invalid characters in string || overflow or underflow.
298  goto not_our_range;
299  }
300 #else
301  errno = 0;
302  const char * startptr = e.c_str();
303  char * endptr;
304  num_e = strtod(startptr, &endptr);
305  if (endptr != startptr + e.size() || errno) {
306  // Invalid characters in string || overflow or underflow.
307  goto not_our_range;
308  }
309 #endif
310  } else {
311  // Silence GCC warning.
312  num_e = 0.0;
313  }
314 
315  return RangeProcessor::operator()(
316  b.empty() ? b : Xapian::sortable_serialise(num_b),
317  e.empty() ? e : Xapian::sortable_serialise(num_e));
318 
319 not_our_range:
321 }
322 
323 static const char byte_units[4][2] = {
324  "B", "K", "M", "G"
325 };
326 
327 // Return factor for byte unit
328 // if string is a valid byte unit
329 // else return -1
330 static double
331 check_byte_unit(const string &s) {
332  double factor = 1;
333  for (int i = 0; i < 4; ++i) {
334  if (endswith(s, byte_units[i])) {
335  return factor;
336  }
337  factor *= 1024;
338  }
339 
340  return -1;
341 }
342 
344 UnitRangeProcessor::operator()(const string& b, const string& e)
345 {
346  // Parse the numbers to floating point.
347  double num_b, num_e;
348 
349  // True if b has unit, e.g. 20K..
350  bool b_has_unit = false;
351 
352  if (!b.empty()) {
353 #ifdef HAVE_STD_FROM_CHARS_DOUBLE
354  const char* startptr = b.data();
355  const char* endptr = startptr + b.size();
356  const auto& r = from_chars(startptr, endptr, num_b);
357  if (r.ec != std::errc()) {
358  // Invalid characters in string || overflow or underflow.
359  goto not_our_range;
360  }
361  endptr = r.ptr;
362 #else
363  errno = 0;
364  const char * startptr = b.c_str();
365  char * endptr;
366  num_b = strtod(startptr, &endptr);
367 
368  if (errno) {
369  // overflow or underflow
370  goto not_our_range;
371  }
372 #endif
373 
374  // For lower range having a unit, e.g. 100K..
375  if (endptr == startptr + b.size() - 1) {
376  double factor_b = check_byte_unit(b);
377  if (factor_b == -1) {
378  // Not a valid byte unit
379  goto not_our_range;
380  }
381  b_has_unit = true;
382  num_b *= factor_b;
383  }
384  } else {
385  // Silence GCC warning.
386  num_b = 0.0;
387  }
388 
389  if (!e.empty()) {
390 #ifdef HAVE_STD_FROM_CHARS_DOUBLE
391  const char* startptr = e.data();
392  const char* endptr = startptr + e.size();
393  const auto& r = from_chars(startptr, endptr, num_e);
394  if (r.ec != std::errc()) {
395  // Invalid characters in string || overflow or underflow.
396  goto not_our_range;
397  }
398  endptr = r.ptr;
399 #else
400  errno = 0;
401  const char * startptr = e.c_str();
402  char * endptr;
403  num_e = strtod(startptr, &endptr);
404 
405  if (errno) {
406  // overflow or underflow
407  goto not_our_range;
408  }
409 #endif
410 
411  // For upper range having a unit, e.g. ..100K
412  if (endptr == startptr + e.size() - 1) {
413  double factor_e = check_byte_unit(e);
414  if (factor_e == -1) {
415  // Not a valid byte unit
416  goto not_our_range;
417  }
418  num_e *= factor_e;
419 
420  // When lower range is not empty and
421  // only upper range unit, e.g. 20..100K
422  if (!b.empty() && !b_has_unit) {
423  num_b *= factor_e;
424  }
425  } else {
426  // When lower range has no unit
427  goto not_our_range;
428  }
429  } else {
430  // Silence GCC warning.
431  num_e = 0.0;
432 
433  // Fail case when lower range
434  // has no unit, e.g. 200..
435  if (!b.empty() && !b_has_unit) {
436  goto not_our_range;
437  }
438  }
439 
440  return RangeProcessor::operator()(
441  b.empty() ? b : Xapian::sortable_serialise(num_b),
442  e.empty() ? e : Xapian::sortable_serialise(num_e));
443 
444 not_our_range:
446 }
447 
448 }
Class representing a query.
Definition: query.h:45
@ OP_VALUE_RANGE
Match only documents where a value slot is within a given range.
Definition: query.h:158
@ OP_VALUE_GE
Match only documents where a value slot is >= a given value.
Definition: query.h:223
@ OP_INVALID
Construct an invalid query.
Definition: query.h:277
PositionList * p
string str(int value)
Convert int to std::string.
Definition: str.cc:91
The Xapian namespace contains public interfaces for the Xapian library.
Definition: compactor.cc:82
std::string sortable_serialise(double value)
Convert a floating point number to a string, preserving sort order.
Definition: queryparser.h:1229
static const char byte_units[4][2]
static bool decode_xxy(const string &s, int &x1, int &x2, int &y)
static double check_byte_unit(const string &s)
@ RP_DATE_PREFER_MDY
Definition: queryparser.h:135
@ RP_REPEATED
Definition: queryparser.h:134
@ RP_SUFFIX
Definition: queryparser.h:133
static bool vet_dm(int d, int m)
static const char max_month_length[12]
static void format_int_fixed_width(char *p, int v, int w)
static void format_yyyymmdd(char *p, int y, int m, int d)
static bool is_yyyy_mm_dd(const string &s)
parsing a user query string to build a Xapian::Query object
Various handy string-related helpers.
bool endswith(std::string_view s, char sfx)
Definition: stringutils.h:80
bool startswith(std::string_view s, char pfx)
Definition: stringutils.h:56
bool C_isdigit(char ch)
Definition: stringutils.h:182