xapian-core  2.0.0
editdistance.h
Go to the documentation of this file.
1 
4 /* Copyright (C) 2003 Richard Boulton
5  * Copyright (C) 2007,2008,2017,2019,2024 Olly Betts
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2 of the License, or
10  * (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, see
19  * <https://www.gnu.org/licenses/>.
20  */
21 
22 #ifndef XAPIAN_INCLUDED_EDITDISTANCE_H
23 #define XAPIAN_INCLUDED_EDITDISTANCE_H
24 
25 #include <cstdlib>
26 #include <climits>
27 #include <vector>
28 
29 #include "omassert.h"
30 #include "xapian/unicode.h"
31 
46 
49 
51  std::vector<unsigned> target;
52 
53  size_t target_bytes;
54 
56  mutable std::vector<unsigned> utf32;
57 
58  mutable int* array = nullptr;
59 
74  typedef unsigned long long freqs_bitmap;
75 
87 
100 
101  static constexpr unsigned FREQS_MASK = sizeof(freqs_bitmap) * 8 - 1;
102 
107  int calc(const unsigned* ptr, int len, int max_distance) const;
108 
109  public:
114  explicit
115  EditDistanceCalculator(std::string_view target_)
116  : target_bytes(target_.size()) {
117  using Xapian::Utf8Iterator;
118  for (Utf8Iterator it(target_); it != Utf8Iterator(); ++it) {
119  unsigned ch = *it;
120  target.push_back(ch);
121  auto bit = freqs_bitmap(1) << (ch & FREQS_MASK);
122  target_freqs2 |= (target_freqs & bit);
123  target_freqs |= bit;
124  }
125  }
126 
128  delete [] array;
129  }
130 
145  int operator()(const std::string& candidate, int max_distance) const {
146  // We have the UTF-32 target in target.
147  size_t target_utf32_len = target.size();
148 
149  // We can quickly rule out some candidates just by comparing
150  // lengths since each edit can change the number of UTF-32 characters
151  // by at most 1. But first we check the encoded UTF-8 length of the
152  // candidate since we can do that without having to convert it to
153  // UTF-32.
154 
155  // Each Unicode codepoint is 1-4 bytes in UTF-8 and one word in UTF-32,
156  // so the number of UTF-32 characters in candidate must be <= the number
157  // of bytes of UTF-8.
158  if (target_utf32_len > candidate.size() + max_distance) {
159  // Candidate too short.
160  return INT_MAX;
161  }
162 
163  // Each edit can change the number of UTF-8 bytes by up to 4 (addition
164  // or deletion of any character which needs 4 bytes in UTF-8), which
165  // gives us an alternative lower bound (which is sometimes tighter and
166  // sometimes not) and the tightest upper bound.
167  if (target_bytes > candidate.size() + 4 * max_distance) {
168  // Candidate too short.
169  return INT_MAX;
170  }
171  if (target_bytes + 4 * max_distance < candidate.size()) {
172  // Candidate too long.
173  return INT_MAX;
174  }
175 
176  // Now convert to UTF-32.
177  utf32.assign(Xapian::Utf8Iterator(candidate), Xapian::Utf8Iterator());
178 
179  // Check a cheap length-based lower bound based on UTF-32 lengths.
180  int lb = std::abs(int(utf32.size()) - int(target_utf32_len));
181  if (lb > max_distance) {
182  return lb;
183  }
184 
185  // Actually calculate the edit distance.
186  return calc(&utf32[0], utf32.size(), max_distance);
187  }
188 };
189 
190 #endif // XAPIAN_INCLUDED_EDITDISTANCE_H
Calculate edit distances to a target string.
Definition: editdistance.h:43
std::vector< unsigned > target
Target in UTF-32.
Definition: editdistance.h:51
EditDistanceCalculator & operator=(const EditDistanceCalculator &)=delete
Don't allow assignment.
int operator()(const std::string &candidate, int max_distance) const
Calculate edit distance for a sequence.
Definition: editdistance.h:145
freqs_bitmap target_freqs
Occurrence bitmap for target sequence.
Definition: editdistance.h:86
std::vector< unsigned > utf32
Current candidate in UTF-32.
Definition: editdistance.h:56
int calc(const unsigned *ptr, int len, int max_distance) const
Calculate edit distance.
unsigned long long freqs_bitmap
The type to use for the occurrence bitmaps.
Definition: editdistance.h:74
EditDistanceCalculator(std::string_view target_)
Constructor.
Definition: editdistance.h:115
static constexpr unsigned FREQS_MASK
Definition: editdistance.h:101
freqs_bitmap target_freqs2
Second occurrence bitmap for target sequence.
Definition: editdistance.h:99
EditDistanceCalculator(const EditDistanceCalculator &)=delete
Don't allow copying.
An iterator which returns Unicode character values from a UTF-8 encoded string.
Definition: unicode.h:39
Various assertion macros.
Unicode and UTF-8 related classes and functions.