xapian-core  2.0.0
editdistance.cc
Go to the documentation of this file.
1 
12 /* Copyright (C) 2003 Richard Boulton
13  * Copyright (C) 2007,2008,2009,2017,2019,2020 Olly Betts
14  *
15  * This program is free software; you can redistribute it and/or modify
16  * it under the terms of the GNU General Public License as published by
17  * the Free Software Foundation; either version 2 of the License, or
18  * (at your option) any later version.
19  *
20  * This program is distributed in the hope that it will be useful,
21  * but WITHOUT ANY WARRANTY; without even the implied warranty of
22  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23  * GNU General Public License for more details.
24  *
25  * You should have received a copy of the GNU General Public License
26  * along with this program; if not, see
27  * <https://www.gnu.org/licenses/>.
28  */
29 
30 #include <config.h>
31 
32 #include "editdistance.h"
33 
34 #include "omassert.h"
35 #include "popcount.h"
36 
37 #include <algorithm>
38 #include <climits>
39 #include <cstdlib>
40 #include <cstring>
41 
42 using namespace std;
43 
44 template<class Char>
45 struct edist_seq {
46  edist_seq(const Char* ptr_, int len_) : ptr(ptr_), len(len_) { }
47  const Char* ptr;
48  int len;
49 };
50 
51 template<class Char>
52 class edist_state {
54  edist_state& operator=(const edist_state&) = delete;
55 
57  edist_state(const edist_state&) = delete;
58 
61 
62  /* Array of f(k,p) values, where f(k,p) = the largest index i such that
63  * d(i,j) = p and d(i,j) is on diagonal k.
64  * ie: f(k,p) = largest i s.t. d(i,k+i) = p
65  * Where: d(i,j) = edit distance between substrings of length i and j.
66  */
67  int* fkp;
68  int fkp_rows;
69 
70  /* Maximum possible edit distance (this is referred to as ZERO_K in
71  * the algorithm description by Berghel and Roach). */
72  int maxdist;
73 
74  int calc_index(int k, int p) const {
75  return k + maxdist + fkp_rows * (p + 1);
76  }
77 
78  public:
79  edist_state(const Char* ptr1, int len1, const Char* ptr2, int len2,
80  int* fkp_)
81  : seq1(ptr1, len1), seq2(ptr2, len2), fkp(fkp_), maxdist(len2) {
82  Assert(len2 >= len1);
83  // fkp is stored as a rectangular array, column by column. Each entry
84  // represents a value of p, from -1 to maxdist or a special value
85  // close-ish to INT_MIN.
86  fkp_rows = 2 * maxdist + 1;
87  // It's significantly faster to memset() than std::fill_n() with an int
88  // value, so fill with the msb of INT_MIN, which for 32-bit 2's
89  // complement int means -2139062144 instead of -2147483648, which is
90  // fine what we need here.
91  memset(fkp, unsigned(INT_MIN) >> (8 * (sizeof(int) - 1)),
92  sizeof(int) * (calc_index(maxdist, maxdist - 2) + 1));
93  set_f_kp(0, -1, -1);
94  for (int k = 1; k <= maxdist; ++k) {
95  set_f_kp(k, k - 1, -1);
96  set_f_kp(-k, k - 1, k - 1);
97  }
98  }
99 
100  int get_f_kp(int k, int p) const {
101  return fkp[calc_index(k, p)];
102  }
103 
104  void set_f_kp(int k, int p, int val) {
105  fkp[calc_index(k, p)] = val;
106  }
107 
108  bool is_transposed(int pos1, int pos2) const {
109  if (pos1 <= 0 || pos2 <= 0 || pos1 >= seq1.len || pos2 >= seq2.len)
110  return false;
111  return (seq1.ptr[pos1 - 1] == seq2.ptr[pos2] &&
112  seq1.ptr[pos1] == seq2.ptr[pos2 - 1]);
113  }
114 
115  void edist_calc_f_kp(int k, int p);
116 };
117 
118 template<class Char>
120 {
121  int maxlen = get_f_kp(k, p - 1) + 1; /* dist if do substitute */
122  int maxlen2 = get_f_kp(k - 1, p - 1); /* dist if do insert */
123  int maxlen3 = get_f_kp(k + 1, p - 1) + 1; /* dist if delete */
124 
125  if (is_transposed(maxlen, maxlen + k)) {
126  // Transposition.
127  ++maxlen;
128  }
129 
130  if (maxlen >= maxlen2) {
131  if (maxlen >= maxlen3) {
132  // Transposition or Substitution.
133  } else {
134  // Deletion.
135  maxlen = maxlen3;
136  }
137  } else {
138  if (maxlen2 >= maxlen3) {
139  // Insertion.
140  maxlen = maxlen2;
141  } else {
142  // Deletion.
143  maxlen = maxlen3;
144  }
145  }
146 
147  /* Check for exact matches, and increase the length until we don't have
148  * one. */
149  while (maxlen < seq1.len &&
150  maxlen + k < seq2.len &&
151  seq1.ptr[maxlen] == seq2.ptr[maxlen + k]) {
152  ++maxlen;
153  }
154  set_f_kp(k, p, maxlen);
155 }
156 
157 template<class Char>
158 static int
159 seqcmp_editdist(const Char* ptr1, int len1, const Char* ptr2, int len2,
160  int* fkp_, int max_distance)
161 {
162  int lendiff = len2 - len1;
163  /* Make sure second sequence is longer (or same length). */
164  if (lendiff < 0) {
165  lendiff = -lendiff;
166  swap(ptr1, ptr2);
167  swap(len1, len2);
168  }
169 
170  /* Special case for if one or both sequences are empty. */
171  if (len1 == 0) return len2;
172 
173  edist_state<Char> state(ptr1, len1, ptr2, len2, fkp_);
174 
175  int p = lendiff; /* This is the minimum possible edit distance. */
176  while (p <= max_distance) {
177  for (int temp_p = 0; temp_p != p; ++temp_p) {
178  int inc = p - temp_p;
179  if (abs(lendiff - inc) <= temp_p) {
180  state.edist_calc_f_kp(lendiff - inc, temp_p);
181  }
182  if (abs(lendiff + inc) <= temp_p) {
183  state.edist_calc_f_kp(lendiff + inc, temp_p);
184  }
185  }
186  state.edist_calc_f_kp(lendiff, p);
187 
188  if (state.get_f_kp(lendiff, p) == len1) break;
189  ++p;
190  }
191 
192  return p;
193 }
194 
195 int
196 EditDistanceCalculator::calc(const unsigned* ptr, int len,
197  int max_distance) const
198 {
199  // Calculate a cheap lower bound on the edit distance by considering
200  // frequency histograms.
201  freqs_bitmap freqs = 0;
202  freqs_bitmap freqs2 = 0;
203  for (int i = 0; i != len; ++i) {
204  unsigned ch = ptr[i];
205  auto bit = freqs_bitmap(1) << (ch & FREQS_MASK);
206  freqs2 |= (freqs & bit);
207  freqs |= bit;
208  }
209  // Each insertion or deletion adds at most 1 to total. Each transposition
210  // doesn't change it at all. But each substitution can change it by 2 so
211  // we need to divide it by 2. We round up since the unpaired change must
212  // be due to an actual edit.
213  unsigned bits = 1;
214  add_popcount(bits, freqs ^ target_freqs);
215  add_popcount(bits, freqs2 ^ target_freqs2);
216  int ed_lower_bound = bits / 2;
217  if (ed_lower_bound > max_distance) {
218  // It's OK to return any distance > max_distance if the true answer is
219  // > max_distance.
220  return ed_lower_bound;
221  }
222 
223  if (!array) {
224  // Allocate space for the largest case we need to consider, which is
225  // when the second sequence is len + max_distance long. Any second
226  // sequence which is longer must be more than max_distance edits
227  // away.
228  int maxdist = target.size() + max_distance;
229  int max_cols = maxdist * 2;
230  int max_rows = maxdist * 2 + 1;
231  array = new int[max_rows * max_cols];
232  }
233 
234  return seqcmp_editdist<unsigned>(ptr, len, &target[0], target.size(),
235  array, max_distance);
236 }
int calc(const unsigned *ptr, int len, int max_distance) const
Calculate edit distance.
unsigned long long freqs_bitmap
The type to use for the occurrence bitmaps.
Definition: editdistance.h:74
edist_state & operator=(const edist_state &)=delete
Don't allow assignment.
bool is_transposed(int pos1, int pos2) const
edist_seq< Char > seq1
Definition: editdistance.cc:59
void edist_calc_f_kp(int k, int p)
int get_f_kp(int k, int p) const
edist_seq< Char > seq2
Definition: editdistance.cc:60
int calc_index(int k, int p) const
Definition: editdistance.cc:74
void set_f_kp(int k, int p, int val)
edist_state(const edist_state &)=delete
Don't allow copying.
edist_state(const Char *ptr1, int len1, const Char *ptr2, int len2, int *fkp_)
Definition: editdistance.cc:79
PositionList * p
static int seqcmp_editdist(const Char *ptr1, int len1, const Char *ptr2, int len2, int *fkp_, int max_distance)
Edit distance calculation algorithm.
Various assertion macros.
#define Assert(COND)
Definition: omassert.h:122
Count the number of set bits in an integer type.
static void add_popcount(A &accumulator, V value)
Add the number of set bits in value to accumulator.
Definition: popcount.h:39
edist_seq(const Char *ptr_, int len_)
Definition: editdistance.cc:46
const Char * ptr
Definition: editdistance.cc:47