xapian-core  1.4.26
editdistance.cc
Go to the documentation of this file.
1 
12 /* Copyright (C) 2003 Richard Boulton
13  * Copyright (C) 2007,2008,2009 Olly Betts
14  *
15  * This program is free software; you can redistribute it and/or modify
16  * it under the terms of the GNU General Public License as published by
17  * the Free Software Foundation; either version 2 of the License, or
18  * (at your option) any later version.
19  *
20  * This program is distributed in the hope that it will be useful,
21  * but WITHOUT ANY WARRANTY; without even the implied warranty of
22  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23  * GNU General Public License for more details.
24  *
25  * You should have received a copy of the GNU General Public License
26  * along with this program; if not, write to the Free Software
27  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
28  */
29 
30 #include <config.h>
31 
32 #include "editdistance.h"
33 
34 #include "omassert.h"
35 
36 #include <algorithm>
37 #include <cstdlib>
38 
39 using namespace std;
40 
41 template<class CHR>
42 struct edist_seq {
43  edist_seq(const CHR * ptr_, int len_) : ptr(ptr_), len(len_) { }
44  const CHR * ptr;
45  int len;
46 };
47 
48 template<class CHR>
49 class edist_state {
51  void operator=(const edist_state &);
52 
54  edist_state(const edist_state &);
55 
58 
59  /* Array of f(k,p) values, where f(k,p) = the largest index i such that
60  * d(i,j) = p and d(i,j) is on diagonal k.
61  * ie: f(k,p) = largest i s.t. d(i,k+i) = p
62  * Where: d(i,j) = edit distance between substrings of length i and j.
63  */
64  int * fkp;
65  int fkp_cols;
66 
67  /* Maximum possible edit distance (this is referred to as ZERO_K in
68  * the algorithm description by Berghel and Roach). */
69  int maxdist;
70 
71  int calc_index(int k, int p) const {
72  return (k + maxdist) * fkp_cols + p + 1;
73  }
74 
75  public:
76 
77  edist_state(const CHR * ptr1, int len1, const CHR * ptr2, int len2);
78 
79  ~edist_state();
80 
81  int get_f_kp(int k, int p) const {
82  return fkp[calc_index(k, p)];
83  }
84 
85  void set_f_kp(int k, int p, int val) {
86  fkp[calc_index(k, p)] = val;
87  }
88 
89  bool is_transposed(int pos1, int pos2) const {
90  if (pos1 <= 0 || pos2 <= 0 || pos1 >= seq1.len || pos2 >= seq2.len) return false;
91  return (seq1.ptr[pos1 - 1] == seq2.ptr[pos2] &&
92  seq1.ptr[pos1] == seq2.ptr[pos2 - 1]);
93  }
94 
95  void edist_calc_f_kp(int k, int p);
96 };
97 
98 template<class CHR>
100 {
101  int maxlen = get_f_kp(k, p - 1) + 1; /* dist if do substitute */
102  int maxlen2 = get_f_kp(k - 1, p - 1); /* dist if do insert */
103  int maxlen3 = get_f_kp(k + 1, p - 1) + 1; /* dist if delete */
104 
105  if (is_transposed(maxlen, maxlen + k)) {
106  // Transposition.
107  ++maxlen;
108  }
109 
110  if (maxlen >= maxlen2) {
111  if (maxlen >= maxlen3) {
112  // Transposition or Substitution.
113  } else {
114  // Deletion.
115  maxlen = maxlen3;
116  }
117  } else {
118  if (maxlen2 >= maxlen3) {
119  // Insertion.
120  maxlen = maxlen2;
121  } else {
122  // Deletion.
123  maxlen = maxlen3;
124  }
125  }
126 
127  /* Check for exact matches, and increase the length until we don't have
128  * one. */
129  while (maxlen < seq1.len &&
130  maxlen + k < seq2.len &&
131  seq1.ptr[maxlen] == seq2.ptr[maxlen + k]) {
132  ++maxlen;
133  }
134  set_f_kp(k, p, maxlen);
135 }
136 
137 #define INF 1000000
138 template<class CHR>
139 edist_state<CHR>::edist_state(const CHR * ptr1, int len1,
140  const CHR * ptr2, int len2)
141  : seq1(ptr1, len1), seq2(ptr2, len2), maxdist(len2)
142 {
143  Assert(len2 >= len1);
144  /* Each row represents a value of k, from -maxdist to maxdist. */
145  int fkp_rows = maxdist * 2 + 1;
146  /* Each column represents a value of p, from -1 to maxdist. */
147  fkp_cols = maxdist + 2;
148  /* fkp is stored as a rectangular array, row by row. */
149  fkp = new int[fkp_rows * fkp_cols];
150 
151  set_f_kp(0, -1, -1);
152  for (int k = 1; k <= maxdist; ++k) {
153  for (int p = -1; p < k - 1; ++p) {
154  set_f_kp(k, p, -INF);
155  set_f_kp(-k, p, -INF);
156  }
157  set_f_kp(k, k - 1, -1);
158  set_f_kp(-k, k - 1, k - 1);
159  }
160 }
161 
162 template<class CHR>
164  delete [] fkp;
165 }
166 
167 template<class CHR>
168 static int
169 seqcmp_editdist(const CHR * ptr1, int len1, const CHR * ptr2, int len2,
170  int max_distance)
171 {
172  int lendiff = len2 - len1;
173  /* Make sure second sequence is longer (or same length). */
174  if (lendiff < 0) {
175  lendiff = -lendiff;
176  swap(ptr1, ptr2);
177  swap(len1, len2);
178  }
179 
180  /* Special case for if one or both sequences are empty. */
181  if (len1 == 0) return len2;
182 
183  edist_state<CHR> state(ptr1, len1, ptr2, len2);
184 
185  int p = lendiff; /* This is the minimum possible edit distance. */
186  while (p <= max_distance) {
187  for (int temp_p = 0; temp_p != p; ++temp_p) {
188  int inc = p - temp_p;
189  if (abs(lendiff - inc) <= temp_p) {
190  state.edist_calc_f_kp(lendiff - inc, temp_p);
191  }
192  if (abs(lendiff + inc) <= temp_p) {
193  state.edist_calc_f_kp(lendiff + inc, temp_p);
194  }
195  }
196  state.edist_calc_f_kp(lendiff, p);
197 
198  if (state.get_f_kp(lendiff, p) == len1) break;
199  ++p;
200  }
201 
202  return p;
203 }
204 
205 int
206 edit_distance_unsigned(const unsigned * ptr1, int len1,
207  const unsigned * ptr2, int len2,
208  int max_distance)
209 {
210  return seqcmp_editdist<unsigned>(ptr1, len1, ptr2, len2, max_distance);
211 }
#define Assert(COND)
Definition: omassert.h:122
int calc_index(int k, int p) const
Definition: editdistance.cc:71
#define INF
STL namespace.
bool is_transposed(int pos1, int pos2) const
Definition: editdistance.cc:89
const CHR * ptr
Definition: editdistance.cc:44
Edit distance calculation algorithm.
static int seqcmp_editdist(const CHR *ptr1, int len1, const CHR *ptr2, int len2, int max_distance)
int get_f_kp(int k, int p) const
Definition: editdistance.cc:81
edist_seq< CHR > seq1
Definition: editdistance.cc:56
void edist_calc_f_kp(int k, int p)
Definition: editdistance.cc:99
void set_f_kp(int k, int p, int val)
Definition: editdistance.cc:85
edist_state(const edist_state &)
Don&#39;t allow copying.
edist_seq(const CHR *ptr_, int len_)
Definition: editdistance.cc:43
Various assertion macros.
int edit_distance_unsigned(const unsigned *ptr1, int len1, const unsigned *ptr2, int len2, int max_distance)
Calculate the edit distance between two sequences.
edist_seq< CHR > seq2
Definition: editdistance.cc:57