xapian-core  1.4.26
api_spelling.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2007-2023 Olly Betts
5  * Copyright (C) 2007 Lemur Consulting Ltd
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2 of the License, or
10  * (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include <config.h>
23 
24 #include "api_spelling.h"
25 
26 #include <xapian.h>
27 
28 #include "apitest.h"
29 #include "testsuite.h"
30 #include "testutils.h"
31 
32 #include <string>
33 
34 using namespace std;
35 
36 // Test add_spelling() and remove_spelling(), which remote dbs support.
37 DEFINE_TESTCASE(spell0, (spelling || remote) && writable) {
39 
40  db.add_spelling("hello");
41  db.add_spelling("cell", 2);
42  db.commit();
43  db.add_spelling("zig");
44  db.add_spelling("ch");
45  db.add_spelling("hello", 2);
46  db.remove_spelling("hello", 2);
47  db.remove_spelling("cell", 6);
48  db.commit();
49  db.remove_spelling("hello");
50  db.remove_spelling("nonsuch");
51  db.remove_spelling("zzzzzzzzz", 1000000);
52  db.remove_spelling("aarvark");
53  db.remove_spelling("hello");
54  db.commit();
55  db.remove_spelling("hello");
56 }
57 
58 // Test basic spelling correction features.
59 DEFINE_TESTCASE(spell1, spelling && writable) {
61 
62  // Check that the more frequent term is chosen.
63  db.add_spelling("hello");
64  TEST_EQUAL(db.get_spelling_suggestion("cell"), "hello");
65  db.add_spelling("cell", 2);
66  TEST_EQUAL(db.get_spelling_suggestion("hell"), "cell");
67  db.commit();
69  TEST_EQUAL(db.get_spelling_suggestion("hell"), "cell");
70  TEST_EQUAL(dbr.get_spelling_suggestion("hell"), "cell");
71 
72  // Check suggestions for single edit errors to "zig".
73  db.add_spelling("zig");
74  // Transpositions:
75  TEST_EQUAL(db.get_spelling_suggestion("izg"), "zig");
76  TEST_EQUAL(db.get_spelling_suggestion("zgi"), "zig");
77  // Substitutions:
78  TEST_EQUAL(db.get_spelling_suggestion("sig"), "zig");
79  TEST_EQUAL(db.get_spelling_suggestion("zog"), "zig");
80  TEST_EQUAL(db.get_spelling_suggestion("zif"), "zig");
81  // Deletions:
82  TEST_EQUAL(db.get_spelling_suggestion("ig"), "zig");
83  TEST_EQUAL(db.get_spelling_suggestion("zg"), "zig");
84  TEST_EQUAL(db.get_spelling_suggestion("zi"), "zig");
85  // Insertions:
86  TEST_EQUAL(db.get_spelling_suggestion("azig"), "zig");
87  TEST_EQUAL(db.get_spelling_suggestion("zaig"), "zig");
88  TEST_EQUAL(db.get_spelling_suggestion("ziag"), "zig");
89  TEST_EQUAL(db.get_spelling_suggestion("ziga"), "zig");
90 
91  // Check suggestions for single edit errors to "ch".
92  db.add_spelling("ch");
93  // Transpositions:
94  TEST_EQUAL(db.get_spelling_suggestion("hc"), "ch");
95  // Substitutions - we don't handle these for two character words:
96  TEST_EQUAL(db.get_spelling_suggestion("qh"), "");
97  TEST_EQUAL(db.get_spelling_suggestion("cq"), "");
98  // Deletions would leave a single character, and we don't handle those.
100  TEST_EQUAL(db.get_spelling_suggestion("h"), "");
101  // Insertions:
102  TEST_EQUAL(db.get_spelling_suggestion("qch"), "ch");
103  TEST_EQUAL(db.get_spelling_suggestion("cqh"), "ch");
104  TEST_EQUAL(db.get_spelling_suggestion("chq"), "ch");
105 
106  // Check assorted cases:
107  TEST_EQUAL(db.get_spelling_suggestion("shello"), "hello");
108  TEST_EQUAL(db.get_spelling_suggestion("hellot"), "hello");
109  TEST_EQUAL(db.get_spelling_suggestion("acell"), "cell");
110  TEST_EQUAL(db.get_spelling_suggestion("cella"), "cell");
111  TEST_EQUAL(db.get_spelling_suggestion("acella"), "cell");
112  TEST_EQUAL(db.get_spelling_suggestion("helo"), "hello");
113  TEST_EQUAL(db.get_spelling_suggestion("cll"), "cell");
114  TEST_EQUAL(db.get_spelling_suggestion("helol"), "hello");
115  TEST_EQUAL(db.get_spelling_suggestion("clel"), "cell");
116  TEST_EQUAL(db.get_spelling_suggestion("ecll"), "cell");
117  TEST_EQUAL(db.get_spelling_suggestion("cll"), "cell");
118 
119  // Check that edit distance 3 isn't found by default:
120  TEST_EQUAL(db.get_spelling_suggestion("shelolx"), "");
121  TEST_EQUAL(db.get_spelling_suggestion("celling"), "");
122  TEST_EQUAL(db.get_spelling_suggestion("dellin"), "");
123 
124  // Check that edit distance 3 is found if specified:
125  TEST_EQUAL(db.get_spelling_suggestion("shelolx", 3), "hello");
126  TEST_EQUAL(db.get_spelling_suggestion("celling", 3), "cell");
127  TEST_EQUAL(db.get_spelling_suggestion("dellin", 3), "cell");
128 
129  // Make "hello" more frequent than "cell" (3 vs 2).
130  db.add_spelling("hello", 2);
131  TEST_EQUAL(db.get_spelling_suggestion("hell"), "hello");
132  db.commit();
133  TEST_EQUAL(db.get_spelling_suggestion("cello"), "hello");
134  db.remove_spelling("hello", 2);
135  TEST_EQUAL(db.get_spelling_suggestion("hell"), "cell");
136  // Test "over-removing".
137  db.remove_spelling("cell", 6);
138  TEST_EQUAL(db.get_spelling_suggestion("cell"), "hello");
139  db.commit();
140  TEST_EQUAL(db.get_spelling_suggestion("cell"), "hello");
141  db.remove_spelling("hello");
142  TEST_EQUAL(db.get_spelling_suggestion("cell"), "");
143 
144  // Test removing words not in the table.
145  db.remove_spelling("nonsuch");
146  db.remove_spelling("zzzzzzzzz", 1000000);
147  db.remove_spelling("aarvark");
148 
149  // Try removing word which was present but no longer is.
150  db.remove_spelling("hello");
151  db.commit();
152  db.remove_spelling("hello");
153 }
154 
155 // Test spelling correction for Unicode.
156 DEFINE_TESTCASE(spell2, spelling && writable) {
158 
159  // Check that a UTF-8 sequence counts as a single character.
160  db.add_spelling("h\xc3\xb6hle");
161  db.add_spelling("ascii");
162  TEST_EQUAL(db.get_spelling_suggestion("hohle", 1), "h\xc3\xb6hle");
163  TEST_EQUAL(db.get_spelling_suggestion("hhle", 1), "h\xc3\xb6hle");
164  TEST_EQUAL(db.get_spelling_suggestion("\xf0\xa8\xa8\x8f\xc3\xb6le", 2), "h\xc3\xb6hle");
165  TEST_EQUAL(db.get_spelling_suggestion("hh\xc3\xb6l"), "h\xc3\xb6hle");
166  TEST_EQUAL(db.get_spelling_suggestion("as\xc3\xb6\xc3\xb7i"), "ascii");
167  TEST_EQUAL(db.get_spelling_suggestion("asc\xc3\xb6i\xc3\xb7i"), "ascii");
168  db.commit();
170  TEST_EQUAL(dbr.get_spelling_suggestion("hohle", 1), "h\xc3\xb6hle");
171  TEST_EQUAL(dbr.get_spelling_suggestion("hhle", 1), "h\xc3\xb6hle");
172  TEST_EQUAL(dbr.get_spelling_suggestion("\xf0\xa8\xa8\x8f\xc3\xb6le", 2), "h\xc3\xb6hle");
173  TEST_EQUAL(dbr.get_spelling_suggestion("hh\xc3\xb6l"), "h\xc3\xb6hle");
174  TEST_EQUAL(dbr.get_spelling_suggestion("as\xc3\xb6\xc3\xb7i"), "ascii");
175  TEST_EQUAL(dbr.get_spelling_suggestion("asc\xc3\xb6i\xc3\xb7i"), "ascii");
176 }
177 
178 // Test spelling correction with multi databases
179 DEFINE_TESTCASE(spell3, spelling) {
180  Xapian::Database db1 = get_database("spell3a",
181  [](Xapian::WritableDatabase& wdb,
182  const string&) {
183  wdb.add_spelling("hello");
184  wdb.add_spelling("cell", 2);
185  });
186  Xapian::Database db2 = get_database("spell3b",
187  [](Xapian::WritableDatabase& wdb,
188  const string&) {
189  wdb.add_spelling("hello", 2);
190  wdb.add_spelling("helo");
191  });
192 
193  Xapian::Database db;
194  db.add_database(db1);
195  db.add_database(db2);
196 
197  TEST_EQUAL(db.get_spelling_suggestion("hello"), "");
198  TEST_EQUAL(db.get_spelling_suggestion("hell"), "hello");
199  TEST_EQUAL(db1.get_spelling_suggestion("hell"), "cell");
200  TEST_EQUAL(db2.get_spelling_suggestion("hell"), "hello");
201 
202  // Test spelling iterator
204  TEST_EQUAL(*i, "cell");
205  TEST_EQUAL(i.get_termfreq(), 2);
206  ++i;
207  TEST_EQUAL(*i, "hello");
208  TEST_EQUAL(i.get_termfreq(), 1);
209  ++i;
210  TEST(i == db1.spellings_end());
211 
212  i = db2.spellings_begin();
213  TEST_EQUAL(*i, "hello");
214  TEST_EQUAL(i.get_termfreq(), 2);
215  ++i;
216  TEST_EQUAL(*i, "helo");
217  TEST_EQUAL(i.get_termfreq(), 1);
218  ++i;
219  TEST(i == db2.spellings_end());
220 
221  i = db.spellings_begin();
222  TEST_EQUAL(*i, "cell");
223  TEST_EQUAL(i.get_termfreq(), 2);
224  ++i;
225  TEST_EQUAL(*i, "hello");
226  TEST_EQUAL(i.get_termfreq(), 3);
227  ++i;
228  TEST_EQUAL(*i, "helo");
229  TEST_EQUAL(i.get_termfreq(), 1);
230  ++i;
231  TEST(i == db.spellings_end());
232 
233  // Regression test for TermIterator::skip_to() bug fixed in 1.4.19.
234  i = db.spellings_begin();
235  i.skip_to("helo");
236  TEST(i != db.spellings_end());
237  TEST_EQUAL(*i, "helo");
238  TEST_EQUAL(i.get_termfreq(), 1);
239  i.skip_to("help");
240  TEST(i == db.spellings_end());
241 }
242 
243 // Regression test - check that appending works correctly.
244 DEFINE_TESTCASE(spell4, spelling) {
245  Xapian::Database db = get_database("spell4",
246  [](Xapian::WritableDatabase& wdb,
247  const string&) {
248  wdb.add_spelling("check");
249  wdb.add_spelling("pecks", 2);
250  wdb.commit();
251  wdb.add_spelling("becky");
252  });
253 
254  TEST_EQUAL(db.get_spelling_suggestion("jeck", 2), "pecks");
255 }
256 
257 // Regression test - used to segfault with some input values.
258 DEFINE_TESTCASE(spell5, spelling) {
259  // Using constexpr instead of a macro fails with MSVC - it's not visible
260  // inside the lambda, and we can't explicitly capture it or else the lambda
261  // can't be passed as a function pointer.
262 #define TARGET "\xe4\xb8\x80\xe4\xba\x9b"
263  Xapian::Database db = get_database("spell5",
264  [](Xapian::WritableDatabase& wdb,
265  const string&) {
266  wdb.add_spelling(TARGET);
267  });
268 
269  string s = db.get_spelling_suggestion("\xe4\xb8\x8d", 3);
270  TEST_EQUAL(s, TARGET);
271 #undef TARGET
272 }
273 
274 // Test basic spelling correction features.
275 DEFINE_TESTCASE(spell6, spelling && writable) {
277 
278  // Check that the more frequent term is chosen.
279  db.add_spelling("hello", 2);
280  db.add_spelling("sell", 3);
281  TEST_EQUAL(db.get_spelling_suggestion("hell"), "sell");
282  db.commit();
284  TEST_EQUAL(db.get_spelling_suggestion("hell"), "sell");
285  TEST_EQUAL(dbr.get_spelling_suggestion("hell"), "sell");
286 }
287 
288 // Test suggestions when there's an exact match.
289 DEFINE_TESTCASE(spell7, spelling) {
290  Xapian::Database db = get_database("spell7",
291  [](Xapian::WritableDatabase& wdb,
292  const string&) {
293  wdb.add_spelling("word", 57);
294  wdb.add_spelling("wrod", 3);
295  wdb.add_spelling("sword", 56);
296  wdb.add_spelling("words", 57);
297  wdb.add_spelling("ward", 58);
298  });
299 
300  // Check that the more frequent term is chosen.
301  TEST_EQUAL(db.get_spelling_suggestion("ward"), "");
302  TEST_EQUAL(db.get_spelling_suggestion("words"), "word");
303  TEST_EQUAL(db.get_spelling_suggestion("sword"), "word");
304  TEST_EQUAL(db.get_spelling_suggestion("wrod"), "word");
305 }
306 
308 DEFINE_TESTCASE(spell8, spelling) {
309  Xapian::Database db = get_database("spell8",
310  [](Xapian::WritableDatabase& wdb,
311  const string&) {
312  wdb.add_spelling("skinking", 2);
313  wdb.add_spelling("stinking", 1);
314  });
315 
316  // kin and kin used to cancel out in "skinking".
317  TEST_EQUAL(db.get_spelling_suggestion("scimkin", 3), "skinking");
318 }
#define TEST(a)
Test a condition, without an additional explanation for failure.
Definition: testsuite.h:275
This class is used to access a database, or a group of databases.
Definition: database.h:68
void remove_spelling(const std::string &word, Xapian::termcount freqdec=1) const
Remove a word from the spelling dictionary.
Definition: omdatabase.cc:1015
Xapian::WritableDatabase get_writable_database(const string &dbname)
Definition: apitest.cc:87
a generic test suite engine
STL namespace.
test functionality of the Xapian API
Class for iterating over a list of terms.
Definition: termiterator.h:41
This class provides read/write access to a database.
Definition: database.h:789
Xapian::TermIterator spellings_begin() const
An iterator which returns all the spelling correction targets.
Definition: omdatabase.cc:704
Public interfaces for the Xapian library.
void commit()
Commit any pending modifications made to the database.
Definition: omdatabase.cc:857
Xapian::Database get_writable_database_as_database()
Definition: apitest.cc:119
void add_database(const Database &database)
Add an existing database (or group of databases) to those accessed by this object.
Definition: omdatabase.cc:148
std::string get_spelling_suggestion(const std::string &word, unsigned max_edit_distance=2) const
Suggest a spelling correction.
Definition: omdatabase.cc:594
Xapian::Database get_database(const string &dbname)
Definition: apitest.cc:48
DEFINE_TESTCASE(spell0,(spelling||remote) &&writable)
Definition: api_spelling.cc:37
Xapian-specific test helper functions and macros.
#define TARGET
Xapian::TermIterator spellings_end() const
Corresponding end iterator to spellings_begin().
Definition: database.h:436
#define TEST_EQUAL(a, b)
Test for equality of two things.
Definition: testsuite.h:278
void add_spelling(const std::string &word, Xapian::termcount freqinc=1) const
Add a word to the spelling dictionary.
Definition: omdatabase.cc:1004