xapian-core  1.4.21
api_spelling.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2007-2021 Olly Betts
5  * Copyright (C) 2007 Lemur Consulting Ltd
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2 of the License, or
10  * (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include <config.h>
23 
24 #include "api_spelling.h"
25 
26 #include <xapian.h>
27 
28 #include "apitest.h"
29 #include "testsuite.h"
30 #include "testutils.h"
31 
32 #include <string>
33 
34 using namespace std;
35 
36 // Test add_spelling() and remove_spelling(), which remote dbs support.
37 DEFINE_TESTCASE(spell0, spelling || remote) {
39 
40  db.add_spelling("hello");
41  db.add_spelling("cell", 2);
42  db.commit();
43  db.add_spelling("zig");
44  db.add_spelling("ch");
45  db.add_spelling("hello", 2);
46  db.remove_spelling("hello", 2);
47  db.remove_spelling("cell", 6);
48  db.commit();
49  db.remove_spelling("hello");
50  db.remove_spelling("nonsuch");
51  db.remove_spelling("zzzzzzzzz", 1000000);
52  db.remove_spelling("aarvark");
53  db.remove_spelling("hello");
54  db.commit();
55  db.remove_spelling("hello");
56 }
57 
58 // Test basic spelling correction features.
59 DEFINE_TESTCASE(spell1, spelling) {
61 
62  // Check that the more frequent term is chosen.
63  db.add_spelling("hello");
64  TEST_EQUAL(db.get_spelling_suggestion("cell"), "hello");
65  db.add_spelling("cell", 2);
66  TEST_EQUAL(db.get_spelling_suggestion("hell"), "cell");
67  db.commit();
69  TEST_EQUAL(db.get_spelling_suggestion("hell"), "cell");
70  TEST_EQUAL(dbr.get_spelling_suggestion("hell"), "cell");
71 
72  // Check suggestions for single edit errors to "zig".
73  db.add_spelling("zig");
74  // Transpositions:
75  TEST_EQUAL(db.get_spelling_suggestion("izg"), "zig");
76  TEST_EQUAL(db.get_spelling_suggestion("zgi"), "zig");
77  // Substitutions:
78  TEST_EQUAL(db.get_spelling_suggestion("sig"), "zig");
79  TEST_EQUAL(db.get_spelling_suggestion("zog"), "zig");
80  TEST_EQUAL(db.get_spelling_suggestion("zif"), "zig");
81  // Deletions:
82  TEST_EQUAL(db.get_spelling_suggestion("ig"), "zig");
83  TEST_EQUAL(db.get_spelling_suggestion("zg"), "zig");
84  TEST_EQUAL(db.get_spelling_suggestion("zi"), "zig");
85  // Insertions:
86  TEST_EQUAL(db.get_spelling_suggestion("azig"), "zig");
87  TEST_EQUAL(db.get_spelling_suggestion("zaig"), "zig");
88  TEST_EQUAL(db.get_spelling_suggestion("ziag"), "zig");
89  TEST_EQUAL(db.get_spelling_suggestion("ziga"), "zig");
90 
91  // Check suggestions for single edit errors to "ch".
92  db.add_spelling("ch");
93  // Transpositions:
94  TEST_EQUAL(db.get_spelling_suggestion("hc"), "ch");
95  // Substitutions - we don't handle these for two character words:
96  TEST_EQUAL(db.get_spelling_suggestion("qh"), "");
97  TEST_EQUAL(db.get_spelling_suggestion("cq"), "");
98  // Deletions would leave a single character, and we don't handle those.
100  TEST_EQUAL(db.get_spelling_suggestion("h"), "");
101  // Insertions:
102  TEST_EQUAL(db.get_spelling_suggestion("qch"), "ch");
103  TEST_EQUAL(db.get_spelling_suggestion("cqh"), "ch");
104  TEST_EQUAL(db.get_spelling_suggestion("chq"), "ch");
105 
106  // Check assorted cases:
107  TEST_EQUAL(db.get_spelling_suggestion("shello"), "hello");
108  TEST_EQUAL(db.get_spelling_suggestion("hellot"), "hello");
109  TEST_EQUAL(db.get_spelling_suggestion("acell"), "cell");
110  TEST_EQUAL(db.get_spelling_suggestion("cella"), "cell");
111  TEST_EQUAL(db.get_spelling_suggestion("acella"), "cell");
112  TEST_EQUAL(db.get_spelling_suggestion("helo"), "hello");
113  TEST_EQUAL(db.get_spelling_suggestion("cll"), "cell");
114  TEST_EQUAL(db.get_spelling_suggestion("helol"), "hello");
115  TEST_EQUAL(db.get_spelling_suggestion("clel"), "cell");
116  TEST_EQUAL(db.get_spelling_suggestion("ecll"), "cell");
117  TEST_EQUAL(db.get_spelling_suggestion("cll"), "cell");
118 
119  // Check that edit distance 3 isn't found by default:
120  TEST_EQUAL(db.get_spelling_suggestion("shelolx"), "");
121  TEST_EQUAL(db.get_spelling_suggestion("celling"), "");
122  TEST_EQUAL(db.get_spelling_suggestion("dellin"), "");
123 
124  // Check that edit distance 3 is found if specified:
125  TEST_EQUAL(db.get_spelling_suggestion("shelolx", 3), "hello");
126  TEST_EQUAL(db.get_spelling_suggestion("celling", 3), "cell");
127  TEST_EQUAL(db.get_spelling_suggestion("dellin", 3), "cell");
128 
129  // Make "hello" more frequent than "cell" (3 vs 2).
130  db.add_spelling("hello", 2);
131  TEST_EQUAL(db.get_spelling_suggestion("hell"), "hello");
132  db.commit();
133  TEST_EQUAL(db.get_spelling_suggestion("cello"), "hello");
134  db.remove_spelling("hello", 2);
135  TEST_EQUAL(db.get_spelling_suggestion("hell"), "cell");
136  // Test "over-removing".
137  db.remove_spelling("cell", 6);
138  TEST_EQUAL(db.get_spelling_suggestion("cell"), "hello");
139  db.commit();
140  TEST_EQUAL(db.get_spelling_suggestion("cell"), "hello");
141  db.remove_spelling("hello");
142  TEST_EQUAL(db.get_spelling_suggestion("cell"), "");
143 
144  // Test removing words not in the table.
145  db.remove_spelling("nonsuch");
146  db.remove_spelling("zzzzzzzzz", 1000000);
147  db.remove_spelling("aarvark");
148 
149  // Try removing word which was present but no longer is.
150  db.remove_spelling("hello");
151  db.commit();
152  db.remove_spelling("hello");
153 }
154 
155 // Test spelling correction for Unicode.
156 DEFINE_TESTCASE(spell2, spelling) {
158 
159  // Check that a UTF-8 sequence counts as a single character.
160  db.add_spelling("h\xc3\xb6hle");
161  db.add_spelling("ascii");
162  TEST_EQUAL(db.get_spelling_suggestion("hohle", 1), "h\xc3\xb6hle");
163  TEST_EQUAL(db.get_spelling_suggestion("hhle", 1), "h\xc3\xb6hle");
164  TEST_EQUAL(db.get_spelling_suggestion("\xf0\xa8\xa8\x8f\xc3\xb6le", 2), "h\xc3\xb6hle");
165  TEST_EQUAL(db.get_spelling_suggestion("hh\xc3\xb6l"), "h\xc3\xb6hle");
166  TEST_EQUAL(db.get_spelling_suggestion("as\xc3\xb6\xc3\xb7i"), "ascii");
167  TEST_EQUAL(db.get_spelling_suggestion("asc\xc3\xb6i\xc3\xb7i"), "ascii");
168  db.commit();
170  TEST_EQUAL(dbr.get_spelling_suggestion("hohle", 1), "h\xc3\xb6hle");
171  TEST_EQUAL(dbr.get_spelling_suggestion("hhle", 1), "h\xc3\xb6hle");
172  TEST_EQUAL(dbr.get_spelling_suggestion("\xf0\xa8\xa8\x8f\xc3\xb6le", 2), "h\xc3\xb6hle");
173  TEST_EQUAL(dbr.get_spelling_suggestion("hh\xc3\xb6l"), "h\xc3\xb6hle");
174  TEST_EQUAL(dbr.get_spelling_suggestion("as\xc3\xb6\xc3\xb7i"), "ascii");
175  TEST_EQUAL(dbr.get_spelling_suggestion("asc\xc3\xb6i\xc3\xb7i"), "ascii");
176 }
177 
178 // Test spelling correction with multi databases
179 DEFINE_TESTCASE(spell3, spelling) {
181  // We can't just call get_writable_database() since it would delete db1
182  // which doesn't work at all under __WIN32__ and will go wrong elsewhere if
183  // changes to db1 are committed.
185 
186  db1.add_spelling("hello");
187  db1.add_spelling("cell", 2);
188  db2.add_spelling("hello", 2);
189  db2.add_spelling("helo");
190 
191  Xapian::Database db;
192  db.add_database(db1);
193  db.add_database(db2);
194 
195  TEST_EQUAL(db.get_spelling_suggestion("hello"), "");
196  TEST_EQUAL(db.get_spelling_suggestion("hell"), "hello");
197  TEST_EQUAL(db1.get_spelling_suggestion("hell"), "cell");
198  TEST_EQUAL(db2.get_spelling_suggestion("hell"), "hello");
199 
200  // Test spelling iterator
202  TEST_EQUAL(*i, "cell");
203  TEST_EQUAL(i.get_termfreq(), 2);
204  ++i;
205  TEST_EQUAL(*i, "hello");
206  TEST_EQUAL(i.get_termfreq(), 1);
207  ++i;
208  TEST(i == db1.spellings_end());
209 
210  i = db2.spellings_begin();
211  TEST_EQUAL(*i, "hello");
212  TEST_EQUAL(i.get_termfreq(), 2);
213  ++i;
214  TEST_EQUAL(*i, "helo");
215  TEST_EQUAL(i.get_termfreq(), 1);
216  ++i;
217  TEST(i == db2.spellings_end());
218 
219  i = db.spellings_begin();
220  TEST_EQUAL(*i, "cell");
221  TEST_EQUAL(i.get_termfreq(), 2);
222  ++i;
223  TEST_EQUAL(*i, "hello");
224  TEST_EQUAL(i.get_termfreq(), 3);
225  ++i;
226  TEST_EQUAL(*i, "helo");
227  TEST_EQUAL(i.get_termfreq(), 1);
228  ++i;
229  TEST(i == db.spellings_end());
230 
231  // Regression test for TermIterator::skip_to() bug fixed in 1.4.19.
232  i = db.spellings_begin();
233  i.skip_to("helo");
234  TEST(i != db.spellings_end());
235  TEST_EQUAL(*i, "helo");
236  TEST_EQUAL(i.get_termfreq(), 1);
237  i.skip_to("help");
238  TEST(i == db.spellings_end());
239 }
240 
241 // Regression test - check that appending works correctly.
242 DEFINE_TESTCASE(spell4, spelling) {
244 
245  db.add_spelling("check");
246  db.add_spelling("pecks", 2);
247  db.commit();
248  db.add_spelling("becky");
249  db.commit();
250 
251  TEST_EQUAL(db.get_spelling_suggestion("jeck", 2), "pecks");
252 }
253 
254 // Regression test - used to segfault with some input values.
255 DEFINE_TESTCASE(spell5, spelling) {
256  const char * target = "\xe4\xb8\x80\xe4\xba\x9b";
257 
259  db.add_spelling(target);
260  db.commit();
261 
262  string s = db.get_spelling_suggestion("\xe4\xb8\x8d", 3);
263  TEST_EQUAL(s, target);
264 }
265 
266 // Test basic spelling correction features.
267 DEFINE_TESTCASE(spell6, spelling) {
269 
270  // Check that the more frequent term is chosen.
271  db.add_spelling("hello", 2);
272  db.add_spelling("sell", 3);
273  TEST_EQUAL(db.get_spelling_suggestion("hell"), "sell");
274  db.commit();
276  TEST_EQUAL(db.get_spelling_suggestion("hell"), "sell");
277  TEST_EQUAL(dbr.get_spelling_suggestion("hell"), "sell");
278 }
279 
280 // Test suggestions when there's an exact match.
281 DEFINE_TESTCASE(spell7, spelling) {
283 
284  // Check that the more frequent term is chosen.
285  db.add_spelling("word", 57);
286  db.add_spelling("wrod", 3);
287  db.add_spelling("sword", 56);
288  db.add_spelling("words", 57);
289  db.add_spelling("ward", 58);
290  db.commit();
291  TEST_EQUAL(db.get_spelling_suggestion("ward"), "");
292  TEST_EQUAL(db.get_spelling_suggestion("words"), "word");
293  TEST_EQUAL(db.get_spelling_suggestion("sword"), "word");
294  TEST_EQUAL(db.get_spelling_suggestion("wrod"), "word");
295 }
296 
298 DEFINE_TESTCASE(spell8, spelling) {
300 
301  // kin and kin used to cancel out in "skinking".
302  db.add_spelling("skinking", 2);
303  db.add_spelling("stinking", 1);
304  db.commit();
305  TEST_EQUAL(db.get_spelling_suggestion("scimkin", 3), "skinking");
306 }
#define TEST(a)
Test a condition, without an additional explanation for failure.
Definition: testsuite.h:275
This class is used to access a database, or a group of databases.
Definition: database.h:68
void remove_spelling(const std::string &word, Xapian::termcount freqdec=1) const
Remove a word from the spelling dictionary.
Definition: omdatabase.cc:1015
Xapian::WritableDatabase get_writable_database(const string &dbname)
Definition: apitest.cc:87
a generic test suite engine
void skip_to(const std::string &term)
Advance the iterator to term term.
STL namespace.
DEFINE_TESTCASE(spell0, spelling||remote)
Definition: api_spelling.cc:37
Xapian::WritableDatabase get_named_writable_database(const std::string &name, const std::string &source)
Definition: apitest.cc:93
test functionality of the Xapian API
Class for iterating over a list of terms.
Definition: termiterator.h:41
This class provides read/write access to a database.
Definition: database.h:785
Xapian::TermIterator spellings_begin() const
An iterator which returns all the spelling correction targets.
Definition: omdatabase.cc:704
Public interfaces for the Xapian library.
void commit()
Commit any pending modifications made to the database.
Definition: omdatabase.cc:857
Xapian::Database get_writable_database_as_database()
Definition: apitest.cc:119
void add_database(const Database &database)
Add an existing database (or group of databases) to those accessed by this object.
Definition: omdatabase.cc:148
std::string get_spelling_suggestion(const std::string &word, unsigned max_edit_distance=2) const
Suggest a spelling correction.
Definition: omdatabase.cc:594
Xapian-specific test helper functions and macros.
Xapian::TermIterator spellings_end() const
Corresponding end iterator to spellings_begin().
Definition: database.h:432
#define TEST_EQUAL(a, b)
Test for equality of two things.
Definition: testsuite.h:278
void add_spelling(const std::string &word, Xapian::termcount freqinc=1) const
Add a word to the spelling dictionary.
Definition: omdatabase.cc:1004