xapian-core  1.4.29
stemtest.cc
Go to the documentation of this file.
1 
4 /* Copyright 1999,2000,2001 BrightStation PLC
5  * Copyright 2002 Ananova Ltd
6  * Copyright 2002,2003,2004,2007,2008,2009,2012,2015,2025 Olly Betts
7  *
8  * This program is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU General Public License as
10  * published by the Free Software Foundation; either version 2 of the
11  * License, or (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
21  * USA
22  */
23 
24 #include <config.h>
25 
26 #include <cstdlib>
27 
28 #include <string>
29 #include <iostream>
30 
31 #include <zlib.h>
32 
33 #include <xapian.h>
34 #include "testsuite.h"
35 
36 using namespace std;
37 
38 static const int JUNKSIZE = 2 * 1048576;
39 
40 static string language;
41 
43 
44 static string srcdir;
45 
46 static int seed;
47 
48 // run stemmers on random text
49 static void
51 {
52  static const char wordchars[] =
53  "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz0123456789^\0";
54 
55  tout << "Stemming random text... (seed " << seed << ")\n";
56  srand(seed);
57 
58  string word;
59  int stemmed_size = 0;
60  for (int c = JUNKSIZE; c; --c) {
61  char ch = wordchars[(rand() >> 8) % sizeof wordchars];
62  if (ch) {
63  word += ch;
64  continue;
65  }
66  stemmed_size += stemmer(word).length();
67  word.resize(0);
68  }
69  stemmed_size += stemmer(word).length();
70  tout << "Input size " << JUNKSIZE << ", stemmed size " << stemmed_size
71  << '\n';
72 
73  if (stemmed_size > JUNKSIZE * 101 / 100) {
74  FAIL_TEST("Stemmed data is significantly bigger than input: "
75  << stemmed_size << " vs. " << JUNKSIZE);
76  }
77  if (stemmed_size < JUNKSIZE / 2) {
78  FAIL_TEST("Stemmed data is significantly smaller than input: "
79  << stemmed_size << " vs. " << JUNKSIZE);
80  }
81 }
82 
83 // run stemmers on random junk
84 static void
86 {
87  tout << "Stemming random junk... (seed " << seed << ")\n";
88  srand(seed);
89 
90  string word;
91  int stemmed_size = 0;
92  for (int c = JUNKSIZE; c; --c) {
93  char ch = char(rand() >> 8);
94  if (ch) {
95  word += ch;
96  continue;
97  }
98  stemmed_size += stemmer(word).length();
99  word.resize(0);
100  }
101  stemmed_size += stemmer(word).length();
102  tout << "Input size " << JUNKSIZE << ", stemmed size " << stemmed_size
103  << '\n';
104 
105  if (stemmed_size > JUNKSIZE * 101 / 100) {
106  FAIL_TEST("Stemmed data is significantly bigger than input ("
107  << stemmed_size << " vs. " << JUNKSIZE);
108  }
109  if (stemmed_size < JUNKSIZE / 2) {
110  FAIL_TEST("Stemmed data is significantly smaller than input ("
111  << stemmed_size << " vs. " << JUNKSIZE);
112  }
113 }
114 
115 static void
117 {
118  string dir = srcdir + "/../../xapian-data/stemming/";
119 
120  gzFile voc = gzopen((dir + language + "/voc.txt").c_str(), "rb");
121  if (!voc) {
122  voc = gzopen((dir + language + "/voc.txt.gz").c_str(), "rb");
123  if (!voc) {
124  SKIP_TEST(language << "/voc.txt not found");
125  }
126  }
127 
128  gzFile st = gzopen((dir + language + "/output.txt").c_str(), "rb");
129  if (!st) {
130  st = gzopen((dir + language + "/output.txt.gz").c_str(), "rb");
131  if (!st) {
132  gzclose(voc);
133  FAIL_TEST(language << "/output.txt not found");
134  }
135  }
136 
137  tout << "Testing " << language << " with Snowball dictionary...\n";
138 
139  int pass = 1;
140  string word, expect;
141  while (true) {
142  while (!gzeof(voc) && !gzeof(st)) {
143  word.clear();
144  while (true) {
145  int ch = gzgetc(voc);
146  if (ch == EOF || ch == '\n') break;
147  word += ch;
148  }
149 
150  expect.clear();
151  while (true) {
152  int ch = gzgetc(st);
153  if (ch == EOF || ch == '\n') break;
154  expect += ch;
155  }
156 
157  string stem = stemmer(word);
158 
159  TEST_EQUAL(stem, expect);
160  }
161  gzclose(voc);
162  gzclose(st);
163 
164  if (pass == 2) break;
165 
166  voc = gzopen((dir + language + "/voc2.txt").c_str(), "rb");
167  if (!voc) break;
168 
169  st = gzopen((dir + language + "/output2.txt").c_str(), "rb");
170  if (!st) {
171  gzclose(voc);
172  FAIL_TEST(language << "/output2.txt not found");
173  }
174  tout << "Testing " << language << " with supplemental dictionary...\n";
175  ++pass;
176  }
177 }
178 
179 // ##################################################################
180 // # End of actual tests #
181 // ##################################################################
182 
184 static const test_desc tests[] = {
185  {"stemrandom", test_stemrandom},
186  {"stemjunk", test_stemjunk},
187  {"stemdict", test_stemdict},
188  {0, 0}
189 };
190 
191 int main(int argc, char **argv)
192 try {
193  string langs = Xapian::Stem::get_available_languages();
194  test_driver::add_command_line_option("languages", 'l', &langs);
195 
196  seed = 42;
197  string seed_str;
198  test_driver::add_command_line_option("seed", 's', &seed_str);
199 
202  int result = 0;
203 
204  if (!seed_str.empty()) seed = atoi(seed_str.c_str());
205  cout << "The random seed is " << seed << '\n';
206  cout << "Please report the seed when reporting a test failure.\n";
207 
208  string::size_type b = 0;
209  while (b != langs.size()) {
210  string::size_type a = b;
211  while (b < langs.size() && langs[b] != ' ') ++b;
212  language.assign(langs, a, b - a);
213  while (b < langs.size() && langs[b] == ' ') ++b;
214  cout << "Running tests with " << language << " stemmer...\n";
215  stemmer = Xapian::Stem(language);
216  result = max(result, test_driver::run(tests));
217  }
218  return result;
219 } catch (const char * e) {
220  cout << e << '\n';
221  return 1;
222 }
static const int JUNKSIZE
Definition: stemtest.cc:38
static void test_stemrandom()
Definition: stemtest.cc:50
static const test_desc tests[]
The lists of tests to perform.
Definition: stemtest.cc:184
static void parse_command_line(int argc, char **argv)
Parse the command line arguments.
Definition: testsuite.cc:828
Class representing a stemming algorithm.
Definition: stem.h:62
static void test_stemjunk()
Definition: stemtest.cc:85
a generic test suite engine
STL namespace.
static Xapian::Stem stemmer
Definition: stemtest.cc:42
static std::string get_srcdir()
Read srcdir from environment and if not present, make a valiant attempt to guess a value...
Definition: testsuite.cc:129
static void add_command_line_option(const std::string &l, char s, std::string *arg)
Add a test-specific command line option.
Definition: testsuite.cc:817
static void test_stemdict()
Definition: stemtest.cc:116
std::ostringstream tout
The debug printing stream.
Definition: testsuite.cc:104
Public interfaces for the Xapian library.
static string srcdir
Definition: stemtest.cc:44
static std::string get_available_languages()
Return a list of available languages.
Definition: stem.h:185
int main(int argc, char **argv)
Definition: stemtest.cc:191
static int seed
Definition: stemtest.cc:46
#define FAIL_TEST(MSG)
Fail the current testcase with message MSG.
Definition: testsuite.h:68
#define SKIP_TEST(MSG)
Skip the current testcase with message MSG.
Definition: testsuite.h:74
static int run(const test_desc *tests)
Definition: testsuite.cc:913
static string language
Definition: stemtest.cc:40
Structure holding a description of a test.
Definition: testsuite.h:77
#define TEST_EQUAL(a, b)
Test for equality of two things.
Definition: testsuite.h:278