xapian-core  1.4.18
stemtest.cc
Go to the documentation of this file.
1 
4 /* Copyright 1999,2000,2001 BrightStation PLC
5  * Copyright 2002 Ananova Ltd
6  * Copyright 2002,2003,2004,2007,2008,2009,2012,2015 Olly Betts
7  *
8  * This program is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU General Public License as
10  * published by the Free Software Foundation; either version 2 of the
11  * License, or (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
21  * USA
22  */
23 
24 #include <config.h>
25 
26 #include <cstdlib>
27 
28 #include <string>
29 #include <fstream>
30 #include <iostream>
31 
32 #include <xapian.h>
33 #include "testsuite.h"
34 
35 using namespace std;
36 
37 static const int JUNKSIZE = 2 * 1048576;
38 
39 static string language;
40 
42 
43 static string srcdir;
44 
45 static int seed;
46 
47 // run stemmers on random text
48 static void
50 {
51  static const char wordchars[] =
52  "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz0123456789^\0";
53 
54  tout << "Stemming random text... (seed " << seed << ")" << endl;
55  srand(seed);
56 
57  string word;
58  int stemmed_size = 0;
59  for (int c = JUNKSIZE; c; --c) {
60  char ch = wordchars[(rand() >> 8) % sizeof wordchars];
61  if (ch) {
62  word += ch;
63  continue;
64  }
65  stemmed_size += stemmer(word).length();
66  word.resize(0);
67  }
68  stemmed_size += stemmer(word).length();
69  tout << "Input size " << JUNKSIZE << ", stemmed size " << stemmed_size
70  << endl;
71 
72  if (stemmed_size > JUNKSIZE * 101 / 100) {
73  FAIL_TEST("Stemmed data is significantly bigger than input: "
74  << stemmed_size << " vs. " << JUNKSIZE);
75  }
76  if (stemmed_size < JUNKSIZE / 2) {
77  FAIL_TEST("Stemmed data is significantly smaller than input: "
78  << stemmed_size << " vs. " << JUNKSIZE);
79  }
80 }
81 
82 // run stemmers on random junk
83 static void
85 {
86  tout << "Stemming random junk... (seed " << seed << ")" << endl;
87  srand(seed);
88 
89  string word;
90  int stemmed_size = 0;
91  for (int c = JUNKSIZE; c; --c) {
92  char ch = char(rand() >> 8);
93  if (ch) {
94  word += ch;
95  continue;
96  }
97  stemmed_size += stemmer(word).length();
98  word.resize(0);
99  }
100  stemmed_size += stemmer(word).length();
101  tout << "Input size " << JUNKSIZE << ", stemmed size " << stemmed_size
102  << endl;
103 
104  if (stemmed_size > JUNKSIZE * 101 / 100) {
105  FAIL_TEST("Stemmed data is significantly bigger than input ("
106  << stemmed_size << " vs. " << JUNKSIZE);
107  }
108  if (stemmed_size < JUNKSIZE / 2) {
109  FAIL_TEST("Stemmed data is significantly smaller than input ("
110  << stemmed_size << " vs. " << JUNKSIZE);
111  }
112 }
113 
114 static void
116 {
117  string dir = srcdir + "/../../xapian-data/stemming/";
118 
119  ifstream voc((dir + language + "/voc.txt").c_str());
120  if (!voc.is_open()) {
121  SKIP_TEST(language << "/voc.txt not found");
122  }
123 
124  ifstream st((dir + language + "/output.txt").c_str());
125  if (!st.is_open()) {
126  voc.close();
127  FAIL_TEST(language << "/output.txt not found");
128  }
129 
130  tout << "Testing " << language << " with Snowball dictionary..." << endl;
131 
132  int pass = 1;
133  while (true) {
134  string word, stem, expect;
135  while (!voc.eof() && !st.eof()) {
136  getline(voc, word);
137  getline(st, expect);
138 
139  stem = stemmer(word);
140 
141  TEST_EQUAL(stem, expect);
142  }
143  voc.close();
144  st.close();
145 
146  if (pass == 2) break;
147 
148  voc.open((dir + language + "/voc2.txt").c_str());
149  if (!voc.is_open()) break;
150 
151  st.open((dir + language + "/output2.txt").c_str());
152  if (!st.is_open()) {
153  voc.close();
154  FAIL_TEST(language << "/output2.txt not found");
155  }
156  tout << "Testing " << language << " with supplemental dictionary..."
157  << endl;
158  ++pass;
159  }
160 }
161 
162 // ##################################################################
163 // # End of actual tests #
164 // ##################################################################
165 
167 static const test_desc tests[] = {
168  {"stemrandom", test_stemrandom},
169  {"stemjunk", test_stemjunk},
170  {"stemdict", test_stemdict},
171  {0, 0}
172 };
173 
174 int main(int argc, char **argv)
175 try {
176  string langs = Xapian::Stem::get_available_languages();
177  test_driver::add_command_line_option("languages", 'l', &langs);
178 
179  seed = 42;
180  string seed_str;
181  test_driver::add_command_line_option("seed", 's', &seed_str);
182 
185  int result = 0;
186 
187  if (!seed_str.empty()) seed = atoi(seed_str.c_str());
188  cout << "The random seed is " << seed << endl;
189  cout << "Please report the seed when reporting a test failure." << endl;
190 
191  string::size_type b = 0;
192  while (b != langs.size()) {
193  string::size_type a = b;
194  while (b < langs.size() && langs[b] != ' ') ++b;
195  language.assign(langs, a, b - a);
196  while (b < langs.size() && langs[b] == ' ') ++b;
197  cout << "Running tests with " << language << " stemmer..." << endl;
198  stemmer = Xapian::Stem(language);
199  result = max(result, test_driver::run(tests));
200  }
201  return result;
202 } catch (const char * e) {
203  cout << e << endl;
204  return 1;
205 }
static const int JUNKSIZE
Definition: stemtest.cc:37
static void test_stemrandom()
Definition: stemtest.cc:49
static const test_desc tests[]
The lists of tests to perform.
Definition: stemtest.cc:167
static void parse_command_line(int argc, char **argv)
Parse the command line arguments.
Definition: testsuite.cc:799
Class representing a stemming algorithm.
Definition: stem.h:62
static void test_stemjunk()
Definition: stemtest.cc:84
a generic test suite engine
STL namespace.
static Xapian::Stem stemmer
Definition: stemtest.cc:41
static std::string get_srcdir()
Read srcdir from environment and if not present, make a valiant attempt to guess a value...
Definition: testsuite.cc:128
static void add_command_line_option(const std::string &l, char s, std::string *arg)
Add a test-specific command line option.
Definition: testsuite.cc:788
static void test_stemdict()
Definition: stemtest.cc:115
std::ostringstream tout
The debug printing stream.
Definition: testsuite.cc:103
Public interfaces for the Xapian library.
static string srcdir
Definition: stemtest.cc:43
static std::string get_available_languages()
Return a list of available languages.
Definition: stem.h:181
int main(int argc, char **argv)
Definition: stemtest.cc:174
static int seed
Definition: stemtest.cc:45
#define FAIL_TEST(MSG)
Fail the current testcase with message MSG.
Definition: testsuite.h:68
#define SKIP_TEST(MSG)
Skip the current testcase with message MSG.
Definition: testsuite.h:74
static int run(const test_desc *tests)
Definition: testsuite.cc:886
static string language
Definition: stemtest.cc:39
Structure holding a description of a test.
Definition: testsuite.h:77
#define TEST_EQUAL(a, b)
Test for equality of two things.
Definition: testsuite.h:278