00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #include <config.h>
00024
00025 #include <cstdlib>
00026
00027 #include <string>
00028 #include <fstream>
00029 #include <iostream>
00030
00031 #include <xapian/stem.h>
00032 #include "testsuite.h"
00033
00034 using namespace std;
00035
00036 static const int JUNKSIZE = 2 * 1048576;
00037
00038 static string language;
00039
00040 static Xapian::Stem stemmer;
00041
00042 static string srcdir;
00043
00044 static int seed;
00045
00046
00047 static bool
00048 test_stemrandom()
00049 {
00050 static const char wordchars[] =
00051 "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz0123456789^\0";
00052
00053 tout << "Stemming random text... (seed " << seed << ")" << endl;
00054 srand(seed);
00055
00056 string word;
00057 int stemmed_size = 0;
00058 for (int c = JUNKSIZE; c; --c) {
00059 char ch = wordchars[(rand() >> 8) % sizeof wordchars];
00060 if (ch) {
00061 word += ch;
00062 continue;
00063 }
00064 stemmed_size += stemmer(word).length();
00065 word.resize(0);
00066 }
00067 stemmed_size += stemmer(word).length();
00068 tout << "Input size " << JUNKSIZE << ", stemmed size " << stemmed_size
00069 << endl;
00070
00071 if (stemmed_size > JUNKSIZE * 101 / 100) {
00072 FAIL_TEST("Stemmed data is significantly bigger than input: "
00073 << stemmed_size << " vs. " << JUNKSIZE);
00074 }
00075 if (stemmed_size < JUNKSIZE / 2) {
00076 FAIL_TEST("Stemmed data is significantly smaller than input: "
00077 << stemmed_size << " vs. " << JUNKSIZE);
00078 }
00079 return true;
00080 }
00081
00082
00083 static bool
00084 test_stemjunk()
00085 {
00086 tout << "Stemming random junk... (seed " << seed << ")" << endl;
00087 srand(seed);
00088
00089 string word;
00090 int stemmed_size = 0;
00091 for (int c = JUNKSIZE; c; --c) {
00092 char ch = rand() >> 8;
00093 if (ch) {
00094 word += ch;
00095 continue;
00096 }
00097 stemmed_size += stemmer(word).length();
00098 word.resize(0);
00099 }
00100 stemmed_size += stemmer(word).length();
00101 tout << "Input size " << JUNKSIZE << ", stemmed size " << stemmed_size
00102 << endl;
00103
00104 if (stemmed_size > JUNKSIZE * 101 / 100) {
00105 FAIL_TEST("Stemmed data is significantly bigger than input ("
00106 << stemmed_size << " vs. " << JUNKSIZE);
00107 }
00108 if (stemmed_size < JUNKSIZE / 2) {
00109 FAIL_TEST("Stemmed data is significantly smaller than input ("
00110 << stemmed_size << " vs. " << JUNKSIZE);
00111 }
00112 return true;
00113 }
00114
00115 static bool
00116 test_stemdict()
00117 {
00118 string dir = srcdir + "/../../xapian-data/stemming/";
00119
00120 ifstream voc((dir + language + "/voc.txt").c_str());
00121 if (!voc.is_open()) {
00122 SKIP_TEST(language + "/voc.txt not found");
00123 }
00124
00125 ifstream st((dir + language + "/output.txt").c_str());
00126 if (!st.is_open()) {
00127 voc.close();
00128 FAIL_TEST(language + "/output.txt not found");
00129 }
00130
00131 tout << "Testing " << language << " with Snowball dictionary..." << endl;
00132
00133 int pass = 1;
00134 while (true) {
00135 string word, stem, expect;
00136 while (!voc.eof() && !st.eof()) {
00137 getline(voc, word);
00138 getline(st, expect);
00139
00140 stem = stemmer(word);
00141
00142 TEST_EQUAL(stem, expect);
00143 }
00144 voc.close();
00145 st.close();
00146
00147 if (pass == 2) break;
00148
00149 voc.open((dir + language + "/voc2.txt").c_str());
00150 if (!voc.is_open()) break;
00151
00152 st.open((dir + language + "/output2.txt").c_str());
00153 if (!st.is_open()) {
00154 voc.close();
00155 FAIL_TEST(language + "/output2.txt not found");
00156 }
00157 tout << "Testing " << language << " with supplemental dictionary..."
00158 << endl;
00159 ++pass;
00160 }
00161
00162 return true;
00163 }
00164
00165
00166
00167
00168
00170 static const test_desc tests[] = {
00171 {"stemrandom", test_stemrandom},
00172 {"stemjunk", test_stemjunk},
00173 {"stemdict", test_stemdict},
00174 {0, 0}
00175 };
00176
00177 int main(int argc, char **argv)
00178 try {
00179 string langs = Xapian::Stem::get_available_languages();
00180 test_driver::add_command_line_option("languages", 'l', &langs);
00181
00182 seed = 42;
00183 string seed_str;
00184 test_driver::add_command_line_option("seed", 's', &seed_str);
00185
00186 test_driver::parse_command_line(argc, argv);
00187 srcdir = test_driver::get_srcdir();
00188 int result = 0;
00189
00190 if (!seed_str.empty()) seed = atoi(seed_str.c_str());
00191 cout << "The random seed is " << seed << endl;
00192 cout << "Please report the seed when reporting a test failure." << endl;
00193
00194 string::size_type b = 0;
00195 while (b != langs.size()) {
00196 string::size_type a = b;
00197 while (b < langs.size() && langs[b] != ' ') ++b;
00198 language = langs.substr(a, b - a);
00199 while (b < langs.size() && langs[b] == ' ') ++b;
00200 cout << "Running tests with " << language << " stemmer..." << endl;
00201 stemmer = Xapian::Stem(language);
00202 result = max(result, test_driver::run(tests));
00203 }
00204 return result;
00205 } catch (const char * e) {
00206 cout << e << endl;
00207 return 1;
00208 }