xapian-core  1.4.25
index_utils.cc
Go to the documentation of this file.
1 
4 /* Copyright (C) 2005,2007,2013 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include <config.h>
22 
23 #include "index_utils.h"
24 
25 #include "errno_to_string.h"
26 #include "stringutils.h"
27 
28 #include <algorithm>
29 #include <cerrno>
30 #include <cstring>
31 #include <fstream>
32 
33 using namespace std;
34 
35 static string munge_term(const string &term);
36 
38 static string
40 {
41  string para, line;
42  while (true) {
43  getline(input, line);
44  if (find_if(line.begin(), line.end(), C_isnotspace) == line.end())
45  return para;
46  para += line;
47  para += '\n';
48  }
49 }
50 
51 void
53 {
54  Xapian::Stem stemmer("english");
55 
56  while (file != end || (input.is_open() && !input.eof())) {
57  if (input.eof()) next_file();
58 
59  Xapian::Document doc;
60  string para = get_paragraph(input);
61  doc.set_data(para);
62 
63  // Value 0 contains all possible character values so we can check that
64  // none of them cause problems.
65  string value0("X\0\0\0 \1\t"
66  "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
67  "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
68  "\x20\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f"
69  "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f"
70  "\x40\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f"
71  "\x50\x51\x52\x53\x54\x55\x56\x57\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f"
72  "\x60\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f"
73  "\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f"
74  "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
75  "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
76  "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
77  "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
78  "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
79  "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
80  "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"
81  "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff",
82  7 + 256);
83  if (para.size() > 2) value0[0] = para[2];
84  value0 += para;
85  doc.add_value(0, value0);
86 
87  for (Xapian::valueno i = min(para.length(), size_t(10)); i >= 1; --i) {
88  doc.add_value(i, para.substr(i, 1));
89  }
90  // Value 11 is useful for tests of sorting
91  doc.add_value(11, Xapian::sortable_serialise(para.size()));
92 
93  // Value 12 is useful for tests of collapsing
94  doc.add_value(12, Xapian::sortable_serialise(para.size() % 5));
95 
96  // Value 13 contains the first 3 letters of the paragraph
97  doc.add_value(13, para.substr(0, 3));
98 
99  Xapian::termpos pos = 0;
100  string::const_iterator word_end = para.begin();
101  // Need a const_iterator version of para.end() for find_if.
102  const string::const_iterator para_end = para.end();
103  while (word_end != para_end) {
104  string::const_iterator word_start;
105  word_start = find_if(word_end, para_end, C_isnotspace);
106  word_end = find_if(word_start, para_end, C_isspace);
107  string word = stemmer(munge_term(string(word_start, word_end)));
108  if (!word.empty()) doc.add_posting(word, ++pos);
109  }
110 
111  db.add_document(doc);
112  }
113 }
114 
115 // Strip unwanted characters, force to lower case, and handle \ escapes.
116 static string
117 munge_term(const string &term)
118 {
119  string result;
120  for (string::const_iterator i = term.begin(); i != term.end(); ++i) {
121  char ch = *i;
122  if (C_isalnum(ch))
123  result += C_tolower(ch);
124  else if (ch == '\\') {
125  ++i;
126  if (i != term.end()) {
127  switch (*i) {
128  case '\\': ch = '\\'; break;
129  case '0': ch = '\0'; break;
130  case 'n': ch = '\n'; break;
131  case 'r': ch = '\r'; break;
132  case 't': ch = '\t'; break;
133  case 'x': {
134  // Check we can read the next two characters.
135  if (size_t(i - term.begin()) >= term.size() - 2) {
136  --i;
137  break;
138  }
139  string::const_iterator j = i;
140  char b = *++i;
141  char c = *++i;
142  if (!C_isxdigit(b) || !C_isxdigit(c)) {
143  i = j - 1;
144  } else {
145  ch = hex_decode(b, c);
146  }
147  break;
148  }
149  }
150  }
151  result += ch;
152  }
153  }
154  return result;
155 }
156 
157 void
159 {
160  if (input.is_open()) {
161  input.close();
162  // MSVC doesn't clear fail() on close() and re-open().
163  input.clear();
164  }
165 
166  // Find the next non-empty filename.
167  while (file != end && (*file).empty()) {
168  ++file;
169  }
170  if (file == end) return;
171 
172  string filename;
173  if (!datadir.empty()) {
174  filename = datadir;
175  bool need_slash = true;
176  for (char dir_sep : DIR_SEPS_LIST) {
177  if (filename.back() == dir_sep) {
178  need_slash = false;
179  break;
180  }
181  }
182  if (need_slash) filename += '/';
183  }
184  filename += *file++;
185  filename += ".txt";
186 
187  input.open(filename.c_str());
188  // Need to check is_open() - just using operator! fails with MSVC.
189  if (!input.is_open()) {
190  string msg = "Can't read file '";
191  msg += filename;
192  msg += "' for indexing (";
193  errno_to_string(errno, msg);
194  msg += ')';
195  throw msg;
196  }
197 }
Xapian::docid add_document(const Xapian::Document &document)
Add a new document to the database.
Definition: omdatabase.cc:902
void add_value(Xapian::valueno slot, const std::string &value)
Add a new value.
Definition: omdocument.cc:107
#define DIR_SEPS_LIST
Definition: config.h:11
char C_tolower(char ch)
Definition: stringutils.h:221
void next_file()
Definition: index_utils.cc:158
Class representing a stemming algorithm.
Definition: stem.h:62
static string munge_term(const string &term)
Definition: index_utils.cc:117
Definition: header.h:63
Convert errno value to std::string, thread-safe if possible.
STL namespace.
std::string sortable_serialise(double value)
Convert a floating point number to a string, preserving sort order.
Definition: queryparser.h:1365
static Xapian::Stem stemmer
Definition: stemtest.cc:41
static string get_paragraph(istream &input)
Read a paragraph from stream input.
Definition: index_utils.cc:39
This class provides read/write access to a database.
Definition: database.h:789
void errno_to_string(int e, string &s)
bool C_isspace(char ch)
Definition: stringutils.h:208
bool C_isalnum(char ch)
Definition: stringutils.h:203
void add_posting(const std::string &tname, Xapian::termpos tpos, Xapian::termcount wdfinc=1)
Add an occurrence of a term at a particular position.
Definition: omdocument.cc:128
unsigned valueno
The number for a value slot in a document.
Definition: types.h:108
char hex_decode(char ch1, char ch2)
Decode a pair of ASCII hex digits.
Definition: stringutils.h:243
unsigned XAPIAN_TERMPOS_BASE_TYPE termpos
A term position within a document or query.
Definition: types.h:83
void index_to(Xapian::WritableDatabase &db)
Definition: index_utils.cc:52
Various handy helpers which std::string really should provide.
bool C_isxdigit(char ch)
Definition: stringutils.h:182
utility functions for indexing testcase data
void set_data(const std::string &data)
Set data stored in the document.
Definition: omdocument.cc:78
bool C_isnotspace(char ch)
Definition: stringutils.h:219
A handle representing a document in a Xapian database.
Definition: document.h:61