00001
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #ifndef XAPIAN_INCLUDED_UNICODE_H
00022 #define XAPIAN_INCLUDED_UNICODE_H
00023
00024 #include <xapian/visibility.h>
00025
00026 #include <string>
00027
00028 namespace Xapian {
00029
00033 class XAPIAN_VISIBILITY_DEFAULT Utf8Iterator {
00034 const unsigned char *p;
00035 const unsigned char *end;
00036 mutable unsigned seqlen;
00037
00038 void calculate_sequence_length() const;
00039
00040 unsigned get_char() const;
00041
00042 Utf8Iterator(const unsigned char *p_, const unsigned char *end_, unsigned seqlen_)
00043 : p(p_), end(end_), seqlen(seqlen_) { }
00044
00045 public:
00047 const char * raw() const {
00048 return reinterpret_cast<const char *>(p ? p : end);
00049 }
00050
00052 size_t left() const { return p ? end - p : 0; }
00053
00065 void assign(const char *p_, size_t len) {
00066 if (len) {
00067 p = reinterpret_cast<const unsigned char*>(p_);
00068 end = p + len;
00069 seqlen = 0;
00070 } else {
00071 p = NULL;
00072 }
00073 }
00074
00085 void assign(const std::string &s) { assign(s.data(), s.size()); }
00086
00095 explicit Utf8Iterator(const char *p_);
00096
00107 Utf8Iterator(const char *p_, size_t len) { assign(p_, len); }
00108
00118 Utf8Iterator(const std::string &s) { assign(s.data(), s.size()); }
00119
00125 Utf8Iterator() : p(NULL), end(0), seqlen(0) { }
00126
00131 unsigned operator*() const;
00132
00137 Utf8Iterator operator++(int) {
00138
00139 if (seqlen == 0) calculate_sequence_length();
00140 const unsigned char *old_p = p;
00141 unsigned old_seqlen = seqlen;
00142 p += seqlen;
00143 if (p == end) p = NULL;
00144 seqlen = 0;
00145 return Utf8Iterator(old_p, end, old_seqlen);
00146 }
00147
00152 Utf8Iterator & operator++() {
00153 if (seqlen == 0) calculate_sequence_length();
00154 p += seqlen;
00155 if (p == end) p = NULL;
00156 seqlen = 0;
00157 return *this;
00158 }
00159
00165 bool operator==(const Utf8Iterator &other) const { return p == other.p; }
00166
00172 bool operator!=(const Utf8Iterator &other) const { return p != other.p; }
00173
00175
00176 typedef std::input_iterator_tag iterator_category;
00177 typedef unsigned value_type;
00178 typedef size_t difference_type;
00179 typedef const unsigned * pointer;
00180 typedef const unsigned & reference;
00182 };
00183
00185 namespace Unicode {
00186
00188 typedef enum {
00189 UNASSIGNED,
00190 UPPERCASE_LETTER,
00191 LOWERCASE_LETTER,
00192 TITLECASE_LETTER,
00193 MODIFIER_LETTER,
00194 OTHER_LETTER,
00195 NON_SPACING_MARK,
00196 ENCLOSING_MARK,
00197 COMBINING_SPACING_MARK,
00198 DECIMAL_DIGIT_NUMBER,
00199 LETTER_NUMBER,
00200 OTHER_NUMBER,
00201 SPACE_SEPARATOR,
00202 LINE_SEPARATOR,
00203 PARAGRAPH_SEPARATOR,
00204 CONTROL,
00205 FORMAT,
00206 PRIVATE_USE,
00207 SURROGATE,
00208 CONNECTOR_PUNCTUATION,
00209 DASH_PUNCTUATION,
00210 OPEN_PUNCTUATION,
00211 CLOSE_PUNCTUATION,
00212 INITIAL_QUOTE_PUNCTUATION,
00213 FINAL_QUOTE_PUNCTUATION,
00214 OTHER_PUNCTUATION,
00215 MATH_SYMBOL,
00216 CURRENCY_SYMBOL,
00217 MODIFIER_SYMBOL,
00218 OTHER_SYMBOL
00219 } category;
00220
00221 namespace Internal {
00227 XAPIAN_VISIBILITY_DEFAULT
00228 int get_character_info(unsigned ch);
00229
00233 inline int get_case_type(int info) { return ((info & 0xe0) >> 5); }
00234
00236 inline category get_category(int info) { return static_cast<category>(info & 0x1f); }
00237
00241 inline int get_delta(int info) {
00242
00243
00244
00245
00246
00247 #ifdef __GNUC__
00248
00249
00250
00251
00252
00253 return info >> 15;
00254 #else
00255 return (info >= 0) ? (info >> 15) : (~(~info >> 15));
00256 #endif
00257 }
00258 }
00259
00270 XAPIAN_VISIBILITY_DEFAULT
00271 unsigned nonascii_to_utf8(unsigned ch, char * buf);
00272
00281 inline unsigned to_utf8(unsigned ch, char *buf) {
00282 if (ch < 128) {
00283 *buf = static_cast<unsigned char>(ch);
00284 return 1;
00285 }
00286 return Xapian::Unicode::nonascii_to_utf8(ch, buf);
00287 }
00288
00292 inline void append_utf8(std::string &s, unsigned ch) {
00293 char buf[4];
00294 s.append(buf, to_utf8(ch, buf));
00295 }
00296
00298 inline category get_category(unsigned ch) {
00299
00300 if (ch >= 0x110000) return Xapian::Unicode::UNASSIGNED;
00301 return Internal::get_category(Internal::get_character_info(ch));
00302 }
00303
00305 inline bool is_wordchar(unsigned ch) {
00306 const unsigned int WORDCHAR_MASK =
00307 (1 << Xapian::Unicode::UPPERCASE_LETTER) |
00308 (1 << Xapian::Unicode::LOWERCASE_LETTER) |
00309 (1 << Xapian::Unicode::TITLECASE_LETTER) |
00310 (1 << Xapian::Unicode::MODIFIER_LETTER) |
00311 (1 << Xapian::Unicode::OTHER_LETTER) |
00312 (1 << Xapian::Unicode::NON_SPACING_MARK) |
00313 (1 << Xapian::Unicode::ENCLOSING_MARK) |
00314 (1 << Xapian::Unicode::COMBINING_SPACING_MARK) |
00315 (1 << Xapian::Unicode::DECIMAL_DIGIT_NUMBER) |
00316 (1 << Xapian::Unicode::LETTER_NUMBER) |
00317 (1 << Xapian::Unicode::OTHER_NUMBER) |
00318 (1 << Xapian::Unicode::CONNECTOR_PUNCTUATION);
00319 return ((WORDCHAR_MASK >> get_category(ch)) & 1);
00320 }
00321
00323 inline bool is_whitespace(unsigned ch) {
00324 const unsigned int WHITESPACE_MASK =
00325 (1 << Xapian::Unicode::CONTROL) |
00326 (1 << Xapian::Unicode::SPACE_SEPARATOR) |
00327 (1 << Xapian::Unicode::LINE_SEPARATOR) |
00328 (1 << Xapian::Unicode::PARAGRAPH_SEPARATOR);
00329 return ((WHITESPACE_MASK >> get_category(ch)) & 1);
00330 }
00331
00333 inline bool is_currency(unsigned ch) {
00334 return (get_category(ch) == Xapian::Unicode::CURRENCY_SYMBOL);
00335 }
00336
00338 inline unsigned tolower(unsigned ch) {
00339 int info;
00340
00341 if (ch >= 0x110000 || !(Internal::get_case_type((info = Xapian::Unicode::Internal::get_character_info(ch))) & 2))
00342 return ch;
00343 return ch + Internal::get_delta(info);
00344 }
00345
00347 inline unsigned toupper(unsigned ch) {
00348 int info;
00349
00350 if (ch >= 0x110000 || !(Internal::get_case_type((info = Xapian::Unicode::Internal::get_character_info(ch))) & 4))
00351 return ch;
00352 return ch - Internal::get_delta(info);
00353 }
00354
00356 inline std::string
00357 tolower(const std::string &term)
00358 {
00359 std::string result;
00360 result.reserve(term.size());
00361 for (Utf8Iterator i(term); i != Utf8Iterator(); ++i) {
00362 append_utf8(result, tolower(*i));
00363 }
00364 return result;
00365 }
00366
00368 inline std::string
00369 toupper(const std::string &term)
00370 {
00371 std::string result;
00372 result.reserve(term.size());
00373 for (Utf8Iterator i(term); i != Utf8Iterator(); ++i) {
00374 append_utf8(result, toupper(*i));
00375 }
00376 return result;
00377 }
00378
00379 }
00380
00381 }
00382
00383 #endif // XAPIAN_INCLUDED_UNICODE_H