00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #include <config.h>
00021
00022 #include <xapian/unicode.h>
00023
00024 #include <cstring>
00025
00026 using namespace std;
00027
00028 inline bool bad_cont(unsigned char ch) { return (ch & 0xc0) != 0x80; }
00029
00030 namespace Xapian {
00031
00032 namespace Unicode {
00033
00034
00035 unsigned
00036 nonascii_to_utf8(unsigned ch, char * buf)
00037 {
00038 if (ch < 0x800) {
00039 buf[0] = 0xc0 | (ch >> 6);
00040 buf[1] = 0x80 | (ch & 0x3f);
00041 return 2;
00042 }
00043 if (ch < 0x10000) {
00044 buf[0] = 0xe0 | (ch >> 12);
00045 buf[1] = 0x80 | ((ch >> 6) & 0x3f);
00046 buf[2] = 0x80 | (ch & 0x3f);
00047 return 3;
00048 }
00049 if (ch < 0x200000) {
00050 buf[0] = 0xf0 | (ch >> 18);
00051 buf[1] = 0x80 | ((ch >> 12) & 0x3f);
00052 buf[2] = 0x80 | ((ch >> 6) & 0x3f);
00053 buf[3] = 0x80 | (ch & 0x3f);
00054 return 4;
00055 }
00056
00057
00058
00059 return 0;
00060 }
00061
00062 }
00063
00064 Utf8Iterator::Utf8Iterator(const char *p_)
00065 {
00066 assign(p_, strlen(p_));
00067 }
00068
00069 void
00070 Utf8Iterator::calculate_sequence_length() const
00071 {
00072
00073
00074
00075
00076
00077 unsigned char ch = *p;
00078
00079 seqlen = 1;
00080
00081
00082
00083
00084
00085 if (ch < 0xc2) return;
00086
00087 if (ch < 0xe0) {
00088 if (p + 1 == end ||
00089 bad_cont(p[1]))
00090 return;
00091 seqlen = 2;
00092 return;
00093 }
00094 if (ch < 0xf0) {
00095 if (end - p < 3 ||
00096 bad_cont(p[1]) || bad_cont(p[2]) ||
00097 (p[0] == 0xe0 && p[1] < 0xa0))
00098 return;
00099 seqlen = 3;
00100 return;
00101 }
00102 if (ch >= 0xf5 ||
00103 end - p < 4 ||
00104 bad_cont(p[1]) || bad_cont(p[2]) || bad_cont(p[3]) ||
00105 (p[0] == 0xf0 && p[1] < 0x90) ||
00106 (p[0] == 0xf4 && p[1] >= 0x90))
00107 return;
00108 seqlen = 4;
00109 return;
00110 }
00111
00112 unsigned Utf8Iterator::operator*() const {
00113 if (p == NULL) return unsigned(-1);
00114 if (seqlen == 0) calculate_sequence_length();
00115 unsigned char ch = *p;
00116 if (seqlen == 1) return ch;
00117 if (seqlen == 2) return ((ch & 0x1f) << 6) | (p[1] & 0x3f);
00118 if (seqlen == 3)
00119 return ((ch & 0x0f) << 12) | ((p[1] & 0x3f) << 6) | (p[2] & 0x3f);
00120 return ((ch & 0x07) << 18) | ((p[1] & 0x3f) << 12) |
00121 ((p[2] & 0x3f) << 6) | (p[3] & 0x3f);
00122 }
00123
00124 }