Kouhei Sutou
null+****@clear*****
Mon Jan 21 17:53:14 JST 2013
Kouhei Sutou 2013-01-21 17:53:14 +0900 (Mon, 21 Jan 2013) New Revision: 46c15627aa48627f5f24446006ca30dda4c2dae1 https://github.com/groonga/groonga/commit/46c15627aa48627f5f24446006ca30dda4c2dae1 Log: Remove deprecated grn_str_open() Modified files: lib/str.c plugins/suggest/suggest.c test/unit/util/test-string.c Modified: lib/str.c (+0 -1103) =================================================================== --- lib/str.c 2013-01-21 12:06:04 +0900 (a26e1b9) +++ lib/str.c 2013-01-21 17:53:14 +0900 (ec87c10) @@ -170,1109 +170,6 @@ grn_charlen(grn_ctx *ctx, const char *str, const char *end) return grn_charlen_(ctx, str, end, ctx->encoding); } -static unsigned char symbol[] = { - ',', '.', 0, ':', ';', '?', '!', 0, 0, 0, '`', 0, '^', '~', '_', 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, '-', '-', '/', '\\', 0, 0, '|', 0, 0, 0, '\'', 0, - '"', '(', ')', 0, 0, '[', ']', '{', '}', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - '+', '-', 0, 0, 0, '=', 0, '<', '>', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - '$', 0, 0, '%', '#', '&', '*', '@', 0, 0, 0, 0, 0, 0, 0, 0 -}; - -inline static grn_rc -normalize_euc(grn_ctx *ctx, grn_str *nstr) -{ - static uint16_t hankana[] = { - 0xa1a1, 0xa1a3, 0xa1d6, 0xa1d7, 0xa1a2, 0xa1a6, 0xa5f2, 0xa5a1, 0xa5a3, - 0xa5a5, 0xa5a7, 0xa5a9, 0xa5e3, 0xa5e5, 0xa5e7, 0xa5c3, 0xa1bc, 0xa5a2, - 0xa5a4, 0xa5a6, 0xa5a8, 0xa5aa, 0xa5ab, 0xa5ad, 0xa5af, 0xa5b1, 0xa5b3, - 0xa5b5, 0xa5b7, 0xa5b9, 0xa5bb, 0xa5bd, 0xa5bf, 0xa5c1, 0xa5c4, 0xa5c6, - 0xa5c8, 0xa5ca, 0xa5cb, 0xa5cc, 0xa5cd, 0xa5ce, 0xa5cf, 0xa5d2, 0xa5d5, - 0xa5d8, 0xa5db, 0xa5de, 0xa5df, 0xa5e0, 0xa5e1, 0xa5e2, 0xa5e4, 0xa5e6, - 0xa5e8, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5ef, 0xa5f3, 0xa1ab, - 0xa1eb - }; - static unsigned char dakuten[] = { - 0xf4, 0, 0, 0, 0, 0xac, 0, 0xae, 0, 0xb0, 0, 0xb2, 0, 0xb4, 0, 0xb6, 0, - 0xb8, 0, 0xba, 0, 0xbc, 0, 0xbe, 0, 0xc0, 0, 0xc2, 0, 0, 0xc5, 0, 0xc7, - 0, 0xc9, 0, 0, 0, 0, 0, 0, 0xd0, 0, 0, 0xd3, 0, 0, 0xd6, 0, 0, 0xd9, 0, - 0, 0xdc - }; - static unsigned char handaku[] = { - 0xd1, 0, 0, 0xd4, 0, 0, 0xd7, 0, 0, 0xda, 0, 0, 0xdd - }; - int16_t *ch; - const unsigned char *s, *s_, *e; - unsigned char *d, *d0, *d_, b; - uint_least8_t *cp, *ctypes, ctype; - size_t size = nstr->orig_blen, length = 0; - int removeblankp = nstr->flags & GRN_STR_REMOVEBLANK; - if (!(nstr->norm = GRN_MALLOC(size * 2 + 1))) { - return GRN_NO_MEMORY_AVAILABLE; - } - d0 = (unsigned char *) nstr->norm; - if (nstr->flags & GRN_STR_WITH_CHECKS) { - if (!(nstr->checks = GRN_MALLOC(size * 2 * sizeof(int16_t) + 1))) { - GRN_FREE(nstr->norm); - nstr->norm = NULL; - return GRN_NO_MEMORY_AVAILABLE; - } - } - ch = nstr->checks; - if (nstr->flags & GRN_STR_WITH_CTYPES) { - if (!(nstr->ctypes = GRN_MALLOC(size + 1))) { - GRN_FREE(nstr->checks); - GRN_FREE(nstr->norm); - nstr->checks = NULL; - nstr->norm = NULL; - return GRN_NO_MEMORY_AVAILABLE; - } - } - cp = ctypes = nstr->ctypes; - e = (unsigned char *)nstr->orig + size; - for (s = s_ = (unsigned char *) nstr->orig, d = d_ = d0; s < e; s++) { - if ((*s & 0x80)) { - if (((s + 1) < e) && (*(s + 1) & 0x80)) { - unsigned char c1 = *s++, c2 = *s, c3 = 0; - switch (c1 >> 4) { - case 0x08 : - if (c1 == 0x8e && 0xa0 <= c2 && c2 <= 0xdf) { - uint16_t c = hankana[c2 - 0xa0]; - switch (c) { - case 0xa1ab : - if (d > d0 + 1 && d[-2] == 0xa5 - && 0xa6 <= d[-1] && d[-1] <= 0xdb && (b = dakuten[d[-1] - 0xa6])) { - *(d - 1) = b; - if (ch) { ch[-1] += 2; s_ += 2; } - continue; - } else { - *d++ = c >> 8; *d = c & 0xff; - } - break; - case 0xa1eb : - if (d > d0 + 1 && d[-2] == 0xa5 - && 0xcf <= d[-1] && d[-1] <= 0xdb && (b = handaku[d[-1] - 0xcf])) { - *(d - 1) = b; - if (ch) { ch[-1] += 2; s_ += 2; } - continue; - } else { - *d++ = c >> 8; *d = c & 0xff; - } - break; - default : - *d++ = c >> 8; *d = c & 0xff; - break; - } - ctype = grn_str_katakana; - } else { - *d++ = c1; *d = c2; - ctype = grn_str_others; - } - break; - case 0x09 : - *d++ = c1; *d = c2; - ctype = grn_str_others; - break; - case 0x0a : - switch (c1 & 0x0f) { - case 1 : - switch (c2) { - case 0xbc : - *d++ = c1; *d = c2; - ctype = grn_str_katakana; - break; - case 0xb9 : - *d++ = c1; *d = c2; - ctype = grn_str_kanji; - break; - case 0xa1 : - if (removeblankp) { - if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; } - continue; - } else { - *d = ' '; - ctype = GRN_STR_BLANK|grn_str_symbol; - } - break; - default : - if (c2 >= 0xa4 && (c3 = symbol[c2 - 0xa4])) { - *d = c3; - ctype = grn_str_symbol; - } else { - *d++ = c1; *d = c2; - ctype = grn_str_others; - } - break; - } - break; - case 2 : - *d++ = c1; *d = c2; - ctype = grn_str_symbol; - break; - case 3 : - c3 = c2 - 0x80; - if ('a' <= c3 && c3 <= 'z') { - ctype = grn_str_alpha; - *d = c3; - } else if ('A' <= c3 && c3 <= 'Z') { - ctype = grn_str_alpha; - *d = c3 + 0x20; - } else if ('0' <= c3 && c3 <= '9') { - ctype = grn_str_digit; - *d = c3; - } else { - ctype = grn_str_others; - *d++ = c1; *d = c2; - } - break; - case 4 : - *d++ = c1; *d = c2; - ctype = grn_str_hiragana; - break; - case 5 : - *d++ = c1; *d = c2; - ctype = grn_str_katakana; - break; - case 6 : - case 7 : - case 8 : - *d++ = c1; *d = c2; - ctype = grn_str_symbol; - break; - default : - *d++ = c1; *d = c2; - ctype = grn_str_others; - break; - } - break; - default : - *d++ = c1; *d = c2; - ctype = grn_str_kanji; - break; - } - } else { - /* skip invalid character */ - continue; - } - } else { - unsigned char c = *s; - switch (c >> 4) { - case 0 : - case 1 : - /* skip unprintable ascii */ - if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; } - continue; - case 2 : - if (c == 0x20) { - if (removeblankp) { - if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; } - continue; - } else { - *d = ' '; - ctype = GRN_STR_BLANK|grn_str_symbol; - } - } else { - *d = c; - ctype = grn_str_symbol; - } - break; - case 3 : - *d = c; - ctype = (c <= 0x39) ? grn_str_digit : grn_str_symbol; - break; - case 4 : - *d = ('A' <= c) ? c + 0x20 : c; - ctype = (c == 0x40) ? grn_str_symbol : grn_str_alpha; - break; - case 5 : - *d = (c <= 'Z') ? c + 0x20 : c; - ctype = (c <= 0x5a) ? grn_str_alpha : grn_str_symbol; - break; - case 6 : - *d = c; - ctype = (c == 0x60) ? grn_str_symbol : grn_str_alpha; - break; - case 7 : - *d = c; - ctype = (c <= 0x7a) ? grn_str_alpha : (c == 0x7f ? grn_str_others : grn_str_symbol); - break; - default : - *d = c; - ctype = grn_str_others; - break; - } - } - d++; - length++; - if (cp) { *cp++ = ctype; } - if (ch) { - *ch++ = (int16_t)(s + 1 - s_); - s_ = s + 1; - while (++d_ < d) { *ch++ = 0; } - } - } - if (cp) { *cp = grn_str_null; } - *d = '\0'; - nstr->length = length; - nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm); - return GRN_SUCCESS; -} - -#ifdef WITH_NFKC -uint_least8_t grn_nfkc_ctype(const unsigned char *str); -const char *grn_nfkc_map1(const unsigned char *str); -const char *grn_nfkc_map2(const unsigned char *prefix, const unsigned char *suffix); - -inline static grn_rc -normalize_utf8(grn_ctx *ctx, grn_str *nstr) -{ - int16_t *ch; - const unsigned char *s, *s_, *s__ = NULL, *p, *p2, *pe, *e; - unsigned char *d, *d_, *de; - uint_least8_t *cp; - size_t length = 0, ls, lp, size = nstr->orig_blen, ds = size * 3; - int removeblankp = nstr->flags & GRN_STR_REMOVEBLANK; - if (!(nstr->norm = GRN_MALLOC(ds + 1))) { - return GRN_NO_MEMORY_AVAILABLE; - } - if (nstr->flags & GRN_STR_WITH_CHECKS) { - if (!(nstr->checks = GRN_MALLOC(ds * sizeof(int16_t) + 1))) { - GRN_FREE(nstr->norm); nstr->norm = NULL; - return GRN_NO_MEMORY_AVAILABLE; - } - } - ch = nstr->checks; - if (nstr->flags & GRN_STR_WITH_CTYPES) { - if (!(nstr->ctypes = GRN_MALLOC(ds + 1))) { - if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; } - GRN_FREE(nstr->norm); nstr->norm = NULL; - return GRN_NO_MEMORY_AVAILABLE; - } - } - cp = nstr->ctypes; - d = (unsigned char *)nstr->norm; - de = d + ds; - d_ = NULL; - e = (unsigned char *)nstr->orig + size; - for (s = s_ = (unsigned char *)nstr->orig; ; s += ls) { - if (!(ls = grn_str_charlen_utf8(ctx, s, e))) { - break; - } - if ((p = (unsigned char *)grn_nfkc_map1(s))) { - pe = p + strlen((char *)p); - } else { - p = s; - pe = p + ls; - } - if (d_ && (p2 = (unsigned char *)grn_nfkc_map2(d_, p))) { - p = p2; - pe = p + strlen((char *)p); - if (cp) { cp--; } - if (ch) { - ch -= (d - d_); - s_ = s__; - } - d = d_; - length--; - } - for (; ; p += lp) { - if (!(lp = grn_str_charlen_utf8(ctx, p, pe))) { - break; - } - if ((*p == ' ' && removeblankp) || *p < 0x20 /* skip unprintable ascii */ ) { - if (cp > nstr->ctypes) { *(cp - 1) |= GRN_STR_BLANK; } - } else { - if (de <= d + lp) { - unsigned char *norm; - ds += (ds >> 1) + lp; - if (!(norm = GRN_REALLOC(nstr->norm, ds + 1))) { - if (nstr->ctypes) { GRN_FREE(nstr->ctypes); nstr->ctypes = NULL; } - if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; } - GRN_FREE(nstr->norm); nstr->norm = NULL; - return GRN_NO_MEMORY_AVAILABLE; - } - de = norm + ds; - d = norm + (d - (unsigned char *)nstr->norm); - nstr->norm = norm; - if (ch) { - int16_t *checks; - if (!(checks = GRN_REALLOC(nstr->checks, ds * sizeof(int16_t)+ 1))) { - if (nstr->ctypes) { GRN_FREE(nstr->ctypes); nstr->ctypes = NULL; } - GRN_FREE(nstr->checks); nstr->checks = NULL; - GRN_FREE(nstr->norm); nstr->norm = NULL; - return GRN_NO_MEMORY_AVAILABLE; - } - ch = checks + (ch - nstr->checks); - nstr->checks = checks; - } - if (cp) { - uint_least8_t *ctypes; - if (!(ctypes = GRN_REALLOC(nstr->ctypes, ds + 1))) { - GRN_FREE(nstr->ctypes); nstr->ctypes = NULL; - if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; } - GRN_FREE(nstr->norm); nstr->norm = NULL; - return GRN_NO_MEMORY_AVAILABLE; - } - cp = ctypes + (cp - nstr->ctypes); - nstr->ctypes = ctypes; - } - } - memcpy(d, p, lp); - d_ = d; - d += lp; - length++; - if (cp) { *cp++ = grn_nfkc_ctype(p); } - if (ch) { - size_t i; - if (s_ == s + ls) { - *ch++ = -1; - } else { - *ch++ = (int16_t)(s + ls - s_); - s__ = s_; - s_ = s + ls; - } - for (i = lp; i > 1; i--) { *ch++ = 0; } - } - } - } - } - if (cp) { *cp = grn_str_null; } - *d = '\0'; - nstr->length = length; - nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm); - return GRN_SUCCESS; -} -#endif /* WITH_NFKC */ - -inline static grn_rc -normalize_sjis(grn_ctx *ctx, grn_str *nstr) -{ - static uint16_t hankana[] = { - 0x8140, 0x8142, 0x8175, 0x8176, 0x8141, 0x8145, 0x8392, 0x8340, 0x8342, - 0x8344, 0x8346, 0x8348, 0x8383, 0x8385, 0x8387, 0x8362, 0x815b, 0x8341, - 0x8343, 0x8345, 0x8347, 0x8349, 0x834a, 0x834c, 0x834e, 0x8350, 0x8352, - 0x8354, 0x8356, 0x8358, 0x835a, 0x835c, 0x835e, 0x8360, 0x8363, 0x8365, - 0x8367, 0x8369, 0x836a, 0x836b, 0x836c, 0x836d, 0x836e, 0x8371, 0x8374, - 0x8377, 0x837a, 0x837d, 0x837e, 0x8380, 0x8381, 0x8382, 0x8384, 0x8386, - 0x8388, 0x8389, 0x838a, 0x838b, 0x838c, 0x838d, 0x838f, 0x8393, 0x814a, - 0x814b - }; - static unsigned char dakuten[] = { - 0x94, 0, 0, 0, 0, 0x4b, 0, 0x4d, 0, 0x4f, 0, 0x51, 0, 0x53, 0, 0x55, 0, - 0x57, 0, 0x59, 0, 0x5b, 0, 0x5d, 0, 0x5f, 0, 0x61, 0, 0, 0x64, 0, 0x66, - 0, 0x68, 0, 0, 0, 0, 0, 0, 0x6f, 0, 0, 0x72, 0, 0, 0x75, 0, 0, 0x78, 0, - 0, 0x7b - }; - static unsigned char handaku[] = { - 0x70, 0, 0, 0x73, 0, 0, 0x76, 0, 0, 0x79, 0, 0, 0x7c - }; - int16_t *ch; - const unsigned char *s, *s_; - unsigned char *d, *d0, *d_, b, *e; - uint_least8_t *cp, *ctypes, ctype; - size_t size = nstr->orig_blen, length = 0; - int removeblankp = nstr->flags & GRN_STR_REMOVEBLANK; - if (!(nstr->norm = GRN_MALLOC(size * 2 + 1))) { - return GRN_NO_MEMORY_AVAILABLE; - } - d0 = (unsigned char *) nstr->norm; - if (nstr->flags & GRN_STR_WITH_CHECKS) { - if (!(nstr->checks = GRN_MALLOC(size * 2 * sizeof(int16_t) + 1))) { - GRN_FREE(nstr->norm); - nstr->norm = NULL; - return GRN_NO_MEMORY_AVAILABLE; - } - } - ch = nstr->checks; - if (nstr->flags & GRN_STR_WITH_CTYPES) { - if (!(nstr->ctypes = GRN_MALLOC(size + 1))) { - GRN_FREE(nstr->checks); - GRN_FREE(nstr->norm); - nstr->checks = NULL; - nstr->norm = NULL; - return GRN_NO_MEMORY_AVAILABLE; - } - } - cp = ctypes = nstr->ctypes; - e = (unsigned char *)nstr->orig + size; - for (s = s_ = (unsigned char *) nstr->orig, d = d_ = d0; s < e; s++) { - if ((*s & 0x80)) { - if (0xa0 <= *s && *s <= 0xdf) { - uint16_t c = hankana[*s - 0xa0]; - switch (c) { - case 0x814a : - if (d > d0 + 1 && d[-2] == 0x83 - && 0x45 <= d[-1] && d[-1] <= 0x7a && (b = dakuten[d[-1] - 0x45])) { - *(d - 1) = b; - if (ch) { ch[-1]++; s_++; } - continue; - } else { - *d++ = c >> 8; *d = c & 0xff; - } - break; - case 0x814b : - if (d > d0 + 1 && d[-2] == 0x83 - && 0x6e <= d[-1] && d[-1] <= 0x7a && (b = handaku[d[-1] - 0x6e])) { - *(d - 1) = b; - if (ch) { ch[-1]++; s_++; } - continue; - } else { - *d++ = c >> 8; *d = c & 0xff; - } - break; - default : - *d++ = c >> 8; *d = c & 0xff; - break; - } - ctype = grn_str_katakana; - } else { - if ((s + 1) < e && 0x40 <= *(s + 1) && *(s + 1) <= 0xfc) { - unsigned char c1 = *s++, c2 = *s, c3 = 0; - if (0x81 <= c1 && c1 <= 0x87) { - switch (c1 & 0x0f) { - case 1 : - switch (c2) { - case 0x5b : - *d++ = c1; *d = c2; - ctype = grn_str_katakana; - break; - case 0x58 : - *d++ = c1; *d = c2; - ctype = grn_str_kanji; - break; - case 0x40 : - if (removeblankp) { - if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; } - continue; - } else { - *d = ' '; - ctype = GRN_STR_BLANK|grn_str_symbol; - } - break; - default : - if (0x43 <= c2 && c2 <= 0x7e && (c3 = symbol[c2 - 0x43])) { - *d = c3; - ctype = grn_str_symbol; - } else if (0x7f <= c2 && c2 <= 0x97 && (c3 = symbol[c2 - 0x44])) { - *d = c3; - ctype = grn_str_symbol; - } else { - *d++ = c1; *d = c2; - ctype = grn_str_others; - } - break; - } - break; - case 2 : - c3 = c2 - 0x1f; - if (0x4f <= c2 && c2 <= 0x58) { - ctype = grn_str_digit; - *d = c2 - 0x1f; - } else if (0x60 <= c2 && c2 <= 0x79) { - ctype = grn_str_alpha; - *d = c2 + 0x01; - } else if (0x81 <= c2 && c2 <= 0x9a) { - ctype = grn_str_alpha; - *d = c2 - 0x20; - } else if (0x9f <= c2 && c2 <= 0xf1) { - *d++ = c1; *d = c2; - ctype = grn_str_hiragana; - } else { - *d++ = c1; *d = c2; - ctype = grn_str_others; - } - break; - case 3 : - if (0x40 <= c2 && c2 <= 0x96) { - *d++ = c1; *d = c2; - ctype = grn_str_katakana; - } else { - *d++ = c1; *d = c2; - ctype = grn_str_symbol; - } - break; - case 4 : - case 7 : - *d++ = c1; *d = c2; - ctype = grn_str_symbol; - break; - default : - *d++ = c1; *d = c2; - ctype = grn_str_others; - break; - } - } else { - *d++ = c1; *d = c2; - ctype = grn_str_kanji; - } - } else { - /* skip invalid character */ - continue; - } - } - } else { - unsigned char c = *s; - switch (c >> 4) { - case 0 : - case 1 : - /* skip unprintable ascii */ - if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; } - continue; - case 2 : - if (c == 0x20) { - if (removeblankp) { - if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; } - continue; - } else { - *d = ' '; - ctype = GRN_STR_BLANK|grn_str_symbol; - } - } else { - *d = c; - ctype = grn_str_symbol; - } - break; - case 3 : - *d = c; - ctype = (c <= 0x39) ? grn_str_digit : grn_str_symbol; - break; - case 4 : - *d = ('A' <= c) ? c + 0x20 : c; - ctype = (c == 0x40) ? grn_str_symbol : grn_str_alpha; - break; - case 5 : - *d = (c <= 'Z') ? c + 0x20 : c; - ctype = (c <= 0x5a) ? grn_str_alpha : grn_str_symbol; - break; - case 6 : - *d = c; - ctype = (c == 0x60) ? grn_str_symbol : grn_str_alpha; - break; - case 7 : - *d = c; - ctype = (c <= 0x7a) ? grn_str_alpha : (c == 0x7f ? grn_str_others : grn_str_symbol); - break; - default : - *d = c; - ctype = grn_str_others; - break; - } - } - d++; - length++; - if (cp) { *cp++ = ctype; } - if (ch) { - *ch++ = (int16_t)(s + 1 - s_); - s_ = s + 1; - while (++d_ < d) { *ch++ = 0; } - } - } - if (cp) { *cp = grn_str_null; } - *d = '\0'; - nstr->length = length; - nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm); - return GRN_SUCCESS; -} - -inline static grn_rc -normalize_none(grn_ctx *ctx, grn_str *nstr) -{ - int16_t *ch; - const unsigned char *s, *s_, *e; - unsigned char *d, *d0, *d_; - uint_least8_t *cp, *ctypes, ctype; - size_t size = nstr->orig_blen, length = 0; - int removeblankp = nstr->flags & GRN_STR_REMOVEBLANK; - if (!(nstr->norm = GRN_MALLOC(size + 1))) { - return GRN_NO_MEMORY_AVAILABLE; - } - d0 = (unsigned char *) nstr->norm; - if (nstr->flags & GRN_STR_WITH_CHECKS) { - if (!(nstr->checks = GRN_MALLOC(size * sizeof(int16_t) + 1))) { - GRN_FREE(nstr->norm); - nstr->norm = NULL; - return GRN_NO_MEMORY_AVAILABLE; - } - } - ch = nstr->checks; - if (nstr->flags & GRN_STR_WITH_CTYPES) { - if (!(nstr->ctypes = GRN_MALLOC(size + 1))) { - GRN_FREE(nstr->checks); - GRN_FREE(nstr->norm); - nstr->checks = NULL; - nstr->norm = NULL; - return GRN_NO_MEMORY_AVAILABLE; - } - } - cp = ctypes = nstr->ctypes; - e = (unsigned char *)nstr->orig + size; - for (s = s_ = (unsigned char *) nstr->orig, d = d_ = d0; s < e; s++) { - unsigned char c = *s; - switch (c >> 4) { - case 0 : - case 1 : - /* skip unprintable ascii */ - if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; } - continue; - case 2 : - if (c == 0x20) { - if (removeblankp) { - if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; } - continue; - } else { - *d = ' '; - ctype = GRN_STR_BLANK|grn_str_symbol; - } - } else { - *d = c; - ctype = grn_str_symbol; - } - break; - case 3 : - *d = c; - ctype = (c <= 0x39) ? grn_str_digit : grn_str_symbol; - break; - case 4 : - *d = ('A' <= c) ? c + 0x20 : c; - ctype = (c == 0x40) ? grn_str_symbol : grn_str_alpha; - break; - case 5 : - *d = (c <= 'Z') ? c + 0x20 : c; - ctype = (c <= 0x5a) ? grn_str_alpha : grn_str_symbol; - break; - case 6 : - *d = c; - ctype = (c == 0x60) ? grn_str_symbol : grn_str_alpha; - break; - case 7 : - *d = c; - ctype = (c <= 0x7a) ? grn_str_alpha : (c == 0x7f ? grn_str_others : grn_str_symbol); - break; - default : - *d = c; - ctype = grn_str_others; - break; - } - d++; - length++; - if (cp) { *cp++ = ctype; } - if (ch) { - *ch++ = (int16_t)(s + 1 - s_); - s_ = s + 1; - while (++d_ < d) { *ch++ = 0; } - } - } - if (cp) { *cp = grn_str_null; } - *d = '\0'; - nstr->length = length; - nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm); - return GRN_SUCCESS; -} - -/* use cp1252 as latin1 */ -inline static grn_rc -normalize_latin1(grn_ctx *ctx, grn_str *nstr) -{ - int16_t *ch; - const unsigned char *s, *s_, *e; - unsigned char *d, *d0, *d_; - uint_least8_t *cp, *ctypes, ctype; - size_t size = nstr->orig_blen, length = 0; - int removeblankp = nstr->flags & GRN_STR_REMOVEBLANK; - if (!(nstr->norm = GRN_MALLOC(size + 1))) { - return GRN_NO_MEMORY_AVAILABLE; - } - d0 = (unsigned char *) nstr->norm; - if (nstr->flags & GRN_STR_WITH_CHECKS) { - if (!(nstr->checks = GRN_MALLOC(size * sizeof(int16_t) + 1))) { - GRN_FREE(nstr->norm); - nstr->norm = NULL; - return GRN_NO_MEMORY_AVAILABLE; - } - } - ch = nstr->checks; - if (nstr->flags & GRN_STR_WITH_CTYPES) { - if (!(nstr->ctypes = GRN_MALLOC(size + 1))) { - GRN_FREE(nstr->checks); - GRN_FREE(nstr->norm); - nstr->checks = NULL; - nstr->norm = NULL; - return GRN_NO_MEMORY_AVAILABLE; - } - } - cp = ctypes = nstr->ctypes; - e = (unsigned char *)nstr->orig + size; - for (s = s_ = (unsigned char *) nstr->orig, d = d_ = d0; s < e; s++) { - unsigned char c = *s; - switch (c >> 4) { - case 0 : - case 1 : - /* skip unprintable ascii */ - if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; } - continue; - case 2 : - if (c == 0x20) { - if (removeblankp) { - if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; } - continue; - } else { - *d = ' '; - ctype = GRN_STR_BLANK|grn_str_symbol; - } - } else { - *d = c; - ctype = grn_str_symbol; - } - break; - case 3 : - *d = c; - ctype = (c <= 0x39) ? grn_str_digit : grn_str_symbol; - break; - case 4 : - *d = ('A' <= c) ? c + 0x20 : c; - ctype = (c == 0x40) ? grn_str_symbol : grn_str_alpha; - break; - case 5 : - *d = (c <= 'Z') ? c + 0x20 : c; - ctype = (c <= 0x5a) ? grn_str_alpha : grn_str_symbol; - break; - case 6 : - *d = c; - ctype = (c == 0x60) ? grn_str_symbol : grn_str_alpha; - break; - case 7 : - *d = c; - ctype = (c <= 0x7a) ? grn_str_alpha : (c == 0x7f ? grn_str_others : grn_str_symbol); - break; - case 8 : - if (c == 0x8a || c == 0x8c || c == 0x8e) { - *d = c + 0x10; - ctype = grn_str_alpha; - } else { - *d = c; - ctype = grn_str_symbol; - } - break; - case 9 : - if (c == 0x9a || c == 0x9c || c == 0x9e || c == 0x9f) { - *d = (c == 0x9f) ? c + 0x60 : c; - ctype = grn_str_alpha; - } else { - *d = c; - ctype = grn_str_symbol; - } - break; - case 0x0c : - *d = c + 0x20; - ctype = grn_str_alpha; - break; - case 0x0d : - *d = (c == 0xd7 || c == 0xdf) ? c : c + 0x20; - ctype = (c == 0xd7) ? grn_str_symbol : grn_str_alpha; - break; - case 0x0e : - *d = c; - ctype = grn_str_alpha; - break; - case 0x0f : - *d = c; - ctype = (c == 0xf7) ? grn_str_symbol : grn_str_alpha; - break; - default : - *d = c; - ctype = grn_str_others; - break; - } - d++; - length++; - if (cp) { *cp++ = ctype; } - if (ch) { - *ch++ = (int16_t)(s + 1 - s_); - s_ = s + 1; - while (++d_ < d) { *ch++ = 0; } - } - } - if (cp) { *cp = grn_str_null; } - *d = '\0'; - nstr->length = length; - nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm); - return GRN_SUCCESS; -} - -inline static grn_rc -normalize_koi8r(grn_ctx *ctx, grn_str *nstr) -{ - int16_t *ch; - const unsigned char *s, *s_, *e; - unsigned char *d, *d0, *d_; - uint_least8_t *cp, *ctypes, ctype; - size_t size = strlen(nstr->orig), length = 0; - int removeblankp = nstr->flags & GRN_STR_REMOVEBLANK; - if (!(nstr->norm = GRN_MALLOC(size + 1))) { - return GRN_NO_MEMORY_AVAILABLE; - } - d0 = (unsigned char *) nstr->norm; - if (nstr->flags & GRN_STR_WITH_CHECKS) { - if (!(nstr->checks = GRN_MALLOC(size * sizeof(int16_t) + 1))) { - GRN_FREE(nstr->norm); - nstr->norm = NULL; - return GRN_NO_MEMORY_AVAILABLE; - } - } - ch = nstr->checks; - if (nstr->flags & GRN_STR_WITH_CTYPES) { - if (!(nstr->ctypes = GRN_MALLOC(size + 1))) { - GRN_FREE(nstr->checks); - GRN_FREE(nstr->norm); - nstr->checks = NULL; - nstr->norm = NULL; - return GRN_NO_MEMORY_AVAILABLE; - } - } - cp = ctypes = nstr->ctypes; - e = (unsigned char *)nstr->orig + size; - for (s = s_ = (unsigned char *) nstr->orig, d = d_ = d0; s < e; s++) { - unsigned char c = *s; - switch (c >> 4) { - case 0 : - case 1 : - /* skip unprintable ascii */ - if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; } - continue; - case 2 : - if (c == 0x20) { - if (removeblankp) { - if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; } - continue; - } else { - *d = ' '; - ctype = GRN_STR_BLANK|grn_str_symbol; - } - } else { - *d = c; - ctype = grn_str_symbol; - } - break; - case 3 : - *d = c; - ctype = (c <= 0x39) ? grn_str_digit : grn_str_symbol; - break; - case 4 : - *d = ('A' <= c) ? c + 0x20 : c; - ctype = (c == 0x40) ? grn_str_symbol : grn_str_alpha; - break; - case 5 : - *d = (c <= 'Z') ? c + 0x20 : c; - ctype = (c <= 0x5a) ? grn_str_alpha : grn_str_symbol; - break; - case 6 : - *d = c; - ctype = (c == 0x60) ? grn_str_symbol : grn_str_alpha; - break; - case 7 : - *d = c; - ctype = (c <= 0x7a) ? grn_str_alpha : (c == 0x7f ? grn_str_others : grn_str_symbol); - break; - case 0x0a : - *d = c; - ctype = (c == 0xa3) ? grn_str_alpha : grn_str_others; - break; - case 0x0b : - if (c == 0xb3) { - *d = c - 0x10; - ctype = grn_str_alpha; - } else { - *d = c; - ctype = grn_str_others; - } - break; - case 0x0c : - case 0x0d : - *d = c; - ctype = grn_str_alpha; - break; - case 0x0e : - case 0x0f : - *d = c - 0x20; - ctype = grn_str_alpha; - break; - default : - *d = c; - ctype = grn_str_others; - break; - } - d++; - length++; - if (cp) { *cp++ = ctype; } - if (ch) { - *ch++ = (int16_t)(s + 1 - s_); - s_ = s + 1; - while (++d_ < d) { *ch++ = 0; } - } - } - if (cp) { *cp = grn_str_null; } - *d = '\0'; - nstr->length = length; - nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm); - return GRN_SUCCESS; -} - -static grn_str * -grn_fakenstr_open(grn_ctx *ctx, const char *str, size_t str_len, grn_encoding encoding, int flags) -{ - /* TODO: support GRN_STR_REMOVEBLANK flag and ctypes */ - grn_str *nstr; - if (!(nstr = GRN_MALLOC(sizeof(grn_str)))) { - GRN_LOG(ctx, GRN_LOG_ALERT, "memory allocation on grn_fakenstr_open failed !"); - return NULL; - } - if (!(nstr->norm = GRN_MALLOC(str_len + 1))) { - GRN_LOG(ctx, GRN_LOG_ALERT, "memory allocation for keyword on grn_snip_add_cond failed !"); - GRN_FREE(nstr); - return NULL; - } - nstr->orig = str; - nstr->orig_blen = str_len; - memcpy(nstr->norm, str, str_len); - nstr->norm[str_len] = '\0'; - nstr->norm_blen = str_len; - nstr->ctypes = NULL; - nstr->flags = flags; - - if (flags & GRN_STR_WITH_CHECKS) { - int16_t f = 0; - unsigned char c; - size_t i; - if (!(nstr->checks = (int16_t *) GRN_MALLOC(sizeof(int16_t) * str_len))) { - GRN_FREE(nstr->norm); - GRN_FREE(nstr); - return NULL; - } - switch (encoding) { - case GRN_ENC_EUC_JP: - for (i = 0; i < str_len; i++) { - if (!f) { - c = (unsigned char) str[i]; - f = ((c >= 0xa1U && c <= 0xfeU) || c == 0x8eU ? 2 : (c == 0x8fU ? 3 : 1) - ); - nstr->checks[i] = f; - } else { - nstr->checks[i] = 0; - } - f--; - } - break; - case GRN_ENC_SJIS: - for (i = 0; i < str_len; i++) { - if (!f) { - c = (unsigned char) str[i]; - f = (c >= 0x81U && ((c <= 0x9fU) || (c >= 0xe0U && c <= 0xfcU)) ? 2 : 1); - nstr->checks[i] = f; - } else { - nstr->checks[i] = 0; - } - f--; - } - break; - case GRN_ENC_UTF8: - for (i = 0; i < str_len; i++) { - if (!f) { - c = (unsigned char) str[i]; - f = (c & 0x80U ? (c & 0x20U ? (c & 0x10U ? 4 : 3) - : 2) - : 1); - nstr->checks[i] = f; - } else { - nstr->checks[i] = 0; - } - f--; - } - break; - default: - for (i = 0; i < str_len; i++) { - nstr->checks[i] = 1; - } - break; - } - } else { - nstr->checks = NULL; - } - return nstr; -} - -grn_str * -grn_str_open_(grn_ctx *ctx, const char *str, unsigned int str_len, int flags, grn_encoding encoding) -{ - grn_rc rc; - grn_str *nstr; - if (!str || !str_len) { return NULL; } - - if (!(flags & GRN_STR_NORMALIZE)) { - return grn_fakenstr_open(ctx, str, str_len, encoding, flags); - } - - if (!(nstr = GRN_MALLOC(sizeof(grn_str)))) { - GRN_LOG(ctx, GRN_LOG_ALERT, "memory allocation on grn_str_open failed !"); - return NULL; - } - nstr->orig = str; - nstr->orig_blen = str_len; - nstr->norm = NULL; - nstr->norm_blen = 0; - nstr->checks = NULL; - nstr->ctypes = NULL; - nstr->encoding = encoding; - nstr->flags = flags; - switch (encoding) { - case GRN_ENC_EUC_JP : - rc = normalize_euc(ctx, nstr); - break; - case GRN_ENC_UTF8 : -#ifdef WITH_NFKC - rc = normalize_utf8(ctx, nstr); -#else /* WITH_NFKC */ - rc = normalize_none(ctx, nstr); -#endif /* WITH_NFKC */ - break; - case GRN_ENC_SJIS : - rc = normalize_sjis(ctx, nstr); - break; - case GRN_ENC_LATIN1 : - rc = normalize_latin1(ctx, nstr); - break; - case GRN_ENC_KOI8R : - rc = normalize_koi8r(ctx, nstr); - break; - default : - rc = normalize_none(ctx, nstr); - break; - } - if (rc) { - grn_str_close(ctx, nstr); - return NULL; - } - return nstr; -} - -grn_str * -grn_str_open(grn_ctx *ctx, const char *str, unsigned int str_len, int flags) -{ - return grn_str_open_(ctx, str, str_len, flags, ctx->encoding); -} - -grn_rc -grn_str_close(grn_ctx *ctx, grn_str *nstr) -{ - if (nstr) { - if (nstr->norm) { GRN_FREE(nstr->norm); } - if (nstr->ctypes) { GRN_FREE(nstr->ctypes); } - if (nstr->checks) { GRN_FREE(nstr->checks); } - GRN_FREE(nstr); - return GRN_SUCCESS; - } else { - return GRN_INVALID_ARGUMENT; - } -} - static const char *grn_enc_string[] = { "default", "none", Modified: plugins/suggest/suggest.c (+17 -6) =================================================================== --- plugins/suggest/suggest.c 2013-01-21 12:06:04 +0900 (0d84cfc) +++ plugins/suggest/suggest.c 2013-01-21 17:53:14 +0900 (b47174c) @@ -304,16 +304,25 @@ complete(grn_ctx *ctx, grn_obj *items, grn_obj *items_boost, grn_obj *col, if ((res = grn_table_create(ctx, NULL, 0, NULL, GRN_TABLE_HASH_KEY|GRN_OBJ_WITH_SUBREC, items, NULL))) { grn_id tid = grn_table_get(ctx, items, TEXT_VALUE_LEN(query)); - grn_str *norm; + grn_obj *string; if (GRN_TEXT_LEN(query) && - (norm = grn_str_open(ctx, TEXT_VALUE_LEN(query), GRN_STR_NORMALIZE))) { + (string = grn_string_open(ctx, TEXT_VALUE_LEN(query), + GRN_NORMALIZER_AUTO, 0))) { grn_table_cursor *cur; /* RK search + prefix search */ grn_obj *index; - /* FIXME: support index selection */ + const char *normalized; + unsigned int normalized_length_in_bytes; + grn_string_get_normalized(ctx, string, + &normalized, + &normalized_length_in_bytes, + NULL); + /* FIXME: support index selection */ if (grn_column_index(ctx, col, GRN_OP_PREFIX, &index, 1, NULL)) { if ((cur = grn_table_cursor_open(ctx, grn_ctx_at(ctx, index->header.domain), - norm->norm, norm->norm_blen, NULL, 0, 0, -1, + normalized, + normalized_length_in_bytes, + NULL, 0, 0, -1, GRN_CURSOR_PREFIX|GRN_CURSOR_RK))) { grn_id id; while ((id = grn_table_cursor_next(ctx, cur))) { @@ -342,7 +351,9 @@ complete(grn_ctx *ctx, grn_obj *items, grn_obj *items_boost, grn_obj *col, if (((prefix_search_mode == GRN_SUGGEST_SEARCH_YES) || (prefix_search_mode == GRN_SUGGEST_SEARCH_AUTO && !grn_table_size(ctx, res))) && - (cur = grn_table_cursor_open(ctx, items, norm->norm, norm->norm_blen, + (cur = grn_table_cursor_open(ctx, items, + normalized, + normalized_length_in_bytes, NULL, 0, 0, -1, GRN_CURSOR_PREFIX))) { grn_id id; while ((id = grn_table_cursor_next(ctx, cur))) { @@ -351,7 +362,7 @@ complete(grn_ctx *ctx, grn_obj *items, grn_obj *items_boost, grn_obj *col, } grn_table_cursor_close(ctx, cur); } - grn_str_close(ctx, norm); + grn_obj_close(ctx, string); } output(ctx, items, res, tid, sortby, output_columns, offset, limit); grn_obj_close(ctx, res); Modified: test/unit/util/test-string.c (+34 -14) =================================================================== --- test/unit/util/test-string.c 2013-01-21 12:06:04 +0900 (b1b617b) +++ test/unit/util/test-string.c 2013-01-21 17:53:14 +0900 (05fcb67) @@ -190,26 +190,36 @@ test_normalize(gconstpointer data) { const gchar *utf8_expected, *encoded_expected; const gchar *utf8_input, *encoded_input; - grn_str *string; + grn_obj *string; const gchar *normalized_text; - guint normalized_text_len; + guint normalized_text_length; + guint normalized_text_n_characters; int flags; grn_encoding encoding; encoding = gcut_data_get_int(data, "encoding"); GRN_CTX_SET_ENCODING(&context, encoding); - flags = GRN_STR_NORMALIZE | GRN_STR_WITH_CHECKS | GRN_STR_WITH_CTYPES; + flags = GRN_STRING_WITH_CHECKS | GRN_STRING_WITH_TYPES; utf8_input = gcut_data_get_string(data, "input"); encoded_input = convert_encoding(utf8_input, encoding); - string = grn_str_open(&context, encoded_input, strlen(encoded_input), flags); - normalized_text = cut_take_strndup(string->norm, string->norm_blen); - normalized_text_len = string->norm_blen; - grn_test_assert(grn_str_close(&context, string)); + string = grn_string_open(&context, + encoded_input, + strlen(encoded_input), + GRN_NORMALIZER_AUTO, + flags); + grn_string_get_normalized(&context, string, + &normalized_text, + &normalized_text_length, + &normalized_text_n_characters); + normalized_text = cut_take_strndup(normalized_text, normalized_text_length); + grn_obj_unlink(&context, string); utf8_expected = gcut_data_get_string(data, "expected"); encoded_expected = convert_encoding(utf8_expected, encoding); cut_assert_equal_string(encoded_expected, normalized_text); - cut_assert_equal_int(strlen(encoded_expected), normalized_text_len); + cut_assert_equal_uint(strlen(encoded_expected), normalized_text_length); + cut_assert_equal_uint(g_utf8_strlen(utf8_expected, -1), + normalized_text_n_characters); } void @@ -243,11 +253,13 @@ data_normalize_broken(void) void test_normalize_broken(gconstpointer data) { - grn_str *string; + grn_obj *string; const gchar *input, *encoded_input; + const gchar *normalized_text; grn_encoding input_encoding, context_encoding; gint input_length; - int flags = GRN_STR_NORMALIZE | GRN_STR_WITH_CHECKS | GRN_STR_WITH_CTYPES; + guint normalized_text_length, normalized_text_n_characters; + int flags = GRN_STRING_WITH_CHECKS | GRN_STRING_WITH_TYPES; context_encoding = gcut_data_get_int(data, "context-encoding"); GRN_CTX_SET_ENCODING(&context, context_encoding); @@ -259,10 +271,18 @@ test_normalize_broken(gconstpointer data) if (input_length < 0) { input_length = strlen(encoded_input); } - string = grn_str_open(&context, encoded_input, input_length, flags); - cut_assert_equal_string("", string->norm); - cut_assert_equal_int(0, string->norm_blen); - grn_test_assert(grn_str_close(&context, string)); + string = grn_string_open(&context, encoded_input, input_length, + GRN_NORMALIZER_AUTO, flags); + grn_string_get_normalized(&context, string, + &normalized_text, + &normalized_text_length, + &normalized_text_n_characters); + normalized_text = cut_take_strndup(normalized_text, normalized_text_length); + grn_obj_unlink(&context, string); + + cut_assert_equal_string("", normalized_text); + cut_assert_equal_int(0, normalized_text_length); + cut_assert_equal_int(0, normalized_text_n_characters); } void -------------- next part -------------- HTML����������������������������...Download