Kouhei Sutou
null+****@clear*****
Thu Oct 11 18:48:12 JST 2012
Kouhei Sutou 2012-10-11 18:48:12 +0900 (Thu, 11 Oct 2012) New Revision: 13d9bb7891d6a68a2247a462d6e2625e072983a6 https://github.com/groonga/groonga/commit/13d9bb7891d6a68a2247a462d6e2625e072983a6 Log: QueryExpanderTSV: support encoding comment It will work but it is not tested yet. We need to improve grntest for testing it. Modified files: plugins/query_expanders/tsv.c Modified: plugins/query_expanders/tsv.c (+68 -3) =================================================================== --- plugins/query_expanders/tsv.c 2012-10-11 18:15:54 +0900 (30d482f) +++ plugins/query_expanders/tsv.c 2012-10-11 18:48:12 +0900 (7adfef8) @@ -67,6 +67,63 @@ is_comment_mark(char character) return character == '#'; } +static inline grn_encoding +detect_coding_part(grn_ctx *ctx, const char *line, size_t line_length) +{ + grn_encoding encoding; + grn_obj null_terminated_line_buffer; + const char *c_line; + const char *encoding_name; + + GRN_TEXT_INIT(&null_terminated_line_buffer, 0); + GRN_TEXT_PUT(ctx, &null_terminated_line_buffer, line, line_length); + GRN_TEXT_PUTC(ctx, &null_terminated_line_buffer, '\0'); + + c_line = GRN_TEXT_VALUE(&null_terminated_line_buffer); + encoding_name = strstr(c_line, "coding: "); + if (encoding_name) { + if (strncasecmp(encoding_name, "utf-8", strlen("utf-8")) == 0 || + strncasecmp(encoding_name, "utf8", strlen("utf8")) == 0) { + encoding = GRN_ENC_UTF8; + } else if (strncasecmp(encoding_name, "sjis", strlen("sjis")) == 0 || + strncasecmp(encoding_name, "Shift_JIS", strlen("Shift_JIS")) == 0) { + encoding = GRN_ENC_SJIS; + } else if (strncasecmp(encoding_name, "EUC-JP", strlen("EUC-JP")) == 0 || + strncasecmp(encoding_name, "euc_jp", strlen("euc_jp")) == 0) { + encoding = GRN_ENC_EUC_JP; + } else if (strncasecmp(encoding_name, "latin1", strlen("latin1")) == 0) { + encoding = GRN_ENC_LATIN1; + } else if (strncasecmp(encoding_name, "KOI8-R", strlen("KOI8-R")) == 0 || + strncasecmp(encoding_name, "koi8r", strlen("koi8r")) == 0) { + encoding = GRN_ENC_KOI8R; + } + } else { + encoding = ctx->encoding; + } + GRN_OBJ_FIN(ctx, &null_terminated_line_buffer); + + return encoding; +} + +static inline grn_encoding +guess_encoding(grn_ctx *ctx, const char **line, size_t *line_length) +{ + const char bom[] = {0xef, 0xbb, 0xbf}; + size_t bom_length = sizeof(bom); + + if (*line_length >= bom_length && memcmp(*line, bom, bom_length) == 0) { + *line += bom_length; + *line_length -= bom_length; + return GRN_ENC_UTF8; + } + + if (!is_comment_mark((*line)[0])) { + return ctx->encoding; + } + + return detect_coding_part(ctx, (*line) + 1, (*line_length) - 1); +} + static void parse_synonyms_file_line(grn_ctx *ctx, const char *line, int line_length, grn_obj *key, grn_obj *value) @@ -127,6 +184,8 @@ load_synonyms(grn_ctx *ctx) { const char *path; FILE *file; + int number_of_lines; + grn_encoding encoding; grn_obj line, key, value; path = getenv("GRN_QUERY_EXPANDER_TSV_SYNONYMS_FILE"); @@ -146,12 +205,18 @@ load_synonyms(grn_ctx *ctx) GRN_TEXT_INIT(&key, 0); GRN_TEXT_INIT(&value, 0); grn_bulk_reserve(ctx, &value, MAX_SYNONYM_BYTES); + number_of_lines = 0; while (grn_text_fgets(ctx, &line, file) == GRN_SUCCESS) { + const char *line_value = GRN_TEXT_VALUE(&line); + size_t line_length = GRN_TEXT_LEN(&line); + + number_of_lines++; + if (number_of_lines == 1) { + encoding = guess_encoding(ctx, &line_value, &line_length); + } GRN_BULK_REWIND(&key); GRN_BULK_REWIND(&value); - parse_synonyms_file_line(ctx, - GRN_TEXT_VALUE(&line), GRN_TEXT_LEN(&line), - &key, &value); + parse_synonyms_file_line(ctx, line_value, line_length, &key, &value); GRN_BULK_REWIND(&line); } GRN_OBJ_FIN(ctx, &line); -------------- next part -------------- HTML����������������������������...Download