Kouhei Sutou
null+****@clear*****
Wed Mar 11 15:58:38 JST 2015
Kouhei Sutou 2015-03-11 15:58:38 +0900 (Wed, 11 Mar 2015) New Revision: 94a7b9e9f226dd70a0be7e8bc4c89b9500d33646 https://github.com/groonga/groonga/commit/94a7b9e9f226dd70a0be7e8bc4c89b9500d33646 Message: Support GRN_OP_REGEXP with inverted index Added files: test/command/suite/select/filter/regexp/asterisk.expected test/command/suite/select/filter/regexp/asterisk.test test/command/suite/select/filter/regexp/dot.expected test/command/suite/select/filter/regexp/dot.test test/command/suite/select/filter/regexp/literal.expected test/command/suite/select/filter/regexp/literal.test test/command/suite/select/filter/regexp/plus.expected test/command/suite/select/filter/regexp/plus.test test/command/suite/select/filter/regexp/question.expected test/command/suite/select/filter/regexp/question.test Modified files: lib/db.c lib/expr.c lib/ii.c lib/mrb/mrb_operator.c lib/mrb/scripts/scan_info_builder.rb lib/mrb/scripts/scan_info_data.rb Modified: lib/db.c (+30 -0) =================================================================== --- lib/db.c 2015-03-11 15:18:43 +0900 (69781cc) +++ lib/db.c 2015-03-11 15:58:38 +0900 (960fdad) @@ -10491,6 +10491,27 @@ grn_column_find_index_data_column_equal(grn_ctx *ctx, grn_obj *obj, return n; } +static inline grn_bool +is_valid_regexp_index(grn_ctx *ctx, grn_obj *index_column) +{ + grn_obj *tokenizer; + grn_obj *lexicon; + + lexicon = grn_ctx_at(ctx, index_column->header.domain); + if (!lexicon) { + return GRN_FALSE; + } + + grn_table_get_info(ctx, lexicon, NULL, NULL, &tokenizer, NULL, NULL); + grn_obj_unlink(ctx, lexicon); + if (!tokenizer) { + return GRN_FALSE; + } + + /* TODO: Restrict to TokenRegexp? */ + return GRN_TRUE; +} + static inline int grn_column_find_index_data_column_match(grn_ctx *ctx, grn_obj *obj, grn_operator op, @@ -10521,6 +10542,9 @@ grn_column_find_index_data_column_match(grn_ctx *ctx, grn_obj *obj, grn_obj *target = grn_ctx_at(ctx, data->target); int section; if (target->header.type != GRN_COLUMN_INDEX) { continue; } + if (op == GRN_OP_REGEXP && !is_valid_regexp_index(ctx, target)) { + continue; + } section = (MULTI_COLUMN_INDEXP(target)) ? data->section : 0; if (section_buf) { *section_buf = section; } if (n < buf_size) { @@ -10633,6 +10657,9 @@ is_valid_index(grn_ctx *ctx, grn_obj *index_column, grn_operator op) case GRN_OP_CALL : return is_valid_range_index(ctx, index_column); break; + case GRN_OP_REGEXP : + return is_valid_regexp_index(ctx, index_column); + break; default : return GRN_FALSE; break; @@ -10855,6 +10882,7 @@ grn_column_find_index_data_accessor(grn_ctx *ctx, grn_obj *obj, case GRN_OP_LESS_EQUAL : case GRN_OP_GREATER_EQUAL : case GRN_OP_CALL : + case GRN_OP_REGEXP : n = grn_column_find_index_data_accessor_match(ctx, obj, op, index_data, n_index_data, index_buf, buf_size, @@ -10887,6 +10915,7 @@ grn_column_index(grn_ctx *ctx, grn_obj *obj, grn_operator op, case GRN_OP_NEAR : case GRN_OP_NEAR2 : case GRN_OP_SIMILAR : + case GRN_OP_REGEXP : n = grn_column_find_index_data_column_match(ctx, obj, op, NULL, 0, index_buf, buf_size, @@ -10934,6 +10963,7 @@ grn_column_find_index_data(grn_ctx *ctx, grn_obj *obj, grn_operator op, case GRN_OP_NEAR : case GRN_OP_NEAR2 : case GRN_OP_SIMILAR : + case GRN_OP_REGEXP : n = grn_column_find_index_data_column_match(ctx, obj, op, index_data, n_index_data, NULL, 0, NULL); Modified: lib/expr.c (+109 -1) =================================================================== --- lib/expr.c 2015-03-11 15:18:43 +0900 (0f76dad) +++ lib/expr.c 2015-03-11 15:58:38 +0900 (2e03b83) @@ -4149,10 +4149,115 @@ scan_info_build_match_expr(grn_ctx *ctx, scan_info *si, grn_expr *expr) } } +static grn_bool +is_index_searchable_regexp(grn_ctx *ctx, grn_obj *regexp) +{ + const char *regexp_raw; + const char *regexp_raw_end; + grn_bool escaping = GRN_FALSE; + + if (!(regexp->header.domain == GRN_DB_SHORT_TEXT || + regexp->header.domain == GRN_DB_TEXT || + regexp->header.domain == GRN_DB_LONG_TEXT)) { + return GRN_FALSE; + } + + regexp_raw = GRN_TEXT_VALUE(regexp); + regexp_raw_end = regexp_raw + GRN_TEXT_LEN(regexp); + + while (regexp_raw < regexp_raw_end) { + unsigned int char_len; + + char_len = grn_charlen(ctx, regexp_raw, regexp_raw_end); + if (char_len == 0) { + return GRN_FALSE; + } + + if (char_len == 1) { + if (escaping) { + escaping = GRN_FALSE; + switch (regexp_raw[0]) { + case 'Z' : + case 'b' : + case 'B' : + case 'd' : + case 'D' : + case 'h' : + case 'H' : + case 'p' : + case 's' : + case 'S' : + case 'w' : + case 'W' : + case 'X' : + case 'k' : + case 'g' : + case '1' : + case '2' : + case '3' : + case '4' : + case '5' : + case '6' : + case '7' : + case '8' : + case '9' : + return GRN_FALSE; + default : + break; + } + } else { + switch (regexp_raw[0]) { + case '.' : + case '[' : + case ']' : + case '|' : + case '?' : + case '+' : + case '*' : + case '{' : + case '}' : + case '^' : + case '$' : + case '(' : + case ')' : + escaping = GRN_FALSE; + return GRN_FALSE; + case '\\' : + escaping = GRN_TRUE; + break; + default : + escaping = GRN_FALSE; + break; + } + } + } else { + escaping = GRN_FALSE; + } + + regexp_raw += char_len; + } + + return GRN_TRUE; +} + static void scan_info_build_match(grn_ctx *ctx, scan_info *si) { - grn_obj **p = si->args, **pe = si->args + si->nargs; + grn_obj **p, **pe; + + if (si->op == GRN_OP_REGEXP) { + p = si->args; + pe = si->args + si->nargs; + for (; p < pe; p++) { + if ((*p)->header.type == GRN_BULK && + !is_index_searchable_regexp(ctx, *p)) { + return; + } + } + } + + p = si->args; + pe = si->args + si->nargs; for (; p < pe; p++) { if ((*p)->header.type == GRN_EXPR) { scan_info_build_match_expr(ctx, si, (grn_expr *)(*p)); @@ -4246,6 +4351,7 @@ scan_info_build(grn_ctx *ctx, grn_obj *expr, int *n, case GRN_OP_GEO_WITHINP6 : case GRN_OP_GEO_WITHINP8 : case GRN_OP_TERM_EXTRACT : + case GRN_OP_REGEXP : if (stat < SCAN_COL1 || SCAN_CONST < stat) { return NULL; } stat = SCAN_START; m++; @@ -4326,6 +4432,7 @@ scan_info_build(grn_ctx *ctx, grn_obj *expr, int *n, case GRN_OP_GEO_WITHINP6 : case GRN_OP_GEO_WITHINP8 : case GRN_OP_TERM_EXTRACT : + case GRN_OP_REGEXP : stat = SCAN_START; si->op = c->op; si->end = c - e->codes; @@ -5041,6 +5148,7 @@ grn_table_select_index(grn_ctx *ctx, grn_obj *table, scan_info *si, case GRN_OP_NEAR : case GRN_OP_NEAR2 : case GRN_OP_SIMILAR : + case GRN_OP_REGEXP : { grn_obj wv, **ip = &GRN_PTR_VALUE(&si->index); int j = GRN_BULK_VSIZE(&si->index)/sizeof(grn_obj *); Modified: lib/ii.c (+6 -0) =================================================================== --- lib/ii.c 2015-03-11 15:18:43 +0900 (b9ab12f) +++ lib/ii.c 2015-03-11 15:58:38 +0900 (2808b6b) @@ -6475,6 +6475,9 @@ grn_ii_estimate_size_for_query(grn_ctx *ctx, grn_ii *ii, case GRN_OP_SIMILAR : mode = optarg->mode; break; + case GRN_OP_REGEXP : + mode = optarg->mode; + break; default : break; } @@ -6543,6 +6546,9 @@ grn_ii_sel(grn_ctx *ctx, grn_ii *ii, const char *string, unsigned int string_len arg.mode = optarg->mode; arg.similarity_threshold = optarg->similarity_threshold; break; + case GRN_OP_REGEXP : + arg.mode = optarg->mode; + break; default : break; } Modified: lib/mrb/mrb_operator.c (+1 -0) =================================================================== --- lib/mrb/mrb_operator.c 2015-03-11 15:18:43 +0900 (59f68ee) +++ lib/mrb/mrb_operator.c 2015-03-11 15:58:38 +0900 (2e0cb48) @@ -147,6 +147,7 @@ grn_mrb_operator_init(grn_ctx *ctx) DEFINE_OPERATOR(TABLE_GROUP); DEFINE_OPERATOR(JSON_PUT); DEFINE_OPERATOR(GET_MEMBER); + DEFINE_OPERATOR(REGEXP); #undef DEFINE_OPERATOR } Modified: lib/mrb/scripts/scan_info_builder.rb (+1 -0) =================================================================== --- lib/mrb/scripts/scan_info_builder.rb 2015-03-11 15:18:43 +0900 (4b441ce) +++ lib/mrb/scripts/scan_info_builder.rb 2015-03-11 15:58:38 +0900 (dc003f8) @@ -34,6 +34,7 @@ module Groonga Operator::GEO_WITHINP6, Operator::GEO_WITHINP8, Operator::TERM_EXTRACT, + Operator::REGEXP, ] ARITHMETIC_OPERATORS = [ Modified: lib/mrb/scripts/scan_info_data.rb (+32 -0) =================================================================== --- lib/mrb/scripts/scan_info_data.rb 2015-03-11 15:18:43 +0900 (0555d50) +++ lib/mrb/scripts/scan_info_data.rb 2015-03-11 15:58:38 +0900 (d65536d) @@ -102,6 +102,38 @@ module Groonga self.query = arg end end + if @op == Operator::REGEXP and not index_searchable_regexp?(@query) + @search_indexes.clear + end + end + + def index_searchable_regexp?(pattern) + return false if pattern.nil? + + previous_char = nil + pattern.value.each_char do |char| + if previous_char == "\\" + case char + when "Z" + return false + when "b", "B" + return false + when "d", "D", "h", "H", "p", "s", "S", "w", "W" + return false + when "X" + return false + when "k", "g", "1", "2", "3", "4", "5", "6", "7", "8", "9" + return false + end + else + case char + when ".", "[", "]", "|", "?", "+", "*", "{", "}", "^", "$", "(", ")" + return false + end + end + previous_char = char + end + true end def match_resolve_index_expression(expression) Added: test/command/suite/select/filter/regexp/asterisk.expected (+58 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/filter/regexp/asterisk.expected 2015-03-11 15:58:38 +0900 (a492b76) @@ -0,0 +1,58 @@ +table_create Memos TABLE_NO_KEY +[[0,0.0,0.0],true] +column_create Memos content COLUMN_SCALAR Text +[[0,0.0,0.0],true] +table_create RegexpTokens TABLE_PAT_KEY ShortText --default_tokenizer TokenBigramSplitSymbolAlphaDigit +[[0,0.0,0.0],true] +column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION Memos content +[[0,0.0,0.0],true] +load --table Memos +[ +{"content": "Grnga"}, +{"content": "Gronga"}, +{"content": "Groonga"}, +{"content": "Mroonga"}, +{"content": "mruby"} +] +[[0,0.0,0.0],5] +select Memos --filter 'content @~ "ro*nga"' +[ + [ + 0, + 0.0, + 0.0 + ], + [ + [ + [ + 4 + ], + [ + [ + "_id", + "UInt32" + ], + [ + "content", + "Text" + ] + ], + [ + 1, + "Grnga" + ], + [ + 2, + "Gronga" + ], + [ + 3, + "Groonga" + ], + [ + 4, + "Mroonga" + ] + ] + ] +] Added: test/command/suite/select/filter/regexp/asterisk.test (+18 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/filter/regexp/asterisk.test 2015-03-11 15:58:38 +0900 (8011fa5) @@ -0,0 +1,18 @@ +table_create Memos TABLE_NO_KEY +column_create Memos content COLUMN_SCALAR Text + +table_create RegexpTokens TABLE_PAT_KEY ShortText \ + --default_tokenizer TokenBigramSplitSymbolAlphaDigit +column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION \ + Memos content + +load --table Memos +[ +{"content": "Grnga"}, +{"content": "Gronga"}, +{"content": "Groonga"}, +{"content": "Mroonga"}, +{"content": "mruby"} +] + +select Memos --filter 'content @~ "ro*nga"' Added: test/command/suite/select/filter/regexp/dot.expected (+48 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/filter/regexp/dot.expected 2015-03-11 15:58:38 +0900 (a545e66) @@ -0,0 +1,48 @@ +table_create Memos TABLE_NO_KEY +[[0,0.0,0.0],true] +column_create Memos content COLUMN_SCALAR Text +[[0,0.0,0.0],true] +table_create RegexpTokens TABLE_PAT_KEY ShortText --default_tokenizer TokenBigramSplitSymbolAlphaDigit +[[0,0.0,0.0],true] +column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION Memos content +[[0,0.0,0.0],true] +load --table Memos +[ +{"content": "Groonga"}, +{"content": "Mroonga"}, +{"content": "mruby"} +] +[[0,0.0,0.0],3] +select Memos --filter 'content @~ ".ro"' +[ + [ + 0, + 0.0, + 0.0 + ], + [ + [ + [ + 2 + ], + [ + [ + "_id", + "UInt32" + ], + [ + "content", + "Text" + ] + ], + [ + 1, + "Groonga" + ], + [ + 2, + "Mroonga" + ] + ] + ] +] Added: test/command/suite/select/filter/regexp/dot.test (+16 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/filter/regexp/dot.test 2015-03-11 15:58:38 +0900 (964a3af) @@ -0,0 +1,16 @@ +table_create Memos TABLE_NO_KEY +column_create Memos content COLUMN_SCALAR Text + +table_create RegexpTokens TABLE_PAT_KEY ShortText \ + --default_tokenizer TokenBigramSplitSymbolAlphaDigit +column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION \ + Memos content + +load --table Memos +[ +{"content": "Groonga"}, +{"content": "Mroonga"}, +{"content": "mruby"} +] + +select Memos --filter 'content @~ ".ro"' Added: test/command/suite/select/filter/regexp/literal.expected (+48 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/filter/regexp/literal.expected 2015-03-11 15:58:38 +0900 (adeb75c) @@ -0,0 +1,48 @@ +table_create Memos TABLE_NO_KEY +[[0,0.0,0.0],true] +column_create Memos content COLUMN_SCALAR Text +[[0,0.0,0.0],true] +table_create RegexpTokens TABLE_PAT_KEY ShortText --default_tokenizer TokenBigramSplitSymbolAlphaDigit +[[0,0.0,0.0],true] +column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION Memos content +[[0,0.0,0.0],true] +load --table Memos +[ +{"content": "Groonga"}, +{"content": "Mroonga"}, +{"content": "mruby"} +] +[[0,0.0,0.0],3] +select Memos --filter 'content @~ "oonga"' +[ + [ + 0, + 0.0, + 0.0 + ], + [ + [ + [ + 2 + ], + [ + [ + "_id", + "UInt32" + ], + [ + "content", + "Text" + ] + ], + [ + 1, + "Groonga" + ], + [ + 2, + "Mroonga" + ] + ] + ] +] Added: test/command/suite/select/filter/regexp/literal.test (+16 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/filter/regexp/literal.test 2015-03-11 15:58:38 +0900 (dc35a23) @@ -0,0 +1,16 @@ +table_create Memos TABLE_NO_KEY +column_create Memos content COLUMN_SCALAR Text + +table_create RegexpTokens TABLE_PAT_KEY ShortText \ + --default_tokenizer TokenBigramSplitSymbolAlphaDigit +column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION \ + Memos content + +load --table Memos +[ +{"content": "Groonga"}, +{"content": "Mroonga"}, +{"content": "mruby"} +] + +select Memos --filter 'content @~ "oonga"' Added: test/command/suite/select/filter/regexp/plus.expected (+54 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/filter/regexp/plus.expected 2015-03-11 15:58:38 +0900 (e119a6f) @@ -0,0 +1,54 @@ +table_create Memos TABLE_NO_KEY +[[0,0.0,0.0],true] +column_create Memos content COLUMN_SCALAR Text +[[0,0.0,0.0],true] +table_create RegexpTokens TABLE_PAT_KEY ShortText --default_tokenizer TokenBigramSplitSymbolAlphaDigit +[[0,0.0,0.0],true] +column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION Memos content +[[0,0.0,0.0],true] +load --table Memos +[ +{"content": "Grnga"}, +{"content": "Gronga"}, +{"content": "Groonga"}, +{"content": "Mroonga"}, +{"content": "mruby"} +] +[[0,0.0,0.0],5] +select Memos --filter 'content @~ "ro+nga"' +[ + [ + 0, + 0.0, + 0.0 + ], + [ + [ + [ + 3 + ], + [ + [ + "_id", + "UInt32" + ], + [ + "content", + "Text" + ] + ], + [ + 2, + "Gronga" + ], + [ + 3, + "Groonga" + ], + [ + 4, + "Mroonga" + ] + ] + ] +] Added: test/command/suite/select/filter/regexp/plus.test (+18 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/filter/regexp/plus.test 2015-03-11 15:58:38 +0900 (10e6ac1) @@ -0,0 +1,18 @@ +table_create Memos TABLE_NO_KEY +column_create Memos content COLUMN_SCALAR Text + +table_create RegexpTokens TABLE_PAT_KEY ShortText \ + --default_tokenizer TokenBigramSplitSymbolAlphaDigit +column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION \ + Memos content + +load --table Memos +[ +{"content": "Grnga"}, +{"content": "Gronga"}, +{"content": "Groonga"}, +{"content": "Mroonga"}, +{"content": "mruby"} +] + +select Memos --filter 'content @~ "ro+nga"' Added: test/command/suite/select/filter/regexp/question.expected (+54 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/filter/regexp/question.expected 2015-03-11 15:58:38 +0900 (51212b4) @@ -0,0 +1,54 @@ +table_create Memos TABLE_NO_KEY +[[0,0.0,0.0],true] +column_create Memos content COLUMN_SCALAR Text +[[0,0.0,0.0],true] +table_create RegexpTokens TABLE_PAT_KEY ShortText --default_tokenizer TokenBigramSplitSymbolAlphaDigit +[[0,0.0,0.0],true] +column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION Memos content +[[0,0.0,0.0],true] +load --table Memos +[ +{"content": "Grnga"}, +{"content": "Gronga"}, +{"content": "Groonga"}, +{"content": "Mroonga"}, +{"content": "mruby"} +] +[[0,0.0,0.0],5] +select Memos --filter 'content @~ "roo?nga"' +[ + [ + 0, + 0.0, + 0.0 + ], + [ + [ + [ + 3 + ], + [ + [ + "_id", + "UInt32" + ], + [ + "content", + "Text" + ] + ], + [ + 2, + "Gronga" + ], + [ + 3, + "Groonga" + ], + [ + 4, + "Mroonga" + ] + ] + ] +] Added: test/command/suite/select/filter/regexp/question.test (+18 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/filter/regexp/question.test 2015-03-11 15:58:38 +0900 (383ec30) @@ -0,0 +1,18 @@ +table_create Memos TABLE_NO_KEY +column_create Memos content COLUMN_SCALAR Text + +table_create RegexpTokens TABLE_PAT_KEY ShortText \ + --default_tokenizer TokenBigramSplitSymbolAlphaDigit +column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION \ + Memos content + +load --table Memos +[ +{"content": "Grnga"}, +{"content": "Gronga"}, +{"content": "Groonga"}, +{"content": "Mroonga"}, +{"content": "mruby"} +] + +select Memos --filter 'content @~ "roo?nga"' -------------- next part -------------- HTML����������������������������... Download