[Groonga-commit] groonga/groonga at 77c72f1 [master] Add TokenRegexp

Back to archive index

Kouhei Sutou null+****@clear*****
Thu Mar 12 20:36:16 JST 2015


Kouhei Sutou	2015-03-12 20:36:16 +0900 (Thu, 12 Mar 2015)

  New Revision: 77c72f1679761f6909061d38acc0ace5f22448aa
  https://github.com/groonga/groonga/commit/77c72f1679761f6909061d38acc0ace5f22448aa

  Message:
    Add TokenRegexp

  Added files:
    test/command/suite/tokenizers/regexp/add/four.expected
    test/command/suite/tokenizers/regexp/add/four.test
    test/command/suite/tokenizers/regexp/add/one.expected
    test/command/suite/tokenizers/regexp/add/one.test
    test/command/suite/tokenizers/regexp/add/three.expected
    test/command/suite/tokenizers/regexp/add/three.test
    test/command/suite/tokenizers/regexp/add/two.expected
    test/command/suite/tokenizers/regexp/add/two.test
    test/command/suite/tokenizers/regexp/get/begin/one.expected
    test/command/suite/tokenizers/regexp/get/begin/one.test
    test/command/suite/tokenizers/regexp/get/begin/three.expected
    test/command/suite/tokenizers/regexp/get/begin/three.test
    test/command/suite/tokenizers/regexp/get/begin/two.expected
    test/command/suite/tokenizers/regexp/get/begin/two.test
    test/command/suite/tokenizers/regexp/get/end/four.expected
    test/command/suite/tokenizers/regexp/get/end/four.test
    test/command/suite/tokenizers/regexp/get/end/one.expected
    test/command/suite/tokenizers/regexp/get/end/one.test
    test/command/suite/tokenizers/regexp/get/end/three.expected
    test/command/suite/tokenizers/regexp/get/end/three.test
    test/command/suite/tokenizers/regexp/get/end/two.expected
    test/command/suite/tokenizers/regexp/get/end/two.test
  Modified files:
    lib/grn_token_cursor.h
    lib/tokenizers.c
    test/command/suite/tokenizer_list/default.expected

  Modified: lib/grn_token_cursor.h (+5 -0)
===================================================================
--- lib/grn_token_cursor.h    2015-03-12 17:59:16 +0900 (81175bc)
+++ lib/grn_token_cursor.h    2015-03-12 20:36:16 +0900 (fec6222)
@@ -26,6 +26,11 @@
 extern "C" {
 #endif
 
+#define GRN_TOKENIZER_BEGIN_MARK_UTF8     "\xEF\xBF\xAF"
+#define GRN_TOKENIZER_BEGIN_MARK_UTF8_LEN 3
+#define GRN_TOKENIZER_END_MARK_UTF8       "\xEF\xBF\xB0"
+#define GRN_TOKENIZER_END_MARK_UTF8_LEN   3
+
 typedef enum {
   GRN_TOKEN_CURSOR_DOING = 0,
   GRN_TOKEN_CURSOR_DONE,

  Modified: lib/tokenizers.c (+209 -1)
===================================================================
--- lib/tokenizers.c    2015-03-12 17:59:16 +0900 (3f6df15)
+++ lib/tokenizers.c    2015-03-12 20:36:16 +0900 (796c02d)
@@ -1,6 +1,6 @@
 /* -*- c-basic-offset: 2 -*- */
 /*
-  Copyright(C) 2009-2014 Brazil
+  Copyright(C) 2009-2015 Brazil
 
   This library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
@@ -467,6 +467,212 @@ ngram_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
   return NULL;
 }
 
+/* regexp tokenizer */
+
+typedef struct {
+  grn_tokenizer_token token;
+  grn_tokenizer_query *query;
+  struct {
+    grn_bool have_begin;
+    grn_bool have_end;
+  } get;
+  grn_bool is_begin;
+  grn_bool is_end;
+  grn_bool is_overlapping;
+  const char *next;
+  const char *end;
+} grn_regexp_tokenizer;
+
+static grn_obj *
+regexp_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
+{
+  unsigned int normalize_flags = 0;
+  grn_tokenizer_query *query;
+  const char *normalized;
+  unsigned int normalized_length_in_bytes;
+  grn_regexp_tokenizer *tokenizer;
+
+  query = grn_tokenizer_query_open(ctx, nargs, args, normalize_flags);
+  if (!query) {
+    return NULL;
+  }
+
+  tokenizer = GRN_MALLOC(sizeof(grn_regexp_tokenizer));
+  if (!tokenizer) {
+    grn_tokenizer_query_close(ctx, query);
+    ERR(GRN_NO_MEMORY_AVAILABLE,
+        "[tokenizer][regexp] failed to allocate memory");
+    return NULL;
+  }
+  user_data->ptr = tokenizer;
+
+  grn_tokenizer_token_init(ctx, &(tokenizer->token));
+  tokenizer->query = query;
+
+  tokenizer->get.have_begin = GRN_FALSE;
+  tokenizer->get.have_end   = GRN_FALSE;
+
+  tokenizer->is_begin = GRN_TRUE;
+  tokenizer->is_end   = GRN_FALSE;
+  tokenizer->is_overlapping = GRN_FALSE;
+
+  grn_string_get_normalized(ctx, tokenizer->query->normalized_query,
+                            &normalized, &normalized_length_in_bytes,
+                            NULL);
+  tokenizer->next = normalized;
+  tokenizer->end = tokenizer->next + normalized_length_in_bytes;
+
+  if (tokenizer->query->tokenize_mode == GRN_TOKEN_GET) {
+    unsigned int query_length = tokenizer->query->length;
+    if (query_length >= 2) {
+      const char *query_string = tokenizer->query->ptr;
+      grn_encoding encoding = tokenizer->query->encoding;
+      if (query_string[0] == '\\' && query_string[1] == 'A') {
+        tokenizer->get.have_begin = GRN_TRUE;
+        /* TODO: It assumes that both "\\" and "A" are normalized to 1
+           characters. Normalizer may omit character or expand to
+           multiple characters. */
+        tokenizer->next += grn_charlen_(ctx, tokenizer->next, tokenizer->end,
+                                        encoding);
+        tokenizer->next += grn_charlen_(ctx, tokenizer->next, tokenizer->end,
+                                        encoding);
+      }
+      if (query_string[query_length - 2] == '\\' &&
+          query_string[query_length - 1] == 'z') {
+        tokenizer->get.have_end = GRN_TRUE;
+        /* TODO: It assumes that both "\\" and "z" are normalized to 1
+           byte characters. Normalizer may omit character or expand to
+           multiple characters. */
+        tokenizer->end -= grn_charlen_(ctx,
+                                       tokenizer->end - 1,
+                                       tokenizer->end,
+                                       encoding);
+        tokenizer->end -= grn_charlen_(ctx,
+                                       tokenizer->end - 1,
+                                       tokenizer->end,
+                                       encoding);
+      }
+    }
+  }
+
+  return NULL;
+}
+
+static grn_obj *
+regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
+{
+  int char_len;
+  grn_token_status status = 0;
+  grn_regexp_tokenizer *tokenizer = user_data->ptr;
+  unsigned int n_characters = 0;
+  int ngram_unit = 2;
+  const char *start = tokenizer->next;
+  const char *current = start;
+  const char *end = tokenizer->end;
+  grn_tokenize_mode mode = tokenizer->query->tokenize_mode;
+
+  if (mode == GRN_TOKEN_GET) {
+    if (tokenizer->get.have_begin) {
+      grn_tokenizer_token_push(ctx,
+                               &(tokenizer->token),
+                               GRN_TOKENIZER_BEGIN_MARK_UTF8,
+                               GRN_TOKENIZER_BEGIN_MARK_UTF8_LEN,
+                               status);
+      tokenizer->get.have_begin = GRN_FALSE;
+      return NULL;
+    }
+
+    if (tokenizer->is_end && tokenizer->get.have_end) {
+      status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END;
+      grn_tokenizer_token_push(ctx,
+                               &(tokenizer->token),
+                               GRN_TOKENIZER_BEGIN_MARK_UTF8,
+                               GRN_TOKENIZER_BEGIN_MARK_UTF8_LEN,
+                               status);
+      return NULL;
+    }
+  } else {
+    if (tokenizer->is_begin) {
+      grn_tokenizer_token_push(ctx,
+                               &(tokenizer->token),
+                               GRN_TOKENIZER_BEGIN_MARK_UTF8,
+                               GRN_TOKENIZER_BEGIN_MARK_UTF8_LEN,
+                               status);
+      tokenizer->is_begin = GRN_FALSE;
+      return NULL;
+    }
+
+    if (tokenizer->is_end) {
+      status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END;
+      grn_tokenizer_token_push(ctx,
+                               &(tokenizer->token),
+                               GRN_TOKENIZER_BEGIN_MARK_UTF8,
+                               GRN_TOKENIZER_BEGIN_MARK_UTF8_LEN,
+                               status);
+      return NULL;
+    }
+  }
+
+  char_len = grn_charlen_(ctx, current, end, tokenizer->query->encoding);
+  if (char_len == 0) {
+    status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END;
+    grn_tokenizer_token_push(ctx, &(tokenizer->token), "", 0, status);
+    return NULL;
+  }
+
+  n_characters++;
+  current += char_len;
+  tokenizer->next = current;
+  while (n_characters < ngram_unit) {
+    char_len = grn_charlen_(ctx, (const char *)current, (const char *)end,
+                            tokenizer->query->encoding);
+    if (char_len == 0) {
+      break;
+    }
+    n_characters++;
+    current += char_len;
+  }
+
+  if (tokenizer->is_overlapping) {
+    status |= GRN_TOKEN_OVERLAP;
+  }
+  if (n_characters < ngram_unit) {
+    status |= GRN_TOKEN_UNMATURED;
+  }
+  tokenizer->is_overlapping = (n_characters > 1);
+
+  if (tokenizer->next == end) {
+    tokenizer->is_end = GRN_TRUE;
+    if (mode == GRN_TOKEN_GET) {
+      if (status & GRN_TOKEN_UNMATURED) {
+        status |= GRN_TOKEN_FORCE_PREFIX;
+      }
+      if (!tokenizer->get.have_end) {
+        status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END;
+      }
+    }
+  }
+  grn_tokenizer_token_push(ctx,
+                           &(tokenizer->token),
+                           (const char *)start,
+                           current - start,
+                           status);
+  return NULL;
+}
+
+static grn_obj *
+regexp_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
+{
+  grn_regexp_tokenizer *tokenizer = user_data->ptr;
+  if (!tokenizer) {
+    return NULL;
+  }
+  grn_tokenizer_token_fin(ctx, &(tokenizer->token));
+  grn_tokenizer_query_close(ctx, tokenizer->query);
+  GRN_FREE(tokenizer);
+  return NULL;
+}
+
 /* external */
 
 grn_rc
@@ -560,5 +766,7 @@ grn_db_init_builtin_tokenizers(grn_ctx *ctx)
                 bigramisad_init, ngram_next, ngram_fin, vars);
   DEF_TOKENIZER("TokenDelimitNull",
                 delimit_null_init, delimited_next, delimited_fin, vars);
+  DEF_TOKENIZER("TokenRegexp",
+                regexp_init, regexp_next, regexp_fin, vars);
   return GRN_SUCCESS;
 }

  Modified: test/command/suite/tokenizer_list/default.expected (+3 -0)
===================================================================
--- test/command/suite/tokenizer_list/default.expected    2015-03-12 17:59:16 +0900 (44118f4)
+++ test/command/suite/tokenizer_list/default.expected    2015-03-12 20:36:16 +0900 (a26eb0b)
@@ -44,6 +44,9 @@ tokenizer_list
     },
     {
       "name": "TokenDelimitNull"
+    },
+    {
+      "name": "TokenRegexp"
     }
   ]
 ]

  Added: test/command/suite/tokenizers/regexp/add/four.expected (+34 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/regexp/add/four.expected    2015-03-12 20:36:16 +0900 (ae80e56)
@@ -0,0 +1,34 @@
+tokenize TokenRegexp "abcd"
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "￯",
+      "position": 0
+    },
+    {
+      "value": "ab",
+      "position": 1
+    },
+    {
+      "value": "bc",
+      "position": 2
+    },
+    {
+      "value": "cd",
+      "position": 3
+    },
+    {
+      "value": "d",
+      "position": 4
+    },
+    {
+      "value": "￯",
+      "position": 5
+    }
+  ]
+]

  Added: test/command/suite/tokenizers/regexp/add/four.test (+1 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/regexp/add/four.test    2015-03-12 20:36:16 +0900 (666a44d)
@@ -0,0 +1 @@
+tokenize TokenRegexp "abcd"

  Added: test/command/suite/tokenizers/regexp/add/one.expected (+22 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/regexp/add/one.expected    2015-03-12 20:36:16 +0900 (6c2e513)
@@ -0,0 +1,22 @@
+tokenize TokenRegexp "x"
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "￯",
+      "position": 0
+    },
+    {
+      "value": "x",
+      "position": 1
+    },
+    {
+      "value": "￯",
+      "position": 2
+    }
+  ]
+]

  Added: test/command/suite/tokenizers/regexp/add/one.test (+1 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/regexp/add/one.test    2015-03-12 20:36:16 +0900 (a723923)
@@ -0,0 +1 @@
+tokenize TokenRegexp "x"

  Added: test/command/suite/tokenizers/regexp/add/three.expected (+30 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/regexp/add/three.expected    2015-03-12 20:36:16 +0900 (e865c8c)
@@ -0,0 +1,30 @@
+tokenize TokenRegexp "xyz"
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "￯",
+      "position": 0
+    },
+    {
+      "value": "xy",
+      "position": 1
+    },
+    {
+      "value": "yz",
+      "position": 2
+    },
+    {
+      "value": "z",
+      "position": 3
+    },
+    {
+      "value": "￯",
+      "position": 4
+    }
+  ]
+]

  Added: test/command/suite/tokenizers/regexp/add/three.test (+1 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/regexp/add/three.test    2015-03-12 20:36:16 +0900 (4671e5b)
@@ -0,0 +1 @@
+tokenize TokenRegexp "xyz"

  Added: test/command/suite/tokenizers/regexp/add/two.expected (+26 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/regexp/add/two.expected    2015-03-12 20:36:16 +0900 (27eef72)
@@ -0,0 +1,26 @@
+tokenize TokenRegexp "xy"
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "￯",
+      "position": 0
+    },
+    {
+      "value": "xy",
+      "position": 1
+    },
+    {
+      "value": "y",
+      "position": 2
+    },
+    {
+      "value": "￯",
+      "position": 3
+    }
+  ]
+]

  Added: test/command/suite/tokenizers/regexp/add/two.test (+1 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/regexp/add/two.test    2015-03-12 20:36:16 +0900 (4bfc597)
@@ -0,0 +1 @@
+tokenize TokenRegexp "xy"

  Added: test/command/suite/tokenizers/regexp/get/begin/one.expected (+26 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/regexp/get/begin/one.expected    2015-03-12 20:36:16 +0900 (254c754)
@@ -0,0 +1,26 @@
+table_create Lexicon TABLE_PAT_KEY ShortText   --default_tokenizer TokenRegexp
+[[0,0.0,0.0],true]
+table_tokenize Lexicon "x" --mode ADD
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "￯",
+      "position": 0
+    },
+    {
+      "value": "x",
+      "position": 1
+    },
+    {
+      "value": "￯",
+      "position": 2
+    }
+  ]
+]
+table_tokenize Lexicon "\\Ax" --mode GET
+[[0,0.0,0.0],[{"value":"￯","position":0},{"value":"x","position":1}]]

  Added: test/command/suite/tokenizers/regexp/get/begin/one.test (+5 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/regexp/get/begin/one.test    2015-03-12 20:36:16 +0900 (0a4f35b)
@@ -0,0 +1,5 @@
+table_create Lexicon TABLE_PAT_KEY ShortText \
+  --default_tokenizer TokenRegexp
+table_tokenize Lexicon "x" --mode ADD
+
+table_tokenize Lexicon "\\Ax" --mode GET

  Added: test/command/suite/tokenizers/regexp/get/begin/three.expected (+54 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/regexp/get/begin/three.expected    2015-03-12 20:36:16 +0900 (49bc507)
@@ -0,0 +1,54 @@
+table_create Lexicon TABLE_PAT_KEY ShortText   --default_tokenizer TokenRegexp
+[[0,0.0,0.0],true]
+table_tokenize Lexicon "xyz" --mode ADD
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "￯",
+      "position": 0
+    },
+    {
+      "value": "xy",
+      "position": 1
+    },
+    {
+      "value": "yz",
+      "position": 2
+    },
+    {
+      "value": "z",
+      "position": 3
+    },
+    {
+      "value": "￯",
+      "position": 4
+    }
+  ]
+]
+table_tokenize Lexicon "\\Axyz" --mode GET
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "￯",
+      "position": 0
+    },
+    {
+      "value": "xy",
+      "position": 1
+    },
+    {
+      "value": "yz",
+      "position": 2
+    }
+  ]
+]

  Added: test/command/suite/tokenizers/regexp/get/begin/three.test (+5 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/regexp/get/begin/three.test    2015-03-12 20:36:16 +0900 (82d674f)
@@ -0,0 +1,5 @@
+table_create Lexicon TABLE_PAT_KEY ShortText \
+  --default_tokenizer TokenRegexp
+table_tokenize Lexicon "xyz" --mode ADD
+
+table_tokenize Lexicon "\\Axyz" --mode GET

  Added: test/command/suite/tokenizers/regexp/get/begin/two.expected (+30 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/regexp/get/begin/two.expected    2015-03-12 20:36:16 +0900 (2808762)
@@ -0,0 +1,30 @@
+table_create Lexicon TABLE_PAT_KEY ShortText   --default_tokenizer TokenRegexp
+[[0,0.0,0.0],true]
+table_tokenize Lexicon "xy" --mode ADD
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "￯",
+      "position": 0
+    },
+    {
+      "value": "xy",
+      "position": 1
+    },
+    {
+      "value": "y",
+      "position": 2
+    },
+    {
+      "value": "￯",
+      "position": 3
+    }
+  ]
+]
+table_tokenize Lexicon "\\Axy" --mode GET
+[[0,0.0,0.0],[{"value":"￯","position":0},{"value":"xy","position":1}]]

  Added: test/command/suite/tokenizers/regexp/get/begin/two.test (+5 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/regexp/get/begin/two.test    2015-03-12 20:36:16 +0900 (9be9343)
@@ -0,0 +1,5 @@
+table_create Lexicon TABLE_PAT_KEY ShortText \
+  --default_tokenizer TokenRegexp
+table_tokenize Lexicon "xy" --mode ADD
+
+table_tokenize Lexicon "\\Axy" --mode GET

  Added: test/command/suite/tokenizers/regexp/get/end/four.expected (+62 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/regexp/get/end/four.expected    2015-03-12 20:36:16 +0900 (af115be)
@@ -0,0 +1,62 @@
+table_create Lexicon TABLE_PAT_KEY ShortText   --default_tokenizer TokenRegexp
+[[0,0.0,0.0],true]
+table_tokenize Lexicon "abcd" --mode ADD
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "￯",
+      "position": 0
+    },
+    {
+      "value": "ab",
+      "position": 1
+    },
+    {
+      "value": "bc",
+      "position": 2
+    },
+    {
+      "value": "cd",
+      "position": 3
+    },
+    {
+      "value": "d",
+      "position": 4
+    },
+    {
+      "value": "￯",
+      "position": 5
+    }
+  ]
+]
+table_tokenize Lexicon "abcd\\z" --mode GET
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "ab",
+      "position": 0
+    },
+    {
+      "value": "bc",
+      "position": 1
+    },
+    {
+      "value": "cd",
+      "position": 2
+    },
+    {
+      "value": "￯",
+      "position": 4
+    }
+  ]
+]

  Added: test/command/suite/tokenizers/regexp/get/end/four.test (+5 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/regexp/get/end/four.test    2015-03-12 20:36:16 +0900 (a4b1c2d)
@@ -0,0 +1,5 @@
+table_create Lexicon TABLE_PAT_KEY ShortText \
+  --default_tokenizer TokenRegexp
+table_tokenize Lexicon "abcd" --mode ADD
+
+table_tokenize Lexicon "abcd\\z" --mode GET

  Added: test/command/suite/tokenizers/regexp/get/end/one.expected (+26 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/regexp/get/end/one.expected    2015-03-12 20:36:16 +0900 (d692b4a)
@@ -0,0 +1,26 @@
+table_create Lexicon TABLE_PAT_KEY ShortText   --default_tokenizer TokenRegexp
+[[0,0.0,0.0],true]
+table_tokenize Lexicon "x" --mode ADD
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "￯",
+      "position": 0
+    },
+    {
+      "value": "x",
+      "position": 1
+    },
+    {
+      "value": "￯",
+      "position": 2
+    }
+  ]
+]
+table_tokenize Lexicon "x\\z" --mode GET
+[[0,0.0,0.0],[{"value":"x","position":0},{"value":"￯","position":1}]]

  Added: test/command/suite/tokenizers/regexp/get/end/one.test (+5 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/regexp/get/end/one.test    2015-03-12 20:36:16 +0900 (3314d6f)
@@ -0,0 +1,5 @@
+table_create Lexicon TABLE_PAT_KEY ShortText \
+  --default_tokenizer TokenRegexp
+table_tokenize Lexicon "x" --mode ADD
+
+table_tokenize Lexicon "x\\z" --mode GET

  Added: test/command/suite/tokenizers/regexp/get/end/three.expected (+54 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/regexp/get/end/three.expected    2015-03-12 20:36:16 +0900 (7759db6)
@@ -0,0 +1,54 @@
+table_create Lexicon TABLE_PAT_KEY ShortText   --default_tokenizer TokenRegexp
+[[0,0.0,0.0],true]
+table_tokenize Lexicon "xyz" --mode ADD
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "￯",
+      "position": 0
+    },
+    {
+      "value": "xy",
+      "position": 1
+    },
+    {
+      "value": "yz",
+      "position": 2
+    },
+    {
+      "value": "z",
+      "position": 3
+    },
+    {
+      "value": "￯",
+      "position": 4
+    }
+  ]
+]
+table_tokenize Lexicon "xyz\\z" --mode GET
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "xy",
+      "position": 0
+    },
+    {
+      "value": "yz",
+      "position": 1
+    },
+    {
+      "value": "￯",
+      "position": 3
+    }
+  ]
+]

  Added: test/command/suite/tokenizers/regexp/get/end/three.test (+5 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/regexp/get/end/three.test    2015-03-12 20:36:16 +0900 (510d69c)
@@ -0,0 +1,5 @@
+table_create Lexicon TABLE_PAT_KEY ShortText \
+  --default_tokenizer TokenRegexp
+table_tokenize Lexicon "xyz" --mode ADD
+
+table_tokenize Lexicon "xyz\\z" --mode GET

  Added: test/command/suite/tokenizers/regexp/get/end/two.expected (+30 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/regexp/get/end/two.expected    2015-03-12 20:36:16 +0900 (ccd7ce2)
@@ -0,0 +1,30 @@
+table_create Lexicon TABLE_PAT_KEY ShortText   --default_tokenizer TokenRegexp
+[[0,0.0,0.0],true]
+table_tokenize Lexicon "xy" --mode ADD
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "￯",
+      "position": 0
+    },
+    {
+      "value": "xy",
+      "position": 1
+    },
+    {
+      "value": "y",
+      "position": 2
+    },
+    {
+      "value": "￯",
+      "position": 3
+    }
+  ]
+]
+table_tokenize Lexicon "xy\\z" --mode GET
+[[0,0.0,0.0],[{"value":"xy","position":0},{"value":"￯","position":2}]]

  Added: test/command/suite/tokenizers/regexp/get/end/two.test (+5 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/regexp/get/end/two.test    2015-03-12 20:36:16 +0900 (58b3e77)
@@ -0,0 +1,5 @@
+table_create Lexicon TABLE_PAT_KEY ShortText \
+  --default_tokenizer TokenRegexp
+table_tokenize Lexicon "xy" --mode ADD
+
+table_tokenize Lexicon "xy\\z" --mode GET
-------------- next part --------------
HTML����������������������������...
Download 



More information about the Groonga-commit mailing list
Back to archive index