groonga/groonga at f0e5b1e [master] mecab: support chunked tokenization (Groonga-commit) - Groonga - fulltext search engine.

Kouhei Sutou	2015-03-01 00:41:03 +0900 (Sun, 01 Mar 2015)

  New Revision: f0e5b1e438f92e163ac8683dcc894c0aeabd1739
  https://github.com/groonga/groonga/commit/f0e5b1e438f92e163ac8683dcc894c0aeabd1739

  Message:
    mecab: support chunked tokenization
    
    MeCab reports "too long sentence" error for long no new line text.
    
    This feature splits a long text (8192 bytes over text by default) to
    small chunks and passes each chunk to MeCab. MeCab will not return "too
    long sentence" error.
    
    This feature uses the following rules to split to chunks:
    
      * Split by one ore more spaces (U+0020 SPACE or U+3000 IDEOGRAPHIC SPACE)
      * Split by about 8192 bytes
      * Split by delimiter character
    
    This feature uses these characters as a chunk delimiter character:
    
      * ,
      * .
      * !
      * ?
      * U+3001 IDEOGRAPHIC COMMA
      * U+3002 IDEOGRAPHIC FULL STOP
      * U+FF01 FULLWIDTH EXCLAMATION MARK
      * U+FF1F FULLWIDTH QUESTION MARK
    
    It's an experimental feature. You need to define
    `GRN_MECAB_CHUNKED_TOKENIZE_ENABLED=yes` environment variable to enable
    this feature.
    
    You can change chunk threshold bytes (8192 by default) by
    `GRN_MECAB_CHUNK_SIZE_THRESHOLD` environment variable.

  Added files:
    test/command/suite/tokenizers/mecab/chunk/comma.expected
    test/command/suite/tokenizers/mecab/chunk/comma.test
    test/command/suite/tokenizers/mecab/chunk/exclamation_mark.expected
    test/command/suite/tokenizers/mecab/chunk/exclamation_mark.test
    test/command/suite/tokenizers/mecab/chunk/fullwidth_exclamation_mark.expected
    test/command/suite/tokenizers/mecab/chunk/fullwidth_exclamation_mark.test
    test/command/suite/tokenizers/mecab/chunk/fullwidth_question_mark.expected
    test/command/suite/tokenizers/mecab/chunk/fullwidth_question_mark.test
    test/command/suite/tokenizers/mecab/chunk/ideographic_comma.expected
    test/command/suite/tokenizers/mecab/chunk/ideographic_comma.test
    test/command/suite/tokenizers/mecab/chunk/ideographic_full_stop.expected
    test/command/suite/tokenizers/mecab/chunk/ideographic_full_stop.test
    test/command/suite/tokenizers/mecab/chunk/ideographic_space.expected
    test/command/suite/tokenizers/mecab/chunk/ideographic_space.test
    test/command/suite/tokenizers/mecab/chunk/period.expected
    test/command/suite/tokenizers/mecab/chunk/period.test
    test/command/suite/tokenizers/mecab/chunk/question_mark.expected
    test/command/suite/tokenizers/mecab/chunk/question_mark.test
    test/command/suite/tokenizers/mecab/chunk/space.expected
    test/command/suite/tokenizers/mecab/chunk/space.test
    test/command/suite/tokenizers/mecab/chunk/threshold.expected
    test/command/suite/tokenizers/mecab/chunk/threshold.test
  Modified files:
    plugins/tokenizers/mecab.c
    test/command/run-test.sh

  Modified: plugins/tokenizers/mecab.c (+207 -12)
===================================================================

--- plugins/tokenizers/mecab.c    2015-02-28 20:24:46 +0900 (7b3d59c)
+++ plugins/tokenizers/mecab.c    2015-03-01 00:41:03 +0900 (5f7c1a2)
@@ -30,6 +30,9 @@ static mecab_t *sole_mecab = NULL;
 static grn_plugin_mutex *sole_mecab_mutex = NULL;
 static grn_encoding sole_mecab_encoding = GRN_ENC_NONE;
 
+static grn_bool grn_mecab_chunked_tokenize_enabled = GRN_FALSE;
+static int grn_mecab_chunk_size_threshold = 8192;
+
 typedef struct {
   mecab_t *mecab;
   grn_obj buf;
@@ -83,6 +86,163 @@ get_mecab_encoding(mecab_t *mecab)
   return encoding;
 }
 
+static inline grn_bool
+is_delimiter_character(grn_ctx *ctx, const char *character, int character_bytes)
+{
+  switch (character_bytes) {
+  case 1 :
+    switch (character[0]) {
+    case ',' :
+    case '.' :
+    case '!' :
+    case '?' :
+      return GRN_TRUE;
+    default :
+      return GRN_FALSE;
+    }
+  case 3 :
+    switch ((unsigned char)(character[0])) {
+    case 0xE3 :
+      switch ((unsigned char)(character[1])) {
+      case 0x80 :
+        switch ((unsigned char)(character[2])) {
+        case 0x81 : /* U+3001 (0xE3 0x80 0x81 in UTF-8) IDEOGRAPHIC COMMA */
+        case 0x82 : /* U+3002 (0xE3 0x80 0x82 in UTF-8) IDEOGRAPHIC FULL STOP */
+          return GRN_TRUE;
+        default :
+          return GRN_FALSE;
+        }
+      default :
+        return GRN_FALSE;
+      }
+      return GRN_FALSE;
+    case 0xEF :
+      switch ((unsigned char)(character[1])) {
+      case 0xBC :
+        switch ((unsigned char)(character[2])) {
+        case 0x81 :
+          /* U+FF01 (0xEF 0xBC 0x81 in UTF-8) FULLWIDTH EXCLAMATION MARK */
+        case 0x9F :
+          /* U+FF1F (0xEF 0xBC 0x9F in UTF-8) FULLWIDTH QUESTION MARK */
+          return GRN_TRUE;
+        default :
+          return GRN_FALSE;
+        }
+      default :
+        return GRN_FALSE;
+      }
+      return GRN_FALSE;
+    default :
+      return GRN_FALSE;
+    }
+  default :
+    return GRN_FALSE;
+  }
+}
+
+static grn_bool
+chunked_tokenize_utf8_chunk(grn_ctx *ctx,
+                            grn_mecab_tokenizer *tokenizer,
+                            const char *chunk,
+                            unsigned int chunk_bytes)
+{
+  const char *tokenized_chunk;
+
+  tokenized_chunk = mecab_sparse_tostr2(tokenizer->mecab, chunk, chunk_bytes);
+  if (!tokenized_chunk) {
+    GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
+                     "[tokenizer][mecab][chunk] "
+                     "mecab_sparse_tostr2() failed len=%d err=%s",
+                     chunk_bytes,
+                     mecab_strerror(tokenizer->mecab));
+    return GRN_FALSE;
+  }
+
+  if (GRN_TEXT_LEN(&(tokenizer->buf)) > 0) {
+    GRN_TEXT_PUTS(ctx, &(tokenizer->buf), " ");
+  }
+  GRN_TEXT_PUTS(ctx, &(tokenizer->buf), tokenized_chunk);
+
+  return GRN_TRUE;
+}
+
+static grn_bool
+chunked_tokenize_utf8(grn_ctx *ctx,
+                      grn_mecab_tokenizer *tokenizer,
+                      const char *string,
+                      unsigned int string_bytes)
+{
+  const char *chunk_start;
+  const char *current;
+  const char *string_end = string + string_bytes;
+  grn_encoding encoding = tokenizer->query->encoding;
+
+  if (string_bytes < grn_mecab_chunk_size_threshold) {
+    return chunked_tokenize_utf8_chunk(ctx,
+                                       tokenizer,
+                                       string,
+                                       string_bytes);
+  }
+
+  chunk_start = current = string;
+  while (current < string_end) {
+    int space_bytes;
+    int character_bytes;
+    const char *current_character;
+
+    space_bytes = grn_isspace(current, encoding);
+    if (space_bytes > 0) {
+      if (chunk_start != current) {
+        grn_bool succeeded;
+        succeeded = chunked_tokenize_utf8_chunk(ctx,
+                                                tokenizer,
+                                                chunk_start,
+                                                current - chunk_start);
+        if (!succeeded) {
+          return succeeded;
+        }
+      }
+      current += space_bytes;
+      chunk_start = current;
+      continue;
+    }
+
+    character_bytes = grn_charlen_(ctx, current, string_end, encoding);
+    if (character_bytes == 0) {
+      GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
+                       "[tokenizer][mecab][chunk] "
+                       "invalid byte sequence: position=%d",
+                       (int)(current - string));
+      return GRN_FALSE;
+    }
+
+    current_character = current;
+    current += character_bytes;
+
+    if (is_delimiter_character(ctx, current_character, character_bytes) ||
+        (current - chunk_start) >= grn_mecab_chunk_size_threshold) {
+      grn_bool succeeded;
+      succeeded = chunked_tokenize_utf8_chunk(ctx,
+                                              tokenizer,
+                                              chunk_start,
+                                              current - chunk_start);
+      if (!succeeded) {
+        return succeeded;
+      }
+      chunk_start = current;
+    }
+  }
+
+  if (current == chunk_start) {
+    return GRN_TRUE;
+  } else {
+    return chunked_tokenize_utf8_chunk(ctx,
+                                       tokenizer,
+                                       chunk_start,
+                                       current - chunk_start);
+  }
+}
+
 /*
   This function is called for a full text search query or a document to be
   indexed. This means that both short/long strings are given.
@@ -92,7 +252,6 @@ get_mecab_encoding(mecab_t *mecab)
 static grn_obj *
 mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
 {
-  const char *s;
   grn_mecab_tokenizer *tokenizer;
   unsigned int normalizer_flags = 0;
   grn_tokenizer_query *query;
@@ -159,21 +318,33 @@ mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
     tokenizer->next = "";
     tokenizer->end = tokenizer->next;
   } else {
+    grn_bool succeeded;
     grn_plugin_mutex_lock(ctx, sole_mecab_mutex);
-    s = mecab_sparse_tostr2(tokenizer->mecab,
-                            normalized_string,
-                            normalized_string_length);
-    if (!s) {
-      GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
-                       "[tokenizer][mecab] "
-                       "mecab_sparse_tostr() failed len=%d err=%s",
-                       normalized_string_length,
-                       mecab_strerror(tokenizer->mecab));
+    if (grn_mecab_chunked_tokenize_enabled &&
+        ctx->encoding == GRN_ENC_UTF8) {
+      succeeded = chunked_tokenize_utf8(ctx,
+                                        tokenizer,
+                                        normalized_string,
+                                        normalized_string_length);
     } else {
-      GRN_TEXT_PUTS(ctx, &(tokenizer->buf), s);
+      const char *s;
+      s = mecab_sparse_tostr2(tokenizer->mecab,
+                              normalized_string,
+                              normalized_string_length);
+      if (!s) {
+        succeeded = GRN_FALSE;
+        GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
+                         "[tokenizer][mecab] "
+                         "mecab_sparse_tostr() failed len=%d err=%s",
+                         normalized_string_length,
+                         mecab_strerror(tokenizer->mecab));
+      } else {
+        succeeded = GRN_TRUE;
+        GRN_TEXT_PUTS(ctx, &(tokenizer->buf), s);
+      }
     }
     grn_plugin_mutex_unlock(ctx, sole_mecab_mutex);
-    if (!s) {
+    if (!succeeded) {
       grn_tokenizer_query_close(ctx, tokenizer->query);
       GRN_PLUGIN_FREE(ctx, tokenizer);
       return NULL;
@@ -313,6 +484,30 @@ check_mecab_dictionary_encoding(grn_ctx *ctx)
 grn_rc
 GRN_PLUGIN_INIT(grn_ctx *ctx)
 {
+  {
+    const char *env;
+
+    env = getenv("GRN_MECAB_CHUNKED_TOKENIZE_ENABLED");
+    grn_mecab_chunked_tokenize_enabled = (env && strcmp(env, "yes") == 0);
+  }
+
+  {
+    const char *env;
+
+    env = getenv("GRN_MECAB_CHUNK_SIZE_THRESHOLD");
+    if (env) {
+      int threshold = -1;
+      const char *end;
+      const char *rest;
+
+      end = env + strlen(env);
+      threshold = grn_atoi(env, end, &rest);
+      if (end > env && end == rest) {
+        grn_mecab_chunk_size_threshold = threshold;
+      }
+    }
+  }
+
   sole_mecab = NULL;
   sole_mecab_mutex = grn_plugin_mutex_open(ctx);
   if (!sole_mecab_mutex) {

  Modified: test/command/run-test.sh (+6 -0)
===================================================================
--- test/command/run-test.sh    2015-02-28 20:24:46 +0900 (37ad81b)
+++ test/command/run-test.sh    2015-03-01 00:41:03 +0900 (f1b65bf)
@@ -41,6 +41,12 @@ export GRN_QUERY_EXPANDER_TSV_SYNONYMS_FILE
 GRN_RUBY_SCRIPTS_DIR="$top_dir/lib/mrb/scripts"
 export GRN_RUBY_SCRIPTS_DIR
 
+: ${GRN_MECAB_CHUNKED_TOKENIZE_ENABLED:="yes"}
+export GRN_MECAB_CHUNKED_TOKENIZE_ENABLED
+
+: ${GRN_MECAB_CHUNK_SIZE_THRESHOLD:="30"}
+export GRN_MECAB_CHUNK_SIZE_THRESHOLD
+
 case `uname` in
   Linux|*BSD)
     LD_LIBRARY_PATH="$top_dir/lib/.libs:$LD_LIBRARY_PATH"

  Added: test/command/suite/tokenizers/mecab/chunk/comma.expected (+30 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/mecab/chunk/comma.expected    2015-03-01 00:41:03 +0900 (0283192)
@@ -0,0 +1,30 @@
+tokenize TokenMecab '日本のエンジン,エンジン'
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "日本",
+      "position": 0
+    },
+    {
+      "value": "の",
+      "position": 1
+    },
+    {
+      "value": "エンジン",
+      "position": 2
+    },
+    {
+      "value": ",",
+      "position": 3
+    },
+    {
+      "value": "エンジン",
+      "position": 4
+    }
+  ]
+]

  Added: test/command/suite/tokenizers/mecab/chunk/comma.test (+3 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/mecab/chunk/comma.test    2015-03-01 00:41:03 +0900 (56e7b3c)
@@ -0,0 +1,3 @@
+#@on-error omit
+tokenize TokenMecab '日本のエンジン,エンジン'
+#@on-error default

  Added: test/command/suite/tokenizers/mecab/chunk/exclamation_mark.expected (+30 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/mecab/chunk/exclamation_mark.expected    2015-03-01 00:41:03 +0900 (6139cd3)
@@ -0,0 +1,30 @@
+tokenize TokenMecab '日本のエンジン!エンジン'
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "日本",
+      "position": 0
+    },
+    {
+      "value": "の",
+      "position": 1
+    },
+    {
+      "value": "エンジン",
+      "position": 2
+    },
+    {
+      "value": "!",
+      "position": 3
+    },
+    {
+      "value": "エンジン",
+      "position": 4
+    }
+  ]
+]

  Added: test/command/suite/tokenizers/mecab/chunk/exclamation_mark.test (+3 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/mecab/chunk/exclamation_mark.test    2015-03-01 00:41:03 +0900 (6a85429)
@@ -0,0 +1,3 @@
+#@on-error omit
+tokenize TokenMecab '日本のエンジン!エンジン'
+#@on-error default

  Added: test/command/suite/tokenizers/mecab/chunk/fullwidth_exclamation_mark.expected (+30 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/mecab/chunk/fullwidth_exclamation_mark.expected    2015-03-01 00:41:03 +0900 (b9ec3bb)
@@ -0,0 +1,30 @@
+tokenize TokenMecab '日本のエンジン！エンジン'
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "日本",
+      "position": 0
+    },
+    {
+      "value": "の",
+      "position": 1
+    },
+    {
+      "value": "エンジン",
+      "position": 2
+    },
+    {
+      "value": "！",
+      "position": 3
+    },
+    {
+      "value": "エンジン",
+      "position": 4
+    }
+  ]
+]

  Added: test/command/suite/tokenizers/mecab/chunk/fullwidth_exclamation_mark.test (+3 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/mecab/chunk/fullwidth_exclamation_mark.test    2015-03-01 00:41:03 +0900 (6ad0bc6)
@@ -0,0 +1,3 @@
+#@on-error omit
+tokenize TokenMecab '日本のエンジン！エンジン'
+#@on-error default

  Added: test/command/suite/tokenizers/mecab/chunk/fullwidth_question_mark.expected (+30 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/mecab/chunk/fullwidth_question_mark.expected    2015-03-01 00:41:03 +0900 (e0e0ea1)
@@ -0,0 +1,30 @@
+tokenize TokenMecab '日本のエンジン？エンジン'
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "日本",
+      "position": 0
+    },
+    {
+      "value": "の",
+      "position": 1
+    },
+    {
+      "value": "エンジン",
+      "position": 2
+    },
+    {
+      "value": "？",
+      "position": 3
+    },
+    {
+      "value": "エンジン",
+      "position": 4
+    }
+  ]
+]

  Added: test/command/suite/tokenizers/mecab/chunk/fullwidth_question_mark.test (+3 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/mecab/chunk/fullwidth_question_mark.test    2015-03-01 00:41:03 +0900 (6816f60)
@@ -0,0 +1,3 @@
+#@on-error omit
+tokenize TokenMecab '日本のエンジン？エンジン'
+#@on-error default

  Added: test/command/suite/tokenizers/mecab/chunk/ideographic_comma.expected (+30 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/mecab/chunk/ideographic_comma.expected    2015-03-01 00:41:03 +0900 (96a23ac)
@@ -0,0 +1,30 @@
+tokenize TokenMecab '日本のエンジン、エンジン'
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "日本",
+      "position": 0
+    },
+    {
+      "value": "の",
+      "position": 1
+    },
+    {
+      "value": "エンジン",
+      "position": 2
+    },
+    {
+      "value": "、",
+      "position": 3
+    },
+    {
+      "value": "エンジン",
+      "position": 4
+    }
+  ]
+]

  Added: test/command/suite/tokenizers/mecab/chunk/ideographic_comma.test (+3 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/mecab/chunk/ideographic_comma.test    2015-03-01 00:41:03 +0900 (5361b3c)
@@ -0,0 +1,3 @@
+#@on-error omit
+tokenize TokenMecab '日本のエンジン、エンジン'
+#@on-error default

  Added: test/command/suite/tokenizers/mecab/chunk/ideographic_full_stop.expected (+30 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/mecab/chunk/ideographic_full_stop.expected    2015-03-01 00:41:03 +0900 (4c3d597)
@@ -0,0 +1,30 @@
+tokenize TokenMecab '日本のエンジン。エンジン'
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "日本",
+      "position": 0
+    },
+    {
+      "value": "の",
+      "position": 1
+    },
+    {
+      "value": "エンジン",
+      "position": 2
+    },
+    {
+      "value": "。",
+      "position": 3
+    },
+    {
+      "value": "エンジン",
+      "position": 4
+    }
+  ]
+]

  Added: test/command/suite/tokenizers/mecab/chunk/ideographic_full_stop.test (+3 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/mecab/chunk/ideographic_full_stop.test    2015-03-01 00:41:03 +0900 (5fc434a)
@@ -0,0 +1,3 @@
+#@on-error omit
+tokenize TokenMecab '日本のエンジン。エンジン'
+#@on-error default

  Added: test/command/suite/tokenizers/mecab/chunk/ideographic_space.expected (+26 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/mecab/chunk/ideographic_space.expected    2015-03-01 00:41:03 +0900 (d58121c)
@@ -0,0 +1,26 @@
+tokenize TokenMecab '日本のエンジン　エンジン'
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "日本",
+      "position": 0
+    },
+    {
+      "value": "の",
+      "position": 1
+    },
+    {
+      "value": "エンジン",
+      "position": 2
+    },
+    {
+      "value": "エンジン",
+      "position": 3
+    }
+  ]
+]

  Added: test/command/suite/tokenizers/mecab/chunk/ideographic_space.test (+3 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/mecab/chunk/ideographic_space.test    2015-03-01 00:41:03 +0900 (bc26402)
@@ -0,0 +1,3 @@
+#@on-error omit
+tokenize TokenMecab '日本のエンジン　エンジン'
+#@on-error default

  Added: test/command/suite/tokenizers/mecab/chunk/period.expected (+30 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/mecab/chunk/period.expected    2015-03-01 00:41:03 +0900 (5e48aed)
@@ -0,0 +1,30 @@
+tokenize TokenMecab '日本のエンジン.エンジン'
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "日本",
+      "position": 0
+    },
+    {
+      "value": "の",
+      "position": 1
+    },
+    {
+      "value": "エンジン",
+      "position": 2
+    },
+    {
+      "value": ".",
+      "position": 3
+    },
+    {
+      "value": "エンジン",
+      "position": 4
+    }
+  ]
+]

  Added: test/command/suite/tokenizers/mecab/chunk/period.test (+3 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/mecab/chunk/period.test    2015-03-01 00:41:03 +0900 (35e5f7f)
@@ -0,0 +1,3 @@
+#@on-error omit
+tokenize TokenMecab '日本のエンジン.エンジン'
+#@on-error default

  Added: test/command/suite/tokenizers/mecab/chunk/question_mark.expected (+30 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/mecab/chunk/question_mark.expected    2015-03-01 00:41:03 +0900 (16ae026)
@@ -0,0 +1,30 @@
+tokenize TokenMecab '日本のエンジン?エンジン'
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "日本",
+      "position": 0
+    },
+    {
+      "value": "の",
+      "position": 1
+    },
+    {
+      "value": "エンジン",
+      "position": 2
+    },
+    {
+      "value": "?",
+      "position": 3
+    },
+    {
+      "value": "エンジン",
+      "position": 4
+    }
+  ]
+]

  Added: test/command/suite/tokenizers/mecab/chunk/question_mark.test (+3 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/mecab/chunk/question_mark.test    2015-03-01 00:41:03 +0900 (832828f)
@@ -0,0 +1,3 @@
+#@on-error omit
+tokenize TokenMecab '日本のエンジン?エンジン'
+#@on-error default

  Added: test/command/suite/tokenizers/mecab/chunk/space.expected (+26 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/mecab/chunk/space.expected    2015-03-01 00:41:03 +0900 (d5dda2b)
@@ -0,0 +1,26 @@
+tokenize TokenMecab '日本のエンジン エンジン'
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "日本",
+      "position": 0
+    },
+    {
+      "value": "の",
+      "position": 1
+    },
+    {
+      "value": "エンジン",
+      "position": 2
+    },
+    {
+      "value": "エンジン",
+      "position": 3
+    }
+  ]
+]

  Added: test/command/suite/tokenizers/mecab/chunk/space.test (+3 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/mecab/chunk/space.test    2015-03-01 00:41:03 +0900 (442e83e)
@@ -0,0 +1,3 @@
+#@on-error omit
+tokenize TokenMecab '日本のエンジン エンジン'
+#@on-error default

  Added: test/command/suite/tokenizers/mecab/chunk/threshold.expected (+34 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/mecab/chunk/threshold.expected    2015-03-01 00:41:03 +0900 (0ffe971)
@@ -0,0 +1,34 @@
+tokenize TokenMecab '日本のエンジンとエンジン'
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "日本",
+      "position": 0
+    },
+    {
+      "value": "の",
+      "position": 1
+    },
+    {
+      "value": "エンジン",
+      "position": 2
+    },
+    {
+      "value": "と",
+      "position": 3
+    },
+    {
+      "value": "エン",
+      "position": 4
+    },
+    {
+      "value": "ジン",
+      "position": 5
+    }
+  ]
+]

  Added: test/command/suite/tokenizers/mecab/chunk/threshold.test (+3 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/mecab/chunk/threshold.test    2015-03-01 00:41:03 +0900 (5f4d0f3)
@@ -0,0 +1,3 @@
+#@on-error omit
+tokenize TokenMecab '日本のエンジンとエンジン'
+#@on-error default
-------------- next part --------------
HTML����������������������������...
Download 


Groonga - fulltext search engine.

[Groonga-commit] groonga/groonga at f0e5b1e [master] mecab: support chunked tokenization