[Groonga-commit] groonga/groonga at a556a40 [master] mecab: use the last found delimiter instead of first found delimiter

Back to archive index

Kouhei Sutou null+****@clear*****
Sun Mar 1 18:52:20 JST 2015


Kouhei Sutou	2015-03-01 18:52:20 +0900 (Sun, 01 Mar 2015)

  New Revision: a556a40f295a512a565de35c5046222ac68dc718
  https://github.com/groonga/groonga/commit/a556a40f295a512a565de35c5046222ac68dc718

  Message:
    mecab: use the last found delimiter instead of first found delimiter

  Added files:
    test/command/suite/tokenizers/mecab/chunk/consecutive_delimiters.expected
    test/command/suite/tokenizers/mecab/chunk/consecutive_delimiters.test
  Modified files:
    plugins/tokenizers/mecab.c

  Modified: plugins/tokenizers/mecab.c (+21 -7)
===================================================================
--- plugins/tokenizers/mecab.c    2015-03-01 00:41:03 +0900 (5f7c1a2)
+++ plugins/tokenizers/mecab.c    2015-03-01 18:52:20 +0900 (a9e6b65)
@@ -174,6 +174,7 @@ chunked_tokenize_utf8(grn_ctx *ctx,
 {
   const char *chunk_start;
   const char *current;
+  const char *last_delimiter;
   const char *string_end = string + string_bytes;
   grn_encoding encoding = tokenizer->query->encoding;
 
@@ -185,6 +186,7 @@ chunked_tokenize_utf8(grn_ctx *ctx,
   }
 
   chunk_start = current = string;
+  last_delimiter = NULL;
   while (current < string_end) {
     int space_bytes;
     int character_bytes;
@@ -204,6 +206,7 @@ chunked_tokenize_utf8(grn_ctx *ctx,
       }
       current += space_bytes;
       chunk_start = current;
+      last_delimiter = NULL;
       continue;
     }
 
@@ -218,18 +221,29 @@ chunked_tokenize_utf8(grn_ctx *ctx,
 
     current_character = current;
     current += character_bytes;
+    if (is_delimiter_character(ctx, current_character, character_bytes)) {
+      last_delimiter = current;
+    }
 
-    if (is_delimiter_character(ctx, current_character, character_bytes) ||
-        (current - chunk_start) >= grn_mecab_chunk_size_threshold) {
+    if ((current - chunk_start) >= grn_mecab_chunk_size_threshold) {
       grn_bool succeeded;
-      succeeded = chunked_tokenize_utf8_chunk(ctx,
-                                              tokenizer,
-                                              chunk_start,
-                                              current - chunk_start);
+      if (last_delimiter) {
+        succeeded = chunked_tokenize_utf8_chunk(ctx,
+                                                tokenizer,
+                                                chunk_start,
+                                                last_delimiter - chunk_start);
+        chunk_start = last_delimiter;
+      } else {
+        succeeded = chunked_tokenize_utf8_chunk(ctx,
+                                                tokenizer,
+                                                chunk_start,
+                                                current - chunk_start);
+        chunk_start = current;
+      }
       if (!succeeded) {
         return succeeded;
       }
-      chunk_start = current;
+      last_delimiter = NULL;
     }
   }
 

  Added: test/command/suite/tokenizers/mecab/chunk/consecutive_delimiters.expected (+30 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/mecab/chunk/consecutive_delimiters.expected    2015-03-01 18:52:20 +0900 (945dd0b)
@@ -0,0 +1,30 @@
+tokenize TokenMecab '日本。エンジン。エンジン'
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "日本",
+      "position": 0
+    },
+    {
+      "value": "。",
+      "position": 1
+    },
+    {
+      "value": "エンジン",
+      "position": 2
+    },
+    {
+      "value": "。",
+      "position": 3
+    },
+    {
+      "value": "エンジン",
+      "position": 4
+    }
+  ]
+]

  Added: test/command/suite/tokenizers/mecab/chunk/consecutive_delimiters.test (+3 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/mecab/chunk/consecutive_delimiters.test    2015-03-01 18:52:20 +0900 (34f5fe0)
@@ -0,0 +1,3 @@
+#@on-error omit
+tokenize TokenMecab '日本。エンジン。エンジン'
+#@on-error default
-------------- next part --------------
HTML����������������������������...
Download 



More information about the Groonga-commit mailing list
Back to archive index