[Groonga-commit] groonga/groonga [master] TokenNgram family: ignore tokenizer delimiter (U+FFFE)

Back to archive index

Kouhei Sutou null+****@clear*****
Fri Nov 9 14:13:49 JST 2012


Kouhei Sutou	2012-11-09 14:13:49 +0900 (Fri, 09 Nov 2012)

  New Revision: 812749828a970e9a8fa4168f638a9e2341015260
  https://github.com/groonga/groonga/commit/812749828a970e9a8fa4168f638a9e2341015260

  Log:
    TokenNgram family: ignore tokenizer delimiter (U+FFFE)

  Added files:
    test/command/suite/table_create/default_tokenizer/bigram/default.expected
    test/command/suite/table_create/default_tokenizer/bigram/default.test
    test/command/suite/table_create/default_tokenizer/bigram/normalize.expected
    test/command/suite/table_create/default_tokenizer/bigram/normalize.test
    test/command/suite/table_create/default_tokenizer/bigram/tokenizer_delimiter/default.expected
    test/command/suite/table_create/default_tokenizer/bigram/tokenizer_delimiter/default.test
    test/command/suite/table_create/default_tokenizer/bigram/tokenizer_delimiter/normalize.expected
    test/command/suite/table_create/default_tokenizer/bigram/tokenizer_delimiter/normalize.test
  Modified files:
    lib/token.c

  Modified: lib/token.c (+4 -1)
===================================================================
--- lib/token.c    2012-11-09 13:56:28 +0900 (9ec0769)
+++ lib/token.c    2012-11-09 14:13:49 +0900 (3579c95)
@@ -212,7 +212,10 @@ ngram_init(grn_ctx *ctx, grn_obj *table, grn_user_data *user_data, uint8_t ngram
 {
   grn_obj *str;
   grn_obj *normalizer = NULL;
-  int nflags = GRN_STRING_REMOVE_BLANK|GRN_STRING_WITH_TYPES;
+  int nflags =
+    GRN_STRING_REMOVE_BLANK |
+    GRN_STRING_WITH_TYPES |
+    GRN_STRING_REMOVE_TOKENIZER_DELIMITER;
   const char *normalized;
   unsigned int normalized_length_in_bytes;
   grn_ngram_tokenizer *token;

  Added: test/command/suite/table_create/default_tokenizer/bigram/default.expected (+95 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/table_create/default_tokenizer/bigram/default.expected    2012-11-09 14:13:49 +0900 (e5ce2c0)
@@ -0,0 +1,95 @@
+table_create Memos TABLE_NO_KEY
+[[0,0.0,0.0],true]
+column_create Memos content COLUMN_SCALAR Text
+[[0,0.0,0.0],true]
+table_create Terms TABLE_PAT_KEY ShortText   --default_tokenizer TokenBigram
+[[0,0.0,0.0],true]
+column_create Terms memos_content COLUMN_INDEX Memos content
+[[0,0.0,0.0],true]
+load --table Memos
+[
+{"content": "This is a pen."},
+{"content": "これはペンです。"}
+]
+[[0,0.0,0.0],2]
+select Terms --output_columns _key --limit -1
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    [
+      [
+        20
+      ],
+      [
+        [
+          "_key",
+          "ShortText"
+        ]
+      ],
+      [
+        " a"
+      ],
+      [
+        " i"
+      ],
+      [
+        " p"
+      ],
+      [
+        "."
+      ],
+      [
+        "Th"
+      ],
+      [
+        "a "
+      ],
+      [
+        "en"
+      ],
+      [
+        "hi"
+      ],
+      [
+        "is"
+      ],
+      [
+        "n."
+      ],
+      [
+        "pe"
+      ],
+      [
+        "s "
+      ],
+      [
+        "。"
+      ],
+      [
+        "これ"
+      ],
+      [
+        "す。"
+      ],
+      [
+        "です"
+      ],
+      [
+        "はペ"
+      ],
+      [
+        "れは"
+      ],
+      [
+        "ペン"
+      ],
+      [
+        "ンで"
+      ]
+    ]
+  ]
+]

  Added: test/command/suite/table_create/default_tokenizer/bigram/default.test (+14 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/table_create/default_tokenizer/bigram/default.test    2012-11-09 14:13:49 +0900 (b04772a)
@@ -0,0 +1,14 @@
+table_create Memos TABLE_NO_KEY
+column_create Memos content COLUMN_SCALAR Text
+
+table_create Terms TABLE_PAT_KEY ShortText \
+  --default_tokenizer TokenBigram
+column_create Terms memos_content COLUMN_INDEX Memos content
+
+load --table Memos
+[
+{"content": "This is a pen."},
+{"content": "これはペンです。"}
+]
+
+select Terms --output_columns _key --limit -1

  Added: test/command/suite/table_create/default_tokenizer/bigram/normalize.expected (+74 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/table_create/default_tokenizer/bigram/normalize.expected    2012-11-09 14:13:49 +0900 (6cc6df6)
@@ -0,0 +1,74 @@
+table_create Memos TABLE_NO_KEY
+[[0,0.0,0.0],true]
+column_create Memos content COLUMN_SCALAR Text
+[[0,0.0,0.0],true]
+table_create Terms TABLE_PAT_KEY|KEY_NORMALIZE ShortText   --default_tokenizer TokenBigram
+[[0,0.0,0.0],true]
+column_create Terms memos_content COLUMN_INDEX Memos content
+[[0,0.0,0.0],true]
+load --table Memos
+[
+{"content": "This is a pen."},
+{"content": "これはペンです。"}
+]
+[[0,0.0,0.0],2]
+select Terms --output_columns _key --limit -1
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    [
+      [
+        13
+      ],
+      [
+        [
+          "_key",
+          "ShortText"
+        ]
+      ],
+      [
+        "."
+      ],
+      [
+        "a"
+      ],
+      [
+        "is"
+      ],
+      [
+        "pen"
+      ],
+      [
+        "this"
+      ],
+      [
+        "。"
+      ],
+      [
+        "これ"
+      ],
+      [
+        "す"
+      ],
+      [
+        "です"
+      ],
+      [
+        "はペ"
+      ],
+      [
+        "れは"
+      ],
+      [
+        "ペン"
+      ],
+      [
+        "ンで"
+      ]
+    ]
+  ]
+]

  Added: test/command/suite/table_create/default_tokenizer/bigram/normalize.test (+14 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/table_create/default_tokenizer/bigram/normalize.test    2012-11-09 14:13:49 +0900 (af6adc7)
@@ -0,0 +1,14 @@
+table_create Memos TABLE_NO_KEY
+column_create Memos content COLUMN_SCALAR Text
+
+table_create Terms TABLE_PAT_KEY|KEY_NORMALIZE ShortText \
+  --default_tokenizer TokenBigram
+column_create Terms memos_content COLUMN_INDEX Memos content
+
+load --table Memos
+[
+{"content": "This is a pen."},
+{"content": "これはペンです。"}
+]
+
+select Terms --output_columns _key --limit -1

  Added: test/command/suite/table_create/default_tokenizer/bigram/tokenizer_delimiter/default.expected (+95 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/table_create/default_tokenizer/bigram/tokenizer_delimiter/default.expected    2012-11-09 14:13:49 +0900 (7f373d3)
@@ -0,0 +1,95 @@
+table_create Memos TABLE_NO_KEY
+[[0,0.0,0.0],true]
+column_create Memos content COLUMN_SCALAR Text
+[[0,0.0,0.0],true]
+table_create Terms TABLE_PAT_KEY ShortText   --default_tokenizer TokenBigram
+[[0,0.0,0.0],true]
+column_create Terms memos_content COLUMN_INDEX Memos content
+[[0,0.0,0.0],true]
+load --table Memos
+[
+{"content": "Th\uFFFEis is a p\uFFFEen."},
+{"content": "これは\uFFFEペン\uFFFEです。"}
+]
+[[0,0.0,0.0],2]
+select Terms --output_columns _key --limit -1
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    [
+      [
+        20
+      ],
+      [
+        [
+          "_key",
+          "ShortText"
+        ]
+      ],
+      [
+        " a"
+      ],
+      [
+        " i"
+      ],
+      [
+        " p"
+      ],
+      [
+        "."
+      ],
+      [
+        "Th"
+      ],
+      [
+        "a "
+      ],
+      [
+        "en"
+      ],
+      [
+        "hi"
+      ],
+      [
+        "is"
+      ],
+      [
+        "n."
+      ],
+      [
+        "pe"
+      ],
+      [
+        "s "
+      ],
+      [
+        "。"
+      ],
+      [
+        "これ"
+      ],
+      [
+        "す。"
+      ],
+      [
+        "です"
+      ],
+      [
+        "はペ"
+      ],
+      [
+        "れは"
+      ],
+      [
+        "ペン"
+      ],
+      [
+        "ンで"
+      ]
+    ]
+  ]
+]

  Added: test/command/suite/table_create/default_tokenizer/bigram/tokenizer_delimiter/default.test (+14 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/table_create/default_tokenizer/bigram/tokenizer_delimiter/default.test    2012-11-09 14:13:49 +0900 (98bc30b)
@@ -0,0 +1,14 @@
+table_create Memos TABLE_NO_KEY
+column_create Memos content COLUMN_SCALAR Text
+
+table_create Terms TABLE_PAT_KEY ShortText \
+  --default_tokenizer TokenBigram
+column_create Terms memos_content COLUMN_INDEX Memos content
+
+load --table Memos
+[
+{"content": "Th\uFFFEis is a p\uFFFEen."},
+{"content": "これは\uFFFEペン\uFFFEです。"}
+]
+
+select Terms --output_columns _key --limit -1

  Added: test/command/suite/table_create/default_tokenizer/bigram/tokenizer_delimiter/normalize.expected (+74 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/table_create/default_tokenizer/bigram/tokenizer_delimiter/normalize.expected    2012-11-09 14:13:49 +0900 (c321869)
@@ -0,0 +1,74 @@
+table_create Memos TABLE_NO_KEY
+[[0,0.0,0.0],true]
+column_create Memos content COLUMN_SCALAR Text
+[[0,0.0,0.0],true]
+table_create Terms TABLE_PAT_KEY|KEY_NORMALIZE ShortText   --default_tokenizer TokenBigram
+[[0,0.0,0.0],true]
+column_create Terms memos_content COLUMN_INDEX Memos content
+[[0,0.0,0.0],true]
+load --table Memos
+[
+{"content": "Th\uFFFEis is a p\uFFFEen."},
+{"content": "これは\uFFFEペン\uFFFEです。"}
+]
+[[0,0.0,0.0],2]
+select Terms --output_columns _key --limit -1
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    [
+      [
+        13
+      ],
+      [
+        [
+          "_key",
+          "ShortText"
+        ]
+      ],
+      [
+        "."
+      ],
+      [
+        "a"
+      ],
+      [
+        "is"
+      ],
+      [
+        "pen"
+      ],
+      [
+        "this"
+      ],
+      [
+        "。"
+      ],
+      [
+        "これ"
+      ],
+      [
+        "す"
+      ],
+      [
+        "です"
+      ],
+      [
+        "はペ"
+      ],
+      [
+        "れは"
+      ],
+      [
+        "ペン"
+      ],
+      [
+        "ンで"
+      ]
+    ]
+  ]
+]

  Added: test/command/suite/table_create/default_tokenizer/bigram/tokenizer_delimiter/normalize.test (+14 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/table_create/default_tokenizer/bigram/tokenizer_delimiter/normalize.test    2012-11-09 14:13:49 +0900 (e2ae5b4)
@@ -0,0 +1,14 @@
+table_create Memos TABLE_NO_KEY
+column_create Memos content COLUMN_SCALAR Text
+
+table_create Terms TABLE_PAT_KEY|KEY_NORMALIZE ShortText \
+  --default_tokenizer TokenBigram
+column_create Terms memos_content COLUMN_INDEX Memos content
+
+load --table Memos
+[
+{"content": "Th\uFFFEis is a p\uFFFEen."},
+{"content": "これは\uFFFEペン\uFFFEです。"}
+]
+
+select Terms --output_columns _key --limit -1
-------------- next part --------------
HTML����������������������������...
Download 



More information about the Groonga-commit mailing list
Back to archive index