Susumu Yata
null+****@clear*****
Thu Jan 11 10:02:43 JST 2018
Susumu Yata 2018-01-11 10:02:43 +0900 (Thu, 11 Jan 2018) New Revision: 764b66ecb36ebbca54ac2ef61d3c8b1f63d67dd9 https://github.com/groonga/groonga/commit/764b66ecb36ebbca54ac2ef61d3c8b1f63d67dd9 Merged 1decf0a: Merge pull request #808 from groonga/support-surrogate-pairs Message: load: support surrogate pairs Added files: test/command/suite/load/surrogate_pair/emoji.expected test/command/suite/load/surrogate_pair/emoji.test test/command/suite/load/surrogate_pair/normalize.expected test/command/suite/load/surrogate_pair/normalize.test test/command/suite/load/surrogate_pair/raw.expected test/command/suite/load/surrogate_pair/raw.test Modified files: lib/grn_ctx_impl.h lib/load.c Modified: lib/grn_ctx_impl.h (+1 -0) =================================================================== --- lib/grn_ctx_impl.h 2018-01-11 09:34:34 +0900 (ebc691e7b) +++ lib/grn_ctx_impl.h 2018-01-11 10:02:43 +0900 (ff2e79d92) @@ -85,6 +85,7 @@ typedef struct { grn_obj *ifexists; grn_obj *each; uint32_t unichar; + uint32_t unichar_hi; uint32_t values_size; uint32_t nrecords; uint32_t n_record_errors; Modified: lib/load.c (+16 -2) =================================================================== --- lib/load.c 2018-01-11 09:34:34 +0900 (4f90dc9c3) +++ lib/load.c 2018-01-11 10:02:43 +0900 (69a85eaf5) @@ -1003,13 +1003,27 @@ json_read(grn_ctx *ctx, grn_loader *loader, const char *str, unsigned int str_le } { uint32_t u = loader->unichar; + if (u >= 0xd800 && u <= 0xdbff) { /* High-surrogate code points */ + loader->unichar_hi = u; + loader->stat = GRN_LOADER_STRING; + str++; + break; + } + if (u >= 0xdc00 && u <= 0xdfff) { /* Low-surrogate code points */ + u = 0x10000 + (loader->unichar_hi - 0xd800) * 0x400 + u - 0xdc00; + } if (u < 0x80) { GRN_TEXT_PUTC(ctx, loader->last, u); } else { if (u < 0x800) { - GRN_TEXT_PUTC(ctx, loader->last, ((u >> 6) & 0x1f) | 0xc0); + GRN_TEXT_PUTC(ctx, loader->last, (u >> 6) | 0xc0); } else { - GRN_TEXT_PUTC(ctx, loader->last, (u >> 12) | 0xe0); + if (u < 0x10000) { + GRN_TEXT_PUTC(ctx, loader->last, (u >> 12) | 0xe0); + } else { + GRN_TEXT_PUTC(ctx, loader->last, (u >> 18) | 0xf0); + GRN_TEXT_PUTC(ctx, loader->last, ((u >> 12) & 0x3f) | 0x80); + } GRN_TEXT_PUTC(ctx, loader->last, ((u >> 6) & 0x3f) | 0x80); } GRN_TEXT_PUTC(ctx, loader->last, (u & 0x3f) | 0x80); Added: test/command/suite/load/surrogate_pair/emoji.expected (+38 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/load/surrogate_pair/emoji.expected 2018-01-11 10:02:43 +0900 (88b759e7c) @@ -0,0 +1,38 @@ +table_create Characters TABLE_HASH_KEY ShortText +[[0,0.0,0.0],true] +column_create Characters unicode COLUMN_SCALAR ShortText +[[0,0.0,0.0],true] +load --table Characters +[ +{"_key": "\uD83C\uDF7A", "unicode": "U+1F37A BEER MUG"} +] +[[0,0.0,0.0],1] +select Characters --output_columns _key,unicode +[ + [ + 0, + 0.0, + 0.0 + ], + [ + [ + [ + 1 + ], + [ + [ + "_key", + "ShortText" + ], + [ + "unicode", + "ShortText" + ] + ], + [ + "🍺", + "U+1F37A BEER MUG" + ] + ] + ] +] Added: test/command/suite/load/surrogate_pair/emoji.test (+9 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/load/surrogate_pair/emoji.test 2018-01-11 10:02:43 +0900 (9389618b3) @@ -0,0 +1,9 @@ +table_create Characters TABLE_HASH_KEY ShortText +column_create Characters unicode COLUMN_SCALAR ShortText + +load --table Characters +[ +{"_key": "\uD83C\uDF7A", "unicode": "U+1F37A BEER MUG"} +] + +select Characters --output_columns _key,unicode Added: test/command/suite/load/surrogate_pair/normalize.expected (+38 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/load/surrogate_pair/normalize.expected 2018-01-11 10:02:43 +0900 (6c5601d43) @@ -0,0 +1,38 @@ +table_create Characters TABLE_HASH_KEY|KEY_NORMALIZE ShortText +[[0,0.0,0.0],true] +column_create Characters unicode COLUMN_SCALAR ShortText +[[0,0.0,0.0],true] +load --table Characters +[ +{"_key": "\uD835\uDC00", "unicode": "U+1D400 MATHEMATICAL BOLD CAPITAL A"} +] +[[0,0.0,0.0],1] +select Characters --filter '_key == "A"' --output_columns _key,unicode +[ + [ + 0, + 0.0, + 0.0 + ], + [ + [ + [ + 1 + ], + [ + [ + "_key", + "ShortText" + ], + [ + "unicode", + "ShortText" + ] + ], + [ + "a", + "U+1D400 MATHEMATICAL BOLD CAPITAL A" + ] + ] + ] +] Added: test/command/suite/load/surrogate_pair/normalize.test (+9 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/load/surrogate_pair/normalize.test 2018-01-11 10:02:43 +0900 (83291cb47) @@ -0,0 +1,9 @@ +table_create Characters TABLE_HASH_KEY|KEY_NORMALIZE ShortText +column_create Characters unicode COLUMN_SCALAR ShortText + +load --table Characters +[ +{"_key": "\uD835\uDC00", "unicode": "U+1D400 MATHEMATICAL BOLD CAPITAL A"} +] + +select Characters --filter '_key == "A"' --output_columns _key,unicode Added: test/command/suite/load/surrogate_pair/raw.expected (+38 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/load/surrogate_pair/raw.expected 2018-01-11 10:02:43 +0900 (c107e2423) @@ -0,0 +1,38 @@ +table_create Characters TABLE_HASH_KEY ShortText +[[0,0.0,0.0],true] +column_create Characters unicode COLUMN_SCALAR ShortText +[[0,0.0,0.0],true] +load --table Characters +[ +{"_key": "\uD835\uDC00", "unicode": "U+1D400 MATHEMATICAL BOLD CAPITAL A"} +] +[[0,0.0,0.0],1] +select Characters --output_columns _key,unicode +[ + [ + 0, + 0.0, + 0.0 + ], + [ + [ + [ + 1 + ], + [ + [ + "_key", + "ShortText" + ], + [ + "unicode", + "ShortText" + ] + ], + [ + "𝐀", + "U+1D400 MATHEMATICAL BOLD CAPITAL A" + ] + ] + ] +] Added: test/command/suite/load/surrogate_pair/raw.test (+9 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/load/surrogate_pair/raw.test 2018-01-11 10:02:43 +0900 (06b754055) @@ -0,0 +1,9 @@ +table_create Characters TABLE_HASH_KEY ShortText +column_create Characters unicode COLUMN_SCALAR ShortText + +load --table Characters +[ +{"_key": "\uD835\uDC00", "unicode": "U+1D400 MATHEMATICAL BOLD CAPITAL A"} +] + +select Characters --output_columns _key,unicode -------------- next part -------------- HTML����������������������������... URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180111/0dc6c4d7/attachment-0003.htm