Kouhei Sutou
null+****@clear*****
Mon Jun 25 15:07:27 JST 2018
Kouhei Sutou 2018-06-25 15:07:27 +0900 (Mon, 25 Jun 2018) New Revision: ea48686f12fe1adbed47f4716ca6eb1a4c232f8c https://github.com/groonga/groonga/commit/ea48686f12fe1adbed47f4716ca6eb1a4c232f8c Message: highlighter lexicon: fix a not highlighted bug If keyword is less than N, the keyword wasn't highlighted. Added files: test/command/suite/select/function/highlight_html/lexicon/less_than_n.expected test/command/suite/select/function/highlight_html/lexicon/less_than_n.test Modified files: lib/highlighter.c Modified: lib/highlighter.c (+107 -4) =================================================================== --- lib/highlighter.c 2018-06-25 12:30:22 +0900 (79b7040b0) +++ lib/highlighter.c 2018-06-25 15:07:27 +0900 (88f359d3e) @@ -65,6 +65,8 @@ struct _grn_highlighter { struct { grn_obj *object; grn_encoding encoding; + grn_obj lazy_keywords; + grn_obj lazy_keyword_ids; grn_obj *token_id_chunks; grn_obj token_id_chunk_ids; grn_obj token_id_chunk; @@ -111,6 +113,10 @@ grn_highlighter_open(grn_ctx *ctx) highlighter->lexicon.object = NULL; highlighter->lexicon.encoding = GRN_ENC_NONE; + GRN_TEXT_INIT(&(highlighter->lexicon.lazy_keywords), GRN_OBJ_VECTOR); + GRN_RECORD_INIT(&(highlighter->lexicon.lazy_keyword_ids), + GRN_OBJ_VECTOR, + GRN_ID_NIL); highlighter->lexicon.token_id_chunks = NULL; GRN_RECORD_INIT(&(highlighter->lexicon.token_id_chunk_ids), GRN_OBJ_VECTOR, @@ -141,6 +147,8 @@ grn_highlighter_close(grn_ctx *ctx, grn_obj_close(ctx, highlighter->pat.keywords); } + GRN_OBJ_FIN(ctx, &(highlighter->lexicon.lazy_keywords)); + GRN_OBJ_FIN(ctx, &(highlighter->lexicon.lazy_keyword_ids)); if (highlighter->lexicon.token_id_chunks) { grn_obj_close(ctx, highlighter->lexicon.token_id_chunks); } @@ -210,12 +218,17 @@ grn_highlighter_prepare_lexicon(grn_ctx *ctx, grn_highlighter *highlighter) { grn_bool have_token_id_chunks = GRN_FALSE; + grn_obj *lazy_keywords = &(highlighter->lexicon.lazy_keywords); + grn_id lexicon_id; grn_obj *token_id_chunk_ids = &(highlighter->lexicon.token_id_chunk_ids); size_t i, n_keywords; grn_obj *token_id_chunk = &(highlighter->lexicon.token_id_chunk); - highlighter->lexicon.token_ids.header.domain = - grn_obj_id(ctx, highlighter->lexicon.object); + GRN_BULK_REWIND(lazy_keywords); + + lexicon_id = grn_obj_id(ctx, highlighter->lexicon.object); + highlighter->lexicon.lazy_keyword_ids.header.domain = lexicon_id; + highlighter->lexicon.token_ids.header.domain = lexicon_id; grn_table_get_info(ctx, highlighter->lexicon.object, @@ -293,6 +306,20 @@ grn_highlighter_prepare_lexicon(grn_ctx *ctx, GRN_BULK_REWIND(token_id_chunk); while ((token_id = grn_token_cursor_next(ctx, cursor)) != GRN_ID_NIL) { GRN_TEXT_PUT(ctx, token_id_chunk, &token_id, sizeof(grn_id)); + if (cursor->force_prefix) { + grn_token *token; + const char *data; + size_t data_length; + + token = grn_token_cursor_get_token(ctx, cursor); + data = grn_token_get_data_raw(ctx, token, &data_length); + grn_vector_add_element(ctx, + lazy_keywords, + data, + data_length, + 0, + GRN_DB_TEXT); + } } grn_token_cursor_close(ctx, cursor); @@ -418,6 +445,20 @@ grn_highlighter_prepare(grn_ctx *ctx, highlighter->need_prepared = GRN_FALSE; } +static inline grn_bool +grn_ids_is_included(grn_id *ids, size_t n_ids, grn_id id) +{ + size_t i; + + for (i = 0; i < n_ids; i++) { + if (ids[i] == ids[0]) { + return GRN_TRUE; + } + } + + return GRN_FALSE; +} + static uint64_t grn_highlighter_highlight_lexicon_flush(grn_ctx *ctx, grn_highlighter *highlighter, @@ -456,10 +497,12 @@ grn_highlighter_highlight_lexicon(grn_ctx *ctx, grn_obj *output) { grn_token_cursor *cursor; + grn_obj *lazy_keyword_ids = &(highlighter->lexicon.lazy_keyword_ids); grn_obj *token_ids = &(highlighter->lexicon.token_ids); grn_obj *token_locations = &(highlighter->lexicon.token_locations); grn_obj *candidates = &(highlighter->lexicon.candidates); + GRN_BULK_REWIND(lazy_keyword_ids); GRN_BULK_REWIND(token_ids); GRN_BULK_REWIND(token_locations); cursor = grn_token_cursor_open(ctx, @@ -494,8 +537,48 @@ grn_highlighter_highlight_lexicon(grn_ctx *ctx, } grn_token_cursor_close(ctx, cursor); + { + grn_obj *lexicon = highlighter->lexicon.object; + grn_obj *chunks = highlighter->lexicon.token_id_chunks; + grn_obj *lazy_keywords = &(highlighter->lexicon.lazy_keywords); + size_t i; + size_t n_keywords; + + n_keywords = grn_vector_size(ctx, lazy_keywords); + for (i = 0; i < n_keywords; i++) { + const char *keyword; + unsigned int keyword_length; + + keyword_length = grn_vector_get_element(ctx, + lazy_keywords, + i, + &keyword, + NULL, + NULL); + GRN_TABLE_EACH_BEGIN_MIN(ctx, + lexicon, + cursor, + id, + keyword, keyword_length, + GRN_CURSOR_PREFIX) { + void *key; + int key_size; + int added = 0; + + key_size = grn_table_cursor_get_key(ctx, cursor, &key); + grn_table_add(ctx, chunks, &id, sizeof(grn_id), &added); + if (added) { + GRN_RECORD_PUT(ctx, lazy_keyword_ids, id); + } + } GRN_TABLE_EACH_END(ctx, cursor); + } + } + GRN_BULK_REWIND(candidates); { + grn_obj *lazy_keyword_ids = &(highlighter->lexicon.lazy_keyword_ids); + grn_id *raw_lazy_keyword_ids; + size_t n_lazy_keyword_ids; grn_pat *chunks = (grn_pat *)(highlighter->lexicon.token_id_chunks); size_t i; size_t n_token_ids = GRN_BULK_VSIZE(token_ids) / sizeof(grn_id); @@ -503,6 +586,9 @@ grn_highlighter_highlight_lexicon(grn_ctx *ctx, grn_highlighter_location *raw_token_locations = (grn_highlighter_location *)GRN_BULK_HEAD(token_locations); + raw_lazy_keyword_ids = (grn_id *)GRN_BULK_HEAD(lazy_keyword_ids); + n_lazy_keyword_ids = GRN_BULK_VSIZE(lazy_keyword_ids) / sizeof(grn_id); + for (i = 0; i < n_token_ids; i++) { grn_id chunk_id; @@ -521,16 +607,22 @@ grn_highlighter_highlight_lexicon(grn_ctx *ctx, } { + grn_id *ids; uint32_t key_size; size_t j; size_t n_ids; grn_highlighter_location candidate; grn_highlighter_location *first = raw_token_locations + i; - _grn_pat_key(ctx, chunks, chunk_id, &key_size); + ids = (grn_id *)_grn_pat_key(ctx, chunks, chunk_id, &key_size); n_ids = key_size / sizeof(grn_id); candidate.offset = first->offset; - if (first->have_overlap && n_ids > 1) { + if (n_ids == 1 && + grn_ids_is_included(raw_lazy_keyword_ids, + n_lazy_keyword_ids, + ids[0])) { + candidate.length = first->first_character_length; + } else if (first->have_overlap && n_ids > 1) { candidate.length = first->first_character_length; } else { candidate.length = first->length; @@ -565,6 +657,17 @@ grn_highlighter_highlight_lexicon(grn_ctx *ctx, } { + grn_obj *chunks = highlighter->lexicon.token_id_chunks; + size_t i, n_ids; + + n_ids = GRN_BULK_VSIZE(lazy_keyword_ids) / sizeof(grn_id); + for (i = 0; i < n_ids; i++) { + grn_id id = GRN_RECORD_VALUE_AT(lazy_keyword_ids, i); + grn_table_delete(ctx, chunks, &id, sizeof(grn_id)); + } + } + + { size_t i; size_t n_candidates = GRN_BULK_VSIZE(candidates) / sizeof(grn_highlighter_location); Added: test/command/suite/select/function/highlight_html/lexicon/less_than_n.expected (+37 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/function/highlight_html/lexicon/less_than_n.expected 2018-06-25 15:07:27 +0900 (d4ecbf8a1) @@ -0,0 +1,37 @@ +table_create Entries TABLE_NO_KEY +[[0,0.0,0.0],true] +column_create Entries body COLUMN_SCALAR ShortText +[[0,0.0,0.0],true] +table_create Terms TABLE_PAT_KEY ShortText --default_tokenizer 'TokenNgram("report_source_location", true)' --normalizer 'NormalizerNFKC100' +[[0,0.0,0.0],true] +column_create Terms document_index COLUMN_INDEX|WITH_POSITION Entries body +[[0,0.0,0.0],true] +load --table Entries +[ +{"body": "私は私であって私"} +] +[[0,0.0,0.0],1] +select Entries --match_columns body --query '私' --output_columns 'highlight_html(body, Terms)' +[ + [ + 0, + 0.0, + 0.0 + ], + [ + [ + [ + 1 + ], + [ + [ + "highlight_html", + null + ] + ], + [ + "<span class=\"keyword\">私</span>は<span class=\"keyword\">私</span>であって<span class=\"keyword\">私</span>" + ] + ] + ] +] Added: test/command/suite/select/function/highlight_html/lexicon/less_than_n.test (+17 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/function/highlight_html/lexicon/less_than_n.test 2018-06-25 15:07:27 +0900 (ba25bd95b) @@ -0,0 +1,17 @@ +table_create Entries TABLE_NO_KEY +column_create Entries body COLUMN_SCALAR ShortText + +table_create Terms TABLE_PAT_KEY ShortText \ + --default_tokenizer 'TokenNgram("report_source_location", true)' \ + --normalizer 'NormalizerNFKC100' +column_create Terms document_index COLUMN_INDEX|WITH_POSITION Entries body + +load --table Entries +[ +{"body": "私は私であって私"} +] + +select Entries \ + --match_columns body \ + --query '私' \ + --output_columns 'highlight_html(body, Terms)' -------------- next part -------------- HTML����������������������������... URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180625/ab421c6b/attachment-0001.htm