[Groonga-commit] groonga/groonga [master] grn_ii_buffer: support GRN_OBJ_WITH_SECTION.

Back to archive index

null+****@clear***** null+****@clear*****
2012年 2月 20日 (月) 20:53:10 JST


Daijiro MORI	2012-02-20 20:53:10 +0900 (Mon, 20 Feb 2012)

  New Revision: 2c3ecbcf7eae08a84be48f35e1431ab0b0686e57

  Log:
    grn_ii_buffer: support GRN_OBJ_WITH_SECTION.

  Modified files:
    lib/ii.c

  Modified: lib/ii.c (+189 -51)
===================================================================
--- lib/ii.c    2012-02-17 17:05:22 +0900 (bc88833)
+++ lib/ii.c    2012-02-20 20:53:10 +0900 (7ecadd3)
@@ -6338,6 +6338,7 @@ grn_ii_inspect_elements(grn_ctx *ctx, grn_ii *ii, grn_obj *buf)
 /********************** buffered index builder ***********************/
 
 const grn_id II_BUFFER_RID_FLAG = 0x80000000;
+const grn_id II_BUFFER_WEIGHT_FLAG = 0x40000000;
 #ifdef II_BUFFER_ORDER_BY_ID
 const int II_BUFFER_ORDER = GRN_CURSOR_BY_ID;
 #else /* II_BUFFER_ORDER_BY_ID */
@@ -6356,11 +6357,12 @@ typedef struct {
   grn_id last_rid;
   uint32_t last_sid;
   uint32_t last_tf;
+  uint32_t last_weight;
   uint32_t last_pos;
   uint32_t offset_rid;
   uint32_t offset_sid;
-  uint32_t offset_weight;
   uint32_t offset_tf;
+  uint32_t offset_weight;
   uint32_t offset_pos;
 } ii_buffer_counter;
 
@@ -6412,7 +6414,7 @@ block_new(grn_ctx *ctx, grn_ii_buffer *ii_buffer)
   if (!(ii_buffer->nblocks & 0x3ff)) {
     ii_buffer_block *blocks;
     if (!(blocks = GRN_REALLOC(ii_buffer->blocks,
-                         (ii_buffer->nblocks + 0x400) *
+                               (ii_buffer->nblocks + 0x400) *
                                sizeof(ii_buffer_block)))) {
       return NULL;
     }
@@ -6430,6 +6432,7 @@ static uint8_t *
 allocate_outbuf(grn_ctx *ctx, grn_ii_buffer *ii_buffer)
 {
   size_t bufsize = 0, bufsize_ = 0;
+  uint32_t flags = ii_buffer->ii->header->flags;
   ii_buffer_counter *counter = ii_buffer->counters;
   grn_id tid, tid_max = grn_table_size(ctx, ii_buffer->tmp_lexicon);
   for (tid = 1; tid <= tid_max; counter++, tid++) {
@@ -6440,8 +6443,16 @@ allocate_outbuf(grn_ctx *ctx, grn_ii_buffer *ii_buffer)
     bufsize += GRN_B_ENC_SIZE(counter->nrecs);
     bufsize += GRN_B_ENC_SIZE(counter->nposts);
     bufsize += counter->offset_rid;
+    if ((flags & GRN_OBJ_WITH_SECTION)) {
+      bufsize += counter->offset_sid;
+    }
     bufsize += counter->offset_tf;
-    bufsize += counter->offset_pos;
+    if ((flags & GRN_OBJ_WITH_WEIGHT)) {
+      bufsize += counter->offset_weight;
+    }
+    if ((flags & GRN_OBJ_WITH_POSITION)) {
+      bufsize += counter->offset_pos;
+    }
     if (bufsize_ + II_BUFFER_BLOCK_READ_UNIT_SIZE < bufsize) {
       bufsize += sizeof(uint32_t);
       bufsize_ = bufsize;
@@ -6461,6 +6472,7 @@ encode_terms(grn_ctx *ctx, grn_ii_buffer *ii_buffer,
   uint8_t *outbufp_ = outbuf;
   grn_table_cursor  *tc;
   uint8_t *pnext = (uint8_t *)&block->nextsize;
+  uint32_t flags = ii_buffer->ii->header->flags;
   tc = grn_table_cursor_open(ctx, ii_buffer->tmp_lexicon,
                              NULL, 0, NULL, 0, 0, -1, II_BUFFER_ORDER);
   while ((tid = grn_table_cursor_next(ctx, tc)) != GRN_ID_NIL) {
@@ -6471,17 +6483,29 @@ encode_terms(grn_ctx *ctx, grn_ii_buffer *ii_buffer,
     ii_buffer_counter *counter = &ii_buffer->counters[tid - 1];
     if (counter->nrecs) {
       uint32_t offset_rid = counter->offset_rid;
+      uint32_t offset_sid = counter->offset_sid;
       uint32_t offset_tf = counter->offset_tf;
+      uint32_t offset_weight = counter->offset_weight;
       uint32_t offset_pos = counter->offset_pos;
       GRN_B_ENC(gtid, outbufp);
       GRN_B_ENC(counter->nrecs, outbufp);
       GRN_B_ENC(counter->nposts, outbufp);
       counter->offset_rid = outbufp - outbuf;
       outbufp += offset_rid;
+      if ((flags & GRN_OBJ_WITH_SECTION)) {
+        counter->offset_sid = outbufp - outbuf;
+        outbufp += offset_sid;
+      }
       counter->offset_tf = outbufp - outbuf;
       outbufp += offset_tf;
-      counter->offset_pos = outbufp - outbuf;
-      outbufp += offset_pos;
+      if ((flags & GRN_OBJ_WITH_WEIGHT)) {
+        counter->offset_weight = outbufp - outbuf;
+        outbufp += offset_weight;
+      }
+      if ((flags & GRN_OBJ_WITH_POSITION)) {
+        counter->offset_pos = outbufp - outbuf;
+        outbufp += offset_pos;
+      }
     }
     if (outbufp_ + II_BUFFER_BLOCK_READ_UNIT_SIZE < outbufp) {
       uint32_t size = outbufp - outbufp_ + sizeof(uint32_t);
@@ -6503,40 +6527,64 @@ static void
 encode_postings(grn_ctx *ctx, grn_ii_buffer *ii_buffer, uint8_t *outbuf)
 {
   grn_id rid = 0;
+  unsigned int sid = 1;
+  unsigned int weight = 0;
   uint32_t pos = 0;
   uint32_t rest;
   grn_id *bp = ii_buffer->block_buf;
+  uint32_t flags = ii_buffer->ii->header->flags;
   for (rest = ii_buffer->block_pos; rest; bp++, rest--) {
     grn_id id = *bp;
     if (id & II_BUFFER_RID_FLAG) {
       rid = id - II_BUFFER_RID_FLAG;
+      if ((flags & GRN_OBJ_WITH_SECTION)) {
+        sid = *++bp;
+      }
+      weight = 0;
       pos = 0;
+    } else if (id & II_BUFFER_WEIGHT_FLAG) {
+      weight = id - II_BUFFER_WEIGHT_FLAG;
     } else {
       ii_buffer_counter *counter = &ii_buffer->counters[id - 1];
-      if (counter->last_rid == rid) {
+      if (counter->last_rid == rid && counter->last_sid == sid) {
         counter->last_tf++;
+        counter->last_weight += weight;
       } else {
         if (counter->last_tf) {
           uint8_t *p = outbuf + counter->offset_tf;
           GRN_B_ENC(counter->last_tf - 1, p);
           counter->offset_tf = p - outbuf;
+          if (flags & GRN_OBJ_WITH_WEIGHT) {
+            p = outbuf + counter->offset_weight;
+            GRN_B_ENC(counter->last_weight, p);
+            counter->offset_weight = p - outbuf;
+          }
         }
         {
           uint8_t *p = outbuf + counter->offset_rid;
           GRN_B_ENC(rid - counter->last_rid, p);
           counter->offset_rid = p - outbuf;
         }
+        if (flags & GRN_OBJ_WITH_SECTION) {
+          uint8_t *p = outbuf + counter->offset_sid;
+          if (counter->last_rid != rid) {
+            GRN_B_ENC(sid - 1, p);
+          } else {
+            GRN_B_ENC(sid - counter->last_sid - 1, p);
+          }
+          counter->offset_sid = p - outbuf;
+        }
         counter->last_rid = rid;
-        counter->last_sid = 0;
+        counter->last_sid = sid;
         counter->last_tf = 1;
         counter->last_pos = 0;
       }
-      {
+      if (flags & GRN_OBJ_WITH_POSITION) {
         uint8_t *p = outbuf + counter->offset_pos;
         GRN_B_ENC(pos - counter->last_pos, p);
         counter->offset_pos = p - outbuf;
+        counter->last_pos = pos;
       }
-      counter->last_pos = pos;
       pos++;
     }
   }
@@ -6551,6 +6599,12 @@ encode_last_tf(grn_ctx *ctx, grn_ii_buffer *ii_buffer, uint8_t *outbuf)
     uint8_t *p = outbuf + counter->offset_tf;
     GRN_B_ENC(counter->last_tf - 1, p);
   }
+  if ((ii_buffer->ii->header->flags & GRN_OBJ_WITH_WEIGHT)) {
+    for (tid = 1; tid <= tid_max; counter++, tid++) {
+      uint8_t *p = outbuf + counter->offset_weight;
+      GRN_B_ENC(counter->last_weight, p);
+    }
+  }
 }
 
 static void
@@ -6633,27 +6687,34 @@ get_buffer_counter(grn_ctx *ctx, grn_ii_buffer *ii_buffer,
 }
 
 static void
-grn_ii_buffer_tokenize(grn_ctx *ctx, grn_ii_buffer *ii_buffer,
-                       grn_id rid, unsigned int section, grn_obj *value)
+grn_ii_buffer_tokenize(grn_ctx *ctx, grn_ii_buffer *ii_buffer, grn_id rid,
+                       unsigned int sid, unsigned int weight, grn_obj *value)
 {
   uint32_t value_len = GRN_TEXT_LEN(value);
   if (value_len) {
     grn_obj *tmp_lexicon;
-    if (ii_buffer->block_buf_size < ii_buffer->block_pos + value_len) {
+    uint32_t est_len = value_len + 2;
+    if (ii_buffer->block_buf_size < ii_buffer->block_pos + est_len) {
       grn_ii_buffer_flush(ctx, ii_buffer);
     }
-    if (ii_buffer->block_buf_size < value_len) {
+    if (ii_buffer->block_buf_size < est_len) {
       grn_id *block_buf = (grn_id *)GRN_REALLOC(ii_buffer->block_buf,
-                                                value_len * sizeof(grn_id));
+                                                est_len * sizeof(grn_id));
       if (!block_buf) { return; }
       ii_buffer->block_buf = block_buf;
-      ii_buffer->block_buf_size = value_len;
+      ii_buffer->block_buf_size = est_len;
     }
     if ((tmp_lexicon = get_tmp_lexicon(ctx, ii_buffer))) {
       grn_token *token;
       grn_id *buffer = ii_buffer->block_buf;
       uint32_t block_pos = ii_buffer->block_pos;
       buffer[block_pos++] = rid + II_BUFFER_RID_FLAG;
+      if ((ii_buffer->ii->header->flags & GRN_OBJ_WITH_SECTION)) {
+        buffer[block_pos++] = sid;
+      }
+      if (weight) {
+        buffer[block_pos++] = weight + II_BUFFER_WEIGHT_FLAG;
+      }
       if ((token = grn_token_open(ctx, tmp_lexicon, GRN_TEXT_VALUE(value),
                                   value_len, grn_token_add))) {
         uint32_t pos;
@@ -6667,9 +6728,26 @@ grn_ii_buffer_tokenize(grn_ctx *ctx, grn_ii_buffer *ii_buffer,
             if (counter->last_rid != rid) {
               counter->offset_rid += GRN_B_ENC_SIZE(rid - counter->last_rid);
               counter->last_rid = rid;
+              counter->offset_sid += GRN_B_ENC_SIZE(sid - 1);
+              counter->last_sid = sid;
               if (counter->last_tf) {
                 counter->offset_tf += GRN_B_ENC_SIZE(counter->last_tf - 1);
                 counter->last_tf = 0;
+                counter->offset_weight += GRN_B_ENC_SIZE(counter->last_weight);
+                counter->last_weight = 0;
+              }
+              counter->last_pos = 0;
+              counter->nrecs++;
+            } else if (counter->last_sid != sid) {
+              counter->offset_rid += GRN_B_ENC_SIZE(0);
+              counter->offset_sid +=
+                GRN_B_ENC_SIZE(sid - counter->last_sid - 1);
+              counter->last_sid = sid;
+              if (counter->last_tf) {
+                counter->offset_tf += GRN_B_ENC_SIZE(counter->last_tf - 1);
+                counter->last_tf = 0;
+                counter->offset_weight += GRN_B_ENC_SIZE(counter->last_weight);
+                counter->last_weight = 0;
               }
               counter->last_pos = 0;
               counter->nrecs++;
@@ -6677,6 +6755,7 @@ grn_ii_buffer_tokenize(grn_ctx *ctx, grn_ii_buffer *ii_buffer,
             counter->offset_pos += GRN_B_ENC_SIZE(pos - counter->last_pos);
             counter->last_pos = pos;
             counter->last_tf++;
+            counter->last_weight += weight;
             counter->nposts++;
           }
         }
@@ -6776,16 +6855,29 @@ merge_hit_blocks(grn_ctx *ctx, grn_ii_buffer *ii_buffer,
     nrecs += block->nrecs;
     nposts += block->nposts;
   }
-  max_size = nrecs * 2 + nposts;
+  max_size = nrecs * (ii_buffer->ii->n_elements - 1) + nposts;
   datavec_reset(ctx, ii_buffer->data_vectors,
                 ii_buffer->ii->n_elements, nrecs, max_size);
   {
-    uint32_t *ridp = ii_buffer->data_vectors[0].data;
-    uint32_t *tfp = ii_buffer->data_vectors[1].data;
-    uint32_t *posp = ii_buffer->data_vectors[2].data;
+    int i;
     uint32_t lr = 0;
     uint64_t spos = 0;
-    int i;
+    uint32_t flags = ii_buffer->ii->header->flags;
+    uint32_t *ridp, *sidp = NULL, *tfp, *weightp = NULL, *posp = NULL;
+    {
+      int j = 0;
+      ridp = ii_buffer->data_vectors[j++].data;
+      if (flags & GRN_OBJ_WITH_SECTION) {
+        sidp = ii_buffer->data_vectors[j++].data;
+      }
+      tfp = ii_buffer->data_vectors[j++].data;
+      if (flags & GRN_OBJ_WITH_WEIGHT) {
+        weightp = ii_buffer->data_vectors[j++].data;
+      }
+      if (flags & GRN_OBJ_WITH_POSITION) {
+        posp = ii_buffer->data_vectors[j++].data;
+      }
+    }
     for (i = 0; i < nhits; i++) {
       ii_buffer_block *block = hits[i];
       uint8_t *p = block->bufcur;
@@ -6799,27 +6891,52 @@ merge_hit_blocks(grn_ctx *ctx, grn_ii_buffer *ii_buffer,
           lr += *ridp++;
         }
       }
+      if ((flags & GRN_OBJ_WITH_SECTION)) {
+        for (n = block->nrecs; n; n--) {
+          GRN_B_DEC(*sidp++, p);
+        }
+      }
       for (n = block->nrecs; n; n--) {
         GRN_B_DEC(*tfp++, p);
       }
-      for (n = block->nposts; n; n--) {
-        GRN_B_DEC(*posp, p);
-        spos += *posp++;
+      if ((flags & GRN_OBJ_WITH_WEIGHT)) {
+        for (n = block->nrecs; n; n--) {
+          GRN_B_DEC(*weightp++, p);
+        }
+      }
+      if ((flags & GRN_OBJ_WITH_POSITION)) {
+        for (n = block->nposts; n; n--) {
+          GRN_B_DEC(*posp, p);
+          spos += *posp++;
+        }
       }
       block->rest -= (p - block->bufcur);
       block->bufcur = p;
       grn_ii_buffer_fetch(ctx, ii_buffer, block);
     }
-    ii_buffer->data_vectors[0].data_size = nrecs;
-    ii_buffer->data_vectors[1].data_size = nrecs;
-    ii_buffer->data_vectors[2].data_size = nposts;
-
-    ii_buffer->data_vectors[0].flags =
-      ((nrecs < 16) || (nrecs <= (lr >> 8))) ? 0 : USE_P_ENC;
-    ii_buffer->data_vectors[1].flags =
-      (nrecs < 3) ? 0 : USE_P_ENC;
-    ii_buffer->data_vectors[2].flags =
-      (((nposts < 32) || (nposts <= (spos >> 13))) ? 0 : USE_P_ENC)|ODD;
+    {
+      int j = 0;
+      uint32_t f_s = (nrecs < 3) ? 0 : USE_P_ENC;
+      uint32_t f_d = ((nrecs < 16) || (nrecs <= (lr >> 8))) ? 0 : USE_P_ENC;
+      ii_buffer->data_vectors[j].data_size = nrecs;
+      ii_buffer->data_vectors[j++].flags = f_d;
+      if ((flags & GRN_OBJ_WITH_SECTION)) {
+        ii_buffer->data_vectors[j].data_size = nrecs;
+        ii_buffer->data_vectors[j++].flags = f_s;
+      }
+      ii_buffer->data_vectors[j].data_size = nrecs;
+      ii_buffer->data_vectors[j++].flags = f_s;
+      if ((flags & GRN_OBJ_WITH_WEIGHT)) {
+        ii_buffer->data_vectors[j].data_size = nrecs;
+        ii_buffer->data_vectors[j++].flags = f_s;
+      }
+      if ((flags & GRN_OBJ_WITH_POSITION)) {
+        uint32_t f_p = (((nposts < 32) ||
+                         (nposts <= (spos >> 13))) ? 0 : USE_P_ENC);
+        ii_buffer->data_vectors[j].data_size = nposts;
+        ii_buffer->data_vectors[j++].flags = f_p|ODD;
+      }
+    }
   }
   return max_size;
 }
@@ -6851,22 +6968,38 @@ grn_ii_buffer_merge(grn_ctx *ctx, grn_ii_buffer *ii_buffer,
 {
   uint32_t *a = array_get(ctx, ii_buffer->ii, tid);
   if (nhits == 1 && hits[0]->nrecs == 1 && hits[0]->nposts == 1) {
+    grn_id rid;
+    uint32_t sid = 1, tf, pos, weight = 0;
     ii_buffer_block *block = hits[0];
     uint8_t *p = block->bufcur;
-    grn_id rid;
-    uint32_t tf, pos;
+    uint32_t flags = ii_buffer->ii->header->flags;
     GRN_B_DEC(rid, p);
+    if (flags & GRN_OBJ_WITH_SECTION) { GRN_B_DEC(sid, p); }
     GRN_B_DEC(tf, p);
+    if (tf != 0) { GRN_LOG(ctx, GRN_LOG_WARNING, "tf=%d", tf); }
+    if (flags & GRN_OBJ_WITH_WEIGHT) { GRN_B_DEC(weight, p); }
     GRN_B_DEC(pos, p);
-    block->rest -= (p - block->bufcur);
-    block->bufcur = p;
-    if (tf != 0) {
-      GRN_LOG(ctx, GRN_LOG_WARNING, "tf=%d", tf);
+    if (!weight) {
+      if (flags & GRN_OBJ_WITH_SECTION) {
+        if (rid < 0x100000 && sid < 0x800) {
+          a[0] = (rid << 12) + (sid << 1) + 1;
+          a[1] = (flags & GRN_OBJ_WITH_POSITION) ? pos : 0;
+          block->rest -= (p - block->bufcur);
+          block->bufcur = p;
+          grn_ii_buffer_fetch(ctx, ii_buffer, block);
+          return;
+        }
+      } else {
+        a[0] = (rid << 1) + 1;
+        a[1] = (flags & GRN_OBJ_WITH_POSITION) ? pos : 0;
+        block->rest -= (p - block->bufcur);
+        block->bufcur = p;
+        grn_ii_buffer_fetch(ctx, ii_buffer, block);
+        return;
+      }
     }
-    a[0] = (rid << 1) + 1;
-    a[1] = pos;
-    grn_ii_buffer_fetch(ctx, ii_buffer, block);
-  } else {
+  }
+  {
     uint32_t max_size = merge_hit_blocks(ctx, ii_buffer, hits, nhits);
     if (ii_buffer->packed_buf &&
         ii_buffer->packed_buf_size <
@@ -6954,9 +7087,9 @@ grn_ii_buffer_open(grn_ctx *ctx, grn_ii *ii)
 
 grn_rc
 grn_ii_buffer_append(grn_ctx *ctx, grn_ii_buffer *ii_buffer,
-                     grn_id rid, unsigned int section, grn_obj *value)
+                     grn_id rid, unsigned int sid, grn_obj *value)
 {
-  grn_ii_buffer_tokenize(ctx, ii_buffer, rid, section, value);
+  grn_ii_buffer_tokenize(ctx, ii_buffer, rid, sid, 0, value);
   return ctx->rc;
 }
 
@@ -7047,15 +7180,20 @@ grn_ii_buffer_parse(grn_ctx *ctx, grn_ii_buffer *ii_buffer,
   if ((tc = grn_table_cursor_open(ctx, target,
                                   NULL, 0, NULL, 0, 0, -1,
                                   GRN_CURSOR_BY_ID))) {
-    grn_id id;
+    grn_id rid;
     grn_obj rv;
     GRN_TEXT_INIT(&rv, 0);
-    while ((id = grn_table_cursor_next(ctx, tc)) != GRN_ID_NIL) {
-      int i;
-      for (i = 0; i < ncols; i++) {
+    while ((rid = grn_table_cursor_next(ctx, tc)) != GRN_ID_NIL) {
+      int sid;
+      grn_obj **col;
+      for (sid = 1, col = cols; sid <= ncols; sid++, col++) {
         GRN_BULK_REWIND(&rv);
-        grn_obj_get_value(ctx, cols[i], id, &rv);
-        grn_ii_buffer_tokenize(ctx, ii_buffer, id, i + 1, &rv);
+        if (GRN_OBJ_TABLEP(*col)) {
+          grn_table_get_key2(ctx, *col, rid, &rv);
+        } else {
+          grn_obj_get_value(ctx, *col, rid, &rv);
+        }
+        grn_ii_buffer_tokenize(ctx, ii_buffer, rid, sid, 0, &rv);
       }
     }
     GRN_OBJ_FIN(ctx, &rv);




Groonga-commit メーリングリストの案内
Back to archive index