[Groonga-commit] pgroonga/pgroonga at 72a261b [master] Support multibyte column name in UTF-8

Back to archive index

Kouhei Sutou null+****@clear*****
Fri Feb 12 00:13:22 JST 2016


Kouhei Sutou	2016-02-12 00:13:22 +0900 (Fri, 12 Feb 2016)

  New Revision: 72a261b0bb0722a9fe3af57751f496522486e9e8
  https://github.com/pgroonga/pgroonga/commit/72a261b0bb0722a9fe3af57751f496522486e9e8

  Message:
    Support multibyte column name in UTF-8

  Added files:
    expected/column-name/japanese.out
    sql/column-name/japanese.sql
    src/pgrn_column_name.c
    src/pgrn_column_name.h
  Modified files:
    Makefile
    src/pgrn_create.c
    src/pgrn_groonga.c
    src/pgroonga.c

  Modified: Makefile (+1 -0)
===================================================================
--- Makefile    2016-02-09 17:29:55 +0900 (2fea1cf)
+++ Makefile    2016-02-12 00:13:22 +0900 (766b743)
@@ -4,6 +4,7 @@ GROONGA_PKG = "groonga >= $(REQUIRED_GROONGA_VERSION)"
 MODULE_big = pgroonga
 SRCS =						\
 	src/pgroonga.c				\
+	src/pgrn_column_name.c			\
 	src/pgrn_convert.c			\
 	src/pgrn_create.c			\
 	src/pgrn_global.c			\

  Added: expected/column-name/japanese.out (+20 -0) 100644
===================================================================
--- /dev/null
+++ expected/column-name/japanese.out    2016-02-12 00:13:22 +0900 (1493323)
@@ -0,0 +1,20 @@
+CREATE TABLE メモ (
+  id integer,
+  コンテンツ text
+);
+INSERT INTO メモ VALUES (1, 'PostgreSQLはRDBMSです。');
+INSERT INTO メモ VALUES (2, 'Groongaは高速な全文検索エンジンです。');
+INSERT INTO メモ VALUES (3, 'PGroongaはGroongaを使うPostgreSQLの拡張機能です。');
+CREATE INDEX 全文検索索引 ON メモ USING pgroonga (コンテンツ);
+SET enable_seqscan = off;
+SET enable_indexscan = on;
+SET enable_bitmapscan = off;
+SELECT id, コンテンツ
+  FROM メモ
+ WHERE コンテンツ %% '全文検索';
+ id |              コンテンツ               
+----+---------------------------------------
+  2 | Groongaは高速な全文検索エンジンです。
+(1 row)
+
+DROP TABLE メモ;

  Added: sql/column-name/japanese.sql (+20 -0) 100644
===================================================================
--- /dev/null
+++ sql/column-name/japanese.sql    2016-02-12 00:13:22 +0900 (f522fb8)
@@ -0,0 +1,20 @@
+CREATE TABLE メモ (
+  id integer,
+  コンテンツ text
+);
+
+INSERT INTO メモ VALUES (1, 'PostgreSQLはRDBMSです。');
+INSERT INTO メモ VALUES (2, 'Groongaは高速な全文検索エンジンです。');
+INSERT INTO メモ VALUES (3, 'PGroongaはGroongaを使うPostgreSQLの拡張機能です。');
+
+CREATE INDEX 全文検索索引 ON メモ USING pgroonga (コンテンツ);
+
+SET enable_seqscan = off;
+SET enable_indexscan = on;
+SET enable_bitmapscan = off;
+
+SELECT id, コンテンツ
+  FROM メモ
+ WHERE コンテンツ %% '全文検索';
+
+DROP TABLE メモ;

  Added: src/pgrn_column_name.c (+135 -0) 100644
===================================================================
--- /dev/null
+++ src/pgrn_column_name.c    2016-02-12 00:13:22 +0900 (07c6c70)
@@ -0,0 +1,135 @@
+#include "pgroonga.h"
+
+#include "pgrn_column_name.h"
+
+#include <groonga.h>
+
+#include <postgres.h>
+#include <mb/pg_wchar.h>
+
+static const char *ENCODED_CHARACTER_FORMAT = "@%05x";
+static const int ENCODED_CHARACTER_LENGTH = 5;
+
+static bool
+PGrnColumnNameIsUsableCharacterASCII(char character)
+{
+	return (character == '_' ||
+			('0' <= character && character <= '9') ||
+			('A' <= character && character <= 'Z') ||
+			('a' <= character && character <= 'z'));
+}
+
+static void
+PGrnColumnNameEncodeCharacterUTF8(const char *utf8Character, char *encodedName)
+{
+	pg_wchar codepoint;
+	codepoint = utf8_to_unicode((const unsigned char *)utf8Character);
+	snprintf(encodedName,
+			 ENCODED_CHARACTER_LENGTH + 1,
+			 ENCODED_CHARACTER_FORMAT,
+			 codepoint);
+}
+
+static void
+checkSize(size_t size)
+{
+	if (size >= GRN_TABLE_MAX_KEY_SIZE)
+		ereport(ERROR,
+				(errcode(ERRCODE_NAME_TOO_LONG),
+				 errmsg("pgroonga: encoded column name >= %d",
+						GRN_TABLE_MAX_KEY_SIZE)));
+}
+
+static size_t
+PGrnColumnNameEncodeUTF8(const char *name, char *encodedName)
+{
+	const char *current;
+	char *encodedCurrent;
+	size_t encodedNameSize = 0;
+
+	current = name;
+	encodedCurrent = encodedName;
+	while (*current != '\0')
+	{
+		int length;
+
+		length = pg_mblen(current);
+
+		if (length == 1 &&
+			PGrnColumnNameIsUsableCharacterASCII(*current) &&
+			!(*current == '_' && current == name))
+		{
+			checkSize(encodedNameSize + length + 1);
+			*encodedCurrent++ = *current;
+			encodedNameSize++;
+		}
+		else
+		{
+			checkSize(encodedNameSize + ENCODED_CHARACTER_LENGTH + 1);
+			PGrnColumnNameEncodeCharacterUTF8(current, encodedCurrent);
+			encodedCurrent += ENCODED_CHARACTER_LENGTH;
+			encodedNameSize += ENCODED_CHARACTER_LENGTH;
+		}
+
+		current += length;
+	}
+
+	*encodedCurrent = '\0';
+
+	return encodedNameSize;
+}
+
+size_t
+PGrnColumnNameEncode(const char *name, char *encodedName)
+{
+	const char *current;
+	char *encodedCurrent;
+	size_t encodedNameSize = 0;
+
+	if (GetDatabaseEncoding() == PG_UTF8)
+		return PGrnColumnNameEncodeUTF8(name, encodedName);
+
+	current = name;
+	encodedCurrent = encodedName;
+	while (*current != '\0')
+	{
+		int length;
+
+		length = pg_mblen(current);
+		if (length != 1)
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("pgroonga: multibyte character isn't supported "
+							"for column name except UTF-8 encoding: <%s>(%s)",
+							name,
+							GetDatabaseEncodingName())));
+
+		if (PGrnColumnNameIsUsableCharacterASCII(*current) &&
+			!(*current == '_' && current == name))
+		{
+			checkSize(encodedNameSize + length + 1);
+			*encodedCurrent++ = *current;
+			encodedNameSize++;
+		}
+		else
+		{
+			checkSize(encodedNameSize + ENCODED_CHARACTER_LENGTH + 1);
+			PGrnColumnNameEncodeCharacterUTF8(current, encodedCurrent);
+			encodedCurrent += ENCODED_CHARACTER_LENGTH;
+			encodedNameSize += ENCODED_CHARACTER_LENGTH;
+		}
+
+		current++;
+	}
+
+	*encodedCurrent = '\0';
+
+	return encodedNameSize;
+}
+
+size_t
+PGrnColumnNameDecode(const char *encodedName, char *name)
+{
+	/* TODO */
+	return 0;
+}

  Added: src/pgrn_column_name.h (+4 -0) 100644
===================================================================
--- /dev/null
+++ src/pgrn_column_name.h    2016-02-12 00:13:22 +0900 (3001024)
@@ -0,0 +1,4 @@
+#pragma once
+
+size_t PGrnColumnNameEncode(const char *name, char *encodedName);
+size_t PGrnColumnNameDecode(const char *encodedName, char *name);

  Modified: src/pgrn_create.c (+10 -4)
===================================================================
--- src/pgrn_create.c    2016-02-09 17:29:55 +0900 (b84a796)
+++ src/pgrn_create.c    2016-02-12 00:13:22 +0900 (1de8de3)
@@ -1,5 +1,6 @@
 #include "pgroonga.h"
 
+#include "pgrn_column_name.h"
 #include "pgrn_create.h"
 #include "pgrn_global.h"
 #include "pgrn_groonga.h"
@@ -57,10 +58,15 @@ PGrnCreateDataColumn(PGrnCreateData *data)
 		}
 	}
 
-	PGrnCreateColumn(data->sourcesTable,
-					 data->desc->attrs[data->i]->attname.data,
-					 flags,
-					 grn_ctx_at(ctx, data->attributeTypeID));
+	{
+		char columnName[GRN_TABLE_MAX_KEY_SIZE];
+		PGrnColumnNameEncode(data->desc->attrs[data->i]->attname.data,
+							 columnName);
+		PGrnCreateColumn(data->sourcesTable,
+						 columnName,
+						 flags,
+						 grn_ctx_at(ctx, data->attributeTypeID));
+	}
 }
 
 void

  Modified: src/pgrn_groonga.c (+5 -1)
===================================================================
--- src/pgrn_groonga.c    2016-02-09 17:29:55 +0900 (c5d7788)
+++ src/pgrn_groonga.c    2016-02-12 00:13:22 +0900 (cc56e90)
@@ -1,5 +1,6 @@
 #include "pgroonga.h"
 
+#include "pgrn_column_name.h"
 #include "pgrn_global.h"
 #include "pgrn_groonga.h"
 
@@ -80,9 +81,12 @@ PGrnLookup(const char *name, int errorLevel)
 grn_obj *
 PGrnLookupColumn(grn_obj *table, const char *name, int errorLevel)
 {
+	char columnName[GRN_TABLE_MAX_KEY_SIZE];
+	size_t columnNameSize;
 	grn_obj *column;
 
-	column = grn_obj_column(ctx, table, name, strlen(name));
+	columnNameSize = PGrnColumnNameEncode(name, columnName);
+	column = grn_obj_column(ctx, table, columnName, columnNameSize);
 	if (!column)
 	{
 		char tableName[GRN_TABLE_MAX_KEY_SIZE];

  Modified: src/pgroonga.c (+2 -2)
===================================================================
--- src/pgroonga.c    2016-02-09 17:29:55 +0900 (67d19a1)
+++ src/pgroonga.c    2016-02-12 00:13:22 +0900 (e754c77)
@@ -1610,8 +1610,8 @@ PGrnInsert(Relation index,
 		if (isnull[i])
 			continue;
 
-		dataColumn = grn_obj_column(ctx, sourcesTable,
-									name->data, strlen(name->data));
+		dataColumn = PGrnLookupColumn(sourcesTable, name->data, ERROR);
+
 		if (PGrnAttributeIsJSONB(attribute->atttypid))
 		{
 			PGrnJSONBInsert(index, values, i, &(buffers->general));
-------------- next part --------------
HTML����������������������������...
Download 



More information about the Groonga-commit mailing list
Back to archive index