正規表現中の Unicode 文字を case-fold するパッチ (Gauche-devel-jp) - Gauche

大橋といいます。

http://chaton.practical-scheme.net/gauche/a/2012/02/07#entry-4f314ca0-508ff
の、正規表現に i フラグをつけると Unicode 文字にうまく
マッチしないことがある問題のパッチを書いてみました。

上記の場合でうまくマッチしないのは、
regexp-parse で AST を作る部分は Unicode 文字を case-fold して
いるのに、マッチ部分がそれに追いついていなかったからです。

文字集合の case-folding も Unicode 対応にしてみました。

既存のテスト + パッチで追加したテストについて
手許の環境で make check まで通ることを確認しました。

-- 
-------------- next part --------------

diff --git a/src/char.c b/src/char.c
index d1c44f4..1d2b38c 100644
--- a/src/char.c
+++ b/src/char.c
@@ -480,17 +480,31 @@ ScmObj Scm_CharSetComplement(ScmCharSet *cs)
     return SCM_OBJ(cs);
 }
 
-/* Make charset case-insensitive.  For now, we only deal with
-   ASCII range. */
+/* Make charset case-insensitive. */
 ScmObj Scm_CharSetCaseFold(ScmCharSet *cs)
 {
+    ScmCharSet *copy = SCM_CHAR_SET(Scm_CharSetCopy(cs));
+    ScmChar c, uch, lch;
+    ScmTreeIter iter;
+    ScmDictEntry *e;
     int ch;
+
     for (ch='a'; ch<='z'; ch++) {
         if (MASK_ISSET(cs, ch) || MASK_ISSET(cs, (ch-('a'-'A')))) {
             MASK_SET(cs, ch);
             MASK_SET(cs, (ch-('a'-'A')));
         }
     }
+
+    Scm_TreeIterInit(&iter, &cs->large, NULL);
+    while ((e = Scm_TreeIterNext(&iter)) != NULL) {
+        for (c = e->key; c <= e->value; c++) {
+            uch = Scm_CharUpcase(c);
+            lch = Scm_CharDowncase(c);
+            Scm_CharSetAddRange(cs, uch, uch);
+            Scm_CharSetAddRange(cs, lch, lch);
+        }
+    }
     return SCM_OBJ(cs);
 }
 
diff --git a/src/regexp.c b/src/regexp.c
index 6730406..3c89819 100644
--- a/src/regexp.c
+++ b/src/regexp.c
@@ -2260,21 +2260,17 @@ struct match_ctx {
 
 static int match_ci(const char **input, const unsigned char **code, int length)
 {
-    unsigned char inch, c;
-    int csize, i;
+    ScmChar inch, c;
+    int csize;
     do {
-        inch = *(*input)++;
-        c = *(*code)++;
-        if ((csize = SCM_CHAR_NFOLLOWS(inch)) == 0) {
-            if (c != SCM_CHAR_DOWNCASE(inch)) return FALSE;
-        } else {
-            if (c != inch) return FALSE;
-            for (i=0; i<csize; i++) {
-                if ((unsigned char)*(*code)++ != (unsigned char)*(*input)++)
-                    return FALSE;
-            }
-        }
-        length -= (csize+1);
+        SCM_CHAR_GET(*input, inch);
+        csize = SCM_CHAR_NBYTES(inch);
+        *input += csize;
+        SCM_CHAR_GET(*code, c);
+        *code += SCM_CHAR_NBYTES(c);
+        if (Scm_CharDowncase(inch) != c)
+            return FALSE;
+        length -= csize;
     } while (length > 0);
     return TRUE;
 }
diff --git a/test/utf-8.scm b/test/utf-8.scm
index 14d2388..d7566ae 100644
--- a/test/utf-8.scm
+++ b/test/utf-8.scm
@@ -619,4 +619,87 @@
          (rxmatch-after m) ;; memoizes start and length
          (list (rxmatch-before m) (rxmatch-substring m))))
 
+;;-------------------------------------------------------------------
+(test* "regexp/unicode-ci (aa)" "λ"
+       (cond ((rxmatch #/λ/i "λ") => rxmatch-substring)
+             (else #f)))
+(test* "regexp/unicode-ci (aA)" "Λ"
+       (cond ((rxmatch #/λ/i "Λ") => rxmatch-substring)
+             (else #f)))
+(test* "regexp/unicode-ci (Aa)" "λ"
+       (cond ((rxmatch #/Λ/i "λ") => rxmatch-substring)
+             (else #f)))
+(test* "regexp/unicode-ci (AA)" "Λ"
+       (cond ((rxmatch #/Λ/i "Λ") => rxmatch-substring)
+             (else #f)))
+
+(test* "regexp/unicode-ci (uncase + backref, aa)" "λλ"
+       (cond ((rxmatch #/(λ)(?i:\1)/ "λλ") => rxmatch-substring)
+             (else #f)))
+(test* "regexp/unicode-ci (uncase + backref, aA)" "λΛ"
+       (cond ((rxmatch #/(λ)(?i:\1)/ "λΛ") => rxmatch-substring)
+             (else #f)))
+(test* "regexp/unicode-ci (uncase + backref, Aa)" "Λλ"
+       (cond ((rxmatch #/(Λ)(?i:\1)/ "Λλ") => rxmatch-substring)
+             (else #f)))
+(test* "regexp/unicode-ci (uncase + backref, AA)" "ΛΛ"
+       (cond ((rxmatch #/(Λ)(?i:\1)/ "ΛΛ") => rxmatch-substring)
+             (else #f)))
+
+(test* "regexp/unicode-ci (charset, aa)" "λ"
+       (cond ((rxmatch #/[λ]/i "λ") => rxmatch-substring)
+             (else #f)))
+(test* "regexp/unicode-ci (charset, aA)" "Λ"
+       (cond ((rxmatch #/[λ]/i "Λ") => rxmatch-substring)
+             (else #f)))
+(test* "regexp/unicode-ci (charset, Aa)" "λ"
+       (cond ((rxmatch #/[Λ]/i "λ") => rxmatch-substring)
+             (else #f)))
+(test* "regexp/unicode-ci (charset, AA)" "Λ"
+       (cond ((rxmatch #/[Λ]/i "Λ") => rxmatch-substring)
+             (else #f)))
+
+(test* "regexp/unicode-ci" "ΒΓ"
+       (cond ((rxmatch #/βγ/i "ΑΒΓΔ") => rxmatch-substring)
+             (else #f)))
+(test* "regexp/unicode-ci" "βΓ"
+       (cond ((rxmatch #/Βγ/i "ΑβΓΔ") => rxmatch-substring)
+             (else #f)))
+(test* "regexp/unicode-ci" "βγ"
+       (cond ((rxmatch #/ΒΓ/i "ΑβγΔ") => rxmatch-substring)
+             (else #f)))
+(test* "regexp/unicode-ci" #f
+       (cond ((rxmatch #/Βγ/ "ΑβΓΔ") => rxmatch-substring)
+             (else #f)))
+(test* "regexp/unicode-ci" #f
+       (cond ((rxmatch #/ΒΓ/ "ΑΒγΔ") => rxmatch-substring)
+             (else #f)))
+
+(test* "regexp/unicode-ci" "ОНА"
+       (cond ((rxmatch #/о[а-я]а/i "ОНА") => rxmatch-substring)
+             (else #f)))
+(test* "regexp/unicode-ci" "она"
+       (cond ((rxmatch #/О[А-Я]А/i "она") => rxmatch-substring)
+             (else #f)))
+(test* "regexp/unicode-ci" "они"
+       (cond ((rxmatch #/[а-пР-Я][А-Пр-я]./i "они") => rxmatch-substring)
+             (else #f)))
+(test* "regexp/unicode-ci" #f
+       (cond ((rxmatch #/о[а-я]а/ "ОНА") => rxmatch-substring)
+             (else #f)))
+(test* "regexp/unicode-ci" #f
+       (cond ((rxmatch #/О[А-Я]А/ "она") => rxmatch-substring)
+             (else #f)))
+(test* "regexp/unicode-ci" #f
+       (cond ((rxmatch #/[а-пР-Я][А-Пр-я]./ "они") => rxmatch-substring)
+             (else #f)))
+(test* "regexp/unicode-ci" "она"
+       (cond ((rxmatch (string->regexp "о[А-Я]а" :case-fold #t) "она")
+              => rxmatch-substring)
+             (else #f)))
+(test* "regexp/unicode-ci" #f
+       (cond ((rxmatch (string->regexp "о[А-Я]а") "она")
+              => rxmatch-substring)
+             (else #f)))
+
 (test-end)


Gauche

[Gauche-devel-jp] 正規表現中の Unicode 文字を case-fold するパッチ