OOHASHI Daichi
leque****@katch*****
2012年 3月 31日 (土) 08:21:26 JST
大橋といいます。 http://chaton.practical-scheme.net/gauche/a/2012/02/07#entry-4f314ca0-508ff の、正規表現に i フラグをつけると Unicode 文字にうまく マッチしないことがある問題のパッチを書いてみました。 上記の場合でうまくマッチしないのは、 regexp-parse で AST を作る部分は Unicode 文字を case-fold して いるのに、マッチ部分がそれに追いついていなかったからです。 文字集合の case-folding も Unicode 対応にしてみました。 既存のテスト + パッチで追加したテストについて 手許の環境で make check まで通ることを確認しました。 -- -------------- next part -------------- diff --git a/src/char.c b/src/char.c index d1c44f4..1d2b38c 100644 --- a/src/char.c +++ b/src/char.c @@ -480,17 +480,31 @@ ScmObj Scm_CharSetComplement(ScmCharSet *cs) return SCM_OBJ(cs); } -/* Make charset case-insensitive. For now, we only deal with - ASCII range. */ +/* Make charset case-insensitive. */ ScmObj Scm_CharSetCaseFold(ScmCharSet *cs) { + ScmCharSet *copy = SCM_CHAR_SET(Scm_CharSetCopy(cs)); + ScmChar c, uch, lch; + ScmTreeIter iter; + ScmDictEntry *e; int ch; + for (ch='a'; ch<='z'; ch++) { if (MASK_ISSET(cs, ch) || MASK_ISSET(cs, (ch-('a'-'A')))) { MASK_SET(cs, ch); MASK_SET(cs, (ch-('a'-'A'))); } } + + Scm_TreeIterInit(&iter, &cs->large, NULL); + while ((e = Scm_TreeIterNext(&iter)) != NULL) { + for (c = e->key; c <= e->value; c++) { + uch = Scm_CharUpcase(c); + lch = Scm_CharDowncase(c); + Scm_CharSetAddRange(cs, uch, uch); + Scm_CharSetAddRange(cs, lch, lch); + } + } return SCM_OBJ(cs); } diff --git a/src/regexp.c b/src/regexp.c index 6730406..3c89819 100644 --- a/src/regexp.c +++ b/src/regexp.c @@ -2260,21 +2260,17 @@ struct match_ctx { static int match_ci(const char **input, const unsigned char **code, int length) { - unsigned char inch, c; - int csize, i; + ScmChar inch, c; + int csize; do { - inch = *(*input)++; - c = *(*code)++; - if ((csize = SCM_CHAR_NFOLLOWS(inch)) == 0) { - if (c != SCM_CHAR_DOWNCASE(inch)) return FALSE; - } else { - if (c != inch) return FALSE; - for (i=0; i<csize; i++) { - if ((unsigned char)*(*code)++ != (unsigned char)*(*input)++) - return FALSE; - } - } - length -= (csize+1); + SCM_CHAR_GET(*input, inch); + csize = SCM_CHAR_NBYTES(inch); + *input += csize; + SCM_CHAR_GET(*code, c); + *code += SCM_CHAR_NBYTES(c); + if (Scm_CharDowncase(inch) != c) + return FALSE; + length -= csize; } while (length > 0); return TRUE; } diff --git a/test/utf-8.scm b/test/utf-8.scm index 14d2388..d7566ae 100644 --- a/test/utf-8.scm +++ b/test/utf-8.scm @@ -619,4 +619,87 @@ (rxmatch-after m) ;; memoizes start and length (list (rxmatch-before m) (rxmatch-substring m)))) +;;------------------------------------------------------------------- +(test* "regexp/unicode-ci (aa)" "λ" + (cond ((rxmatch #/λ/i "λ") => rxmatch-substring) + (else #f))) +(test* "regexp/unicode-ci (aA)" "Λ" + (cond ((rxmatch #/λ/i "Λ") => rxmatch-substring) + (else #f))) +(test* "regexp/unicode-ci (Aa)" "λ" + (cond ((rxmatch #/Λ/i "λ") => rxmatch-substring) + (else #f))) +(test* "regexp/unicode-ci (AA)" "Λ" + (cond ((rxmatch #/Λ/i "Λ") => rxmatch-substring) + (else #f))) + +(test* "regexp/unicode-ci (uncase + backref, aa)" "λλ" + (cond ((rxmatch #/(λ)(?i:\1)/ "λλ") => rxmatch-substring) + (else #f))) +(test* "regexp/unicode-ci (uncase + backref, aA)" "λΛ" + (cond ((rxmatch #/(λ)(?i:\1)/ "λΛ") => rxmatch-substring) + (else #f))) +(test* "regexp/unicode-ci (uncase + backref, Aa)" "Λλ" + (cond ((rxmatch #/(Λ)(?i:\1)/ "Λλ") => rxmatch-substring) + (else #f))) +(test* "regexp/unicode-ci (uncase + backref, AA)" "ΛΛ" + (cond ((rxmatch #/(Λ)(?i:\1)/ "ΛΛ") => rxmatch-substring) + (else #f))) + +(test* "regexp/unicode-ci (charset, aa)" "λ" + (cond ((rxmatch #/[λ]/i "λ") => rxmatch-substring) + (else #f))) +(test* "regexp/unicode-ci (charset, aA)" "Λ" + (cond ((rxmatch #/[λ]/i "Λ") => rxmatch-substring) + (else #f))) +(test* "regexp/unicode-ci (charset, Aa)" "λ" + (cond ((rxmatch #/[Λ]/i "λ") => rxmatch-substring) + (else #f))) +(test* "regexp/unicode-ci (charset, AA)" "Λ" + (cond ((rxmatch #/[Λ]/i "Λ") => rxmatch-substring) + (else #f))) + +(test* "regexp/unicode-ci" "ΒΓ" + (cond ((rxmatch #/βγ/i "ΑΒΓΔ") => rxmatch-substring) + (else #f))) +(test* "regexp/unicode-ci" "βΓ" + (cond ((rxmatch #/Βγ/i "ΑβΓΔ") => rxmatch-substring) + (else #f))) +(test* "regexp/unicode-ci" "βγ" + (cond ((rxmatch #/ΒΓ/i "ΑβγΔ") => rxmatch-substring) + (else #f))) +(test* "regexp/unicode-ci" #f + (cond ((rxmatch #/Βγ/ "ΑβΓΔ") => rxmatch-substring) + (else #f))) +(test* "regexp/unicode-ci" #f + (cond ((rxmatch #/ΒΓ/ "ΑΒγΔ") => rxmatch-substring) + (else #f))) + +(test* "regexp/unicode-ci" "ОНА" + (cond ((rxmatch #/о[а-я]а/i "ОНА") => rxmatch-substring) + (else #f))) +(test* "regexp/unicode-ci" "она" + (cond ((rxmatch #/О[А-Я]А/i "она") => rxmatch-substring) + (else #f))) +(test* "regexp/unicode-ci" "они" + (cond ((rxmatch #/[а-пР-Я][А-Пр-я]./i "они") => rxmatch-substring) + (else #f))) +(test* "regexp/unicode-ci" #f + (cond ((rxmatch #/о[а-я]а/ "ОНА") => rxmatch-substring) + (else #f))) +(test* "regexp/unicode-ci" #f + (cond ((rxmatch #/О[А-Я]А/ "она") => rxmatch-substring) + (else #f))) +(test* "regexp/unicode-ci" #f + (cond ((rxmatch #/[а-пР-Я][А-Пр-я]./ "они") => rxmatch-substring) + (else #f))) +(test* "regexp/unicode-ci" "она" + (cond ((rxmatch (string->regexp "о[А-Я]а" :case-fold #t) "она") + => rxmatch-substring) + (else #f))) +(test* "regexp/unicode-ci" #f + (cond ((rxmatch (string->regexp "о[А-Я]а") "она") + => rxmatch-substring) + (else #f))) + (test-end)