変換テーブルソース生成用のスクリプトを登録
@@ -0,0 +1,192 @@ | ||
1 | +;; TODO: 整理 | |
2 | +(require :asdf) | |
3 | + | |
4 | +;; | |
5 | +(unless (= (length sb-ext:*posix-argv*) 3) | |
6 | + (format *error-output* "~&Usage: sbcl --script gen-table.lisp DATA_DIR OUTPUT_TABLE_FILE~%") | |
7 | + (sb-ext:quit)) | |
8 | + | |
9 | +(defparameter *data-dir* (pathname (second sb-ext:*posix-argv*))) | |
10 | +(defparameter *table-hh* (pathname (third sb-ext:*posix-argv*))) | |
11 | + | |
12 | +;; | |
13 | +(defun load-local-system (package &optional (package-directory #P"./")) | |
14 | + (let #.`((asdf:*central-registry* (directory package-directory)) | |
15 | + ;; or #+ASDF2 | |
16 | + ,@(when #.#1=(find-symbol "*DEFAULT-SOURCE-REGISTRIES*" :asdf) | |
17 | + `((,#1# nil)))) | |
18 | + (asdf:load-system package))) | |
19 | + | |
20 | +(defmacro each-file-line ((line filepath &rest keys) &body body) | |
21 | + `(with-open-file (#1=#:in ,filepath ,@keys) | |
22 | + (let (,line) | |
23 | + (loop while (setf ,line (read-line #1# nil nil nil)) | |
24 | + DO (locally ,@body))))) | |
25 | + | |
26 | +(defun s (&rest args) | |
27 | + "ARGSを連接した文字列に変換する" | |
28 | + (with-output-to-string (s) | |
29 | + (dolist (a args) | |
30 | + (typecase a | |
31 | + (string (write-string a s)) | |
32 | + (character (write-char a s)) | |
33 | + (otherwise (princ a s)))))) | |
34 | + | |
35 | +(defun flatten (lst &aux acc) | |
36 | + (labels ((self (x) | |
37 | + (if (consp x) | |
38 | + (progn (self (car x)) (self (cdr x))) | |
39 | + (when x | |
40 | + (push x acc))))) | |
41 | + (self lst) | |
42 | + (nreverse acc))) | |
43 | + | |
44 | +(load-local-system :dict #P"lib/dict-0.0.2/") | |
45 | +(load-local-system :dawg #P"lib/cl-dawg-0.2.2-unf/") | |
46 | + | |
47 | +;; | |
48 | +(defun read-attr-def (path &aux acc) | |
49 | + (each-file-line (line path) | |
50 | + (push (list (subseq line 3) (parse-integer line :end 2 :radix 16)) acc)) | |
51 | + (sort (nreverse acc) #'string< :key #'first)) | |
52 | + | |
53 | +(defun read-map-def (path &aux acc) | |
54 | + (each-file-line (line path) | |
55 | + (let ((p (position #\Tab line))) | |
56 | + (push (list (subseq line 0 p) (subseq line (1+ p))) acc))) | |
57 | + (sort (nreverse acc) #'string< :key #'first)) | |
58 | + | |
59 | +(let ((*default-pathname-defaults* (probe-file *data-dir*))) | |
60 | + (defparameter *cac* | |
61 | + (read-map-def "canonical-composition.def")) | |
62 | + | |
63 | + (defparameter *cad* | |
64 | + (read-map-def "canonical-decomposition.def")) | |
65 | + | |
66 | + (defparameter *cod* | |
67 | + (read-map-def "compatibility-decomposition.def")) | |
68 | + | |
69 | + (defparameter *ccc* | |
70 | + (read-attr-def "canonical-combining-class.def")) | |
71 | + | |
72 | + (defparameter *nic* | |
73 | + (read-attr-def "nfc-illegal-char.def")) | |
74 | + | |
75 | + (defparameter *nfic* | |
76 | + (read-attr-def "nfkc-illegal-char.def"))) | |
77 | + | |
78 | +;; | |
79 | +(defun add-prefix (prefix) | |
80 | + (lambda (s) | |
81 | + (s prefix (car s)))) | |
82 | + | |
83 | +(defun cat (strs) | |
84 | + (reduce (lambda (acc s) | |
85 | + (declare (simple-string s acc)) | |
86 | + (let ((p (search s acc))) | |
87 | + (if (null p) | |
88 | + (concatenate 'string s acc) | |
89 | + acc))) | |
90 | + strs | |
91 | + :initial-value "")) | |
92 | + | |
93 | +(defparameter *keys* | |
94 | + (flatten | |
95 | + (list (mapcar (add-prefix "0") *cac*) | |
96 | + (mapcar (add-prefix "1") *cad*) | |
97 | + (mapcar (add-prefix "2") *cod*) | |
98 | + (mapcar (add-prefix "3") *ccc*) | |
99 | + (mapcar (add-prefix "4") *nic*) | |
100 | + (mapcar (add-prefix "5") *nfic*)))) | |
101 | + | |
102 | +(defparameter *strs* | |
103 | + (cat | |
104 | + (sort | |
105 | + (flatten | |
106 | + (list (mapcar #'second *cac*) | |
107 | + (mapcar #'second *cad*) | |
108 | + (mapcar #'second *cod*))) | |
109 | + #'> :key #'length))) | |
110 | + | |
111 | +(defparameter *octets* (sb-ext:string-to-octets *strs*)) | |
112 | + | |
113 | +(with-open-file (out "/tmp/unf.str.dat" :direction :output | |
114 | + :if-exists :supersede | |
115 | + :element-type '(unsigned-byte 8)) | |
116 | + (write-sequence *octets* out) | |
117 | + 'done) | |
118 | + | |
119 | +(defparameter *vals* | |
120 | + (flatten | |
121 | + (list | |
122 | + (loop FOR as IN (list *cac* *cad* *cod*) | |
123 | + COLLECT | |
124 | + (loop FOR (_ v) IN as | |
125 | + FOR bv = (string-to-octets v) | |
126 | + FOR p = (search bv *octets*) | |
127 | + COLLECT (progn | |
128 | + (assert (and (<= (integer-length p) 18) | |
129 | + (<= (integer-length (length bv)) 6))) | |
130 | + (dpb (length bv) (byte 6 18) p)))) | |
131 | + | |
132 | + (loop FOR (_ attr) IN *ccc* COLLECT attr) | |
133 | + | |
134 | + (loop REPEAT (+ (length *nic*) (length *nfic*)) COLLECT 0)))) | |
135 | + | |
136 | +(defparameter *kvs* (mapcar (lambda (x y) | |
137 | + (cons (s x (code-char 0)) y)) | |
138 | + *keys* *vals*)) | |
139 | + | |
140 | +;; | |
141 | +(dawg:build :input *kvs* :output "/tmp/unf.key.idx") | |
142 | + | |
143 | +(defun gen-source (path) | |
144 | + (with-open-file (out path :direction :output :if-exists :supersede) | |
145 | + (format out "#ifndef UNF_TABLE_HH~%") | |
146 | + (format out "#define UNF_TABLE_HH~%") | |
147 | + (format out "namespace UNF {~%") | |
148 | + (format out "namespace TABLE {~%") | |
149 | + | |
150 | + (with-open-file (in "/tmp/unf.key.idx" :element-type '(unsigned-byte 32)) | |
151 | + (let ((base (ldb (byte 24 0) (progn #1=(read-byte in nil nil) #1# #1#)))) | |
152 | + (format out "const unsigned CANONICAL_COM_ROOT = ~d;~%" (+ base (char-code #\0))) | |
153 | + (format out "const unsigned CANONICAL_DECOM_ROOT = ~d;~%" (+ base (char-code #\1))) | |
154 | + (format out "const unsigned COMPATIBILITY_DECOM_ROOT = ~d;~%" (+ base (char-code #\2))) | |
155 | + (format out "const unsigned CANONICAL_CLASS_ROOT = ~d;~%" (+ base (char-code #\3))) | |
156 | + (format out "const unsigned NFC_ILLEGAL_ROOT = ~d;~%" (+ base (char-code #\4))) | |
157 | + (format out "const unsigned NFKC_ILLEGAL_ROOT = ~d;~%" (+ base (char-code #\5))))) | |
158 | + | |
159 | + (with-open-file (in "/tmp/unf.key.idx" :element-type '(unsigned-byte 32)) | |
160 | + (format out "~%const unsigned NODES[]={") | |
161 | + (read-byte in nil nil) | |
162 | + (read-byte in nil nil) | |
163 | + (loop FOR v = (read-byte in nil nil) | |
164 | + WHILE v | |
165 | + FOR i FROM 0 | |
166 | + DO | |
167 | + (when (zerop (mod i 10)) | |
168 | + (terpri out)) | |
169 | + (format out "0x~8,'0x" v) | |
170 | + (when (listen in) | |
171 | + (format out ","))) | |
172 | + (format out "};~%")) | |
173 | + | |
174 | + (with-open-file (in "/tmp/unf.str.dat" :element-type '(signed-byte 8)) | |
175 | + (format out "~%const char STRINGS[]={") | |
176 | + (loop FOR c = (read-byte in nil nil) | |
177 | + WHILE c | |
178 | + FOR i FROM 0 | |
179 | + DO | |
180 | + (when (zerop (mod i 20)) | |
181 | + (terpri out)) | |
182 | + (format out "~4d" c) | |
183 | + (when (listen in) | |
184 | + (format out ","))) | |
185 | + (format out "};~%")) | |
186 | + | |
187 | + (format out "}~%") | |
188 | + (format out "}~%") | |
189 | + (format out "#endif~%"))) | |
190 | + | |
191 | +;; | |
192 | +(gen-source *table-hh*) |
@@ -1,3 +1,9 @@ | ||
1 | 1 | - unfの開発版 |
2 | 2 | - 変換テーブル生成ソース/データを含む |
3 | 3 | - 本番用(?)のREADMEおよびMakefileには'.production'が付いている |
4 | + | |
5 | +[変換テーブル生成方法] | |
6 | +# ※ SBCLが必要 | |
7 | +# ※ gen-table.lispはいろいろ未整理 | |
8 | +$ cd lisp | |
9 | +$ sbcl --script gen-table.lisp ../data/ ../src/unf/table.hh |