減色プログラム
Revisão | 4986e10d08106c1adcd49e5dc918735fa3ca7736 (tree) |
---|---|
Hora | 2011-05-22 07:22:26 |
Autor | berupon <berupon@gmai...> |
Commiter | berupon |
optimized
@@ -38,11 +38,18 @@ struct Color4d | ||
38 | 38 | |
39 | 39 | double dot_product(const Color4d& rhs) { |
40 | 40 | // http://www.icnet.ne.jp/~nsystem/simd_tobira/dpps.html |
41 | +#if 1 | |
42 | + Color4d result = (*this) * rhs; | |
43 | + __m128d v = _mm_add_pd(result.v[0], result.v[1]); | |
44 | + v = _mm_hadd_pd(v, v); | |
45 | + return v.m128d_f64[0]; | |
46 | +#else | |
41 | 47 | double result = 0; |
42 | 48 | for (int i=0; i<3; i++) { |
43 | 49 | result += (*this)[i] * rhs[i]; |
44 | 50 | } |
45 | 51 | return result; |
52 | +#endif | |
46 | 53 | } |
47 | 54 | |
48 | 55 | Color4d& operator += (const Color4d& rhs) { |
@@ -51,16 +58,16 @@ struct Color4d | ||
51 | 58 | return *this; |
52 | 59 | } |
53 | 60 | |
61 | +/* | |
54 | 62 | Color4d& operator += (const Color4f& rhs) { |
55 | 63 | v[0] = _mm_add_pd(v[0], _mm_cvtps_pd(rhs.v)); |
56 | 64 | v[1] = _mm_add_pd(v[1], _mm_cvtps_pd(_mm_movehl_ps(rhs.v, rhs.v))); |
57 | 65 | return *this; |
58 | 66 | } |
67 | +*/ | |
59 | 68 | |
60 | 69 | Color4d operator + (const Color4d& rhs) { |
61 | - Color4d result(*this); | |
62 | - result += rhs; | |
63 | - return result; | |
70 | + return Color4d(*this) += rhs; | |
64 | 71 | } |
65 | 72 | |
66 | 73 | Color4d& operator -= (const Color4d& rhs) { |
@@ -70,9 +77,7 @@ struct Color4d | ||
70 | 77 | } |
71 | 78 | |
72 | 79 | Color4d operator - (const Color4d& rhs) { |
73 | - Color4d result(*this); | |
74 | - result -= rhs; | |
75 | - return result; | |
80 | + return Color4d(*this) -= rhs; | |
76 | 81 | } |
77 | 82 | |
78 | 83 | Color4d& operator *= (const Color4d& rhs) { |
@@ -82,9 +87,7 @@ struct Color4d | ||
82 | 87 | } |
83 | 88 | |
84 | 89 | Color4d operator * (const Color4d& rhs) { |
85 | - Color4d result(*this); | |
86 | - result *= rhs; | |
87 | - return result; | |
90 | + return Color4d(*this) *= rhs; | |
88 | 91 | } |
89 | 92 | |
90 | 93 | Color4d& operator *= (double scalar) { |
@@ -95,11 +98,9 @@ struct Color4d | ||
95 | 98 | } |
96 | 99 | |
97 | 100 | Color4d operator * (double scalar) { |
98 | - Color4d result(*this); | |
99 | - result *= scalar; | |
100 | - return result; | |
101 | + return Color4d(*this) *= scalar; | |
101 | 102 | } |
102 | - | |
103 | + | |
103 | 104 | double& operator[] (int idx) { |
104 | 105 | return ((double*)&v)[idx]; |
105 | 106 | } |
@@ -108,13 +109,22 @@ struct Color4d | ||
108 | 109 | } |
109 | 110 | |
110 | 111 | double norm_squared() { |
112 | +#if 1 | |
113 | + __m128d t = _mm_add_pd( | |
114 | + _mm_mul_pd(v[0], v[0]), | |
115 | + _mm_mul_pd(v[1], v[1]) | |
116 | + ); | |
117 | + t = _mm_hadd_pd(t, t); | |
118 | + return t.m128d_f64[0]; | |
119 | +#else | |
111 | 120 | double result = 0; |
112 | 121 | for (int i=0; i<3; i++) { |
113 | 122 | result += (*this)[i] * (*this)[i]; |
114 | 123 | } |
115 | 124 | return result; |
125 | +#endif | |
116 | 126 | } |
117 | - | |
127 | + | |
118 | 128 | void zero() { |
119 | 129 | v[0] = _mm_setzero_pd(); |
120 | 130 | v[1] = _mm_setzero_pd(); |
@@ -122,7 +132,10 @@ struct Color4d | ||
122 | 132 | }; |
123 | 133 | |
124 | 134 | inline Color4d operator * (double scalar, const Color4d& c) { |
125 | - Color4d tmp = c; | |
126 | - return tmp * scalar; | |
135 | + return Color4d(c) *= scalar; | |
136 | +} | |
137 | + | |
138 | +inline Color4d operator * (const Color4d& c, double scalar) { | |
139 | + return Color4d(c) *= scalar; | |
127 | 140 | } |
128 | 141 |
@@ -1,6 +1,6 @@ | ||
1 | 1 | #pragma once |
2 | 2 | |
3 | -#include <emmintrin.h> | |
3 | +#include <intrin.h> | |
4 | 4 | |
5 | 5 | // http://www2.kobe-u.ac.jp/~lerl2/l_cc_p_10.1.008/doc/main_cls/mergedProjects/intref_cls/ |
6 | 6 |
@@ -293,18 +293,17 @@ void compute_initial_s( | ||
293 | 293 | ) |
294 | 294 | { |
295 | 295 | size_t palette_size = s.width_; |
296 | - int coarse_width = coarse_variables.width_; | |
297 | - int coarse_height = coarse_variables.height_; | |
298 | - int center_x = (b.width_-1)/2, center_y = (b.height_-1)/2; | |
299 | - | |
300 | - Color center_b = b_value(b,0,0,0,0); | |
301 | 296 | Color zero_vector; |
302 | 297 | zero_vector.zero(); |
303 | 298 | for (size_t v=0; v<palette_size; ++v) { |
304 | - for (size_t alpha=v; alpha<palette_size; ++alpha) { | |
305 | - s[alpha][v] = zero_vector; | |
299 | + for (size_t v2=0; v2<v+1; ++v2) { | |
300 | + s[v][v2] = zero_vector; | |
306 | 301 | } |
307 | 302 | } |
303 | + const int coarse_width = coarse_variables.width_; | |
304 | + const int coarse_height = coarse_variables.height_; | |
305 | + const int center_x = (b.width_-1)/2, center_y = (b.height_-1)/2; | |
306 | + const Color center_b = b_value(b,0,0,0,0); | |
308 | 307 | for (int i_y=0; i_y<coarse_height; ++i_y) { |
309 | 308 | for (int i_x=0; i_x<coarse_width; ++i_x) { |
310 | 309 | const double* p_icv = &coarse_variables(i_x, i_y, 0); |
@@ -313,15 +312,20 @@ void compute_initial_s( | ||
313 | 312 | for (size_t j_y=max<int>(0, i_y - center_y); j_y<max_j_y; ++j_y) { |
314 | 313 | for (int j_x=max<int>(0, i_x - center_x); j_x<max_j_x; ++j_x) { |
315 | 314 | if (i_x == j_x && i_y == j_y) continue; |
316 | - Color b_ij = b_value(b,i_x,i_y,j_x,j_y); | |
315 | + const Color b_ij = b_value(b,i_x,i_y,j_x,j_y); | |
316 | + const double* p_jcv = &coarse_variables(j_x, j_y, 0); | |
317 | 317 | for (size_t v=0; v<palette_size; ++v) { |
318 | - Color b_ij2 = b_ij * p_icv[v]; | |
319 | - const double* p_jcv = &coarse_variables(j_x, j_y, v); | |
318 | + const Color b_ij2 = b_ij * p_icv[v]; | |
319 | + const double* p_jcv2 = p_jcv++; | |
320 | + double jcv = *p_jcv2; | |
320 | 321 | Color* ps = s.pBuff_ + v * palette_size + v; |
321 | 322 | // TODO: 変更画像、縦方向ではなく横方向に操作する。後で転置。 |
322 | 323 | for (size_t alpha=v; alpha<palette_size; ++alpha) { |
323 | - *ps += (*p_jcv++) * b_ij2; | |
324 | + ++p_jcv2; | |
325 | + double njcv = *p_jcv2; | |
326 | + *ps += jcv * b_ij2; | |
324 | 327 | ps += palette_size; |
328 | + jcv = njcv; | |
325 | 329 | } |
326 | 330 | } |
327 | 331 | } |
@@ -343,27 +347,34 @@ void update_s( | ||
343 | 347 | const double delta |
344 | 348 | ) |
345 | 349 | { |
346 | - const size_t palette_size = s.width_; | |
347 | - const int coarse_width = coarse_variables.width_; | |
348 | - const int coarse_height = coarse_variables.height_; | |
350 | + const size_t palette_size = s.width_; | |
349 | 351 | const int center_x = (b.width_-1) / 2; |
350 | 352 | const int center_y = (b.height_-1) / 2; |
351 | - const int max_i_x = min(coarse_width, j_x + center_x + 1); | |
352 | - const int max_i_y = min<int>(coarse_height, j_y + center_y + 1); | |
353 | + const size_t max_i_x = min<int>(coarse_variables.width_, j_x + center_x + 1); | |
354 | + const size_t max_i_y = min<int>(coarse_variables.height_, j_y + center_y + 1); | |
353 | 355 | for (size_t i_y=max(0, j_y - center_y); i_y<max_i_y; ++i_y) { |
354 | 356 | for (size_t i_x=max(0, j_x - center_x); i_x<max_i_x; ++i_x) { |
355 | - const Color delta_b_ij = delta * b_value(b,i_x,i_y,j_x,j_y); | |
356 | 357 | if (i_x == j_x && i_y == j_y) continue; |
358 | + const Color delta_b_ij = delta * b_value(b,i_x,i_y,j_x,j_y); | |
357 | 359 | Color* ps = s[alpha]; |
358 | 360 | const double* p_cv = &coarse_variables(i_x, i_y, 0); |
361 | + double cv = *p_cv; | |
359 | 362 | for (size_t v=0; v<=alpha; ++v) { |
360 | - ps[v] += (*p_cv++) * delta_b_ij; | |
363 | + ++p_cv; | |
364 | + double ncv = *p_cv; | |
365 | + *ps += cv * delta_b_ij; | |
366 | + ++ps; | |
367 | + cv = ncv; | |
361 | 368 | } |
362 | 369 | --p_cv; |
363 | - ps += alpha; | |
370 | + --ps; | |
371 | + cv = *p_cv; | |
364 | 372 | for (size_t v=alpha; v<palette_size; ++v) { |
365 | - *ps += (*p_cv++) * delta_b_ij; | |
373 | + ++p_cv; | |
374 | + double ncv = *p_cv; | |
375 | + *ps += cv * delta_b_ij; | |
366 | 376 | ps += palette_size; |
377 | + cv = ncv; | |
367 | 378 | } |
368 | 379 | } |
369 | 380 | } |
@@ -539,6 +550,8 @@ void spatial_color_quant( | ||
539 | 550 | |
540 | 551 | // Compute 2*sum(j in extended neighborhood of i, j != i) b_ij |
541 | 552 | |
553 | + const int radius_width = (b.width_ - 1)/2; | |
554 | + const int radius_height = (b.height_ - 1)/2; | |
542 | 555 | while (!visit_queue.empty()) { |
543 | 556 | // If we get to 10% above initial size, just revisit them all |
544 | 557 | if (visit_queue.size() > coarse_variables.width_*coarse_variables.height_*11.0/10) { |
@@ -552,16 +565,45 @@ void spatial_color_quant( | ||
552 | 565 | // Compute (25) |
553 | 566 | Color p_i; |
554 | 567 | p_i.zero(); |
555 | - for (int y=0; y<b.height_; ++y) { | |
556 | - int j_y = y - center_y + i_y; | |
557 | - if (j_y < 0 || j_y >= coarse_variables.height_) continue; | |
558 | - for (int x=0; x<b.width_; ++x) { | |
559 | - int j_x = x - center_x + i_x; | |
560 | - if (i_x == j_x && i_y == j_y) continue; | |
561 | - if (j_x < 0 || j_x >= coarse_variables.width_) continue; | |
562 | - Color b_ij = b_value(b, i_x, i_y, j_x, j_y); | |
563 | - Color j_pal = (*j_palette_sum)[j_y][j_x]; | |
564 | - p_i += b_ij * j_pal; | |
568 | + int sy = i_y - center_y; | |
569 | + int ey = sy + b.height_; | |
570 | + sy = max(sy, 0); | |
571 | + ey = min<int>(ey, coarse_variables.height_); | |
572 | + int ky = sy - i_y + radius_height; | |
573 | + if (ky < 0) { | |
574 | + sy += -ky; | |
575 | + } | |
576 | + ky = ey - 1 - i_y + radius_height; | |
577 | + if (ky >= b.height_) { | |
578 | + ey -= (ky - b.height_) + 1; | |
579 | + } | |
580 | + int sx = i_x - center_x; | |
581 | + int ex = sx + b.width_; | |
582 | + sx = max(sx, 0); | |
583 | + ex = min<int>(ex, coarse_variables.width_); | |
584 | + int kx = sx - i_x + radius_width; | |
585 | + if (kx < 0) { | |
586 | + sx += -kx; | |
587 | + } | |
588 | + kx = ex - 1 - i_x + radius_width; | |
589 | + if (kx >= b.width_) { | |
590 | + ex -= (kx - b.width_) + 1; | |
591 | + } | |
592 | + for (int y=sy; y<ey; ++y) { | |
593 | + int k_y = y - i_y + radius_height; | |
594 | + Color* cb = b[k_y] + sx - i_x + radius_width; | |
595 | + Color* cp = (*j_palette_sum)[y]+sx; | |
596 | + if (i_y == y) { | |
597 | + for (int x=sx; x<ex; ++x) { | |
598 | + Color c = (*cb++) * (*cp++); | |
599 | + if (i_x != x) { | |
600 | + p_i += c; | |
601 | + } | |
602 | + } | |
603 | + }else { | |
604 | + for (int x=sx; x<ex; ++x) { | |
605 | + p_i += (*cb++) * (*cp++); | |
606 | + } | |
565 | 607 | } |
566 | 608 | } |
567 | 609 | p_i *= 2.0; |
@@ -569,13 +611,13 @@ void spatial_color_quant( | ||
569 | 611 | |
570 | 612 | double max_meanfield_log = -numeric_limits<double>::infinity(); |
571 | 613 | double meanfield_sum = 0.0; |
614 | + double minus_inv_temperature = -1.0 / temperature; | |
572 | 615 | for (size_t v=0; v<num_colors; ++v) { |
573 | 616 | // Update m_{pi(i)v}^I according to (23) |
574 | 617 | // We can subtract an arbitrary factor to prevent overflow, |
575 | 618 | // since only the weight relative to the sum matters, so we |
576 | 619 | // will choose a value that makes the maximum e^100. |
577 | - Color p_i2; p_i2 = p_i; | |
578 | - double m = -(palette[v].dot_product(p_i2 + middle_b.direct_product(palette[v]))) / temperature; | |
620 | + double m = palette[v].dot_product(p_i + middle_b.direct_product(palette[v])) * minus_inv_temperature; | |
579 | 621 | meanfield_logs[v] = m; |
580 | 622 | if (m > max_meanfield_log) { |
581 | 623 | max_meanfield_log = m; |