Open3D (C++ API)  0.18.0
Loading...
Searching...
No Matches
SVD3x3.h
Go to the documentation of this file.
1// ----------------------------------------------------------------------------
2// - Open3D: www.open3d.org -
3// ----------------------------------------------------------------------------
4// Copyright (c) 2018-2023 www.open3d.org
5// SPDX-License-Identifier: MIT
6// ----------------------------------------------------------------------------
7/**************************************************************************
8**
9** svd3
10**
11** Quick singular value decomposition as described by:
12** A. McAdams, A. Selle, R. Tamstorf, J. Teran and E. Sifakis,
13** Computing the Singular Value Decomposition of 3x3 matrices
14** with minimal branching and elementary floating point operations,
15** University of Wisconsin - Madison technical report TR1690, May 2011
16**
17** Implemented by: Kui Wu
18** kwu@cs.utah.edu
19** Modified by: Wei Dong and Rishabh Singh
20**
21** May 2018
22**
23**************************************************************************/
24
25#pragma once
26
27#include <cmath>
28
31
32#if defined(BUILD_CUDA_MODULE) && defined(__CUDACC__)
33#include <cuda.h>
34#endif
35
36#include "math.h"
38
39#define gone 1065353216
40#define gsine_pi_over_eight 1053028117
41
42#define gcosine_pi_over_eight 1064076127
43#define gtiny_number 1.e-20
44#define gfour_gamma_squared 5.8284273147583007813
45
46#ifndef __CUDACC__
47using std::abs;
48using std::max;
49using std::sqrt;
50#endif
51
52#if defined(BUILD_CUDA_MODULE) && defined(__CUDACC__)
53#define __fadd_rn(x, y) __fadd_rn(x, y)
54#define __fsub_rn(x, y) __fsub_rn(x, y)
55#define __frsqrt_rn(x) __frsqrt_rn(x)
56
57#define __dadd_rn(x, y) __dadd_rn(x, y)
58#define __dsub_rn(x, y) __dsub_rn(x, y)
59#define __drsqrt_rn(x) __drcp_rn(__dsqrt_rn(x))
60#else
61
62#define __fadd_rn(x, y) (x + y)
63#define __fsub_rn(x, y) (x - y)
64#define __frsqrt_rn(x) (1.0 / sqrt(x))
65
66#define __dadd_rn(x, y) (x + y)
67#define __dsub_rn(x, y) (x - y)
68#define __drsqrt_rn(x) (1.0 / sqrt(x))
69
70#define __add_rn(x, y) (x + y)
71#define __sub_rn(x, y) (x - y)
72#define __rsqrt_rn(x) (1.0 / sqrt(x))
73#endif
74
75namespace open3d {
76namespace core {
77namespace linalg {
78namespace kernel {
79
80template <typename scalar_t>
81union un {
82 scalar_t f;
83 unsigned int ui;
84};
85
86template <typename scalar_t>
87OPEN3D_DEVICE OPEN3D_FORCE_INLINE void svd3x3(const scalar_t *A_3x3,
88 scalar_t *U_3x3,
89 scalar_t *S_3x1,
90 scalar_t *V_3x3);
91
92template <>
94 double *U_3x3,
95 double *S_3x1,
96 double *V_3x3) {
97 double gsmall_number = 1.e-12;
98
99 un<double> Sa11, Sa21, Sa31, Sa12, Sa22, Sa32, Sa13, Sa23, Sa33;
100 un<double> Su11, Su21, Su31, Su12, Su22, Su32, Su13, Su23, Su33;
101 un<double> Sv11, Sv21, Sv31, Sv12, Sv22, Sv32, Sv13, Sv23, Sv33;
102 un<double> Sc, Ss, Sch, Ssh;
103 un<double> Stmp1, Stmp2, Stmp3, Stmp4, Stmp5;
104 un<double> Ss11, Ss21, Ss31, Ss22, Ss32, Ss33;
105 un<double> Sqvs, Sqvvx, Sqvvy, Sqvvz;
106
107 Sa11.f = A_3x3[0];
108 Sa12.f = A_3x3[1];
109 Sa13.f = A_3x3[2];
110 Sa21.f = A_3x3[3];
111 Sa22.f = A_3x3[4];
112 Sa23.f = A_3x3[5];
113 Sa31.f = A_3x3[6];
114 Sa32.f = A_3x3[7];
115 Sa33.f = A_3x3[8];
116
117 //###########################################################
118 // Compute normal equations matrix
119 //###########################################################
120
121 Ss11.f = Sa11.f * Sa11.f;
122 Stmp1.f = Sa21.f * Sa21.f;
123 Ss11.f = __dadd_rn(Stmp1.f, Ss11.f);
124 Stmp1.f = Sa31.f * Sa31.f;
125 Ss11.f = __dadd_rn(Stmp1.f, Ss11.f);
126
127 Ss21.f = Sa12.f * Sa11.f;
128 Stmp1.f = Sa22.f * Sa21.f;
129 Ss21.f = __dadd_rn(Stmp1.f, Ss21.f);
130 Stmp1.f = Sa32.f * Sa31.f;
131 Ss21.f = __dadd_rn(Stmp1.f, Ss21.f);
132
133 Ss31.f = Sa13.f * Sa11.f;
134 Stmp1.f = Sa23.f * Sa21.f;
135 Ss31.f = __dadd_rn(Stmp1.f, Ss31.f);
136 Stmp1.f = Sa33.f * Sa31.f;
137 Ss31.f = __dadd_rn(Stmp1.f, Ss31.f);
138
139 Ss22.f = Sa12.f * Sa12.f;
140 Stmp1.f = Sa22.f * Sa22.f;
141 Ss22.f = __dadd_rn(Stmp1.f, Ss22.f);
142 Stmp1.f = Sa32.f * Sa32.f;
143 Ss22.f = __dadd_rn(Stmp1.f, Ss22.f);
144
145 Ss32.f = Sa13.f * Sa12.f;
146 Stmp1.f = Sa23.f * Sa22.f;
147 Ss32.f = __dadd_rn(Stmp1.f, Ss32.f);
148 Stmp1.f = Sa33.f * Sa32.f;
149 Ss32.f = __dadd_rn(Stmp1.f, Ss32.f);
150
151 Ss33.f = Sa13.f * Sa13.f;
152 Stmp1.f = Sa23.f * Sa23.f;
153 Ss33.f = __dadd_rn(Stmp1.f, Ss33.f);
154 Stmp1.f = Sa33.f * Sa33.f;
155 Ss33.f = __dadd_rn(Stmp1.f, Ss33.f);
156
157 Sqvs.f = 1.f;
158 Sqvvx.f = 0.f;
159 Sqvvy.f = 0.f;
160 Sqvvz.f = 0.f;
161
162 //###########################################################
163 // Solve symmetric eigenproblem using Jacobi iteration
164 //###########################################################
165 for (int i = 0; i < 4; i++) {
166 Ssh.f = Ss21.f * 0.5f;
167 Stmp5.f = __dsub_rn(Ss11.f, Ss22.f);
168
169 Stmp2.f = Ssh.f * Ssh.f;
170 Stmp1.ui = (Stmp2.f >= gtiny_number) ? 0xffffffff : 0;
171 Ssh.ui = Stmp1.ui & Ssh.ui;
172 Sch.ui = Stmp1.ui & Stmp5.ui;
173 Stmp2.ui = ~Stmp1.ui & gone;
174 Sch.ui = Sch.ui | Stmp2.ui;
175
176 Stmp1.f = Ssh.f * Ssh.f;
177 Stmp2.f = Sch.f * Sch.f;
178 Stmp3.f = __dadd_rn(Stmp1.f, Stmp2.f);
179 Stmp4.f = __drsqrt_rn(Stmp3.f);
180
181 Ssh.f = Stmp4.f * Ssh.f;
182 Sch.f = Stmp4.f * Sch.f;
183 Stmp1.f = gfour_gamma_squared * Stmp1.f;
184 Stmp1.ui = (Stmp2.f <= Stmp1.f) ? 0xffffffff : 0;
185
186 Stmp2.ui = gsine_pi_over_eight & Stmp1.ui;
187 Ssh.ui = ~Stmp1.ui & Ssh.ui;
188 Ssh.ui = Ssh.ui | Stmp2.ui;
189 Stmp2.ui = gcosine_pi_over_eight & Stmp1.ui;
190 Sch.ui = ~Stmp1.ui & Sch.ui;
191 Sch.ui = Sch.ui | Stmp2.ui;
192
193 Stmp1.f = Ssh.f * Ssh.f;
194 Stmp2.f = Sch.f * Sch.f;
195 Sc.f = __dsub_rn(Stmp2.f, Stmp1.f);
196 Ss.f = Sch.f * Ssh.f;
197 Ss.f = __dadd_rn(Ss.f, Ss.f);
198
199#ifdef DEBUG_JACOBI_CONJUGATE
200 printf("GPU s %.20g, c %.20g, sh %.20g, ch %.20g\n", Ss.f, Sc.f, Ssh.f,
201 Sch.f);
202#endif
203 //###########################################################
204 // Perform the actual Givens conjugation
205 //###########################################################
206
207 Stmp3.f = __dadd_rn(Stmp1.f, Stmp2.f);
208 Ss33.f = Ss33.f * Stmp3.f;
209 Ss31.f = Ss31.f * Stmp3.f;
210 Ss32.f = Ss32.f * Stmp3.f;
211 Ss33.f = Ss33.f * Stmp3.f;
212
213 Stmp1.f = Ss.f * Ss31.f;
214 Stmp2.f = Ss.f * Ss32.f;
215 Ss31.f = Sc.f * Ss31.f;
216 Ss32.f = Sc.f * Ss32.f;
217 Ss31.f = __dadd_rn(Stmp2.f, Ss31.f);
218 Ss32.f = __dsub_rn(Ss32.f, Stmp1.f);
219
220 Stmp2.f = Ss.f * Ss.f;
221 Stmp1.f = Ss22.f * Stmp2.f;
222 Stmp3.f = Ss11.f * Stmp2.f;
223 Stmp4.f = Sc.f * Sc.f;
224 Ss11.f = Ss11.f * Stmp4.f;
225 Ss22.f = Ss22.f * Stmp4.f;
226 Ss11.f = __dadd_rn(Ss11.f, Stmp1.f);
227 Ss22.f = __dadd_rn(Ss22.f, Stmp3.f);
228 Stmp4.f = __dsub_rn(Stmp4.f, Stmp2.f);
229 Stmp2.f = __dadd_rn(Ss21.f, Ss21.f);
230 Ss21.f = Ss21.f * Stmp4.f;
231 Stmp4.f = Sc.f * Ss.f;
232 Stmp2.f = Stmp2.f * Stmp4.f;
233 Stmp5.f = Stmp5.f * Stmp4.f;
234 Ss11.f = __dadd_rn(Ss11.f, Stmp2.f);
235 Ss21.f = __dsub_rn(Ss21.f, Stmp5.f);
236 Ss22.f = __dsub_rn(Ss22.f, Stmp2.f);
237
238#ifdef DEBUG_JACOBI_CONJUGATE
239 printf("%.20g\n", Ss11.f);
240 printf("%.20g %.20g\n", Ss21.f, Ss22.f);
241 printf("%.20g %.20g %.20g\n", Ss31.f, Ss32.f, Ss33.f);
242#endif
243
244 //###########################################################
245 // Compute the cumulative rotation, in quaternion form
246 //###########################################################
247
248 Stmp1.f = Ssh.f * Sqvvx.f;
249 Stmp2.f = Ssh.f * Sqvvy.f;
250 Stmp3.f = Ssh.f * Sqvvz.f;
251 Ssh.f = Ssh.f * Sqvs.f;
252
253 Sqvs.f = Sch.f * Sqvs.f;
254 Sqvvx.f = Sch.f * Sqvvx.f;
255 Sqvvy.f = Sch.f * Sqvvy.f;
256 Sqvvz.f = Sch.f * Sqvvz.f;
257
258 Sqvvz.f = __dadd_rn(Sqvvz.f, Ssh.f);
259 Sqvs.f = __dsub_rn(Sqvs.f, Stmp3.f);
260 Sqvvx.f = __dadd_rn(Sqvvx.f, Stmp2.f);
261 Sqvvy.f = __dsub_rn(Sqvvy.f, Stmp1.f);
262
263#ifdef DEBUG_JACOBI_CONJUGATE
264 printf("GPU q %.20g %.20g %.20g %.20g\n", Sqvvx.f, Sqvvy.f, Sqvvz.f,
265 Sqvs.f);
266#endif
267
269 // (1->3)
271 Ssh.f = Ss32.f * 0.5f;
272 Stmp5.f = __dsub_rn(Ss22.f, Ss33.f);
273
274 Stmp2.f = Ssh.f * Ssh.f;
275 Stmp1.ui = (Stmp2.f >= gtiny_number) ? 0xffffffff : 0;
276 Ssh.ui = Stmp1.ui & Ssh.ui;
277 Sch.ui = Stmp1.ui & Stmp5.ui;
278 Stmp2.ui = ~Stmp1.ui & gone;
279 Sch.ui = Sch.ui | Stmp2.ui;
280
281 Stmp1.f = Ssh.f * Ssh.f;
282 Stmp2.f = Sch.f * Sch.f;
283 Stmp3.f = __dadd_rn(Stmp1.f, Stmp2.f);
284 Stmp4.f = __drsqrt_rn(Stmp3.f);
285
286 Ssh.f = Stmp4.f * Ssh.f;
287 Sch.f = Stmp4.f * Sch.f;
288 Stmp1.f = gfour_gamma_squared * Stmp1.f;
289 Stmp1.ui = (Stmp2.f <= Stmp1.f) ? 0xffffffff : 0;
290
291 Stmp2.ui = gsine_pi_over_eight & Stmp1.ui;
292 Ssh.ui = ~Stmp1.ui & Ssh.ui;
293 Ssh.ui = Ssh.ui | Stmp2.ui;
294 Stmp2.ui = gcosine_pi_over_eight & Stmp1.ui;
295 Sch.ui = ~Stmp1.ui & Sch.ui;
296 Sch.ui = Sch.ui | Stmp2.ui;
297
298 Stmp1.f = Ssh.f * Ssh.f;
299 Stmp2.f = Sch.f * Sch.f;
300 Sc.f = __dsub_rn(Stmp2.f, Stmp1.f);
301 Ss.f = Sch.f * Ssh.f;
302 Ss.f = __dadd_rn(Ss.f, Ss.f);
303
304#ifdef DEBUG_JACOBI_CONJUGATE
305 printf("GPU s %.20g, c %.20g, sh %.20g, ch %.20g\n", Ss.f, Sc.f, Ssh.f,
306 Sch.f);
307#endif
308
309 //###########################################################
310 // Perform the actual Givens conjugation
311 //###########################################################
312
313 Stmp3.f = __dadd_rn(Stmp1.f, Stmp2.f);
314 Ss11.f = Ss11.f * Stmp3.f;
315 Ss21.f = Ss21.f * Stmp3.f;
316 Ss31.f = Ss31.f * Stmp3.f;
317 Ss11.f = Ss11.f * Stmp3.f;
318
319 Stmp1.f = Ss.f * Ss21.f;
320 Stmp2.f = Ss.f * Ss31.f;
321 Ss21.f = Sc.f * Ss21.f;
322 Ss31.f = Sc.f * Ss31.f;
323 Ss21.f = __dadd_rn(Stmp2.f, Ss21.f);
324 Ss31.f = __dsub_rn(Ss31.f, Stmp1.f);
325
326 Stmp2.f = Ss.f * Ss.f;
327 Stmp1.f = Ss33.f * Stmp2.f;
328 Stmp3.f = Ss22.f * Stmp2.f;
329 Stmp4.f = Sc.f * Sc.f;
330 Ss22.f = Ss22.f * Stmp4.f;
331 Ss33.f = Ss33.f * Stmp4.f;
332 Ss22.f = __dadd_rn(Ss22.f, Stmp1.f);
333 Ss33.f = __dadd_rn(Ss33.f, Stmp3.f);
334 Stmp4.f = __dsub_rn(Stmp4.f, Stmp2.f);
335 Stmp2.f = __dadd_rn(Ss32.f, Ss32.f);
336 Ss32.f = Ss32.f * Stmp4.f;
337 Stmp4.f = Sc.f * Ss.f;
338 Stmp2.f = Stmp2.f * Stmp4.f;
339 Stmp5.f = Stmp5.f * Stmp4.f;
340 Ss22.f = __dadd_rn(Ss22.f, Stmp2.f);
341 Ss32.f = __dsub_rn(Ss32.f, Stmp5.f);
342 Ss33.f = __dsub_rn(Ss33.f, Stmp2.f);
343
344#ifdef DEBUG_JACOBI_CONJUGATE
345 printf("%.20g\n", Ss11.f);
346 printf("%.20g %.20g\n", Ss21.f, Ss22.f);
347 printf("%.20g %.20g %.20g\n", Ss31.f, Ss32.f, Ss33.f);
348#endif
349
350 //###########################################################
351 // Compute the cumulative rotation, in quaternion form
352 //###########################################################
353
354 Stmp1.f = Ssh.f * Sqvvx.f;
355 Stmp2.f = Ssh.f * Sqvvy.f;
356 Stmp3.f = Ssh.f * Sqvvz.f;
357 Ssh.f = Ssh.f * Sqvs.f;
358
359 Sqvs.f = Sch.f * Sqvs.f;
360 Sqvvx.f = Sch.f * Sqvvx.f;
361 Sqvvy.f = Sch.f * Sqvvy.f;
362 Sqvvz.f = Sch.f * Sqvvz.f;
363
364 Sqvvx.f = __dadd_rn(Sqvvx.f, Ssh.f);
365 Sqvs.f = __dsub_rn(Sqvs.f, Stmp1.f);
366 Sqvvy.f = __dadd_rn(Sqvvy.f, Stmp3.f);
367 Sqvvz.f = __dsub_rn(Sqvvz.f, Stmp2.f);
368
369#ifdef DEBUG_JACOBI_CONJUGATE
370 printf("GPU q %.20g %.20g %.20g %.20g\n", Sqvvx.f, Sqvvy.f, Sqvvz.f,
371 Sqvs.f);
372#endif
373#if 1
375 // 1 -> 2
377
378 Ssh.f = Ss31.f * 0.5f;
379 Stmp5.f = __dsub_rn(Ss33.f, Ss11.f);
380
381 Stmp2.f = Ssh.f * Ssh.f;
382 Stmp1.ui = (Stmp2.f >= gtiny_number) ? 0xffffffff : 0;
383 Ssh.ui = Stmp1.ui & Ssh.ui;
384 Sch.ui = Stmp1.ui & Stmp5.ui;
385 Stmp2.ui = ~Stmp1.ui & gone;
386 Sch.ui = Sch.ui | Stmp2.ui;
387
388 Stmp1.f = Ssh.f * Ssh.f;
389 Stmp2.f = Sch.f * Sch.f;
390 Stmp3.f = __dadd_rn(Stmp1.f, Stmp2.f);
391 Stmp4.f = __drsqrt_rn(Stmp3.f);
392
393 Ssh.f = Stmp4.f * Ssh.f;
394 Sch.f = Stmp4.f * Sch.f;
395 Stmp1.f = gfour_gamma_squared * Stmp1.f;
396 Stmp1.ui = (Stmp2.f <= Stmp1.f) ? 0xffffffff : 0;
397
398 Stmp2.ui = gsine_pi_over_eight & Stmp1.ui;
399 Ssh.ui = ~Stmp1.ui & Ssh.ui;
400 Ssh.ui = Ssh.ui | Stmp2.ui;
401 Stmp2.ui = gcosine_pi_over_eight & Stmp1.ui;
402 Sch.ui = ~Stmp1.ui & Sch.ui;
403 Sch.ui = Sch.ui | Stmp2.ui;
404
405 Stmp1.f = Ssh.f * Ssh.f;
406 Stmp2.f = Sch.f * Sch.f;
407 Sc.f = __dsub_rn(Stmp2.f, Stmp1.f);
408 Ss.f = Sch.f * Ssh.f;
409 Ss.f = __dadd_rn(Ss.f, Ss.f);
410
411#ifdef DEBUG_JACOBI_CONJUGATE
412 printf("GPU s %.20g, c %.20g, sh %.20g, ch %.20g\n", Ss.f, Sc.f, Ssh.f,
413 Sch.f);
414#endif
415
416 //###########################################################
417 // Perform the actual Givens conjugation
418 //###########################################################
419
420 Stmp3.f = __dadd_rn(Stmp1.f, Stmp2.f);
421 Ss22.f = Ss22.f * Stmp3.f;
422 Ss32.f = Ss32.f * Stmp3.f;
423 Ss21.f = Ss21.f * Stmp3.f;
424 Ss22.f = Ss22.f * Stmp3.f;
425
426 Stmp1.f = Ss.f * Ss32.f;
427 Stmp2.f = Ss.f * Ss21.f;
428 Ss32.f = Sc.f * Ss32.f;
429 Ss21.f = Sc.f * Ss21.f;
430 Ss32.f = __dadd_rn(Stmp2.f, Ss32.f);
431 Ss21.f = __dsub_rn(Ss21.f, Stmp1.f);
432
433 Stmp2.f = Ss.f * Ss.f;
434 Stmp1.f = Ss11.f * Stmp2.f;
435 Stmp3.f = Ss33.f * Stmp2.f;
436 Stmp4.f = Sc.f * Sc.f;
437 Ss33.f = Ss33.f * Stmp4.f;
438 Ss11.f = Ss11.f * Stmp4.f;
439 Ss33.f = __dadd_rn(Ss33.f, Stmp1.f);
440 Ss11.f = __dadd_rn(Ss11.f, Stmp3.f);
441 Stmp4.f = __dsub_rn(Stmp4.f, Stmp2.f);
442 Stmp2.f = __dadd_rn(Ss31.f, Ss31.f);
443 Ss31.f = Ss31.f * Stmp4.f;
444 Stmp4.f = Sc.f * Ss.f;
445 Stmp2.f = Stmp2.f * Stmp4.f;
446 Stmp5.f = Stmp5.f * Stmp4.f;
447 Ss33.f = __dadd_rn(Ss33.f, Stmp2.f);
448 Ss31.f = __dsub_rn(Ss31.f, Stmp5.f);
449 Ss11.f = __dsub_rn(Ss11.f, Stmp2.f);
450
451#ifdef DEBUG_JACOBI_CONJUGATE
452 printf("%.20g\n", Ss11.f);
453 printf("%.20g %.20g\n", Ss21.f, Ss22.f);
454 printf("%.20g %.20g %.20g\n", Ss31.f, Ss32.f, Ss33.f);
455#endif
456
457 //###########################################################
458 // Compute the cumulative rotation, in quaternion form
459 //###########################################################
460
461 Stmp1.f = Ssh.f * Sqvvx.f;
462 Stmp2.f = Ssh.f * Sqvvy.f;
463 Stmp3.f = Ssh.f * Sqvvz.f;
464 Ssh.f = Ssh.f * Sqvs.f;
465
466 Sqvs.f = Sch.f * Sqvs.f;
467 Sqvvx.f = Sch.f * Sqvvx.f;
468 Sqvvy.f = Sch.f * Sqvvy.f;
469 Sqvvz.f = Sch.f * Sqvvz.f;
470
471 Sqvvy.f = __dadd_rn(Sqvvy.f, Ssh.f);
472 Sqvs.f = __dsub_rn(Sqvs.f, Stmp2.f);
473 Sqvvz.f = __dadd_rn(Sqvvz.f, Stmp1.f);
474 Sqvvx.f = __dsub_rn(Sqvvx.f, Stmp3.f);
475#endif
476 }
477
478 //###########################################################
479 // Normalize quaternion for matrix V
480 //###########################################################
481
482 Stmp2.f = Sqvs.f * Sqvs.f;
483 Stmp1.f = Sqvvx.f * Sqvvx.f;
484 Stmp2.f = __dadd_rn(Stmp1.f, Stmp2.f);
485 Stmp1.f = Sqvvy.f * Sqvvy.f;
486 Stmp2.f = __dadd_rn(Stmp1.f, Stmp2.f);
487 Stmp1.f = Sqvvz.f * Sqvvz.f;
488 Stmp2.f = __dadd_rn(Stmp1.f, Stmp2.f);
489
490 Stmp1.f = __drsqrt_rn(Stmp2.f);
491 Stmp4.f = Stmp1.f * 0.5f;
492 Stmp3.f = Stmp1.f * Stmp4.f;
493 Stmp3.f = Stmp1.f * Stmp3.f;
494 Stmp3.f = Stmp2.f * Stmp3.f;
495 Stmp1.f = __dadd_rn(Stmp1.f, Stmp4.f);
496 Stmp1.f = __dsub_rn(Stmp1.f, Stmp3.f);
497
498 Sqvs.f = Sqvs.f * Stmp1.f;
499 Sqvvx.f = Sqvvx.f * Stmp1.f;
500 Sqvvy.f = Sqvvy.f * Stmp1.f;
501 Sqvvz.f = Sqvvz.f * Stmp1.f;
502
503 //###########################################################
504 // Transform quaternion to matrix V
505 //###########################################################
506
507 Stmp1.f = Sqvvx.f * Sqvvx.f;
508 Stmp2.f = Sqvvy.f * Sqvvy.f;
509 Stmp3.f = Sqvvz.f * Sqvvz.f;
510 Sv11.f = Sqvs.f * Sqvs.f;
511 Sv22.f = __dsub_rn(Sv11.f, Stmp1.f);
512 Sv33.f = __dsub_rn(Sv22.f, Stmp2.f);
513 Sv33.f = __dadd_rn(Sv33.f, Stmp3.f);
514 Sv22.f = __dadd_rn(Sv22.f, Stmp2.f);
515 Sv22.f = __dsub_rn(Sv22.f, Stmp3.f);
516 Sv11.f = __dadd_rn(Sv11.f, Stmp1.f);
517 Sv11.f = __dsub_rn(Sv11.f, Stmp2.f);
518 Sv11.f = __dsub_rn(Sv11.f, Stmp3.f);
519 Stmp1.f = __dadd_rn(Sqvvx.f, Sqvvx.f);
520 Stmp2.f = __dadd_rn(Sqvvy.f, Sqvvy.f);
521 Stmp3.f = __dadd_rn(Sqvvz.f, Sqvvz.f);
522 Sv32.f = Sqvs.f * Stmp1.f;
523 Sv13.f = Sqvs.f * Stmp2.f;
524 Sv21.f = Sqvs.f * Stmp3.f;
525 Stmp1.f = Sqvvy.f * Stmp1.f;
526 Stmp2.f = Sqvvz.f * Stmp2.f;
527 Stmp3.f = Sqvvx.f * Stmp3.f;
528 Sv12.f = __dsub_rn(Stmp1.f, Sv21.f);
529 Sv23.f = __dsub_rn(Stmp2.f, Sv32.f);
530 Sv31.f = __dsub_rn(Stmp3.f, Sv13.f);
531 Sv21.f = __dadd_rn(Stmp1.f, Sv21.f);
532 Sv32.f = __dadd_rn(Stmp2.f, Sv32.f);
533 Sv13.f = __dadd_rn(Stmp3.f, Sv13.f);
534
536 // Multiply (from the right) with V
537 //###########################################################
538
539 Stmp2.f = Sa12.f;
540 Stmp3.f = Sa13.f;
541 Sa12.f = Sv12.f * Sa11.f;
542 Sa13.f = Sv13.f * Sa11.f;
543 Sa11.f = Sv11.f * Sa11.f;
544 Stmp1.f = Sv21.f * Stmp2.f;
545 Sa11.f = __dadd_rn(Sa11.f, Stmp1.f);
546 Stmp1.f = Sv31.f * Stmp3.f;
547 Sa11.f = __dadd_rn(Sa11.f, Stmp1.f);
548 Stmp1.f = Sv22.f * Stmp2.f;
549 Sa12.f = __dadd_rn(Sa12.f, Stmp1.f);
550 Stmp1.f = Sv32.f * Stmp3.f;
551 Sa12.f = __dadd_rn(Sa12.f, Stmp1.f);
552 Stmp1.f = Sv23.f * Stmp2.f;
553 Sa13.f = __dadd_rn(Sa13.f, Stmp1.f);
554 Stmp1.f = Sv33.f * Stmp3.f;
555 Sa13.f = __dadd_rn(Sa13.f, Stmp1.f);
556
557 Stmp2.f = Sa22.f;
558 Stmp3.f = Sa23.f;
559 Sa22.f = Sv12.f * Sa21.f;
560 Sa23.f = Sv13.f * Sa21.f;
561 Sa21.f = Sv11.f * Sa21.f;
562 Stmp1.f = Sv21.f * Stmp2.f;
563 Sa21.f = __dadd_rn(Sa21.f, Stmp1.f);
564 Stmp1.f = Sv31.f * Stmp3.f;
565 Sa21.f = __dadd_rn(Sa21.f, Stmp1.f);
566 Stmp1.f = Sv22.f * Stmp2.f;
567 Sa22.f = __dadd_rn(Sa22.f, Stmp1.f);
568 Stmp1.f = Sv32.f * Stmp3.f;
569 Sa22.f = __dadd_rn(Sa22.f, Stmp1.f);
570 Stmp1.f = Sv23.f * Stmp2.f;
571 Sa23.f = __dadd_rn(Sa23.f, Stmp1.f);
572 Stmp1.f = Sv33.f * Stmp3.f;
573 Sa23.f = __dadd_rn(Sa23.f, Stmp1.f);
574
575 Stmp2.f = Sa32.f;
576 Stmp3.f = Sa33.f;
577 Sa32.f = Sv12.f * Sa31.f;
578 Sa33.f = Sv13.f * Sa31.f;
579 Sa31.f = Sv11.f * Sa31.f;
580 Stmp1.f = Sv21.f * Stmp2.f;
581 Sa31.f = __dadd_rn(Sa31.f, Stmp1.f);
582 Stmp1.f = Sv31.f * Stmp3.f;
583 Sa31.f = __dadd_rn(Sa31.f, Stmp1.f);
584 Stmp1.f = Sv22.f * Stmp2.f;
585 Sa32.f = __dadd_rn(Sa32.f, Stmp1.f);
586 Stmp1.f = Sv32.f * Stmp3.f;
587 Sa32.f = __dadd_rn(Sa32.f, Stmp1.f);
588 Stmp1.f = Sv23.f * Stmp2.f;
589 Sa33.f = __dadd_rn(Sa33.f, Stmp1.f);
590 Stmp1.f = Sv33.f * Stmp3.f;
591 Sa33.f = __dadd_rn(Sa33.f, Stmp1.f);
592
593 //###########################################################
594 // Permute columns such that the singular values are sorted
595 //###########################################################
596
597 Stmp1.f = Sa11.f * Sa11.f;
598 Stmp4.f = Sa21.f * Sa21.f;
599 Stmp1.f = __dadd_rn(Stmp1.f, Stmp4.f);
600 Stmp4.f = Sa31.f * Sa31.f;
601 Stmp1.f = __dadd_rn(Stmp1.f, Stmp4.f);
602
603 Stmp2.f = Sa12.f * Sa12.f;
604 Stmp4.f = Sa22.f * Sa22.f;
605 Stmp2.f = __dadd_rn(Stmp2.f, Stmp4.f);
606 Stmp4.f = Sa32.f * Sa32.f;
607 Stmp2.f = __dadd_rn(Stmp2.f, Stmp4.f);
608
609 Stmp3.f = Sa13.f * Sa13.f;
610 Stmp4.f = Sa23.f * Sa23.f;
611 Stmp3.f = __dadd_rn(Stmp3.f, Stmp4.f);
612 Stmp4.f = Sa33.f * Sa33.f;
613 Stmp3.f = __dadd_rn(Stmp3.f, Stmp4.f);
614
615 // Swap columns 1-2 if necessary
616
617 Stmp4.ui = (Stmp1.f < Stmp2.f) ? 0xffffffff : 0;
618 Stmp5.ui = Sa11.ui ^ Sa12.ui;
619 Stmp5.ui = Stmp5.ui & Stmp4.ui;
620 Sa11.ui = Sa11.ui ^ Stmp5.ui;
621 Sa12.ui = Sa12.ui ^ Stmp5.ui;
622
623 Stmp5.ui = Sa21.ui ^ Sa22.ui;
624 Stmp5.ui = Stmp5.ui & Stmp4.ui;
625 Sa21.ui = Sa21.ui ^ Stmp5.ui;
626 Sa22.ui = Sa22.ui ^ Stmp5.ui;
627
628 Stmp5.ui = Sa31.ui ^ Sa32.ui;
629 Stmp5.ui = Stmp5.ui & Stmp4.ui;
630 Sa31.ui = Sa31.ui ^ Stmp5.ui;
631 Sa32.ui = Sa32.ui ^ Stmp5.ui;
632
633 Stmp5.ui = Sv11.ui ^ Sv12.ui;
634 Stmp5.ui = Stmp5.ui & Stmp4.ui;
635 Sv11.ui = Sv11.ui ^ Stmp5.ui;
636 Sv12.ui = Sv12.ui ^ Stmp5.ui;
637
638 Stmp5.ui = Sv21.ui ^ Sv22.ui;
639 Stmp5.ui = Stmp5.ui & Stmp4.ui;
640 Sv21.ui = Sv21.ui ^ Stmp5.ui;
641 Sv22.ui = Sv22.ui ^ Stmp5.ui;
642
643 Stmp5.ui = Sv31.ui ^ Sv32.ui;
644 Stmp5.ui = Stmp5.ui & Stmp4.ui;
645 Sv31.ui = Sv31.ui ^ Stmp5.ui;
646 Sv32.ui = Sv32.ui ^ Stmp5.ui;
647
648 Stmp5.ui = Stmp1.ui ^ Stmp2.ui;
649 Stmp5.ui = Stmp5.ui & Stmp4.ui;
650 Stmp1.ui = Stmp1.ui ^ Stmp5.ui;
651 Stmp2.ui = Stmp2.ui ^ Stmp5.ui;
652
653 // If columns 1-2 have been swapped, negate 2nd column of A and V so that V
654 // is still a rotation
655
656 Stmp5.f = -2.f;
657 Stmp5.ui = Stmp5.ui & Stmp4.ui;
658 Stmp4.f = 1.f;
659 Stmp4.f = __dadd_rn(Stmp4.f, Stmp5.f);
660
661 Sa12.f = Sa12.f * Stmp4.f;
662 Sa22.f = Sa22.f * Stmp4.f;
663 Sa32.f = Sa32.f * Stmp4.f;
664
665 Sv12.f = Sv12.f * Stmp4.f;
666 Sv22.f = Sv22.f * Stmp4.f;
667 Sv32.f = Sv32.f * Stmp4.f;
668
669 // Swap columns 1-3 if necessary
670
671 Stmp4.ui = (Stmp1.f < Stmp3.f) ? 0xffffffff : 0;
672 Stmp5.ui = Sa11.ui ^ Sa13.ui;
673 Stmp5.ui = Stmp5.ui & Stmp4.ui;
674 Sa11.ui = Sa11.ui ^ Stmp5.ui;
675 Sa13.ui = Sa13.ui ^ Stmp5.ui;
676
677 Stmp5.ui = Sa21.ui ^ Sa23.ui;
678 Stmp5.ui = Stmp5.ui & Stmp4.ui;
679 Sa21.ui = Sa21.ui ^ Stmp5.ui;
680 Sa23.ui = Sa23.ui ^ Stmp5.ui;
681
682 Stmp5.ui = Sa31.ui ^ Sa33.ui;
683 Stmp5.ui = Stmp5.ui & Stmp4.ui;
684 Sa31.ui = Sa31.ui ^ Stmp5.ui;
685 Sa33.ui = Sa33.ui ^ Stmp5.ui;
686
687 Stmp5.ui = Sv11.ui ^ Sv13.ui;
688 Stmp5.ui = Stmp5.ui & Stmp4.ui;
689 Sv11.ui = Sv11.ui ^ Stmp5.ui;
690 Sv13.ui = Sv13.ui ^ Stmp5.ui;
691
692 Stmp5.ui = Sv21.ui ^ Sv23.ui;
693 Stmp5.ui = Stmp5.ui & Stmp4.ui;
694 Sv21.ui = Sv21.ui ^ Stmp5.ui;
695 Sv23.ui = Sv23.ui ^ Stmp5.ui;
696
697 Stmp5.ui = Sv31.ui ^ Sv33.ui;
698 Stmp5.ui = Stmp5.ui & Stmp4.ui;
699 Sv31.ui = Sv31.ui ^ Stmp5.ui;
700 Sv33.ui = Sv33.ui ^ Stmp5.ui;
701
702 Stmp5.ui = Stmp1.ui ^ Stmp3.ui;
703 Stmp5.ui = Stmp5.ui & Stmp4.ui;
704 Stmp1.ui = Stmp1.ui ^ Stmp5.ui;
705 Stmp3.ui = Stmp3.ui ^ Stmp5.ui;
706
707 // If columns 1-3 have been swapped, negate 1st column of A and V so that V
708 // is still a rotation
709
710 Stmp5.f = -2.f;
711 Stmp5.ui = Stmp5.ui & Stmp4.ui;
712 Stmp4.f = 1.f;
713 Stmp4.f = __dadd_rn(Stmp4.f, Stmp5.f);
714
715 Sa11.f = Sa11.f * Stmp4.f;
716 Sa21.f = Sa21.f * Stmp4.f;
717 Sa31.f = Sa31.f * Stmp4.f;
718
719 Sv11.f = Sv11.f * Stmp4.f;
720 Sv21.f = Sv21.f * Stmp4.f;
721 Sv31.f = Sv31.f * Stmp4.f;
722
723 // Swap columns 2-3 if necessary
724
725 Stmp4.ui = (Stmp2.f < Stmp3.f) ? 0xffffffff : 0;
726 Stmp5.ui = Sa12.ui ^ Sa13.ui;
727 Stmp5.ui = Stmp5.ui & Stmp4.ui;
728 Sa12.ui = Sa12.ui ^ Stmp5.ui;
729 Sa13.ui = Sa13.ui ^ Stmp5.ui;
730
731 Stmp5.ui = Sa22.ui ^ Sa23.ui;
732 Stmp5.ui = Stmp5.ui & Stmp4.ui;
733 Sa22.ui = Sa22.ui ^ Stmp5.ui;
734 Sa23.ui = Sa23.ui ^ Stmp5.ui;
735
736 Stmp5.ui = Sa32.ui ^ Sa33.ui;
737 Stmp5.ui = Stmp5.ui & Stmp4.ui;
738 Sa32.ui = Sa32.ui ^ Stmp5.ui;
739 Sa33.ui = Sa33.ui ^ Stmp5.ui;
740
741 Stmp5.ui = Sv12.ui ^ Sv13.ui;
742 Stmp5.ui = Stmp5.ui & Stmp4.ui;
743 Sv12.ui = Sv12.ui ^ Stmp5.ui;
744 Sv13.ui = Sv13.ui ^ Stmp5.ui;
745
746 Stmp5.ui = Sv22.ui ^ Sv23.ui;
747 Stmp5.ui = Stmp5.ui & Stmp4.ui;
748 Sv22.ui = Sv22.ui ^ Stmp5.ui;
749 Sv23.ui = Sv23.ui ^ Stmp5.ui;
750
751 Stmp5.ui = Sv32.ui ^ Sv33.ui;
752 Stmp5.ui = Stmp5.ui & Stmp4.ui;
753 Sv32.ui = Sv32.ui ^ Stmp5.ui;
754 Sv33.ui = Sv33.ui ^ Stmp5.ui;
755
756 Stmp5.ui = Stmp2.ui ^ Stmp3.ui;
757 Stmp5.ui = Stmp5.ui & Stmp4.ui;
758 Stmp2.ui = Stmp2.ui ^ Stmp5.ui;
759 Stmp3.ui = Stmp3.ui ^ Stmp5.ui;
760
761 // If columns 2-3 have been swapped, negate 3rd column of A and V so that V
762 // is still a rotation
763
764 Stmp5.f = -2.f;
765 Stmp5.ui = Stmp5.ui & Stmp4.ui;
766 Stmp4.f = 1.f;
767 Stmp4.f = __dadd_rn(Stmp4.f, Stmp5.f);
768
769 Sa13.f = Sa13.f * Stmp4.f;
770 Sa23.f = Sa23.f * Stmp4.f;
771 Sa33.f = Sa33.f * Stmp4.f;
772
773 Sv13.f = Sv13.f * Stmp4.f;
774 Sv23.f = Sv23.f * Stmp4.f;
775 Sv33.f = Sv33.f * Stmp4.f;
776
777 //###########################################################
778 // Construct QR factorization of A*V (=U*D) using Givens rotations
779 //###########################################################
780
781 Su11.f = 1.f;
782 Su12.f = 0.f;
783 Su13.f = 0.f;
784 Su21.f = 0.f;
785 Su22.f = 1.f;
786 Su23.f = 0.f;
787 Su31.f = 0.f;
788 Su32.f = 0.f;
789 Su33.f = 1.f;
790
791 Ssh.f = Sa21.f * Sa21.f;
792 Ssh.ui = (Ssh.f >= gsmall_number) ? 0xffffffff : 0;
793 Ssh.ui = Ssh.ui & Sa21.ui;
794
795 Stmp5.f = 0.f;
796 Sch.f = __dsub_rn(Stmp5.f, Sa11.f);
797 Sch.f = max(Sch.f, Sa11.f);
798 Sch.f = max(Sch.f, gsmall_number);
799 Stmp5.ui = (Sa11.f >= Stmp5.f) ? 0xffffffff : 0;
800
801 Stmp1.f = Sch.f * Sch.f;
802 Stmp2.f = Ssh.f * Ssh.f;
803 Stmp2.f = __dadd_rn(Stmp1.f, Stmp2.f);
804 Stmp1.f = __drsqrt_rn(Stmp2.f);
805
806 Stmp4.f = Stmp1.f * 0.5f;
807 Stmp3.f = Stmp1.f * Stmp4.f;
808 Stmp3.f = Stmp1.f * Stmp3.f;
809 Stmp3.f = Stmp2.f * Stmp3.f;
810 Stmp1.f = __dadd_rn(Stmp1.f, Stmp4.f);
811 Stmp1.f = __dsub_rn(Stmp1.f, Stmp3.f);
812 Stmp1.f = Stmp1.f * Stmp2.f;
813
814 Sch.f = __dadd_rn(Sch.f, Stmp1.f);
815
816 Stmp1.ui = ~Stmp5.ui & Ssh.ui;
817 Stmp2.ui = ~Stmp5.ui & Sch.ui;
818 Sch.ui = Stmp5.ui & Sch.ui;
819 Ssh.ui = Stmp5.ui & Ssh.ui;
820 Sch.ui = Sch.ui | Stmp1.ui;
821 Ssh.ui = Ssh.ui | Stmp2.ui;
822
823 Stmp1.f = Sch.f * Sch.f;
824 Stmp2.f = Ssh.f * Ssh.f;
825 Stmp2.f = __dadd_rn(Stmp1.f, Stmp2.f);
826 Stmp1.f = __drsqrt_rn(Stmp2.f);
827
828 Stmp4.f = Stmp1.f * 0.5f;
829 Stmp3.f = Stmp1.f * Stmp4.f;
830 Stmp3.f = Stmp1.f * Stmp3.f;
831 Stmp3.f = Stmp2.f * Stmp3.f;
832 Stmp1.f = __dadd_rn(Stmp1.f, Stmp4.f);
833 Stmp1.f = __dsub_rn(Stmp1.f, Stmp3.f);
834
835 Sch.f = Sch.f * Stmp1.f;
836 Ssh.f = Ssh.f * Stmp1.f;
837
838 Sc.f = Sch.f * Sch.f;
839 Ss.f = Ssh.f * Ssh.f;
840 Sc.f = __dsub_rn(Sc.f, Ss.f);
841 Ss.f = Ssh.f * Sch.f;
842 Ss.f = __dadd_rn(Ss.f, Ss.f);
843
844 //###########################################################
845 // Rotate matrix A
846 //###########################################################
847
848 Stmp1.f = Ss.f * Sa11.f;
849 Stmp2.f = Ss.f * Sa21.f;
850 Sa11.f = Sc.f * Sa11.f;
851 Sa21.f = Sc.f * Sa21.f;
852 Sa11.f = __dadd_rn(Sa11.f, Stmp2.f);
853 Sa21.f = __dsub_rn(Sa21.f, Stmp1.f);
854
855 Stmp1.f = Ss.f * Sa12.f;
856 Stmp2.f = Ss.f * Sa22.f;
857 Sa12.f = Sc.f * Sa12.f;
858 Sa22.f = Sc.f * Sa22.f;
859 Sa12.f = __dadd_rn(Sa12.f, Stmp2.f);
860 Sa22.f = __dsub_rn(Sa22.f, Stmp1.f);
861
862 Stmp1.f = Ss.f * Sa13.f;
863 Stmp2.f = Ss.f * Sa23.f;
864 Sa13.f = Sc.f * Sa13.f;
865 Sa23.f = Sc.f * Sa23.f;
866 Sa13.f = __dadd_rn(Sa13.f, Stmp2.f);
867 Sa23.f = __dsub_rn(Sa23.f, Stmp1.f);
868
869 //###########################################################
870 // Update matrix U
871 //###########################################################
872
873 Stmp1.f = Ss.f * Su11.f;
874 Stmp2.f = Ss.f * Su12.f;
875 Su11.f = Sc.f * Su11.f;
876 Su12.f = Sc.f * Su12.f;
877 Su11.f = __dadd_rn(Su11.f, Stmp2.f);
878 Su12.f = __dsub_rn(Su12.f, Stmp1.f);
879
880 Stmp1.f = Ss.f * Su21.f;
881 Stmp2.f = Ss.f * Su22.f;
882 Su21.f = Sc.f * Su21.f;
883 Su22.f = Sc.f * Su22.f;
884 Su21.f = __dadd_rn(Su21.f, Stmp2.f);
885 Su22.f = __dsub_rn(Su22.f, Stmp1.f);
886
887 Stmp1.f = Ss.f * Su31.f;
888 Stmp2.f = Ss.f * Su32.f;
889 Su31.f = Sc.f * Su31.f;
890 Su32.f = Sc.f * Su32.f;
891 Su31.f = __dadd_rn(Su31.f, Stmp2.f);
892 Su32.f = __dsub_rn(Su32.f, Stmp1.f);
893
894 // Second Givens rotation
895
896 Ssh.f = Sa31.f * Sa31.f;
897 Ssh.ui = (Ssh.f >= gsmall_number) ? 0xffffffff : 0;
898 Ssh.ui = Ssh.ui & Sa31.ui;
899
900 Stmp5.f = 0.f;
901 Sch.f = __dsub_rn(Stmp5.f, Sa11.f);
902 Sch.f = max(Sch.f, Sa11.f);
903 Sch.f = max(Sch.f, gsmall_number);
904 Stmp5.ui = (Sa11.f >= Stmp5.f) ? 0xffffffff : 0;
905
906 Stmp1.f = Sch.f * Sch.f;
907 Stmp2.f = Ssh.f * Ssh.f;
908 Stmp2.f = __dadd_rn(Stmp1.f, Stmp2.f);
909 Stmp1.f = __drsqrt_rn(Stmp2.f);
910
911 Stmp4.f = Stmp1.f * 0.5;
912 Stmp3.f = Stmp1.f * Stmp4.f;
913 Stmp3.f = Stmp1.f * Stmp3.f;
914 Stmp3.f = Stmp2.f * Stmp3.f;
915 Stmp1.f = __dadd_rn(Stmp1.f, Stmp4.f);
916 Stmp1.f = __dsub_rn(Stmp1.f, Stmp3.f);
917 Stmp1.f = Stmp1.f * Stmp2.f;
918
919 Sch.f = __dadd_rn(Sch.f, Stmp1.f);
920
921 Stmp1.ui = ~Stmp5.ui & Ssh.ui;
922 Stmp2.ui = ~Stmp5.ui & Sch.ui;
923 Sch.ui = Stmp5.ui & Sch.ui;
924 Ssh.ui = Stmp5.ui & Ssh.ui;
925 Sch.ui = Sch.ui | Stmp1.ui;
926 Ssh.ui = Ssh.ui | Stmp2.ui;
927
928 Stmp1.f = Sch.f * Sch.f;
929 Stmp2.f = Ssh.f * Ssh.f;
930 Stmp2.f = __dadd_rn(Stmp1.f, Stmp2.f);
931 Stmp1.f = __drsqrt_rn(Stmp2.f);
932
933 Stmp4.f = Stmp1.f * 0.5f;
934 Stmp3.f = Stmp1.f * Stmp4.f;
935 Stmp3.f = Stmp1.f * Stmp3.f;
936 Stmp3.f = Stmp2.f * Stmp3.f;
937 Stmp1.f = __dadd_rn(Stmp1.f, Stmp4.f);
938 Stmp1.f = __dsub_rn(Stmp1.f, Stmp3.f);
939
940 Sch.f = Sch.f * Stmp1.f;
941 Ssh.f = Ssh.f * Stmp1.f;
942
943 Sc.f = Sch.f * Sch.f;
944 Ss.f = Ssh.f * Ssh.f;
945 Sc.f = __dsub_rn(Sc.f, Ss.f);
946 Ss.f = Ssh.f * Sch.f;
947 Ss.f = __dadd_rn(Ss.f, Ss.f);
948
949 //###########################################################
950 // Rotate matrix A
951 //###########################################################
952
953 Stmp1.f = Ss.f * Sa11.f;
954 Stmp2.f = Ss.f * Sa31.f;
955 Sa11.f = Sc.f * Sa11.f;
956 Sa31.f = Sc.f * Sa31.f;
957 Sa11.f = __dadd_rn(Sa11.f, Stmp2.f);
958 Sa31.f = __dsub_rn(Sa31.f, Stmp1.f);
959
960 Stmp1.f = Ss.f * Sa12.f;
961 Stmp2.f = Ss.f * Sa32.f;
962 Sa12.f = Sc.f * Sa12.f;
963 Sa32.f = Sc.f * Sa32.f;
964 Sa12.f = __dadd_rn(Sa12.f, Stmp2.f);
965 Sa32.f = __dsub_rn(Sa32.f, Stmp1.f);
966
967 Stmp1.f = Ss.f * Sa13.f;
968 Stmp2.f = Ss.f * Sa33.f;
969 Sa13.f = Sc.f * Sa13.f;
970 Sa33.f = Sc.f * Sa33.f;
971 Sa13.f = __dadd_rn(Sa13.f, Stmp2.f);
972 Sa33.f = __dsub_rn(Sa33.f, Stmp1.f);
973
974 //###########################################################
975 // Update matrix U
976 //###########################################################
977
978 Stmp1.f = Ss.f * Su11.f;
979 Stmp2.f = Ss.f * Su13.f;
980 Su11.f = Sc.f * Su11.f;
981 Su13.f = Sc.f * Su13.f;
982 Su11.f = __dadd_rn(Su11.f, Stmp2.f);
983 Su13.f = __dsub_rn(Su13.f, Stmp1.f);
984
985 Stmp1.f = Ss.f * Su21.f;
986 Stmp2.f = Ss.f * Su23.f;
987 Su21.f = Sc.f * Su21.f;
988 Su23.f = Sc.f * Su23.f;
989 Su21.f = __dadd_rn(Su21.f, Stmp2.f);
990 Su23.f = __dsub_rn(Su23.f, Stmp1.f);
991
992 Stmp1.f = Ss.f * Su31.f;
993 Stmp2.f = Ss.f * Su33.f;
994 Su31.f = Sc.f * Su31.f;
995 Su33.f = Sc.f * Su33.f;
996 Su31.f = __dadd_rn(Su31.f, Stmp2.f);
997 Su33.f = __dsub_rn(Su33.f, Stmp1.f);
998
999 // Third Givens Rotation
1000
1001 Ssh.f = Sa32.f * Sa32.f;
1002 Ssh.ui = (Ssh.f >= gsmall_number) ? 0xffffffff : 0;
1003 Ssh.ui = Ssh.ui & Sa32.ui;
1004
1005 Stmp5.f = 0.f;
1006 Sch.f = __dsub_rn(Stmp5.f, Sa22.f);
1007 Sch.f = max(Sch.f, Sa22.f);
1008 Sch.f = max(Sch.f, gsmall_number);
1009 Stmp5.ui = (Sa22.f >= Stmp5.f) ? 0xffffffff : 0;
1010
1011 Stmp1.f = Sch.f * Sch.f;
1012 Stmp2.f = Ssh.f * Ssh.f;
1013 Stmp2.f = __dadd_rn(Stmp1.f, Stmp2.f);
1014 Stmp1.f = __drsqrt_rn(Stmp2.f);
1015
1016 Stmp4.f = Stmp1.f * 0.5f;
1017 Stmp3.f = Stmp1.f * Stmp4.f;
1018 Stmp3.f = Stmp1.f * Stmp3.f;
1019 Stmp3.f = Stmp2.f * Stmp3.f;
1020 Stmp1.f = __dadd_rn(Stmp1.f, Stmp4.f);
1021 Stmp1.f = __dsub_rn(Stmp1.f, Stmp3.f);
1022 Stmp1.f = Stmp1.f * Stmp2.f;
1023
1024 Sch.f = __dadd_rn(Sch.f, Stmp1.f);
1025
1026 Stmp1.ui = ~Stmp5.ui & Ssh.ui;
1027 Stmp2.ui = ~Stmp5.ui & Sch.ui;
1028 Sch.ui = Stmp5.ui & Sch.ui;
1029 Ssh.ui = Stmp5.ui & Ssh.ui;
1030 Sch.ui = Sch.ui | Stmp1.ui;
1031 Ssh.ui = Ssh.ui | Stmp2.ui;
1032
1033 Stmp1.f = Sch.f * Sch.f;
1034 Stmp2.f = Ssh.f * Ssh.f;
1035 Stmp2.f = __dadd_rn(Stmp1.f, Stmp2.f);
1036 Stmp1.f = __drsqrt_rn(Stmp2.f);
1037
1038 Stmp4.f = Stmp1.f * 0.5f;
1039 Stmp3.f = Stmp1.f * Stmp4.f;
1040 Stmp3.f = Stmp1.f * Stmp3.f;
1041 Stmp3.f = Stmp2.f * Stmp3.f;
1042 Stmp1.f = __dadd_rn(Stmp1.f, Stmp4.f);
1043 Stmp1.f = __dsub_rn(Stmp1.f, Stmp3.f);
1044
1045 Sch.f = Sch.f * Stmp1.f;
1046 Ssh.f = Ssh.f * Stmp1.f;
1047
1048 Sc.f = Sch.f * Sch.f;
1049 Ss.f = Ssh.f * Ssh.f;
1050 Sc.f = __dsub_rn(Sc.f, Ss.f);
1051 Ss.f = Ssh.f * Sch.f;
1052 Ss.f = __dadd_rn(Ss.f, Ss.f);
1053
1054 //###########################################################
1055 // Rotate matrix A
1056 //###########################################################
1057
1058 Stmp1.f = Ss.f * Sa21.f;
1059 Stmp2.f = Ss.f * Sa31.f;
1060 Sa21.f = Sc.f * Sa21.f;
1061 Sa31.f = Sc.f * Sa31.f;
1062 Sa21.f = __dadd_rn(Sa21.f, Stmp2.f);
1063 Sa31.f = __dsub_rn(Sa31.f, Stmp1.f);
1064
1065 Stmp1.f = Ss.f * Sa22.f;
1066 Stmp2.f = Ss.f * Sa32.f;
1067 Sa22.f = Sc.f * Sa22.f;
1068 Sa32.f = Sc.f * Sa32.f;
1069 Sa22.f = __dadd_rn(Sa22.f, Stmp2.f);
1070 Sa32.f = __dsub_rn(Sa32.f, Stmp1.f);
1071
1072 Stmp1.f = Ss.f * Sa23.f;
1073 Stmp2.f = Ss.f * Sa33.f;
1074 Sa23.f = Sc.f * Sa23.f;
1075 Sa33.f = Sc.f * Sa33.f;
1076 Sa23.f = __dadd_rn(Sa23.f, Stmp2.f);
1077 Sa33.f = __dsub_rn(Sa33.f, Stmp1.f);
1078
1079 //###########################################################
1080 // Update matrix U
1081 //###########################################################
1082
1083 Stmp1.f = Ss.f * Su12.f;
1084 Stmp2.f = Ss.f * Su13.f;
1085 Su12.f = Sc.f * Su12.f;
1086 Su13.f = Sc.f * Su13.f;
1087 Su12.f = __dadd_rn(Su12.f, Stmp2.f);
1088 Su13.f = __dsub_rn(Su13.f, Stmp1.f);
1089
1090 Stmp1.f = Ss.f * Su22.f;
1091 Stmp2.f = Ss.f * Su23.f;
1092 Su22.f = Sc.f * Su22.f;
1093 Su23.f = Sc.f * Su23.f;
1094 Su22.f = __dadd_rn(Su22.f, Stmp2.f);
1095 Su23.f = __dsub_rn(Su23.f, Stmp1.f);
1096
1097 Stmp1.f = Ss.f * Su32.f;
1098 Stmp2.f = Ss.f * Su33.f;
1099 Su32.f = Sc.f * Su32.f;
1100 Su33.f = Sc.f * Su33.f;
1101 Su32.f = __dadd_rn(Su32.f, Stmp2.f);
1102 Su33.f = __dsub_rn(Su33.f, Stmp1.f);
1103
1104 V_3x3[0] = Sv11.f;
1105 V_3x3[1] = Sv12.f;
1106 V_3x3[2] = Sv13.f;
1107 V_3x3[3] = Sv21.f;
1108 V_3x3[4] = Sv22.f;
1109 V_3x3[5] = Sv23.f;
1110 V_3x3[6] = Sv31.f;
1111 V_3x3[7] = Sv32.f;
1112 V_3x3[8] = Sv33.f;
1113
1114 U_3x3[0] = Su11.f;
1115 U_3x3[1] = Su12.f;
1116 U_3x3[2] = Su13.f;
1117 U_3x3[3] = Su21.f;
1118 U_3x3[4] = Su22.f;
1119 U_3x3[5] = Su23.f;
1120 U_3x3[6] = Su31.f;
1121 U_3x3[7] = Su32.f;
1122 U_3x3[8] = Su33.f;
1123
1124 S_3x1[0] = Sa11.f;
1125 // s12 = Sa12.f; s13 = Sa13.f; s21 = Sa21.f;
1126 S_3x1[1] = Sa22.f;
1127 // s23 = Sa23.f; s31 = Sa31.f; s32 = Sa32.f;
1128 S_3x1[2] = Sa33.f;
1129}
1130
1131template <>
1133 float *U_3x3,
1134 float *S_3x1,
1135 float *V_3x3) {
1136 float gsmall_number = 1.e-12;
1137
1138 un<float> Sa11, Sa21, Sa31, Sa12, Sa22, Sa32, Sa13, Sa23, Sa33;
1139 un<float> Su11, Su21, Su31, Su12, Su22, Su32, Su13, Su23, Su33;
1140 un<float> Sv11, Sv21, Sv31, Sv12, Sv22, Sv32, Sv13, Sv23, Sv33;
1141 un<float> Sc, Ss, Sch, Ssh;
1142 un<float> Stmp1, Stmp2, Stmp3, Stmp4, Stmp5;
1143 un<float> Ss11, Ss21, Ss31, Ss22, Ss32, Ss33;
1144 un<float> Sqvs, Sqvvx, Sqvvy, Sqvvz;
1145
1146 Sa11.f = A_3x3[0];
1147 Sa12.f = A_3x3[1];
1148 Sa13.f = A_3x3[2];
1149 Sa21.f = A_3x3[3];
1150 Sa22.f = A_3x3[4];
1151 Sa23.f = A_3x3[5];
1152 Sa31.f = A_3x3[6];
1153 Sa32.f = A_3x3[7];
1154 Sa33.f = A_3x3[8];
1155
1156 //###########################################################
1157 // Compute normal equations matrix
1158 //###########################################################
1159
1160 Ss11.f = Sa11.f * Sa11.f;
1161 Stmp1.f = Sa21.f * Sa21.f;
1162 Ss11.f = __fadd_rn(Stmp1.f, Ss11.f);
1163 Stmp1.f = Sa31.f * Sa31.f;
1164 Ss11.f = __fadd_rn(Stmp1.f, Ss11.f);
1165
1166 Ss21.f = Sa12.f * Sa11.f;
1167 Stmp1.f = Sa22.f * Sa21.f;
1168 Ss21.f = __fadd_rn(Stmp1.f, Ss21.f);
1169 Stmp1.f = Sa32.f * Sa31.f;
1170 Ss21.f = __fadd_rn(Stmp1.f, Ss21.f);
1171
1172 Ss31.f = Sa13.f * Sa11.f;
1173 Stmp1.f = Sa23.f * Sa21.f;
1174 Ss31.f = __fadd_rn(Stmp1.f, Ss31.f);
1175 Stmp1.f = Sa33.f * Sa31.f;
1176 Ss31.f = __fadd_rn(Stmp1.f, Ss31.f);
1177
1178 Ss22.f = Sa12.f * Sa12.f;
1179 Stmp1.f = Sa22.f * Sa22.f;
1180 Ss22.f = __fadd_rn(Stmp1.f, Ss22.f);
1181 Stmp1.f = Sa32.f * Sa32.f;
1182 Ss22.f = __fadd_rn(Stmp1.f, Ss22.f);
1183
1184 Ss32.f = Sa13.f * Sa12.f;
1185 Stmp1.f = Sa23.f * Sa22.f;
1186 Ss32.f = __fadd_rn(Stmp1.f, Ss32.f);
1187 Stmp1.f = Sa33.f * Sa32.f;
1188 Ss32.f = __fadd_rn(Stmp1.f, Ss32.f);
1189
1190 Ss33.f = Sa13.f * Sa13.f;
1191 Stmp1.f = Sa23.f * Sa23.f;
1192 Ss33.f = __fadd_rn(Stmp1.f, Ss33.f);
1193 Stmp1.f = Sa33.f * Sa33.f;
1194 Ss33.f = __fadd_rn(Stmp1.f, Ss33.f);
1195
1196 Sqvs.f = 1.f;
1197 Sqvvx.f = 0.f;
1198 Sqvvy.f = 0.f;
1199 Sqvvz.f = 0.f;
1200
1201 //###########################################################
1202 // Solve symmetric eigenproblem using Jacobi iteration
1203 //###########################################################
1204 for (int i = 0; i < 4; i++) {
1205 Ssh.f = Ss21.f * 0.5f;
1206 Stmp5.f = __fsub_rn(Ss11.f, Ss22.f);
1207
1208 Stmp2.f = Ssh.f * Ssh.f;
1209 Stmp1.ui = (Stmp2.f >= gtiny_number) ? 0xffffffff : 0;
1210 Ssh.ui = Stmp1.ui & Ssh.ui;
1211 Sch.ui = Stmp1.ui & Stmp5.ui;
1212 Stmp2.ui = ~Stmp1.ui & gone;
1213 Sch.ui = Sch.ui | Stmp2.ui;
1214
1215 Stmp1.f = Ssh.f * Ssh.f;
1216 Stmp2.f = Sch.f * Sch.f;
1217 Stmp3.f = __fadd_rn(Stmp1.f, Stmp2.f);
1218 Stmp4.f = __frsqrt_rn(Stmp3.f);
1219
1220 Ssh.f = Stmp4.f * Ssh.f;
1221 Sch.f = Stmp4.f * Sch.f;
1222 Stmp1.f = gfour_gamma_squared * Stmp1.f;
1223 Stmp1.ui = (Stmp2.f <= Stmp1.f) ? 0xffffffff : 0;
1224
1225 Stmp2.ui = gsine_pi_over_eight & Stmp1.ui;
1226 Ssh.ui = ~Stmp1.ui & Ssh.ui;
1227 Ssh.ui = Ssh.ui | Stmp2.ui;
1228 Stmp2.ui = gcosine_pi_over_eight & Stmp1.ui;
1229 Sch.ui = ~Stmp1.ui & Sch.ui;
1230 Sch.ui = Sch.ui | Stmp2.ui;
1231
1232 Stmp1.f = Ssh.f * Ssh.f;
1233 Stmp2.f = Sch.f * Sch.f;
1234 Sc.f = __fsub_rn(Stmp2.f, Stmp1.f);
1235 Ss.f = Sch.f * Ssh.f;
1236 Ss.f = __fadd_rn(Ss.f, Ss.f);
1237
1238#ifdef DEBUG_JACOBI_CONJUGATE
1239 printf("GPU s %.20g, c %.20g, sh %.20g, ch %.20g\n", Ss.f, Sc.f, Ssh.f,
1240 Sch.f);
1241#endif
1242 //###########################################################
1243 // Perform the actual Givens conjugation
1244 //###########################################################
1245
1246 Stmp3.f = __fadd_rn(Stmp1.f, Stmp2.f);
1247 Ss33.f = Ss33.f * Stmp3.f;
1248 Ss31.f = Ss31.f * Stmp3.f;
1249 Ss32.f = Ss32.f * Stmp3.f;
1250 Ss33.f = Ss33.f * Stmp3.f;
1251
1252 Stmp1.f = Ss.f * Ss31.f;
1253 Stmp2.f = Ss.f * Ss32.f;
1254 Ss31.f = Sc.f * Ss31.f;
1255 Ss32.f = Sc.f * Ss32.f;
1256 Ss31.f = __fadd_rn(Stmp2.f, Ss31.f);
1257 Ss32.f = __fsub_rn(Ss32.f, Stmp1.f);
1258
1259 Stmp2.f = Ss.f * Ss.f;
1260 Stmp1.f = Ss22.f * Stmp2.f;
1261 Stmp3.f = Ss11.f * Stmp2.f;
1262 Stmp4.f = Sc.f * Sc.f;
1263 Ss11.f = Ss11.f * Stmp4.f;
1264 Ss22.f = Ss22.f * Stmp4.f;
1265 Ss11.f = __fadd_rn(Ss11.f, Stmp1.f);
1266 Ss22.f = __fadd_rn(Ss22.f, Stmp3.f);
1267 Stmp4.f = __fsub_rn(Stmp4.f, Stmp2.f);
1268 Stmp2.f = __fadd_rn(Ss21.f, Ss21.f);
1269 Ss21.f = Ss21.f * Stmp4.f;
1270 Stmp4.f = Sc.f * Ss.f;
1271 Stmp2.f = Stmp2.f * Stmp4.f;
1272 Stmp5.f = Stmp5.f * Stmp4.f;
1273 Ss11.f = __fadd_rn(Ss11.f, Stmp2.f);
1274 Ss21.f = __fsub_rn(Ss21.f, Stmp5.f);
1275 Ss22.f = __fsub_rn(Ss22.f, Stmp2.f);
1276
1277#ifdef DEBUG_JACOBI_CONJUGATE
1278 printf("%.20g\n", Ss11.f);
1279 printf("%.20g %.20g\n", Ss21.f, Ss22.f);
1280 printf("%.20g %.20g %.20g\n", Ss31.f, Ss32.f, Ss33.f);
1281#endif
1282
1283 //###########################################################
1284 // Compute the cumulative rotation, in quaternion form
1285 //###########################################################
1286
1287 Stmp1.f = Ssh.f * Sqvvx.f;
1288 Stmp2.f = Ssh.f * Sqvvy.f;
1289 Stmp3.f = Ssh.f * Sqvvz.f;
1290 Ssh.f = Ssh.f * Sqvs.f;
1291
1292 Sqvs.f = Sch.f * Sqvs.f;
1293 Sqvvx.f = Sch.f * Sqvvx.f;
1294 Sqvvy.f = Sch.f * Sqvvy.f;
1295 Sqvvz.f = Sch.f * Sqvvz.f;
1296
1297 Sqvvz.f = __fadd_rn(Sqvvz.f, Ssh.f);
1298 Sqvs.f = __fsub_rn(Sqvs.f, Stmp3.f);
1299 Sqvvx.f = __fadd_rn(Sqvvx.f, Stmp2.f);
1300 Sqvvy.f = __fsub_rn(Sqvvy.f, Stmp1.f);
1301
1302#ifdef DEBUG_JACOBI_CONJUGATE
1303 printf("GPU q %.20g %.20g %.20g %.20g\n", Sqvvx.f, Sqvvy.f, Sqvvz.f,
1304 Sqvs.f);
1305#endif
1306
1308 // (1->3)
1310 Ssh.f = Ss32.f * 0.5f;
1311 Stmp5.f = __fsub_rn(Ss22.f, Ss33.f);
1312
1313 Stmp2.f = Ssh.f * Ssh.f;
1314 Stmp1.ui = (Stmp2.f >= gtiny_number) ? 0xffffffff : 0;
1315 Ssh.ui = Stmp1.ui & Ssh.ui;
1316 Sch.ui = Stmp1.ui & Stmp5.ui;
1317 Stmp2.ui = ~Stmp1.ui & gone;
1318 Sch.ui = Sch.ui | Stmp2.ui;
1319
1320 Stmp1.f = Ssh.f * Ssh.f;
1321 Stmp2.f = Sch.f * Sch.f;
1322 Stmp3.f = __fadd_rn(Stmp1.f, Stmp2.f);
1323 Stmp4.f = __frsqrt_rn(Stmp3.f);
1324
1325 Ssh.f = Stmp4.f * Ssh.f;
1326 Sch.f = Stmp4.f * Sch.f;
1327 Stmp1.f = gfour_gamma_squared * Stmp1.f;
1328 Stmp1.ui = (Stmp2.f <= Stmp1.f) ? 0xffffffff : 0;
1329
1330 Stmp2.ui = gsine_pi_over_eight & Stmp1.ui;
1331 Ssh.ui = ~Stmp1.ui & Ssh.ui;
1332 Ssh.ui = Ssh.ui | Stmp2.ui;
1333 Stmp2.ui = gcosine_pi_over_eight & Stmp1.ui;
1334 Sch.ui = ~Stmp1.ui & Sch.ui;
1335 Sch.ui = Sch.ui | Stmp2.ui;
1336
1337 Stmp1.f = Ssh.f * Ssh.f;
1338 Stmp2.f = Sch.f * Sch.f;
1339 Sc.f = __fsub_rn(Stmp2.f, Stmp1.f);
1340 Ss.f = Sch.f * Ssh.f;
1341 Ss.f = __fadd_rn(Ss.f, Ss.f);
1342
1343#ifdef DEBUG_JACOBI_CONJUGATE
1344 printf("GPU s %.20g, c %.20g, sh %.20g, ch %.20g\n", Ss.f, Sc.f, Ssh.f,
1345 Sch.f);
1346#endif
1347
1348 //###########################################################
1349 // Perform the actual Givens conjugation
1350 //###########################################################
1351
1352 Stmp3.f = __fadd_rn(Stmp1.f, Stmp2.f);
1353 Ss11.f = Ss11.f * Stmp3.f;
1354 Ss21.f = Ss21.f * Stmp3.f;
1355 Ss31.f = Ss31.f * Stmp3.f;
1356 Ss11.f = Ss11.f * Stmp3.f;
1357
1358 Stmp1.f = Ss.f * Ss21.f;
1359 Stmp2.f = Ss.f * Ss31.f;
1360 Ss21.f = Sc.f * Ss21.f;
1361 Ss31.f = Sc.f * Ss31.f;
1362 Ss21.f = __fadd_rn(Stmp2.f, Ss21.f);
1363 Ss31.f = __fsub_rn(Ss31.f, Stmp1.f);
1364
1365 Stmp2.f = Ss.f * Ss.f;
1366 Stmp1.f = Ss33.f * Stmp2.f;
1367 Stmp3.f = Ss22.f * Stmp2.f;
1368 Stmp4.f = Sc.f * Sc.f;
1369 Ss22.f = Ss22.f * Stmp4.f;
1370 Ss33.f = Ss33.f * Stmp4.f;
1371 Ss22.f = __fadd_rn(Ss22.f, Stmp1.f);
1372 Ss33.f = __fadd_rn(Ss33.f, Stmp3.f);
1373 Stmp4.f = __fsub_rn(Stmp4.f, Stmp2.f);
1374 Stmp2.f = __fadd_rn(Ss32.f, Ss32.f);
1375 Ss32.f = Ss32.f * Stmp4.f;
1376 Stmp4.f = Sc.f * Ss.f;
1377 Stmp2.f = Stmp2.f * Stmp4.f;
1378 Stmp5.f = Stmp5.f * Stmp4.f;
1379 Ss22.f = __fadd_rn(Ss22.f, Stmp2.f);
1380 Ss32.f = __fsub_rn(Ss32.f, Stmp5.f);
1381 Ss33.f = __fsub_rn(Ss33.f, Stmp2.f);
1382
1383#ifdef DEBUG_JACOBI_CONJUGATE
1384 printf("%.20g\n", Ss11.f);
1385 printf("%.20g %.20g\n", Ss21.f, Ss22.f);
1386 printf("%.20g %.20g %.20g\n", Ss31.f, Ss32.f, Ss33.f);
1387#endif
1388
1389 //###########################################################
1390 // Compute the cumulative rotation, in quaternion form
1391 //###########################################################
1392
1393 Stmp1.f = Ssh.f * Sqvvx.f;
1394 Stmp2.f = Ssh.f * Sqvvy.f;
1395 Stmp3.f = Ssh.f * Sqvvz.f;
1396 Ssh.f = Ssh.f * Sqvs.f;
1397
1398 Sqvs.f = Sch.f * Sqvs.f;
1399 Sqvvx.f = Sch.f * Sqvvx.f;
1400 Sqvvy.f = Sch.f * Sqvvy.f;
1401 Sqvvz.f = Sch.f * Sqvvz.f;
1402
1403 Sqvvx.f = __fadd_rn(Sqvvx.f, Ssh.f);
1404 Sqvs.f = __fsub_rn(Sqvs.f, Stmp1.f);
1405 Sqvvy.f = __fadd_rn(Sqvvy.f, Stmp3.f);
1406 Sqvvz.f = __fsub_rn(Sqvvz.f, Stmp2.f);
1407
1408#ifdef DEBUG_JACOBI_CONJUGATE
1409 printf("GPU q %.20g %.20g %.20g %.20g\n", Sqvvx.f, Sqvvy.f, Sqvvz.f,
1410 Sqvs.f);
1411#endif
1412#if 1
1414 // 1 -> 2
1416
1417 Ssh.f = Ss31.f * 0.5f;
1418 Stmp5.f = __fsub_rn(Ss33.f, Ss11.f);
1419
1420 Stmp2.f = Ssh.f * Ssh.f;
1421 Stmp1.ui = (Stmp2.f >= gtiny_number) ? 0xffffffff : 0;
1422 Ssh.ui = Stmp1.ui & Ssh.ui;
1423 Sch.ui = Stmp1.ui & Stmp5.ui;
1424 Stmp2.ui = ~Stmp1.ui & gone;
1425 Sch.ui = Sch.ui | Stmp2.ui;
1426
1427 Stmp1.f = Ssh.f * Ssh.f;
1428 Stmp2.f = Sch.f * Sch.f;
1429 Stmp3.f = __fadd_rn(Stmp1.f, Stmp2.f);
1430 Stmp4.f = __frsqrt_rn(Stmp3.f);
1431
1432 Ssh.f = Stmp4.f * Ssh.f;
1433 Sch.f = Stmp4.f * Sch.f;
1434 Stmp1.f = gfour_gamma_squared * Stmp1.f;
1435 Stmp1.ui = (Stmp2.f <= Stmp1.f) ? 0xffffffff : 0;
1436
1437 Stmp2.ui = gsine_pi_over_eight & Stmp1.ui;
1438 Ssh.ui = ~Stmp1.ui & Ssh.ui;
1439 Ssh.ui = Ssh.ui | Stmp2.ui;
1440 Stmp2.ui = gcosine_pi_over_eight & Stmp1.ui;
1441 Sch.ui = ~Stmp1.ui & Sch.ui;
1442 Sch.ui = Sch.ui | Stmp2.ui;
1443
1444 Stmp1.f = Ssh.f * Ssh.f;
1445 Stmp2.f = Sch.f * Sch.f;
1446 Sc.f = __fsub_rn(Stmp2.f, Stmp1.f);
1447 Ss.f = Sch.f * Ssh.f;
1448 Ss.f = __fadd_rn(Ss.f, Ss.f);
1449
1450#ifdef DEBUG_JACOBI_CONJUGATE
1451 printf("GPU s %.20g, c %.20g, sh %.20g, ch %.20g\n", Ss.f, Sc.f, Ssh.f,
1452 Sch.f);
1453#endif
1454
1455 //###########################################################
1456 // Perform the actual Givens conjugation
1457 //###########################################################
1458
1459 Stmp3.f = __fadd_rn(Stmp1.f, Stmp2.f);
1460 Ss22.f = Ss22.f * Stmp3.f;
1461 Ss32.f = Ss32.f * Stmp3.f;
1462 Ss21.f = Ss21.f * Stmp3.f;
1463 Ss22.f = Ss22.f * Stmp3.f;
1464
1465 Stmp1.f = Ss.f * Ss32.f;
1466 Stmp2.f = Ss.f * Ss21.f;
1467 Ss32.f = Sc.f * Ss32.f;
1468 Ss21.f = Sc.f * Ss21.f;
1469 Ss32.f = __fadd_rn(Stmp2.f, Ss32.f);
1470 Ss21.f = __fsub_rn(Ss21.f, Stmp1.f);
1471
1472 Stmp2.f = Ss.f * Ss.f;
1473 Stmp1.f = Ss11.f * Stmp2.f;
1474 Stmp3.f = Ss33.f * Stmp2.f;
1475 Stmp4.f = Sc.f * Sc.f;
1476 Ss33.f = Ss33.f * Stmp4.f;
1477 Ss11.f = Ss11.f * Stmp4.f;
1478 Ss33.f = __fadd_rn(Ss33.f, Stmp1.f);
1479 Ss11.f = __fadd_rn(Ss11.f, Stmp3.f);
1480 Stmp4.f = __fsub_rn(Stmp4.f, Stmp2.f);
1481 Stmp2.f = __fadd_rn(Ss31.f, Ss31.f);
1482 Ss31.f = Ss31.f * Stmp4.f;
1483 Stmp4.f = Sc.f * Ss.f;
1484 Stmp2.f = Stmp2.f * Stmp4.f;
1485 Stmp5.f = Stmp5.f * Stmp4.f;
1486 Ss33.f = __fadd_rn(Ss33.f, Stmp2.f);
1487 Ss31.f = __fsub_rn(Ss31.f, Stmp5.f);
1488 Ss11.f = __fsub_rn(Ss11.f, Stmp2.f);
1489
1490#ifdef DEBUG_JACOBI_CONJUGATE
1491 printf("%.20g\n", Ss11.f);
1492 printf("%.20g %.20g\n", Ss21.f, Ss22.f);
1493 printf("%.20g %.20g %.20g\n", Ss31.f, Ss32.f, Ss33.f);
1494#endif
1495
1496 //###########################################################
1497 // Compute the cumulative rotation, in quaternion form
1498 //###########################################################
1499
1500 Stmp1.f = Ssh.f * Sqvvx.f;
1501 Stmp2.f = Ssh.f * Sqvvy.f;
1502 Stmp3.f = Ssh.f * Sqvvz.f;
1503 Ssh.f = Ssh.f * Sqvs.f;
1504
1505 Sqvs.f = Sch.f * Sqvs.f;
1506 Sqvvx.f = Sch.f * Sqvvx.f;
1507 Sqvvy.f = Sch.f * Sqvvy.f;
1508 Sqvvz.f = Sch.f * Sqvvz.f;
1509
1510 Sqvvy.f = __fadd_rn(Sqvvy.f, Ssh.f);
1511 Sqvs.f = __fsub_rn(Sqvs.f, Stmp2.f);
1512 Sqvvz.f = __fadd_rn(Sqvvz.f, Stmp1.f);
1513 Sqvvx.f = __fsub_rn(Sqvvx.f, Stmp3.f);
1514#endif
1515 }
1516
1517 //###########################################################
1518 // Normalize quaternion for matrix V
1519 //###########################################################
1520
1521 Stmp2.f = Sqvs.f * Sqvs.f;
1522 Stmp1.f = Sqvvx.f * Sqvvx.f;
1523 Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
1524 Stmp1.f = Sqvvy.f * Sqvvy.f;
1525 Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
1526 Stmp1.f = Sqvvz.f * Sqvvz.f;
1527 Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
1528
1529 Stmp1.f = __frsqrt_rn(Stmp2.f);
1530 Stmp4.f = Stmp1.f * 0.5f;
1531 Stmp3.f = Stmp1.f * Stmp4.f;
1532 Stmp3.f = Stmp1.f * Stmp3.f;
1533 Stmp3.f = Stmp2.f * Stmp3.f;
1534 Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
1535 Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);
1536
1537 Sqvs.f = Sqvs.f * Stmp1.f;
1538 Sqvvx.f = Sqvvx.f * Stmp1.f;
1539 Sqvvy.f = Sqvvy.f * Stmp1.f;
1540 Sqvvz.f = Sqvvz.f * Stmp1.f;
1541
1542 //###########################################################
1543 // Transform quaternion to matrix V
1544 //###########################################################
1545
1546 Stmp1.f = Sqvvx.f * Sqvvx.f;
1547 Stmp2.f = Sqvvy.f * Sqvvy.f;
1548 Stmp3.f = Sqvvz.f * Sqvvz.f;
1549 Sv11.f = Sqvs.f * Sqvs.f;
1550 Sv22.f = __fsub_rn(Sv11.f, Stmp1.f);
1551 Sv33.f = __fsub_rn(Sv22.f, Stmp2.f);
1552 Sv33.f = __fadd_rn(Sv33.f, Stmp3.f);
1553 Sv22.f = __fadd_rn(Sv22.f, Stmp2.f);
1554 Sv22.f = __fsub_rn(Sv22.f, Stmp3.f);
1555 Sv11.f = __fadd_rn(Sv11.f, Stmp1.f);
1556 Sv11.f = __fsub_rn(Sv11.f, Stmp2.f);
1557 Sv11.f = __fsub_rn(Sv11.f, Stmp3.f);
1558 Stmp1.f = __fadd_rn(Sqvvx.f, Sqvvx.f);
1559 Stmp2.f = __fadd_rn(Sqvvy.f, Sqvvy.f);
1560 Stmp3.f = __fadd_rn(Sqvvz.f, Sqvvz.f);
1561 Sv32.f = Sqvs.f * Stmp1.f;
1562 Sv13.f = Sqvs.f * Stmp2.f;
1563 Sv21.f = Sqvs.f * Stmp3.f;
1564 Stmp1.f = Sqvvy.f * Stmp1.f;
1565 Stmp2.f = Sqvvz.f * Stmp2.f;
1566 Stmp3.f = Sqvvx.f * Stmp3.f;
1567 Sv12.f = __fsub_rn(Stmp1.f, Sv21.f);
1568 Sv23.f = __fsub_rn(Stmp2.f, Sv32.f);
1569 Sv31.f = __fsub_rn(Stmp3.f, Sv13.f);
1570 Sv21.f = __fadd_rn(Stmp1.f, Sv21.f);
1571 Sv32.f = __fadd_rn(Stmp2.f, Sv32.f);
1572 Sv13.f = __fadd_rn(Stmp3.f, Sv13.f);
1573
1575 // Multiply (from the right) with V
1576 //###########################################################
1577
1578 Stmp2.f = Sa12.f;
1579 Stmp3.f = Sa13.f;
1580 Sa12.f = Sv12.f * Sa11.f;
1581 Sa13.f = Sv13.f * Sa11.f;
1582 Sa11.f = Sv11.f * Sa11.f;
1583 Stmp1.f = Sv21.f * Stmp2.f;
1584 Sa11.f = __fadd_rn(Sa11.f, Stmp1.f);
1585 Stmp1.f = Sv31.f * Stmp3.f;
1586 Sa11.f = __fadd_rn(Sa11.f, Stmp1.f);
1587 Stmp1.f = Sv22.f * Stmp2.f;
1588 Sa12.f = __fadd_rn(Sa12.f, Stmp1.f);
1589 Stmp1.f = Sv32.f * Stmp3.f;
1590 Sa12.f = __fadd_rn(Sa12.f, Stmp1.f);
1591 Stmp1.f = Sv23.f * Stmp2.f;
1592 Sa13.f = __fadd_rn(Sa13.f, Stmp1.f);
1593 Stmp1.f = Sv33.f * Stmp3.f;
1594 Sa13.f = __fadd_rn(Sa13.f, Stmp1.f);
1595
1596 Stmp2.f = Sa22.f;
1597 Stmp3.f = Sa23.f;
1598 Sa22.f = Sv12.f * Sa21.f;
1599 Sa23.f = Sv13.f * Sa21.f;
1600 Sa21.f = Sv11.f * Sa21.f;
1601 Stmp1.f = Sv21.f * Stmp2.f;
1602 Sa21.f = __fadd_rn(Sa21.f, Stmp1.f);
1603 Stmp1.f = Sv31.f * Stmp3.f;
1604 Sa21.f = __fadd_rn(Sa21.f, Stmp1.f);
1605 Stmp1.f = Sv22.f * Stmp2.f;
1606 Sa22.f = __fadd_rn(Sa22.f, Stmp1.f);
1607 Stmp1.f = Sv32.f * Stmp3.f;
1608 Sa22.f = __fadd_rn(Sa22.f, Stmp1.f);
1609 Stmp1.f = Sv23.f * Stmp2.f;
1610 Sa23.f = __fadd_rn(Sa23.f, Stmp1.f);
1611 Stmp1.f = Sv33.f * Stmp3.f;
1612 Sa23.f = __fadd_rn(Sa23.f, Stmp1.f);
1613
1614 Stmp2.f = Sa32.f;
1615 Stmp3.f = Sa33.f;
1616 Sa32.f = Sv12.f * Sa31.f;
1617 Sa33.f = Sv13.f * Sa31.f;
1618 Sa31.f = Sv11.f * Sa31.f;
1619 Stmp1.f = Sv21.f * Stmp2.f;
1620 Sa31.f = __fadd_rn(Sa31.f, Stmp1.f);
1621 Stmp1.f = Sv31.f * Stmp3.f;
1622 Sa31.f = __fadd_rn(Sa31.f, Stmp1.f);
1623 Stmp1.f = Sv22.f * Stmp2.f;
1624 Sa32.f = __fadd_rn(Sa32.f, Stmp1.f);
1625 Stmp1.f = Sv32.f * Stmp3.f;
1626 Sa32.f = __fadd_rn(Sa32.f, Stmp1.f);
1627 Stmp1.f = Sv23.f * Stmp2.f;
1628 Sa33.f = __fadd_rn(Sa33.f, Stmp1.f);
1629 Stmp1.f = Sv33.f * Stmp3.f;
1630 Sa33.f = __fadd_rn(Sa33.f, Stmp1.f);
1631
1632 //###########################################################
1633 // Permute columns such that the singular values are sorted
1634 //###########################################################
1635
1636 Stmp1.f = Sa11.f * Sa11.f;
1637 Stmp4.f = Sa21.f * Sa21.f;
1638 Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
1639 Stmp4.f = Sa31.f * Sa31.f;
1640 Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
1641
1642 Stmp2.f = Sa12.f * Sa12.f;
1643 Stmp4.f = Sa22.f * Sa22.f;
1644 Stmp2.f = __fadd_rn(Stmp2.f, Stmp4.f);
1645 Stmp4.f = Sa32.f * Sa32.f;
1646 Stmp2.f = __fadd_rn(Stmp2.f, Stmp4.f);
1647
1648 Stmp3.f = Sa13.f * Sa13.f;
1649 Stmp4.f = Sa23.f * Sa23.f;
1650 Stmp3.f = __fadd_rn(Stmp3.f, Stmp4.f);
1651 Stmp4.f = Sa33.f * Sa33.f;
1652 Stmp3.f = __fadd_rn(Stmp3.f, Stmp4.f);
1653
1654 // Swap columns 1-2 if necessary
1655
1656 Stmp4.ui = (Stmp1.f < Stmp2.f) ? 0xffffffff : 0;
1657 Stmp5.ui = Sa11.ui ^ Sa12.ui;
1658 Stmp5.ui = Stmp5.ui & Stmp4.ui;
1659 Sa11.ui = Sa11.ui ^ Stmp5.ui;
1660 Sa12.ui = Sa12.ui ^ Stmp5.ui;
1661
1662 Stmp5.ui = Sa21.ui ^ Sa22.ui;
1663 Stmp5.ui = Stmp5.ui & Stmp4.ui;
1664 Sa21.ui = Sa21.ui ^ Stmp5.ui;
1665 Sa22.ui = Sa22.ui ^ Stmp5.ui;
1666
1667 Stmp5.ui = Sa31.ui ^ Sa32.ui;
1668 Stmp5.ui = Stmp5.ui & Stmp4.ui;
1669 Sa31.ui = Sa31.ui ^ Stmp5.ui;
1670 Sa32.ui = Sa32.ui ^ Stmp5.ui;
1671
1672 Stmp5.ui = Sv11.ui ^ Sv12.ui;
1673 Stmp5.ui = Stmp5.ui & Stmp4.ui;
1674 Sv11.ui = Sv11.ui ^ Stmp5.ui;
1675 Sv12.ui = Sv12.ui ^ Stmp5.ui;
1676
1677 Stmp5.ui = Sv21.ui ^ Sv22.ui;
1678 Stmp5.ui = Stmp5.ui & Stmp4.ui;
1679 Sv21.ui = Sv21.ui ^ Stmp5.ui;
1680 Sv22.ui = Sv22.ui ^ Stmp5.ui;
1681
1682 Stmp5.ui = Sv31.ui ^ Sv32.ui;
1683 Stmp5.ui = Stmp5.ui & Stmp4.ui;
1684 Sv31.ui = Sv31.ui ^ Stmp5.ui;
1685 Sv32.ui = Sv32.ui ^ Stmp5.ui;
1686
1687 Stmp5.ui = Stmp1.ui ^ Stmp2.ui;
1688 Stmp5.ui = Stmp5.ui & Stmp4.ui;
1689 Stmp1.ui = Stmp1.ui ^ Stmp5.ui;
1690 Stmp2.ui = Stmp2.ui ^ Stmp5.ui;
1691
1692 // If columns 1-2 have been swapped, negate 2nd column of A and V so that V
1693 // is still a rotation
1694
1695 Stmp5.f = -2.f;
1696 Stmp5.ui = Stmp5.ui & Stmp4.ui;
1697 Stmp4.f = 1.f;
1698 Stmp4.f = __fadd_rn(Stmp4.f, Stmp5.f);
1699
1700 Sa12.f = Sa12.f * Stmp4.f;
1701 Sa22.f = Sa22.f * Stmp4.f;
1702 Sa32.f = Sa32.f * Stmp4.f;
1703
1704 Sv12.f = Sv12.f * Stmp4.f;
1705 Sv22.f = Sv22.f * Stmp4.f;
1706 Sv32.f = Sv32.f * Stmp4.f;
1707
1708 // Swap columns 1-3 if necessary
1709
1710 Stmp4.ui = (Stmp1.f < Stmp3.f) ? 0xffffffff : 0;
1711 Stmp5.ui = Sa11.ui ^ Sa13.ui;
1712 Stmp5.ui = Stmp5.ui & Stmp4.ui;
1713 Sa11.ui = Sa11.ui ^ Stmp5.ui;
1714 Sa13.ui = Sa13.ui ^ Stmp5.ui;
1715
1716 Stmp5.ui = Sa21.ui ^ Sa23.ui;
1717 Stmp5.ui = Stmp5.ui & Stmp4.ui;
1718 Sa21.ui = Sa21.ui ^ Stmp5.ui;
1719 Sa23.ui = Sa23.ui ^ Stmp5.ui;
1720
1721 Stmp5.ui = Sa31.ui ^ Sa33.ui;
1722 Stmp5.ui = Stmp5.ui & Stmp4.ui;
1723 Sa31.ui = Sa31.ui ^ Stmp5.ui;
1724 Sa33.ui = Sa33.ui ^ Stmp5.ui;
1725
1726 Stmp5.ui = Sv11.ui ^ Sv13.ui;
1727 Stmp5.ui = Stmp5.ui & Stmp4.ui;
1728 Sv11.ui = Sv11.ui ^ Stmp5.ui;
1729 Sv13.ui = Sv13.ui ^ Stmp5.ui;
1730
1731 Stmp5.ui = Sv21.ui ^ Sv23.ui;
1732 Stmp5.ui = Stmp5.ui & Stmp4.ui;
1733 Sv21.ui = Sv21.ui ^ Stmp5.ui;
1734 Sv23.ui = Sv23.ui ^ Stmp5.ui;
1735
1736 Stmp5.ui = Sv31.ui ^ Sv33.ui;
1737 Stmp5.ui = Stmp5.ui & Stmp4.ui;
1738 Sv31.ui = Sv31.ui ^ Stmp5.ui;
1739 Sv33.ui = Sv33.ui ^ Stmp5.ui;
1740
1741 Stmp5.ui = Stmp1.ui ^ Stmp3.ui;
1742 Stmp5.ui = Stmp5.ui & Stmp4.ui;
1743 Stmp1.ui = Stmp1.ui ^ Stmp5.ui;
1744 Stmp3.ui = Stmp3.ui ^ Stmp5.ui;
1745
1746 // If columns 1-3 have been swapped, negate 1st column of A and V so that V
1747 // is still a rotation
1748
1749 Stmp5.f = -2.f;
1750 Stmp5.ui = Stmp5.ui & Stmp4.ui;
1751 Stmp4.f = 1.f;
1752 Stmp4.f = __fadd_rn(Stmp4.f, Stmp5.f);
1753
1754 Sa11.f = Sa11.f * Stmp4.f;
1755 Sa21.f = Sa21.f * Stmp4.f;
1756 Sa31.f = Sa31.f * Stmp4.f;
1757
1758 Sv11.f = Sv11.f * Stmp4.f;
1759 Sv21.f = Sv21.f * Stmp4.f;
1760 Sv31.f = Sv31.f * Stmp4.f;
1761
1762 // Swap columns 2-3 if necessary
1763
1764 Stmp4.ui = (Stmp2.f < Stmp3.f) ? 0xffffffff : 0;
1765 Stmp5.ui = Sa12.ui ^ Sa13.ui;
1766 Stmp5.ui = Stmp5.ui & Stmp4.ui;
1767 Sa12.ui = Sa12.ui ^ Stmp5.ui;
1768 Sa13.ui = Sa13.ui ^ Stmp5.ui;
1769
1770 Stmp5.ui = Sa22.ui ^ Sa23.ui;
1771 Stmp5.ui = Stmp5.ui & Stmp4.ui;
1772 Sa22.ui = Sa22.ui ^ Stmp5.ui;
1773 Sa23.ui = Sa23.ui ^ Stmp5.ui;
1774
1775 Stmp5.ui = Sa32.ui ^ Sa33.ui;
1776 Stmp5.ui = Stmp5.ui & Stmp4.ui;
1777 Sa32.ui = Sa32.ui ^ Stmp5.ui;
1778 Sa33.ui = Sa33.ui ^ Stmp5.ui;
1779
1780 Stmp5.ui = Sv12.ui ^ Sv13.ui;
1781 Stmp5.ui = Stmp5.ui & Stmp4.ui;
1782 Sv12.ui = Sv12.ui ^ Stmp5.ui;
1783 Sv13.ui = Sv13.ui ^ Stmp5.ui;
1784
1785 Stmp5.ui = Sv22.ui ^ Sv23.ui;
1786 Stmp5.ui = Stmp5.ui & Stmp4.ui;
1787 Sv22.ui = Sv22.ui ^ Stmp5.ui;
1788 Sv23.ui = Sv23.ui ^ Stmp5.ui;
1789
1790 Stmp5.ui = Sv32.ui ^ Sv33.ui;
1791 Stmp5.ui = Stmp5.ui & Stmp4.ui;
1792 Sv32.ui = Sv32.ui ^ Stmp5.ui;
1793 Sv33.ui = Sv33.ui ^ Stmp5.ui;
1794
1795 Stmp5.ui = Stmp2.ui ^ Stmp3.ui;
1796 Stmp5.ui = Stmp5.ui & Stmp4.ui;
1797 Stmp2.ui = Stmp2.ui ^ Stmp5.ui;
1798 Stmp3.ui = Stmp3.ui ^ Stmp5.ui;
1799
1800 // If columns 2-3 have been swapped, negate 3rd column of A and V so that V
1801 // is still a rotation
1802
1803 Stmp5.f = -2.f;
1804 Stmp5.ui = Stmp5.ui & Stmp4.ui;
1805 Stmp4.f = 1.f;
1806 Stmp4.f = __fadd_rn(Stmp4.f, Stmp5.f);
1807
1808 Sa13.f = Sa13.f * Stmp4.f;
1809 Sa23.f = Sa23.f * Stmp4.f;
1810 Sa33.f = Sa33.f * Stmp4.f;
1811
1812 Sv13.f = Sv13.f * Stmp4.f;
1813 Sv23.f = Sv23.f * Stmp4.f;
1814 Sv33.f = Sv33.f * Stmp4.f;
1815
1816 //###########################################################
1817 // Construct QR factorization of A*V (=U*D) using Givens rotations
1818 //###########################################################
1819
1820 Su11.f = 1.f;
1821 Su12.f = 0.f;
1822 Su13.f = 0.f;
1823 Su21.f = 0.f;
1824 Su22.f = 1.f;
1825 Su23.f = 0.f;
1826 Su31.f = 0.f;
1827 Su32.f = 0.f;
1828 Su33.f = 1.f;
1829
1830 Ssh.f = Sa21.f * Sa21.f;
1831 Ssh.ui = (Ssh.f >= gsmall_number) ? 0xffffffff : 0;
1832 Ssh.ui = Ssh.ui & Sa21.ui;
1833
1834 Stmp5.f = 0.f;
1835 Sch.f = __fsub_rn(Stmp5.f, Sa11.f);
1836 Sch.f = max(Sch.f, Sa11.f);
1837 Sch.f = max(Sch.f, gsmall_number);
1838 Stmp5.ui = (Sa11.f >= Stmp5.f) ? 0xffffffff : 0;
1839
1840 Stmp1.f = Sch.f * Sch.f;
1841 Stmp2.f = Ssh.f * Ssh.f;
1842 Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
1843 Stmp1.f = __frsqrt_rn(Stmp2.f);
1844
1845 Stmp4.f = Stmp1.f * 0.5f;
1846 Stmp3.f = Stmp1.f * Stmp4.f;
1847 Stmp3.f = Stmp1.f * Stmp3.f;
1848 Stmp3.f = Stmp2.f * Stmp3.f;
1849 Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
1850 Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);
1851 Stmp1.f = Stmp1.f * Stmp2.f;
1852
1853 Sch.f = __fadd_rn(Sch.f, Stmp1.f);
1854
1855 Stmp1.ui = ~Stmp5.ui & Ssh.ui;
1856 Stmp2.ui = ~Stmp5.ui & Sch.ui;
1857 Sch.ui = Stmp5.ui & Sch.ui;
1858 Ssh.ui = Stmp5.ui & Ssh.ui;
1859 Sch.ui = Sch.ui | Stmp1.ui;
1860 Ssh.ui = Ssh.ui | Stmp2.ui;
1861
1862 Stmp1.f = Sch.f * Sch.f;
1863 Stmp2.f = Ssh.f * Ssh.f;
1864 Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
1865 Stmp1.f = __frsqrt_rn(Stmp2.f);
1866
1867 Stmp4.f = Stmp1.f * 0.5f;
1868 Stmp3.f = Stmp1.f * Stmp4.f;
1869 Stmp3.f = Stmp1.f * Stmp3.f;
1870 Stmp3.f = Stmp2.f * Stmp3.f;
1871 Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
1872 Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);
1873
1874 Sch.f = Sch.f * Stmp1.f;
1875 Ssh.f = Ssh.f * Stmp1.f;
1876
1877 Sc.f = Sch.f * Sch.f;
1878 Ss.f = Ssh.f * Ssh.f;
1879 Sc.f = __fsub_rn(Sc.f, Ss.f);
1880 Ss.f = Ssh.f * Sch.f;
1881 Ss.f = __fadd_rn(Ss.f, Ss.f);
1882
1883 //###########################################################
1884 // Rotate matrix A
1885 //###########################################################
1886
1887 Stmp1.f = Ss.f * Sa11.f;
1888 Stmp2.f = Ss.f * Sa21.f;
1889 Sa11.f = Sc.f * Sa11.f;
1890 Sa21.f = Sc.f * Sa21.f;
1891 Sa11.f = __fadd_rn(Sa11.f, Stmp2.f);
1892 Sa21.f = __fsub_rn(Sa21.f, Stmp1.f);
1893
1894 Stmp1.f = Ss.f * Sa12.f;
1895 Stmp2.f = Ss.f * Sa22.f;
1896 Sa12.f = Sc.f * Sa12.f;
1897 Sa22.f = Sc.f * Sa22.f;
1898 Sa12.f = __fadd_rn(Sa12.f, Stmp2.f);
1899 Sa22.f = __fsub_rn(Sa22.f, Stmp1.f);
1900
1901 Stmp1.f = Ss.f * Sa13.f;
1902 Stmp2.f = Ss.f * Sa23.f;
1903 Sa13.f = Sc.f * Sa13.f;
1904 Sa23.f = Sc.f * Sa23.f;
1905 Sa13.f = __fadd_rn(Sa13.f, Stmp2.f);
1906 Sa23.f = __fsub_rn(Sa23.f, Stmp1.f);
1907
1908 //###########################################################
1909 // Update matrix U
1910 //###########################################################
1911
1912 Stmp1.f = Ss.f * Su11.f;
1913 Stmp2.f = Ss.f * Su12.f;
1914 Su11.f = Sc.f * Su11.f;
1915 Su12.f = Sc.f * Su12.f;
1916 Su11.f = __fadd_rn(Su11.f, Stmp2.f);
1917 Su12.f = __fsub_rn(Su12.f, Stmp1.f);
1918
1919 Stmp1.f = Ss.f * Su21.f;
1920 Stmp2.f = Ss.f * Su22.f;
1921 Su21.f = Sc.f * Su21.f;
1922 Su22.f = Sc.f * Su22.f;
1923 Su21.f = __fadd_rn(Su21.f, Stmp2.f);
1924 Su22.f = __fsub_rn(Su22.f, Stmp1.f);
1925
1926 Stmp1.f = Ss.f * Su31.f;
1927 Stmp2.f = Ss.f * Su32.f;
1928 Su31.f = Sc.f * Su31.f;
1929 Su32.f = Sc.f * Su32.f;
1930 Su31.f = __fadd_rn(Su31.f, Stmp2.f);
1931 Su32.f = __fsub_rn(Su32.f, Stmp1.f);
1932
1933 // Second Givens rotation
1934
1935 Ssh.f = Sa31.f * Sa31.f;
1936 Ssh.ui = (Ssh.f >= gsmall_number) ? 0xffffffff : 0;
1937 Ssh.ui = Ssh.ui & Sa31.ui;
1938
1939 Stmp5.f = 0.f;
1940 Sch.f = __fsub_rn(Stmp5.f, Sa11.f);
1941 Sch.f = max(Sch.f, Sa11.f);
1942 Sch.f = max(Sch.f, gsmall_number);
1943 Stmp5.ui = (Sa11.f >= Stmp5.f) ? 0xffffffff : 0;
1944
1945 Stmp1.f = Sch.f * Sch.f;
1946 Stmp2.f = Ssh.f * Ssh.f;
1947 Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
1948 Stmp1.f = __frsqrt_rn(Stmp2.f);
1949
1950 Stmp4.f = Stmp1.f * 0.5;
1951 Stmp3.f = Stmp1.f * Stmp4.f;
1952 Stmp3.f = Stmp1.f * Stmp3.f;
1953 Stmp3.f = Stmp2.f * Stmp3.f;
1954 Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
1955 Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);
1956 Stmp1.f = Stmp1.f * Stmp2.f;
1957
1958 Sch.f = __fadd_rn(Sch.f, Stmp1.f);
1959
1960 Stmp1.ui = ~Stmp5.ui & Ssh.ui;
1961 Stmp2.ui = ~Stmp5.ui & Sch.ui;
1962 Sch.ui = Stmp5.ui & Sch.ui;
1963 Ssh.ui = Stmp5.ui & Ssh.ui;
1964 Sch.ui = Sch.ui | Stmp1.ui;
1965 Ssh.ui = Ssh.ui | Stmp2.ui;
1966
1967 Stmp1.f = Sch.f * Sch.f;
1968 Stmp2.f = Ssh.f * Ssh.f;
1969 Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
1970 Stmp1.f = __frsqrt_rn(Stmp2.f);
1971
1972 Stmp4.f = Stmp1.f * 0.5f;
1973 Stmp3.f = Stmp1.f * Stmp4.f;
1974 Stmp3.f = Stmp1.f * Stmp3.f;
1975 Stmp3.f = Stmp2.f * Stmp3.f;
1976 Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
1977 Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);
1978
1979 Sch.f = Sch.f * Stmp1.f;
1980 Ssh.f = Ssh.f * Stmp1.f;
1981
1982 Sc.f = Sch.f * Sch.f;
1983 Ss.f = Ssh.f * Ssh.f;
1984 Sc.f = __fsub_rn(Sc.f, Ss.f);
1985 Ss.f = Ssh.f * Sch.f;
1986 Ss.f = __fadd_rn(Ss.f, Ss.f);
1987
1988 //###########################################################
1989 // Rotate matrix A
1990 //###########################################################
1991
1992 Stmp1.f = Ss.f * Sa11.f;
1993 Stmp2.f = Ss.f * Sa31.f;
1994 Sa11.f = Sc.f * Sa11.f;
1995 Sa31.f = Sc.f * Sa31.f;
1996 Sa11.f = __fadd_rn(Sa11.f, Stmp2.f);
1997 Sa31.f = __fsub_rn(Sa31.f, Stmp1.f);
1998
1999 Stmp1.f = Ss.f * Sa12.f;
2000 Stmp2.f = Ss.f * Sa32.f;
2001 Sa12.f = Sc.f * Sa12.f;
2002 Sa32.f = Sc.f * Sa32.f;
2003 Sa12.f = __fadd_rn(Sa12.f, Stmp2.f);
2004 Sa32.f = __fsub_rn(Sa32.f, Stmp1.f);
2005
2006 Stmp1.f = Ss.f * Sa13.f;
2007 Stmp2.f = Ss.f * Sa33.f;
2008 Sa13.f = Sc.f * Sa13.f;
2009 Sa33.f = Sc.f * Sa33.f;
2010 Sa13.f = __fadd_rn(Sa13.f, Stmp2.f);
2011 Sa33.f = __fsub_rn(Sa33.f, Stmp1.f);
2012
2013 //###########################################################
2014 // Update matrix U
2015 //###########################################################
2016
2017 Stmp1.f = Ss.f * Su11.f;
2018 Stmp2.f = Ss.f * Su13.f;
2019 Su11.f = Sc.f * Su11.f;
2020 Su13.f = Sc.f * Su13.f;
2021 Su11.f = __fadd_rn(Su11.f, Stmp2.f);
2022 Su13.f = __fsub_rn(Su13.f, Stmp1.f);
2023
2024 Stmp1.f = Ss.f * Su21.f;
2025 Stmp2.f = Ss.f * Su23.f;
2026 Su21.f = Sc.f * Su21.f;
2027 Su23.f = Sc.f * Su23.f;
2028 Su21.f = __fadd_rn(Su21.f, Stmp2.f);
2029 Su23.f = __fsub_rn(Su23.f, Stmp1.f);
2030
2031 Stmp1.f = Ss.f * Su31.f;
2032 Stmp2.f = Ss.f * Su33.f;
2033 Su31.f = Sc.f * Su31.f;
2034 Su33.f = Sc.f * Su33.f;
2035 Su31.f = __fadd_rn(Su31.f, Stmp2.f);
2036 Su33.f = __fsub_rn(Su33.f, Stmp1.f);
2037
2038 // Third Givens Rotation
2039
2040 Ssh.f = Sa32.f * Sa32.f;
2041 Ssh.ui = (Ssh.f >= gsmall_number) ? 0xffffffff : 0;
2042 Ssh.ui = Ssh.ui & Sa32.ui;
2043
2044 Stmp5.f = 0.f;
2045 Sch.f = __fsub_rn(Stmp5.f, Sa22.f);
2046 Sch.f = max(Sch.f, Sa22.f);
2047 Sch.f = max(Sch.f, gsmall_number);
2048 Stmp5.ui = (Sa22.f >= Stmp5.f) ? 0xffffffff : 0;
2049
2050 Stmp1.f = Sch.f * Sch.f;
2051 Stmp2.f = Ssh.f * Ssh.f;
2052 Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
2053 Stmp1.f = __frsqrt_rn(Stmp2.f);
2054
2055 Stmp4.f = Stmp1.f * 0.5f;
2056 Stmp3.f = Stmp1.f * Stmp4.f;
2057 Stmp3.f = Stmp1.f * Stmp3.f;
2058 Stmp3.f = Stmp2.f * Stmp3.f;
2059 Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
2060 Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);
2061 Stmp1.f = Stmp1.f * Stmp2.f;
2062
2063 Sch.f = __fadd_rn(Sch.f, Stmp1.f);
2064
2065 Stmp1.ui = ~Stmp5.ui & Ssh.ui;
2066 Stmp2.ui = ~Stmp5.ui & Sch.ui;
2067 Sch.ui = Stmp5.ui & Sch.ui;
2068 Ssh.ui = Stmp5.ui & Ssh.ui;
2069 Sch.ui = Sch.ui | Stmp1.ui;
2070 Ssh.ui = Ssh.ui | Stmp2.ui;
2071
2072 Stmp1.f = Sch.f * Sch.f;
2073 Stmp2.f = Ssh.f * Ssh.f;
2074 Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
2075 Stmp1.f = __frsqrt_rn(Stmp2.f);
2076
2077 Stmp4.f = Stmp1.f * 0.5f;
2078 Stmp3.f = Stmp1.f * Stmp4.f;
2079 Stmp3.f = Stmp1.f * Stmp3.f;
2080 Stmp3.f = Stmp2.f * Stmp3.f;
2081 Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
2082 Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);
2083
2084 Sch.f = Sch.f * Stmp1.f;
2085 Ssh.f = Ssh.f * Stmp1.f;
2086
2087 Sc.f = Sch.f * Sch.f;
2088 Ss.f = Ssh.f * Ssh.f;
2089 Sc.f = __fsub_rn(Sc.f, Ss.f);
2090 Ss.f = Ssh.f * Sch.f;
2091 Ss.f = __fadd_rn(Ss.f, Ss.f);
2092
2093 //###########################################################
2094 // Rotate matrix A
2095 //###########################################################
2096
2097 Stmp1.f = Ss.f * Sa21.f;
2098 Stmp2.f = Ss.f * Sa31.f;
2099 Sa21.f = Sc.f * Sa21.f;
2100 Sa31.f = Sc.f * Sa31.f;
2101 Sa21.f = __fadd_rn(Sa21.f, Stmp2.f);
2102 Sa31.f = __fsub_rn(Sa31.f, Stmp1.f);
2103
2104 Stmp1.f = Ss.f * Sa22.f;
2105 Stmp2.f = Ss.f * Sa32.f;
2106 Sa22.f = Sc.f * Sa22.f;
2107 Sa32.f = Sc.f * Sa32.f;
2108 Sa22.f = __fadd_rn(Sa22.f, Stmp2.f);
2109 Sa32.f = __fsub_rn(Sa32.f, Stmp1.f);
2110
2111 Stmp1.f = Ss.f * Sa23.f;
2112 Stmp2.f = Ss.f * Sa33.f;
2113 Sa23.f = Sc.f * Sa23.f;
2114 Sa33.f = Sc.f * Sa33.f;
2115 Sa23.f = __fadd_rn(Sa23.f, Stmp2.f);
2116 Sa33.f = __fsub_rn(Sa33.f, Stmp1.f);
2117
2118 //###########################################################
2119 // Update matrix U
2120 //###########################################################
2121
2122 Stmp1.f = Ss.f * Su12.f;
2123 Stmp2.f = Ss.f * Su13.f;
2124 Su12.f = Sc.f * Su12.f;
2125 Su13.f = Sc.f * Su13.f;
2126 Su12.f = __fadd_rn(Su12.f, Stmp2.f);
2127 Su13.f = __fsub_rn(Su13.f, Stmp1.f);
2128
2129 Stmp1.f = Ss.f * Su22.f;
2130 Stmp2.f = Ss.f * Su23.f;
2131 Su22.f = Sc.f * Su22.f;
2132 Su23.f = Sc.f * Su23.f;
2133 Su22.f = __fadd_rn(Su22.f, Stmp2.f);
2134 Su23.f = __fsub_rn(Su23.f, Stmp1.f);
2135
2136 Stmp1.f = Ss.f * Su32.f;
2137 Stmp2.f = Ss.f * Su33.f;
2138 Su32.f = Sc.f * Su32.f;
2139 Su33.f = Sc.f * Su33.f;
2140 Su32.f = __fadd_rn(Su32.f, Stmp2.f);
2141 Su33.f = __fsub_rn(Su33.f, Stmp1.f);
2142
2143 V_3x3[0] = Sv11.f;
2144 V_3x3[1] = Sv12.f;
2145 V_3x3[2] = Sv13.f;
2146 V_3x3[3] = Sv21.f;
2147 V_3x3[4] = Sv22.f;
2148 V_3x3[5] = Sv23.f;
2149 V_3x3[6] = Sv31.f;
2150 V_3x3[7] = Sv32.f;
2151 V_3x3[8] = Sv33.f;
2152
2153 U_3x3[0] = Su11.f;
2154 U_3x3[1] = Su12.f;
2155 U_3x3[2] = Su13.f;
2156 U_3x3[3] = Su21.f;
2157 U_3x3[4] = Su22.f;
2158 U_3x3[5] = Su23.f;
2159 U_3x3[6] = Su31.f;
2160 U_3x3[7] = Su32.f;
2161 U_3x3[8] = Su33.f;
2162
2163 S_3x1[0] = Sa11.f;
2164 // s12 = Sa12.f; s13 = Sa13.f; s21 = Sa21.f;
2165 S_3x1[1] = Sa22.f;
2166 // s23 = Sa23.f; s31 = Sa31.f; s32 = Sa32.f;
2167 S_3x1[2] = Sa33.f;
2168}
2169
2170template <typename scalar_t>
2172 const scalar_t *A_3x3, // input A {3,3}
2173 const scalar_t *B_3x1, // input b {3,1}
2174 scalar_t *X_3x1) // output x {3,1}
2175{
2176 scalar_t U[9];
2177 scalar_t V[9];
2178 scalar_t S[3];
2179 svd3x3(A_3x3, U, S, V);
2180
2181 //###########################################################
2182 // Sigma^+
2183 //###########################################################
2184 const scalar_t epsilon = 1e-10;
2185 S[0] = abs(S[0]) < epsilon ? 0 : 1.0 / S[0];
2186 S[1] = abs(S[1]) < epsilon ? 0 : 1.0 / S[1];
2187 S[2] = abs(S[2]) < epsilon ? 0 : 1.0 / S[2];
2188
2189 //###########################################################
2190 // (Sigma^+) * UT
2191 //###########################################################
2192 scalar_t S_UT[9];
2193
2194 S_UT[0] = U[0] * S[0];
2195 S_UT[1] = U[3] * S[0];
2196 S_UT[2] = U[6] * S[0];
2197 S_UT[3] = U[1] * S[1];
2198 S_UT[4] = U[4] * S[1];
2199 S_UT[5] = U[7] * S[1];
2200 S_UT[6] = U[2] * S[2];
2201 S_UT[7] = U[5] * S[2];
2202 S_UT[8] = U[8] * S[2];
2203
2204 //###########################################################
2205 // Ainv = V * [(Sigma^+) * UT]
2206 //###########################################################
2207 scalar_t Ainv[9] = {0};
2208 matmul3x3_3x3(V, S_UT, Ainv);
2209
2210 //###########################################################
2211 // x = Ainv * b
2212 //###########################################################
2213
2214 matmul3x3_3x1(Ainv, B_3x1, X_3x1);
2215}
2216
2217} // namespace kernel
2218} // namespace linalg
2219} // namespace core
2220} // namespace open3d
Common CUDA utilities.
#define OPEN3D_DEVICE
Definition CUDAUtils.h:45
#define OPEN3D_FORCE_INLINE
Definition CUDAUtils.h:43
#define __fsub_rn(x, y)
Definition SVD3x3.h:63
#define __fadd_rn(x, y)
Definition SVD3x3.h:62
#define __drsqrt_rn(x)
Definition SVD3x3.h:68
#define gone
Definition SVD3x3.h:39
#define __frsqrt_rn(x)
Definition SVD3x3.h:64
#define gcosine_pi_over_eight
Definition SVD3x3.h:42
#define __dadd_rn(x, y)
Definition SVD3x3.h:66
#define gsine_pi_over_eight
Definition SVD3x3.h:40
#define gfour_gamma_squared
Definition SVD3x3.h:44
#define __dsub_rn(x, y)
Definition SVD3x3.h:67
#define gtiny_number
Definition SVD3x3.h:43
OPEN3D_DEVICE OPEN3D_FORCE_INLINE void svd3x3< double >(const double *A_3x3, double *U_3x3, double *S_3x1, double *V_3x3)
Definition SVD3x3.h:93
OPEN3D_DEVICE OPEN3D_FORCE_INLINE void matmul3x3_3x3(const scalar_t *A_3x3, const scalar_t *B_3x3, scalar_t *C_3x3)
Definition Matrix.h:48
OPEN3D_DEVICE OPEN3D_FORCE_INLINE void svd3x3< float >(const float *A_3x3, float *U_3x3, float *S_3x1, float *V_3x3)
Definition SVD3x3.h:1132
OPEN3D_DEVICE OPEN3D_FORCE_INLINE void svd3x3(const scalar_t *A_3x3, scalar_t *U_3x3, scalar_t *S_3x1, scalar_t *V_3x3)
OPEN3D_DEVICE OPEN3D_FORCE_INLINE void solve_svd3x3(const scalar_t *A_3x3, const scalar_t *B_3x1, scalar_t *X_3x1)
Definition SVD3x3.h:2171
OPEN3D_HOST_DEVICE OPEN3D_FORCE_INLINE void matmul3x3_3x1(const scalar_t *A_3x3, const scalar_t *B_3x1, scalar_t *C_3x1)
Definition Matrix.h:39
Definition PinholeCameraIntrinsic.cpp:16
Definition SVD3x3.h:81
unsigned int ui
Definition SVD3x3.h:83
scalar_t f
Definition SVD3x3.h:82