3524b0687e289a82c69501b4621af260d1302c55
[RBC.git] / kernels.cu
1 /* This file is part of the Random Ball Cover (RBC) library.
2 * (C) Copyright 2010, Lawrence Cayton [lcayton@tuebingen.mpg.de]
3 */
4
5 #ifndef KERNELS_CU
6 #define KERNELS_CU
7
8 #include<cuda.h>
9 #include "defs.h"
10 #include "kernels.h"
11 #include<stdio.h>
12
13 // This kernel does the same thing as nnKernel, except it only considers pairs as
14 // specified by the compPlan.
15 __global__ void planNNKernel(const matrix Q, const unint *qMap, const matrix X, const intMatrix xMap, real *dMins, unint *dMinIDs, compPlan cP, unint qStartPos ){
16 unint qB = qStartPos + blockIdx.y * BLOCK_SIZE; //indexes Q
17 unint xB; //X (DB) Block;
18 unint cB; //column Block
19 unint offQ = threadIdx.y; //the offset of qPos in this block
20 unint offX = threadIdx.x; //ditto for x
21 unint i,j,k;
22 unint groupIts;
23
24 __shared__ real min[BLOCK_SIZE][BLOCK_SIZE];
25 __shared__ unint minPos[BLOCK_SIZE][BLOCK_SIZE];
26
27 __shared__ real Xs[BLOCK_SIZE][BLOCK_SIZE];
28 __shared__ real Qs[BLOCK_SIZE][BLOCK_SIZE];
29
30 unint g; //query group of q
31 unint xG; //DB group currently being examined
32 unint numGroups;
33 unint groupCount;
34
35 g = cP.qToQGroup[qB];
36 numGroups = cP.numGroups[g];
37
38 min[offQ][offX]=MAX_REAL;
39 __syncthreads();
40
41
42 for(i=0; i<numGroups; i++){ //iterate over DB groups
43 xG = cP.qGroupToXGroup[IDX( g, i, cP.ld )];
44 groupCount = cP.groupCountX[IDX( g, i, cP.ld )];
45 groupIts = (groupCount+BLOCK_SIZE-1)/BLOCK_SIZE;
46
47 for(j=0; j<groupIts; j++){ //iterate over elements of group
48 xB=j*BLOCK_SIZE;
49
50 real ans=0;
51 for(cB=0; cB<X.pc; cB+=BLOCK_SIZE){ // iterate over cols to compute distances
52
53 Xs[offX][offQ] = X.mat[IDX( xMap.mat[IDX( xG, xB+offQ, xMap.ld )], cB+offX, X.ld )];
54 Qs[offX][offQ] = ( (qMap[qB+offQ]==DUMMY_IDX) ? 0 : Q.mat[IDX( qMap[qB+offQ], cB+offX, Q.ld )] );
55 __syncthreads();
56
57 for(k=0; k<BLOCK_SIZE; k++)
58 ans+=DIST( Xs[k][offX], Qs[k][offQ] );
59
60 __syncthreads();
61 }
62
63 //compare to previous min and store into shared mem if needed.
64 if(xB+offX<groupCount && ans<min[offQ][offX]){
65 min[offQ][offX]=ans;
66 minPos[offQ][offX]= xMap.mat[IDX( xG, xB+offX, xMap.ld )];
67 }
68 __syncthreads();
69 }
70 }
71
72 //Reduce across threads
73 for(i=BLOCK_SIZE/2; i>0; i/=2){
74 if( offX<i ){
75 if( min[offQ][offX+i] < min[offQ][offX] ){
76 min[offQ][offX] = min[offQ][offX+i];
77 minPos[offQ][offX] = minPos[offQ][offX+i];
78 }
79 }
80 __syncthreads();
81 }
82
83 if(offX==0 && qMap[qB+offQ]!=DUMMY_IDX){
84 dMins[qMap[qB+offQ]] = min[offQ][0];
85 dMinIDs[qMap[qB+offQ]] = minPos[offQ][0];
86 }
87 }
88
89
90 __global__ void planKNNKernel(const matrix Q, const unint *qMap, const matrix X, const intMatrix xMap, matrix dMins, intMatrix dMinIDs, compPlan cP, unint qStartPos ){
91 unint qB = qStartPos + blockIdx.y * BLOCK_SIZE; //indexes Q
92 unint xB; //X (DB) Block;
93 unint cB; //column Block
94 unint offQ = threadIdx.y; //the offset of qPos in this block
95 unint offX = threadIdx.x; //ditto for x
96 unint i,j,k;
97 unint groupIts;
98
99 __shared__ real dNN[BLOCK_SIZE][K+BLOCK_SIZE];
100 __shared__ unint idNN[BLOCK_SIZE][K+BLOCK_SIZE];
101
102 __shared__ real Xs[BLOCK_SIZE][BLOCK_SIZE];
103 __shared__ real Qs[BLOCK_SIZE][BLOCK_SIZE];
104
105 unint g; //query group of q
106 unint xG; //DB group currently being examined
107 unint numGroups;
108 unint groupCount;
109
110 g = cP.qToQGroup[qB];
111 numGroups = cP.numGroups[g];
112
113 dNN[offQ][offX] = MAX_REAL;
114 dNN[offQ][offX+16] = MAX_REAL;
115 idNN[offQ][offX] = DUMMY_IDX;
116 idNN[offQ][offX+16] = DUMMY_IDX;
117 __syncthreads();
118
119 for(i=0; i<numGroups; i++){ //iterate over DB groups
120 xG = cP.qGroupToXGroup[IDX( g, i, cP.ld )];
121 groupCount = cP.groupCountX[IDX( g, i, cP.ld )];
122 groupIts = (groupCount+BLOCK_SIZE-1)/BLOCK_SIZE;
123
124 for(j=0; j<groupIts; j++){ //iterate over elements of group
125 xB=j*BLOCK_SIZE;
126
127 real ans=0;
128 for(cB=0; cB<X.pc; cB+=BLOCK_SIZE){ // iterate over cols to compute distances
129
130 Xs[offX][offQ] = X.mat[IDX( xMap.mat[IDX( xG, xB+offQ, xMap.ld )], cB+offX, X.ld )];
131 Qs[offX][offQ] = ( (qMap[qB+offQ]==DUMMY_IDX) ? 0 : Q.mat[IDX( qMap[qB+offQ], cB+offX, Q.ld )] );
132 __syncthreads();
133
134 for(k=0; k<BLOCK_SIZE; k++)
135 ans+=DIST( Xs[k][offX], Qs[k][offQ] );
136
137 __syncthreads();
138 }
139
140 dNN[offQ][offX+32] = (xB+offX<groupCount)? ans:MAX_REAL;
141 idNN[offQ][offX+32] = (xB+offX<groupCount)? xMap.mat[IDX( xG, xB+offX, xMap.ld )]: DUMMY_IDX;
142 __syncthreads();
143
144 sort16off( dNN, idNN );
145 __syncthreads();
146
147 merge32x16( dNN, idNN );
148 }
149 }
150 __syncthreads();
151
152 if(qMap[qB+offQ]!=DUMMY_IDX){
153 dMins.mat[IDX(qMap[qB+offQ], offX, dMins.ld)] = dNN[offQ][offX];
154 dMins.mat[IDX(qMap[qB+offQ], offX+16, dMins.ld)] = dNN[offQ][offX+16];
155 dMinIDs.mat[IDX(qMap[qB+offQ], offX, dMins.ld)] = idNN[offQ][offX];
156 dMinIDs.mat[IDX(qMap[qB+offQ], offX+16, dMinIDs.ld)] = idNN[offQ][offX+16];
157 }
158 }
159
160
161
162 __global__ void nnKernel(const matrix Q, unint numDone, const matrix X, real *dMins, unint *dMinIDs){
163
164 unint qB = blockIdx.y * BLOCK_SIZE + numDone; //indexes Q
165 unint xB; //indexes X;
166 unint cB; //colBlock
167 unint offQ = threadIdx.y; //the offset of qPos in this block
168 unint offX = threadIdx.x; //ditto for x
169 unint i;
170 real ans;
171
172 __shared__ real min[BLOCK_SIZE][BLOCK_SIZE];
173 __shared__ unint minPos[BLOCK_SIZE][BLOCK_SIZE];
174
175 __shared__ real Xs[BLOCK_SIZE][BLOCK_SIZE];
176 __shared__ real Qs[BLOCK_SIZE][BLOCK_SIZE];
177
178 min[offQ][offX]=MAX_REAL;
179 __syncthreads();
180
181 for(xB=0; xB<X.pr; xB+=BLOCK_SIZE){
182 ans=0;
183 for(cB=0; cB<X.pc; cB+=BLOCK_SIZE){
184
185 //Each thread loads one element of X and Q into memory.
186 Xs[offX][offQ] = X.mat[IDX( xB+offQ, cB+offX, X.ld )];
187 Qs[offX][offQ] = Q.mat[IDX( qB+offQ, cB+offX, Q.ld )];
188
189 __syncthreads();
190
191 for(i=0;i<BLOCK_SIZE;i++)
192 ans += DIST( Xs[i][offX], Qs[i][offQ] );
193
194 __syncthreads();
195 }
196
197 if( xB+offX<X.r && ans<min[offQ][offX] ){
198 minPos[offQ][offX] = xB+offX;
199 min[offQ][offX] = ans;
200 }
201 }
202 __syncthreads();
203
204
205 //reduce across threads
206 for(i=BLOCK_SIZE/2; i>0; i/=2){
207 if(offX<i){
208 if(min[offQ][offX+i]<min[offQ][offX]){
209 min[offQ][offX] = min[offQ][offX+i];
210 minPos[offQ][offX] = minPos[offQ][offX+i];
211 }
212 }
213 __syncthreads();
214 }
215
216 if(offX==0){
217 dMins[qB+offQ] = min[offQ][0];
218 dMinIDs[qB+offQ] = minPos[offQ][0];
219 }
220 }
221
222
223 __global__ void knnKernel(const matrix Q, unint numDone, const matrix X, matrix dMins, intMatrix dMinIDs){
224
225 unint qB = blockIdx.y * BLOCK_SIZE + numDone; //indexes Q
226 unint xB; //indexes X;
227 unint cB; //colBlock
228 unint offQ = threadIdx.y; //the offset of qPos in this block
229 unint offX = threadIdx.x; //ditto for x
230 unint i;
231 real ans;
232
233 __shared__ real Xs[BLOCK_SIZE][BLOCK_SIZE];
234 __shared__ real Qs[BLOCK_SIZE][BLOCK_SIZE];
235
236 __shared__ real dNN[BLOCK_SIZE][K+BLOCK_SIZE];
237 __shared__ unint idNN[BLOCK_SIZE][K+BLOCK_SIZE];
238
239 dNN[offQ][offX] = MAX_REAL;
240 dNN[offQ][offX+16] = MAX_REAL;
241 idNN[offQ][offX] = DUMMY_IDX;
242 idNN[offQ][offX+16] = DUMMY_IDX;
243
244 __syncthreads();
245
246 for(xB=0; xB<X.pr; xB+=BLOCK_SIZE){
247 ans=0;
248 for(cB=0; cB<X.pc; cB+=BLOCK_SIZE){
249
250 //Each thread loads one element of X and Q into memory.
251 Xs[offX][offQ] = X.mat[IDX( xB+offQ, cB+offX, X.ld )];
252 Qs[offX][offQ] = Q.mat[IDX( qB+offQ, cB+offX, Q.ld )];
253 __syncthreads();
254
255 for(i=0;i<BLOCK_SIZE;i++)
256 ans += DIST( Xs[i][offX], Qs[i][offQ] );
257
258 __syncthreads();
259 }
260
261 dNN[offQ][offX+32] = (xB+offX<X.r)? ans:MAX_REAL;
262 idNN[offQ][offX+32] = xB + offX;
263 __syncthreads();
264
265 sort16off( dNN, idNN );
266 __syncthreads();
267
268 merge32x16( dNN, idNN );
269 }
270 __syncthreads();
271
272 dMins.mat[IDX(qB+offQ, offX, dMins.ld)] = dNN[offQ][offX];
273 dMins.mat[IDX(qB+offQ, offX+16, dMins.ld)] = dNN[offQ][offX+16];
274 dMinIDs.mat[IDX(qB+offQ, offX, dMins.ld)] = idNN[offQ][offX];
275 dMinIDs.mat[IDX(qB+offQ, offX+16, dMins.ld)] = idNN[offQ][offX+16];
276
277 }
278
279
280 __global__ void dist1Kernel(const matrix Q, unint qStart, const matrix X, unint xStart, matrix D){
281 unint c, i, j;
282
283 unint qB = blockIdx.y*BLOCK_SIZE + qStart;
284 unint q = threadIdx.y;
285 unint xB = blockIdx.x*BLOCK_SIZE + xStart;
286 unint x = threadIdx.x;
287
288 real ans=0;
289
290 //This thread is responsible for computing the dist between Q[qB+q] and X[xB+x]
291
292 __shared__ real Qs[BLOCK_SIZE][BLOCK_SIZE];
293 __shared__ real Xs[BLOCK_SIZE][BLOCK_SIZE];
294
295
296 for(i=0 ; i<Q.pc/BLOCK_SIZE ; i++){
297 c=i*BLOCK_SIZE; //current col block
298
299 Qs[x][q] = Q.mat[ IDX(qB+q, c+x, Q.ld) ];
300 Xs[x][q] = X.mat[ IDX(xB+q, c+x, X.ld) ];
301
302 __syncthreads();
303
304 for(j=0 ; j<BLOCK_SIZE ; j++)
305 ans += DIST( Qs[j][q], Xs[j][x] );
306
307 __syncthreads();
308 }
309
310 D.mat[ IDX( qB+q, xB+x, D.ld ) ] = ans;
311
312 }
313
314
315
316 __global__ void findRangeKernel(const matrix D, unint numDone, real *ranges, unint cntWant){
317
318 unint row = blockIdx.y*(BLOCK_SIZE/4)+threadIdx.y + numDone;
319 unint ro = threadIdx.y;
320 unint co = threadIdx.x;
321 unint i, c;
322 real t;
323
324 const unint LB = (90*cntWant)/100 ;
325 const unint UB = cntWant;
326
327 __shared__ real smin[BLOCK_SIZE/4][4*BLOCK_SIZE];
328 __shared__ real smax[BLOCK_SIZE/4][4*BLOCK_SIZE];
329
330 real min=MAX_REAL;
331 real max=0;
332 for(c=0 ; c<D.pc ; c+=(4*BLOCK_SIZE)){
333 if( c+co < D.c ){
334 t = D.mat[ IDX( row, c+co, D.ld ) ];
335 min = MIN(t,min);
336 max = MAX(t,max);
337 }
338 }
339
340 smin[ro][co] = min;
341 smax[ro][co] = max;
342 __syncthreads();
343
344 for(i=2*BLOCK_SIZE ; i>0 ; i/=2){
345 if( co < i ){
346 smin[ro][co] = MIN( smin[ro][co], smin[ro][co+i] );
347 smax[ro][co] = MAX( smax[ro][co], smax[ro][co+i] );
348 }
349 __syncthreads();
350 }
351
352 //Now start range counting.
353
354 unint itcount=0;
355 unint cnt;
356 real rg;
357 __shared__ unint scnt[BLOCK_SIZE/4][4*BLOCK_SIZE];
358 __shared__ char cont[BLOCK_SIZE/4];
359
360 if(co==0)
361 cont[ro]=1;
362
363 do{
364 itcount++;
365 __syncthreads();
366
367 if( cont[ro] ) //if we didn't actually need to cont, leave rg as it was.
368 rg = ( smax[ro][0] + smin[ro][0] ) / ((real)2.0) ;
369
370 cnt=0;
371 for(c=0 ; c<D.pc ; c+=(4*BLOCK_SIZE)){
372 cnt += (c+co < D.c && row < D.r && D.mat[ IDX( row, c+co, D.ld ) ] <= rg);
373 }
374
375 scnt[ro][co] = cnt;
376 __syncthreads();
377
378 for(i=2*BLOCK_SIZE ; i>0 ; i/=2){
379 if( co < i ){
380 scnt[ro][co] += scnt[ro][co+i];
381 }
382 __syncthreads();
383 }
384
385 if(co==0){
386 if( scnt[ro][0] < cntWant )
387 smin[ro][0]=rg;
388 else
389 smax[ro][0]=rg;
390 }
391
392 // cont[ro] == this row needs to continue
393 if(co==0)
394 cont[ro] = row<D.r && ( scnt[ro][0] < LB || scnt[ro][0] > UB );
395 __syncthreads();
396
397 // Determine if *any* of the rows need to continue
398 for(i=BLOCK_SIZE/8 ; i>0 ; i/=2){
399 if( ro < i && co==0)
400 cont[ro] |= cont[ro+i];
401 __syncthreads();
402 }
403
404 } while(cont[0]);
405
406 if(co==0 && row<D.r )
407 ranges[row]=rg;
408
409 }
410
411
412 __global__ void rangeSearchKernel(const matrix D, unint xOff, unint yOff, const real *ranges, charMatrix ir){
413 unint col = blockIdx.x*BLOCK_SIZE + threadIdx.x + xOff;
414 unint row = blockIdx.y*BLOCK_SIZE + threadIdx.y + yOff;
415
416 ir.mat[IDX( row, col, ir.ld )] = D.mat[IDX( row, col, D.ld )] < ranges[row];
417
418 }
419
420
421 __global__ void rangeCountKernel(const matrix Q, unint numDone, const matrix X, real *ranges, unint *counts){
422 unint q = blockIdx.y*BLOCK_SIZE + numDone;
423 unint qo = threadIdx.y;
424 unint xo = threadIdx.x;
425
426 real rg = ranges[q+qo];
427
428 unint r,c,i;
429
430 __shared__ unint scnt[BLOCK_SIZE][BLOCK_SIZE];
431
432 __shared__ real xs[BLOCK_SIZE][BLOCK_SIZE];
433 __shared__ real qs[BLOCK_SIZE][BLOCK_SIZE];
434
435 unint cnt=0;
436 for( r=0; r<X.pr; r+=BLOCK_SIZE ){
437
438 real dist=0;
439 for( c=0; c<X.pc; c+=BLOCK_SIZE){
440 xs[xo][qo] = X.mat[IDX( r+qo, c+xo, X.ld )];
441 qs[xo][qo] = Q.mat[IDX( q+qo, c+xo, Q.ld )];
442 __syncthreads();
443
444 for( i=0; i<BLOCK_SIZE; i++)
445 dist += DIST( xs[i][xo], qs[i][qo] );
446
447 __syncthreads();
448
449 }
450 cnt += r+xo<X.r && dist<rg;
451
452 }
453
454 scnt[qo][xo]=cnt;
455 __syncthreads();
456
457 for( i=BLOCK_SIZE/2; i>0; i/=2 ){
458 if( xo<i ){
459 scnt[qo][xo] += scnt[qo][xo+i];
460 }
461 __syncthreads();
462 }
463
464 if( xo==0 && q+qo<Q.r )
465 counts[q+qo] = scnt[qo][0];
466 }
467
468
469 __device__ void sort16off(real x[][48], unint xi[][48]){
470 int i = threadIdx.x;
471 int j = threadIdx.y;
472
473 if(i%2==0)
474 mmGateI( x[j]+K+i, x[j]+K+i+1, xi[j]+K+i, xi[j]+K+i+1 );
475 __syncthreads();
476
477 if(i%4<2)
478 mmGateI( x[j]+K+i, x[j]+K+i+2, xi[j]+K+i, xi[j]+K+i+2 );
479 __syncthreads();
480
481 if(i%4==1)
482 mmGateI( x[j]+K+i, x[j]+K+i+1, xi[j]+K+i, xi[j]+K+i+1 );
483 __syncthreads();
484
485 if(i%8<4)
486 mmGateI( x[j]+K+i, x[j]+K+i+4, xi[j]+K+i, xi[j]+K+i+4 );
487 __syncthreads();
488
489 if(i%8==2 || i%8==3)
490 mmGateI( x[j]+K+i, x[j]+K+i+2, xi[j]+K+i, xi[j]+K+i+2 );
491 __syncthreads();
492
493 if( i%2 && i%8 != 7 )
494 mmGateI( x[j]+K+i, x[j]+K+i+1, xi[j]+K+i, xi[j]+K+i+1 );
495 __syncthreads();
496
497 //0-7; 8-15 now sorted. merge time.
498 if( i<8)
499 mmGateI( x[j]+K+i, x[j]+K+i+8, xi[j]+K+i, xi[j]+K+i+8 );
500 __syncthreads();
501
502 if( i>3 && i<8 )
503 mmGateI( x[j]+K+i, x[j]+K+i+4, xi[j]+K+i, xi[j]+K+i+4 );
504 __syncthreads();
505
506 int os = (i/2)*4+2 + i%2;
507 if(i<6)
508 mmGateI( x[j]+K+os, x[j]+K+os+2, xi[j]+K+os, xi[j]+K+os+2 );
509 __syncthreads();
510
511 if( i%2 && i<15)
512 mmGateI( x[j]+K+i, x[j]+K+i+1, xi[j]+K+i, xi[j]+K+i+1 );
513 }
514
515
516 __device__ void sort16(real x[][16], unint xi[][16]){
517 int i = threadIdx.x;
518 int j = threadIdx.y;
519
520 if(i%2==0)
521 mmGateI( x[j]+i, x[j]+i+1, xi[j]+i, xi[j]+i+1 );
522 __syncthreads();
523
524 if(i%4<2)
525 mmGateI( x[j]+i, x[j]+i+2, xi[j]+i, xi[j]+i+2 );
526 __syncthreads();
527
528 if(i%4==1)
529 mmGateI( x[j]+i, x[j]+i+1, xi[j]+i, xi[j]+i+1 );
530 __syncthreads();
531
532 if(i%8<4)
533 mmGateI( x[j]+i, x[j]+i+4, xi[j]+i, xi[j]+i+4 );
534 __syncthreads();
535
536 if(i%8==2 || i%8==3)
537 mmGateI( x[j]+i, x[j]+i+2, xi[j]+i, xi[j]+i+2 );
538 __syncthreads();
539
540 if( i%2 && i%8 != 7 )
541 mmGateI( x[j]+i, x[j]+i+1, xi[j]+i, xi[j]+i+1 );
542 __syncthreads();
543
544 //0-7; 8-15 now sorted. merge time.
545 if( i<8)
546 mmGateI( x[j]+i, x[j]+i+8, xi[j]+i, xi[j]+i+8 );
547 __syncthreads();
548
549 if( i>3 && i<8 )
550 mmGateI( x[j]+i, x[j]+i+4, xi[j]+i, xi[j]+i+4 );
551 __syncthreads();
552
553 int os = (i/2)*4+2 + i%2;
554 if(i<6)
555 mmGateI( x[j]+os, x[j]+os+2, xi[j]+os, xi[j]+os+2 );
556 __syncthreads();
557
558 if( i%2 && i<15)
559 mmGateI( x[j]+i, x[j]+i+1, xi[j]+i, xi[j]+i+1 );
560
561 }
562
563
564 __device__ void merge32x16(real x[][48], unint xi[][48]){
565 int i = threadIdx.x;
566 int j = threadIdx.y;
567
568 mmGateI( x[j]+i, x[j]+i+32, xi[j]+i, xi[j]+i+32 );
569 __syncthreads();
570
571 mmGateI( x[j]+i+16, x[j]+i+32, xi[j]+i+16, xi[j]+i+32 );
572 __syncthreads();
573
574 int os = (i<8)? 24: 0;
575 mmGateI( x[j]+os+i, x[j]+os+i+8, xi[j]+os+i, xi[j]+os+i+8 );
576 __syncthreads();
577
578 os = (i/4)*8+4 + i%4;
579 mmGateI( x[j]+os, x[j]+os+4, xi[j]+os, xi[j]+os+4 );
580 if(i<4)
581 mmGateI(x[j]+36+i, x[j]+36+i+4, xi[j]+36+i, xi[j]+36+i+4 );
582 __syncthreads();
583
584 os = (i/2)*4+2 + i%2;
585 mmGateI( x[j]+os, x[j]+os+2, xi[j]+os, xi[j]+os+2 );
586
587 os = (i/2)*4+34 + i%2;
588 if(i<6)
589 mmGateI( x[j]+os, x[j]+os+2, xi[j]+os, xi[j]+os+2 );
590 __syncthreads();
591
592 os = 2*i+1;
593 mmGateI(x[j]+os, x[j]+os+1, xi[j]+os, xi[j]+os+1 );
594
595 os = 2*i+33;
596 if(i<7)
597 mmGateI(x[j]+os, x[j]+os+1, xi[j]+os, xi[j]+os+1 );
598
599 }
600
601
602 __device__ void mmGateI(real *x, real *y, unint *xi, unint *yi){
603 int ti = MINi( *x, *y, *xi, *yi );
604 *yi = MAXi( *x, *y, *xi, *yi );
605 *xi = ti;
606 real t = MIN( *x, *y );
607 *y = MAX( *x, *y );
608 *x = t;
609 }
610
611 #endif