Welcome to Intel® Software Network Quick Login | Join | Help |
Search in Intel® Software Network Forums
in Go

(pain reloaded) Why performace isn't much better in euclidean distance with SSE2?

Last post 04-24-2008, 9:29 AM by dario.mx@gmail.com. 9 replies.
Sort Posts: Previous Next
 04-14-2008, 8:18 AM 30252727  

(pain reloaded) Why performace isn't much better in euclidean distance with SSE2?

Hallo,

I am trying to improve the performance of a very simple program, by using SSE2. Program snipet only needs to calculate the distance of all pairs of  a set of N points (I know I could take advantage of symmetry, but I am focusing now at low vectorization).

I am using intrinsics, have examined a bit the generated assembler (though I far from being an expert) and it looks fike. However, the performance is only 10% less than original one. What am I missing?

Thanks.


Program is the following:
#include <stdio.h>
#include <stdlib.h>
#include <xmmintrin.h>
#include <math.h>

#define SSE2_ALIGNED __attribute__ ((aligned (16)))
#define print_y(x) printf("%d\n",(int) x)
#define print_n(x) x
#define print print_n

int D;
float *X,*Y;

inline static void dist(int i,int j)
{
float xd = X[i] - X[j];
float yd = Y[i] - Y[j];
print(rint(xd*xd + yd*yd));
}

inline static void dist_sse(int i)
{
float d[8] SSE2_ALIGNED;
int j;
__m128 xmm0 = _mm_set1_ps(X[i]);
__m128 xmm1 = xmm0;
__m128 xmm2 = _mm_set1_ps(Y[i]);
__m128 xmm3 = xmm2;
__m128 xmm4,xmm5,xmm6,xmm7;
for(j=0; j<D;j+=8)
{
xmm4 =_mm_load_ps(X+j);
xmm5 =_mm_load_ps(X+j+4);
xmm6 =_mm_load_ps(Y+j);
xmm7 =_mm_load_ps(Y+j+4);
xmm4 = _mm_sub_ps(xmm0,xmm4);
xmm5 = _mm_sub_ps(xmm1,xmm5);
xmm6 = _mm_sub_ps(xmm2,xmm6);
xmm7 = _mm_sub_ps(xmm3,xmm7);
xmm4 = _mm_mul_ps(xmm4,xmm4);
xmm5 = _mm_mul_ps(xmm5,xmm5);
xmm6 = _mm_mul_ps(xmm6,xmm6);
xmm7 = _mm_mul_ps(xmm7,xmm7);
xmm4 = _mm_add_ps(xmm4,xmm6);
xmm5 = _mm_add_ps(xmm5,xmm7);
_mm_store_ps(d,xmm4);
_mm_store_ps(d+4,xmm5);
print(rint(d[0]));
print(rint(d[1]));
print(rint(d[2]));
print(rint(d[3]));
print(rint(d[4]));
print(rint(d[5]));
print(rint(d[6]));
print(rint(d[7]));
}
}

int main(int argc, char * argv[])
{
int i,j,opc;

if ( argc != 3 )
{
fprintf(stderr,"\nUsage: %s <opc=0|1> <D>\n\n",argv[0]);
return 1;
}

opc = atoi(argv[1]);
D = atoi(argv[2]);

if ( D %8 != 0 )
{
fprintf(stderr,"\nDimension %d must be multiple of 8: \n\n",D);
return 2;
}

if ( opc == 0 )
{
X = (float *) malloc(D * sizeof(float));
Y = (float *) malloc(D * sizeof(float));
}
else
{
X = (float *) _mm_malloc(D * sizeof(float), 16);
Y = (float *) _mm_malloc(D * sizeof(float), 16);
}

for(i=0;i<D;i++)
{
X[i] = i;
Y[i] = D - i;
}

if ( opc == 0 )
for(i=0;i<D;i++)
for(j=0;j<D;j++)
dist(i,j);
else
for(i=0;i<D;i++)
dist_sse(i);

return 0;
}

I am compiling with:

CC = gcc
CFLAGS = -O3 -Wall -march=pentium-m -msse2

all: kk

And generated assembler is:

	.file	"kk.c"
.def ___main; .scl 2; .type 32; .endef
.section .rdata,"dr"
LC0:
.ascii "\12Usage: %s <opc=0|1> <D>\12\12\0"
.align 4
LC1:
.ascii "\12Dimension %d must be multiple of 8: \12\12\0"
.text
.p2align 4,,15
.globl _main
.def _main; .scl 2; .type 32; .endef
_main:
pushl %ebp
movl $16, %eax
movl %esp, %ebp
pushl %edi
pushl %esi
pushl %ebx
subl $108, %esp
movl 12(%ebp), %ebx
andl $-16, %esp
call __alloca
call ___main
cmpl $3, 8(%ebp)
je L2
call ___getreent
movl (%ebx), %esi
movl $LC0, %ecx
movl %ecx, 4(%esp)
movl %esi, 8(%esp)
movl 12(%eax), %edx
movl %edx, (%esp)
call _fprintf
movl $1, %eax
leal -12(%ebp), %esp
L90:
popl %ebx
popl %esi
popl %edi
popl %ebp
ret
L2:
movl 4(%ebx), %edi
movl %edi, (%esp)
call _atoi
movl %eax, %edi
movl 8(%ebx), %eax
movl %eax, (%esp)
call _atoi
movl %eax, _D
testb $7, %al
movl %eax, %ecx
jne L82
testl %edi, %edi
je L83
xorl %edx, %edx
sall $2, %eax
jne L84
L7:
movl %edx, _X
movl %ecx, %eax
xorl %edx, %edx
sall $2, %eax
jne L85
L12:
movl %edx, _Y
movl %edx, %ebx
L5:
xorl %edx, %edx
cmpl %ecx, %edx
jge L59
movl _X, %esi
.p2align 4,,15
L19:
movl %ecx, %eax
cvtsi2ss %edx, %xmm1
subl %edx, %eax
cvtsi2ss %eax, %xmm0
movss %xmm1, (%esi,%edx,4)
movss %xmm0, (%ebx,%edx,4)
incl %edx
cmpl %ecx, %edx
jl L19
L59:
testl %edi, %edi
jne L20
xorl %esi, %esi
cmpl %ecx, %esi
jge L30
.p2align 4,,15
L92:
xorl %ebx, %ebx
cmpl %ecx, %ebx
jge L63
.p2align 4,,15
L91:
movl _X, %edx
movl _Y, %edi
flds (%edx,%ebx,4)
flds (%edi,%ebx,4)
fxch %st(1)
incl %ebx
fsubrs (%edx,%esi,4)
fxch %st(1)
fsubrs (%edi,%esi,4)
fxch %st(1)
fmul %st(0), %st
fxch %st(1)
fmul %st(0), %st
faddp %st, %st(1)
fstpl (%esp)
call _rint
fstp %st(0)
movl _D, %ecx
cmpl %ecx, %ebx
jl L91
L63:
incl %esi
cmpl %ecx, %esi
jl L92
L30:
leal -12(%ebp), %esp
xorl %eax, %eax
popl %ebx
popl %esi
popl %edi
popl %ebp
ret
L83:
leal 0(,%eax,4), %eax
movl %eax, (%esp)
call _malloc
movl %eax, _X
movl _D, %esi
sall $2, %esi
movl %esi, (%esp)
call _malloc
movl %eax, _Y
movl _D, %ecx
movl %eax, %ebx
jmp L5
L20:
xorl %edi, %edi
cmpl %ecx, %edi
jge L30
.p2align 4,,15
L75:
movl _X, %edx
movl (%edx,%edi,4), %eax
movl %eax, -60(%ebp)
movl (%ebx,%edi,4), %esi
movss -60(%ebp), %xmm2
movl %esi, -64(%ebp)
xorl %esi, %esi
shufps $0, %xmm2, %xmm2
movss -64(%ebp), %xmm4
cmpl %ecx, %esi
movaps %xmm2, -88(%ebp)
shufps $0, %xmm4, %xmm4
movaps %xmm4, -104(%ebp)
jl L76
jmp L66
.p2align 4,,7
L67:
movl _X, %edx
movl _Y, %ebx
L76:
movaps -88(%ebp), %xmm7
leal 0(,%esi,4), %ecx
leal (%ecx,%edx), %edx
movaps (%edx), %xmm5
addl %ebx, %ecx
addl $8, %esi
movaps (%ecx), %xmm6
movaps -104(%ebp), %xmm4
subps %xmm5, %xmm7
movaps %xmm7, %xmm5
movaps 16(%edx), %xmm3
mulps %xmm5, %xmm5
subps %xmm6, %xmm4
movaps 16(%ecx), %xmm1
movaps %xmm4, %xmm7
mulps %xmm4, %xmm7
movaps -88(%ebp), %xmm2
addps %xmm7, %xmm5
movaps -104(%ebp), %xmm0
movaps %xmm5, -56(%ebp)
flds -56(%ebp)
subps %xmm3, %xmm2
subps %xmm1, %xmm0
movaps %xmm2, %xmm3
movaps %xmm0, %xmm6
mulps %xmm2, %xmm3
mulps %xmm0, %xmm6
addps %xmm6, %xmm3
movaps %xmm3, -40(%ebp)
fstpl (%esp)
call _rint
fstp %st(0)
flds -52(%ebp)
fstpl (%esp)
call _rint
fstp %st(0)
flds -48(%ebp)
fstpl (%esp)
call _rint
fstp %st(0)
flds -44(%ebp)
fstpl (%esp)
call _rint
fstp %st(0)
flds -40(%ebp)
fstpl (%esp)
call _rint
fstp %st(0)
flds -36(%ebp)
fstpl (%esp)
call _rint
fstp %st(0)
flds -32(%ebp)
fstpl (%esp)
call _rint
fstp %st(0)
flds -28(%ebp)
fstpl (%esp)
call _rint
fstp %st(0)
movl _D, %ecx
cmpl %ecx, %esi
jl L67
L66:
incl %edi
cmpl %ecx, %edi
jge L30
movl _Y, %ebx
jmp L75
L85:
addl $16, %eax
movl %eax, (%esp)
call _malloc
testl %eax, %eax
movl %eax, %edx
je L78
leal 16(%eax), %ecx
andl $-16, %ecx
movl %ecx, %edx
movl %eax, -4(%ecx)
L78:
movl _D, %ecx
jmp L12
L84:
addl $16, %eax
movl %eax, (%esp)
call _malloc
testl %eax, %eax
movl %eax, %edx
je L77
leal 16(%eax), %ebx
andl $-16, %ebx
movl %ebx, %edx
movl %eax, -4(%ebx)
L77:
movl _D, %ecx
jmp L7
L82:
call ___getreent
movl _D, %ecx
movl $LC1, %edx
movl %edx, 4(%esp)
movl %ecx, 8(%esp)
movl 12(%eax), %ebx
movl %ebx, (%esp)
call _fprintf
movl $2, %eax
leal -12(%ebp), %esp
jmp L90
.comm _D, 16 # 4
.comm _X, 16 # 4
.comm _Y, 16 # 4
.def _atoi; .scl 3; .type 32; .endef
.def ___getreent; .scl 3; .type 32; .endef
.def _fprintf; .scl 3; .type 32; .endef
.def _rint; .scl 3; .type 32; .endef
.def _malloc; .scl 3; .type 32; .endef


 
 04-22-2008, 3:02 PM 30253278 in reply to 30252727  

Re: (pain reloaded) Why performace isn't much better in euclidean distance with SSE2?

This topic had already been covered extensively in the following public e-mail thread:
http://gcc.gnu.org/ml/gcc-help/2008-04/msg00073.html
It was never clear why a major speedup was expected with printf() in the inner loop, nor whether a vectorizing compiler such as g++ 4.3 or icpc was tried (with printf removed so as to attempt auto-vectorization).
C99 math functions (e.g. rint) presumably are available in g++, with -std=gnu99.
 
 04-22-2008, 3:22 PM 30253282 in reply to 30253278  

Re: (pain reloaded) Why performace isn't much better in euclidean distance with SSE2?

Will read post ... but meanwhile can tell that the printf is conditionally removed from code. I used only for debugging the correctness of vectorized version, of course for the real timing, i removed.

thanks.
 
 04-22-2008, 3:28 PM 30253283 in reply to 30253278  

Re: (pain reloaded) Why performace isn't much better in euclidean distance with SSE2?

LOL

The post u mentioned is just the one I made first, before coming here to Intel forumns !!!

With "extensively covered" you mean the topic was left unanswered? ;-1 Cause thats what happened. I made all suggestions I got, and still no signigicant improvement. The modified version is the one I posted here, at Intel.

There you have the assembler generated by GCC ... there we can see the SSE2 instructions. So, that is wrong with this picture?!

I would expect that, if there is one place on earth with people being experts in Intel assembler, that place must be here ... hehe. So, please check the assembler and tell me what is wrong.

Thanks.

 
 04-22-2008, 3:59 PM 30253285 in reply to 30253283  

Re: (pain reloaded) Why performace isn't much better in euclidean distance with SSE2?

People on the gcc-help list did their best to help you define what you wanted to do.  If you wanted only to make a non-vector cross between a macro and inline function of a style understood only by gcc, with parallel SSE inside that function, perhaps what you showed was what you wanted.
Otherwise, you could have posted a shorter example, which could be compiled by standard compilers, illustrating your interest.
The rint() function is not recognized as a vectorizable function by either gcc or icc.  If you meant it as a substitute for sqrtf(), the latter can be vectorized in line by icc, but apparently not yet by gcc.
 
 04-22-2008, 7:55 PM 30253297 in reply to 30253285  

Re: (pain reloaded) Why performace isn't much better in euclidean distance with SSE2?

Certaingly I appreciate all the advices gcc-help community provide me. Indeed, I followed all their advices.

I was not asking for a way to achieve the vectorization ... I already did it. Neither was I looking to automatic vectorization ... I manually code it. We can see the assembler there, generated directly from intrinsics functions ... it includes SSE2 of course (thus, this is not a problem of whether the compiler automatically vectorized or not).

What I am asking for, kindly of course hehe, is for advice about why my manually coded vector version is not improving significantly the performance (it offers a gain of 10% in runtime, which is very poor considering I am vectorizing the whole thing ... I would expect a gain of 75%)

Thanks.
 
 04-23-2008, 8:06 AM 30253342 in reply to 30253297  

Re: (pain reloaded) Why performace isn't much better in euclidean distance with SSE2?

Hi,

Try changing the allocation scheme to allocating one aligned block for all the data instead of many allocations. This will improve data locality in the caches and might gain performance.

The code here uses rint(). This part is not parallelized and is similar for the two versions. I don’t know what it does so it’s impossible to estimate its duration relative to the other operations. It might be another reason for the low speedup.

Regards

 
 04-23-2008, 11:23 PM 30253393 in reply to 30253297  

Re: (pain reloaded) Why performace isn't much better in euclidean distance with SSE2?

Just to add a few more comments to explain the likely reasons for your observation of lack of speed up.

In addition to what my colleagues have pointed out, on calling an external function like rint. The pitfalls in the code you've shown have significant overhead in terms calling scalar external functions. For the purpose of rounding, using another integer conversion technique might make more sense than throwing a bunch of rint's at the end of each hand-vectorized SSE loop. Secondly, depending on the parameter "D" you use when testing, it is possible in some portion of your loop iteration the rounding of floating-point to integer may experience exceptions. That can have different amount of delays between x87 code and SSE code.

I did a quick test by simplifying your code somewhat to use the intrinsic of cvtps2pi for rounding. I also modified the scalar c code into two versions to compare the overhead of using rint vs. a simple type cast conversion.

Using a fixed value of D= 1024 (10^6 scalar loops, each loop has 2 mul, 1 add, 2 subtract), and compiled with simply /O2 on an ICC and MSC, 

the scalar distance calculation with rint took ~ 40 M cycles

a modified scalar calculation with type cast conversion took ~ 17 M cycles

the modified SSE code with _mm_cvtps_pi32 took ~ 5 M cycles.

There certainly will be variances with different compilers, using an external function like rint vs. type cast convert vs. intrinsic convert. The value of D you choose and the method of timing measurement will make further variance on your measured speedup.

For your reference the modifications I made is based on accumulating the results of each evaluation of

 int dist(int i,int j)
{ float xd = X[i] - X[j]; 
 float yd = Y[i] - Y[j]; 
 int z = rint(xd*xd + yd*yd);
 return z;
}

 int distB(int i,int j)  // compare overhead of rint
{ float xd = X[i] - X[j]; 
 float yd = Y[i] - Y[j]; 
 return (int) (xd*xd + yd*yd);
}

The SSE version includes replacing the rint with _mm_cvtps_pi32 and a bunch of _mm_add_pi32, so that the different loop structure of vectorized code have the same amount of add relative to the double-nested loop of scalar code and accumulated result.


sjkuo
 
 04-24-2008, 9:15 AM 30253419 in reply to 30253342  

Re: (pain reloaded) Why performace isn't much better in euclidean distance with SSE2?

Jalo,

You may be watching at an old version of program. Initial posts I put on gcc-help list, shown an AOS (Array Of StructureS). Due suggestions, I changed that to a SOA (Structure Of Arrays). I have now a few aligned arrays with all the data.

I am going to post new version, which included latests suggestions I received here. You may wanna take a look at it.

Thanks.
 
 04-24-2008, 9:29 AM 30253421 in reply to 30253393  

Re: (pain reloaded) Why performace isn't much better in euclidean distance with SSE2?

Jallo,

It seems that rint function entered a lot of noise ;-1 It was not an escencial part of my problem, so I removed for the sake of this proof of concept. I can see now a gain of 30% less in execution time. But still, I would expect more, given SSE2 is performing 4 operations at a time, right?

Unless maybe,  I am loosing something ... as usual ;-|

Thanks for your attention (below c and assembler)

PS: Dunno why, but when used an if to repeat code for calling one function or the other, instead of a func pointer, the performance is worse than 30%. I know that using function pointer may prevent inlining the functions, so I tried to avoid its usage ... with the surprise that the SSE2 gain was less !. That's the reason I kept that part. Althought this may b an interesting point on its own, think thats another topic, not related with this serial vs vectorized debate.



#include <stdio.h>
#include <stdlib.h>
#include <xmmintrin.h>

int D;
float *X,*Y,*Z;

inline static void dist(int i)
{
  float xd,yd;
  int j;
  for(j=0; j<D; j++)
  {
    xd = X[i] - X[j];
    yd = Y[i] - Y[j];
    Z[j] = xd*xd + yd*yd;
  }
}

inline static void dist_sse(int i)
{
  int j;
  __m128 xmm0 = _mm_set1_ps(X[i]);
  __m128 xmm1 = xmm0;
  __m128 xmm2 = _mm_set1_ps(Y[i]);
  __m128 xmm3 = xmm2;
  __m128 xmm4,xmm5,xmm6,xmm7;
  for(j=0; j<D;j+=8)
  {
    xmm4 =_mm_load_ps(X+j);
    xmm5 =_mm_load_ps(X+j+4);
    xmm6 =_mm_load_ps(Y+j);
    xmm7 =_mm_load_ps(Y+j+4);
    xmm4 = _mm_sub_ps(xmm0,xmm4);
    xmm5 = _mm_sub_ps(xmm1,xmm5);
    xmm6 = _mm_sub_ps(xmm2,xmm6);
    xmm7 = _mm_sub_ps(xmm3,xmm7);
    xmm4 = _mm_mul_ps(xmm4,xmm4);
    xmm5 = _mm_mul_ps(xmm5,xmm5);
    xmm6 = _mm_mul_ps(xmm6,xmm6);
    xmm7 = _mm_mul_ps(xmm7,xmm7);
    xmm4 = _mm_add_ps(xmm4,xmm6);
    xmm5 = _mm_add_ps(xmm5,xmm7);
    _mm_store_ps(Z+j,xmm4);
    _mm_store_ps(Z+j+4,xmm5);
  }
}

int main(int argc, char * argv[])
{
  int i,j,opc,debug;
  void (*opc_func)(int);

  if ( argc != 4 )
  {
    fprintf(stderr,"\nUsage: %s <opc=0|1> <D> <debug=0|1>\n\n",argv[0]);
    return 1;
  }

  opc = atoi(argv[1]);
  D = atoi(argv[2]);
  debug = atoi(argv[3]);

  if ( D %8 != 0 )
  {
    fprintf(stderr,"\nDimension %d must be multiple of 8: \n\n",D);
    return 2;
  }

  if ( opc == 0 )
  {
    X = (float *) malloc(D * sizeof(float));
    Y = (float *) malloc(D * sizeof(float));
    Z = (float *) malloc(D * sizeof(float));
  }
  else
  {
    X = (float *) _mm_malloc(D * sizeof(float), 16);
    Y = (float *) _mm_malloc(D * sizeof(float), 16);
    Z = (float *) _mm_malloc(D * sizeof(float), 16);
  }

  for(i=0;i<D;i++)
  {
    X[i] = i;
    Y[i] = D - i;
  }
 
  opc_func = opc == 0? dist : dist_sse;
  for(i=0;i<D;i++)
  {
    opc_func(i);
    if ( debug )
    {
      for(j=0; j<D; j++)
        printf("%f\n",Z[j]);
    }
  }

  return 0;
}


    .file    "kk.c"
    .text
    .p2align 4,,15
    .def    _dist;    .scl    3;    .type    32;    .endef
_dist:
    pushl    %ebp
    xorl    %eax, %eax
    movl    %esp, %ebp
    pushl    %edi
    movl    8(%ebp), %edi
    pushl    %esi
    movl    _D, %esi
    pushl    %ebx
    cmpl    %esi, %eax
    jge    L7
    movl    _X, %ecx
    movl    _Y, %edx
    movl    _Z, %ebx
    .p2align 4,,15
L5:
    flds    (%ecx,%eax,4)
    flds    (%edx,%eax,4)
    fxch    %st(1)
    fsubrs    (%ecx,%edi,4)
    fxch    %st(1)
    fsubrs    (%edx,%edi,4)
    fxch    %st(1)
    fmul    %st(0), %st
    fxch    %st(1)
    fmul    %st(0), %st
    faddp    %st, %st(1)
    fstps    (%ebx,%eax,4)
    incl    %eax
    cmpl    %esi, %eax
    jl    L5
L7:
    popl    %ebx
    popl    %esi
    popl    %edi
    popl    %ebp
    ret
    .p2align 4,,15
    .def    _dist_sse;    .scl    3;    .type    32;    .endef
_dist_sse:
    pushl    %ebp
    movl    %esp, %ebp
    pushl    %esi
    pushl    %ebx
    subl    $8, %esp
    movl    8(%ebp), %esi
    movl    _X, %ebx
    movl    _Y, %ecx
    movl    (%ebx,%esi,4), %edx
    movl    %edx, -12(%ebp)
    movl    (%ecx,%esi,4), %eax
    xorl    %esi, %esi
    movss    -12(%ebp), %xmm5
    movl    %eax, -16(%ebp)
    movss    -16(%ebp), %xmm4
    cmpl    _D, %esi
    shufps    $0, %xmm5, %xmm5
    shufps    $0, %xmm4, %xmm4
    jl    L36
    jmp    L34
    .p2align 4,,7
L35:
    movl    _X, %ebx
    movl    _Y, %ecx
L36:
    leal    0(,%esi,4), %edx
    movaps    %xmm5, %xmm1
    leal    (%edx,%ebx), %eax
    movaps    (%eax), %xmm2
    leal    (%edx,%ecx), %ebx
    movaps    %xmm5, %xmm6
    movaps    (%ebx), %xmm0
    addl    $8, %esi
    movaps    16(%eax), %xmm3
    subps    %xmm2, %xmm1
    movl    _Z, %eax
    movaps    16(%ebx), %xmm7
    movaps    %xmm1, %xmm2
    movaps    %xmm4, %xmm1
    subps    %xmm0, %xmm1
    movl    %edx, %ebx
    movaps    %xmm1, %xmm0
    mulps    %xmm2, %xmm2
    mulps    %xmm0, %xmm0
    subps    %xmm3, %xmm6
    addl    %eax, %ebx
    addps    %xmm0, %xmm2
    movaps    %xmm6, %xmm3
    movaps    %xmm2, (%ebx)
    movaps    %xmm4, %xmm6
    subps    %xmm7, %xmm6
    movl    _Z, %ecx
    movaps    %xmm6, %xmm1
    mulps    %xmm3, %xmm3
    mulps    %xmm6, %xmm1
    addps    %xmm1, %xmm3
    addl    %ecx, %edx
    movaps    %xmm3, 16(%edx)
    cmpl    _D, %esi
    jl    L35
L34:
    addl    $8, %esp
    popl    %ebx
    popl    %esi
    popl    %ebp
    ret
    .def    ___main;    .scl    2;    .type    32;    .endef
    .section .rdata,"dr"
    .align 4
LC1:
    .ascii "\12Usage: %s <opc=0|1> <D> <debug=0|1>\12\12\0"
LC3:
    .ascii "%f\12\0"
    .align 4
LC2:
    .ascii "\12Dimension %d must be multiple of 8: \12\12\0"
    .text
    .p2align 4,,15
.globl _main
    .def    _main;    .scl    2;    .type    32;    .endef
_main:
    pushl    %ebp
    movl    $16, %eax
    movl    %esp, %ebp
    pushl    %edi
    pushl    %esi
    pushl    %ebx
    subl    $28, %esp
    movl    12(%ebp), %ebx
    andl    $-16, %esp
    call    __alloca
    call    ___main
    cmpl    $4, 8(%ebp)
    je    L38
    call    ___getreent
    movl    (%ebx), %esi
    movl    $LC1, %ecx
    movl    %ecx, 4(%esp)
    movl    %esi, 8(%esp)
    movl    12(%eax), %edx
    movl    %edx, (%esp)
    call    _fprintf
    movl    $1, %eax
    leal    -12(%ebp), %esp
L99:
    popl    %ebx
    popl    %esi
    popl    %edi
    popl    %ebp
    ret
    .p2align 4,,7
L38:
    movl    4(%ebx), %edx
    movl    %edx, (%esp)
    call    _atoi
    movl    %eax, -16(%ebp)
    movl    8(%ebx), %eax
    movl    %eax, (%esp)
    call    _atoi
    movl    %eax, _D
    movl    12(%ebx), %edi
    movl    %edi, (%esp)
    call    _atoi
    movl    %eax, -20(%ebp)
    movl    _D, %ecx
    testb    $7, %cl
    jne    L89
    movl    -16(%ebp), %edi
    testl    %edi, %edi
    je    L90
    movl    %ecx, %eax
    xorl    %edx, %edx
    sall    $2, %eax
    jne    L91
L43:
    movl    %edx, _X
    movl    %ecx, %eax
    xorl    %edx, %edx
    sall    $2, %eax
    jne    L92
L48:
    movl    %edx, _Y
    movl    %ecx, %eax
    xorl    %edx, %edx
    sall    $2, %eax
    jne    L93
L53:
    movl    %edx, _Z
L41:
    xorl    %edx, %edx
    cmpl    %ecx, %edx
    jge    L73
    movl    _X, %esi
    movl    _Y, %ebx
    .p2align 4,,15
L60:
    movl    %ecx, %eax
    cvtsi2ss    %edx, %xmm1
    subl    %edx, %eax
    cvtsi2ss    %eax, %xmm0
    movss    %xmm1, (%esi,%edx,4)
    movss    %xmm0, (%ebx,%edx,4)
    incl    %edx
    cmpl    %ecx, %edx
    jl    L60
L73:
    movl    -16(%ebp), %ebx
    movl    $_dist, %edi
    movl    $_dist_sse, %edx
    testl    %ebx, %ebx
    cmovne    %edx, %edi
    xorl    %esi, %esi
    cmpl    %ecx, %esi
    jge    L75
    .p2align 4,,15
L101:
    movl    %esi, (%esp)
    call    *%edi
    movl    -20(%ebp), %ecx
    testl    %ecx, %ecx
    je    L77
    movl    _D, %eax
    xorl    %ebx, %ebx
    cmpl    %eax, %ebx
    jge    L65
    .p2align 4,,15
L100:
    movl    _Z, %eax
    flds    (%eax,%ebx,4)
    incl    %ebx
    movl    $LC3, (%esp)
    fstpl    4(%esp)
    call    _printf
    movl    _D, %eax
    cmpl    %eax, %ebx
    jl    L100
L65:
    incl    %esi
    cmpl    %eax, %esi
L102:
    jl    L101
L75:
    leal    -12(%ebp), %esp
    xorl    %eax, %eax
    popl    %ebx
    popl    %esi
    popl    %edi
    popl    %ebp
    ret
L90:
    leal    0(,%ecx,4), %ebx
    movl    %ebx, (%esp)
    call    _malloc
    movl    %eax, _X
    movl    _D, %edx
    sall    $2, %edx
    movl    %edx, (%esp)
    call    _malloc
    movl    %eax, _Y
    movl    _D, %eax
    sall    $2, %eax
    movl    %eax, (%esp)
    call    _malloc
    movl    %eax, _Z
    movl    _D, %ecx
    jmp    L41
    .p2align 4,,7
L77:
    movl    _D, %eax
    incl    %esi
    cmpl    %eax, %esi
    jmp    L102
L93:
    addl    $16, %eax
    movl    %eax, (%esp)
    call    _malloc
    testl    %eax, %eax
    movl    %eax, %edx
    je    L86
    leal    16(%eax), %edi
    andl    $-16, %edi
    movl    %edi, %edx
    movl    %eax, -4(%edi)
L86:
    movl    _D, %ecx
    jmp    L53
L92:
    addl    $16, %eax
    movl    %eax, (%esp)
    call    _malloc
    testl    %eax, %eax
    movl    %eax, %edx
    je    L85
    leal    16(%eax), %esi
    andl    $-16, %esi
    movl    %esi, %edx
    movl    %eax, -4(%esi)
L85:
    movl    _D, %ecx
    jmp    L48
L91:
    addl    $16, %eax
    movl    %eax, (%esp)
    call    _malloc
    testl    %eax, %eax
    movl    %eax, %edx
    je    L84
    leal    16(%eax), %ecx
    andl    $-16, %ecx
    movl    %ecx, %edx
    movl    %eax, -4(%ecx)
L84:
    movl    _D, %ecx
    jmp    L43
L89:
    call    ___getreent
    movl    _D, %esi
    movl    $LC2, %ecx
    movl    %ecx, 4(%esp)
    movl    %esi, 8(%esp)
    movl    12(%eax), %ebx
    movl    %ebx, (%esp)
    call    _fprintf
    movl    $2, %eax
    leal    -12(%ebp), %esp
    jmp    L99
    .comm    _D, 16     # 4
    .comm    _X, 16     # 4
    .comm    _Y, 16     # 4
    .comm    _Z, 16     # 4
    .def    _printf;    .scl    3;    .type    32;    .endef
    .def    _atoi;    .scl    3;    .type    32;    .endef
    .def    ___getreent;    .scl    3;    .type    32;    .endef
    .def    _fprintf;    .scl    3;    .type    32;    .endef
    .def    _malloc;    .scl    3;    .type    32;    .endef

 
View as RSS news feed in XML

Shortcuts


Tags For This Post

...

Community Tags

...