|
|
(pain reloaded) Why performace isn't much better in euclidean distance with SSE2?
Last post 04-24-2008, 9:29 AM by dario.mx@gmail.com. 9 replies.
|
|
(pain reloaded) Why performace isn't much better in euclidean distance with SSE2?
Hallo, I am trying to improve the performance of a very simple program, by using SSE2. Program snipet only needs to calculate the distance of all pairs of a set of N points (I know I could take advantage of symmetry, but I am focusing now at low vectorization). I am using intrinsics, have examined a bit the generated assembler (though I far from being an expert) and it looks fike. However, the performance is only 10% less than original one. What am I missing? Thanks. Program is the following: #include <stdio.h> #include <stdlib.h> #include <xmmintrin.h> #include <math.h>
#define SSE2_ALIGNED __attribute__ ((aligned (16))) #define print_y(x) printf("%d\n",(int) x) #define print_n(x) x #define print print_n
int D; float *X,*Y;
inline static void dist(int i,int j) { float xd = X[i] - X[j]; float yd = Y[i] - Y[j]; print(rint(xd*xd + yd*yd)); }
inline static void dist_sse(int i) { float d[8] SSE2_ALIGNED; int j; __m128 xmm0 = _mm_set1_ps(X[i]); __m128 xmm1 = xmm0; __m128 xmm2 = _mm_set1_ps(Y[i]); __m128 xmm3 = xmm2; __m128 xmm4,xmm5,xmm6,xmm7; for(j=0; j<D;j+=8) { xmm4 =_mm_load_ps(X+j); xmm5 =_mm_load_ps(X+j+4); xmm6 =_mm_load_ps(Y+j); xmm7 =_mm_load_ps(Y+j+4); xmm4 = _mm_sub_ps(xmm0,xmm4); xmm5 = _mm_sub_ps(xmm1,xmm5); xmm6 = _mm_sub_ps(xmm2,xmm6); xmm7 = _mm_sub_ps(xmm3,xmm7); xmm4 = _mm_mul_ps(xmm4,xmm4); xmm5 = _mm_mul_ps(xmm5,xmm5); xmm6 = _mm_mul_ps(xmm6,xmm6); xmm7 = _mm_mul_ps(xmm7,xmm7); xmm4 = _mm_add_ps(xmm4,xmm6); xmm5 = _mm_add_ps(xmm5,xmm7); _mm_store_ps(d,xmm4); _mm_store_ps(d+4,xmm5); print(rint(d[0])); print(rint(d[1])); print(rint(d[2])); print(rint(d[3])); print(rint(d[4])); print(rint(d[5])); print(rint(d[6])); print(rint(d[7])); } }
int main(int argc, char * argv[]) { int i,j,opc;
if ( argc != 3 ) { fprintf(stderr,"\nUsage: %s <opc=0|1> <D>\n\n",argv[0]); return 1; }
opc = atoi(argv[1]); D = atoi(argv[2]);
if ( D %8 != 0 ) { fprintf(stderr,"\nDimension %d must be multiple of 8: \n\n",D); return 2; }
if ( opc == 0 ) { X = (float *) malloc(D * sizeof(float)); Y = (float *) malloc(D * sizeof(float)); } else { X = (float *) _mm_malloc(D * sizeof(float), 16); Y = (float *) _mm_malloc(D * sizeof(float), 16); }
for(i=0;i<D;i++) { X[i] = i; Y[i] = D - i; }
if ( opc == 0 ) for(i=0;i<D;i++) for(j=0;j<D;j++) dist(i,j); else for(i=0;i<D;i++) dist_sse(i);
return 0; }
I am compiling with: CC = gcc CFLAGS = -O3 -Wall -march=pentium-m -msse2
all: kk
And generated assembler is:
.file "kk.c" .def ___main; .scl 2; .type 32; .endef .section .rdata,"dr" LC0: .ascii "\12Usage: %s <opc=0|1> <D>\12\12\0" .align 4 LC1: .ascii "\12Dimension %d must be multiple of 8: \12\12\0" .text .p2align 4,,15 .globl _main .def _main; .scl 2; .type 32; .endef _main: pushl %ebp movl $16, %eax movl %esp, %ebp pushl %edi pushl %esi pushl %ebx subl $108, %esp movl 12(%ebp), %ebx andl $-16, %esp call __alloca call ___main cmpl $3, 8(%ebp) je L2 call ___getreent movl (%ebx), %esi movl $LC0, %ecx movl %ecx, 4(%esp) movl %esi, 8(%esp) movl 12(%eax), %edx movl %edx, (%esp) call _fprintf movl $1, %eax leal -12(%ebp), %esp L90: popl %ebx popl %esi popl %edi popl %ebp ret L2: movl 4(%ebx), %edi movl %edi, (%esp) call _atoi movl %eax, %edi movl 8(%ebx), %eax movl %eax, (%esp) call _atoi movl %eax, _D testb $7, %al movl %eax, %ecx jne L82 testl %edi, %edi je L83 xorl %edx, %edx sall $2, %eax jne L84 L7: movl %edx, _X movl %ecx, %eax xorl %edx, %edx sall $2, %eax jne L85 L12: movl %edx, _Y movl %edx, %ebx L5: xorl %edx, %edx cmpl %ecx, %edx jge L59 movl _X, %esi .p2align 4,,15 L19: movl %ecx, %eax cvtsi2ss %edx, %xmm1 subl %edx, %eax cvtsi2ss %eax, %xmm0 movss %xmm1, (%esi,%edx,4) movss %xmm0, (%ebx,%edx,4) incl %edx cmpl %ecx, %edx jl L19 L59: testl %edi, %edi jne L20 xorl %esi, %esi cmpl %ecx, %esi jge L30 .p2align 4,,15 L92: xorl %ebx, %ebx cmpl %ecx, %ebx jge L63 .p2align 4,,15 L91: movl _X, %edx movl _Y, %edi flds (%edx,%ebx,4) flds (%edi,%ebx,4) fxch %st(1) incl %ebx fsubrs (%edx,%esi,4) fxch %st(1) fsubrs (%edi,%esi,4) fxch %st(1) fmul %st(0), %st fxch %st(1) fmul %st(0), %st faddp %st, %st(1) fstpl (%esp) call _rint fstp %st(0) movl _D, %ecx cmpl %ecx, %ebx jl L91 L63: incl %esi cmpl %ecx, %esi jl L92 L30: leal -12(%ebp), %esp xorl %eax, %eax popl %ebx popl %esi popl %edi popl %ebp ret L83: leal 0(,%eax,4), %eax movl %eax, (%esp) call _malloc movl %eax, _X movl _D, %esi sall $2, %esi movl %esi, (%esp) call _malloc movl %eax, _Y movl _D, %ecx movl %eax, %ebx jmp L5 L20: xorl %edi, %edi cmpl %ecx, %edi jge L30 .p2align 4,,15 L75: movl _X, %edx movl (%edx,%edi,4), %eax movl %eax, -60(%ebp) movl (%ebx,%edi,4), %esi movss -60(%ebp), %xmm2 movl %esi, -64(%ebp) xorl %esi, %esi shufps $0, %xmm2, %xmm2 movss -64(%ebp), %xmm4 cmpl %ecx, %esi movaps %xmm2, -88(%ebp) shufps $0, %xmm4, %xmm4 movaps %xmm4, -104(%ebp) jl L76 jmp L66 .p2align 4,,7 L67: movl _X, %edx movl _Y, %ebx L76: movaps -88(%ebp), %xmm7 leal 0(,%esi,4), %ecx leal (%ecx,%edx), %edx movaps (%edx), %xmm5 addl %ebx, %ecx addl $8, %esi movaps (%ecx), %xmm6 movaps -104(%ebp), %xmm4 subps %xmm5, %xmm7 movaps %xmm7, %xmm5 movaps 16(%edx), %xmm3 mulps %xmm5, %xmm5 subps %xmm6, %xmm4 movaps 16(%ecx), %xmm1 movaps %xmm4, %xmm7 mulps %xmm4, %xmm7 movaps -88(%ebp), %xmm2 addps %xmm7, %xmm5 movaps -104(%ebp), %xmm0 movaps %xmm5, -56(%ebp) flds -56(%ebp) subps %xmm3, %xmm2 subps %xmm1, %xmm0 movaps %xmm2, %xmm3 movaps %xmm0, %xmm6 mulps %xmm2, %xmm3 mulps %xmm0, %xmm6 addps %xmm6, %xmm3 movaps %xmm3, -40(%ebp) fstpl (%esp) call _rint fstp %st(0) flds -52(%ebp) fstpl (%esp) call _rint fstp %st(0) flds -48(%ebp) fstpl (%esp) call _rint fstp %st(0) flds -44(%ebp) fstpl (%esp) call _rint fstp %st(0) flds -40(%ebp) fstpl (%esp) call _rint fstp %st(0) flds -36(%ebp) fstpl (%esp) call _rint fstp %st(0) flds -32(%ebp) fstpl (%esp) call _rint fstp %st(0) flds -28(%ebp) fstpl (%esp) call _rint fstp %st(0) movl _D, %ecx cmpl %ecx, %esi jl L67 L66: incl %edi cmpl %ecx, %edi jge L30 movl _Y, %ebx jmp L75 L85: addl $16, %eax movl %eax, (%esp) call _malloc testl %eax, %eax movl %eax, %edx je L78 leal 16(%eax), %ecx andl $-16, %ecx movl %ecx, %edx movl %eax, -4(%ecx) L78: movl _D, %ecx jmp L12 L84: addl $16, %eax movl %eax, (%esp) call _malloc testl %eax, %eax movl %eax, %edx je L77 leal 16(%eax), %ebx andl $-16, %ebx movl %ebx, %edx movl %eax, -4(%ebx) L77: movl _D, %ecx jmp L7 L82: call ___getreent movl _D, %ecx movl $LC1, %edx movl %edx, 4(%esp) movl %ecx, 8(%esp) movl 12(%eax), %ebx movl %ebx, (%esp) call _fprintf movl $2, %eax leal -12(%ebp), %esp jmp L90 .comm _D, 16 # 4 .comm _X, 16 # 4 .comm _Y, 16 # 4 .def _atoi; .scl 3; .type 32; .endef .def ___getreent; .scl 3; .type 32; .endef .def _fprintf; .scl 3; .type 32; .endef .def _rint; .scl 3; .type 32; .endef .def _malloc; .scl 3; .type 32; .endef
|
|
| |
-
tim18
-
-
-
Joined on 12-27-2003
-
-
Posts 3,323
-
-
|
Re: (pain reloaded) Why performace isn't much better in euclidean distance with SSE2?
This topic had already been covered extensively in the following public e-mail thread: http://gcc.gnu.org/ml/gcc-help/2008-04/msg00073.htmlIt was never clear why a major speedup was expected with printf() in the inner loop, nor whether a vectorizing compiler such as g++ 4.3 or icpc was tried (with printf removed so as to attempt auto-vectorization). C99 math functions (e.g. rint) presumably are available in g++, with -std=gnu99.
|
|
| |
|
|
Re: (pain reloaded) Why performace isn't much better in euclidean distance with SSE2?
Will read post ... but meanwhile can tell that the printf is conditionally removed from code. I used only for debugging the correctness of vectorized version, of course for the real timing, i removed.
thanks.
|
|
| |
|
|
Re: (pain reloaded) Why performace isn't much better in euclidean distance with SSE2?
LOL
The post u mentioned is just the one I made first, before coming here to Intel forumns !!!
With "extensively covered" you mean the topic was left unanswered? ;-1 Cause thats what happened. I made all suggestions I got, and still no signigicant improvement. The modified version is the one I posted here, at Intel.
There you have the assembler generated by GCC ... there we can see the SSE2 instructions. So, that is wrong with this picture?!
I would expect that, if there is one place on earth with people being experts in Intel assembler, that place must be here ... hehe. So, please check the assembler and tell me what is wrong.
Thanks.
|
|
| |
-
tim18
-
-
-
Joined on 12-27-2003
-
-
Posts 3,323
-
-
|
Re: (pain reloaded) Why performace isn't much better in euclidean distance with SSE2?
People on the gcc-help list did their best to help you define what you wanted to do. If you wanted only to make a non-vector cross between a macro and inline function of a style understood only by gcc, with parallel SSE inside that function, perhaps what you showed was what you wanted. Otherwise, you could have posted a shorter example, which could be compiled by standard compilers, illustrating your interest. The rint() function is not recognized as a vectorizable function by either gcc or icc. If you meant it as a substitute for sqrtf(), the latter can be vectorized in line by icc, but apparently not yet by gcc.
|
|
| |
|
|
Re: (pain reloaded) Why performace isn't much better in euclidean distance with SSE2?
Certaingly I appreciate all the advices gcc-help community provide me. Indeed, I followed all their advices.
I was not asking for a way to achieve the vectorization ... I already did it. Neither was I looking to automatic vectorization ... I manually code it. We can see the assembler there, generated directly from intrinsics functions ... it includes SSE2 of course (thus, this is not a problem of whether the compiler automatically vectorized or not).
What I am asking for, kindly of course hehe, is for advice about why my manually coded vector version is not improving significantly the performance (it offers a gain of 10% in runtime, which is very poor considering I am vectorizing the whole thing ... I would expect a gain of 75%)
Thanks.
|
|
| |
|
|
Re: (pain reloaded) Why performace isn't much better in euclidean distance with SSE2?
Hi,
Try changing the allocation scheme to allocating one aligned block for all the data instead of many allocations. This will improve data locality in the caches and might gain performance.
The code here uses rint(). This part is not parallelized and is similar for the two versions. I don’t know what it does so it’s impossible to estimate its duration relative to the other operations. It might be another reason for the low speedup.
Regards
|
|
| |
-
sjkuo
-
-
-
Joined on 12-07-2006
-
-
Posts 25
-
-
|
Re: (pain reloaded) Why performace isn't much better in euclidean distance with SSE2?
Just to add a few more comments to explain the likely reasons for your observation of lack of speed up.
In addition to what my colleagues have pointed out, on calling an external function like rint. The pitfalls in the code you've shown have significant overhead in terms calling scalar external functions. For the purpose of rounding, using another integer conversion technique might make more sense than throwing a bunch of rint's at the end of each hand-vectorized SSE loop. Secondly, depending on the parameter "D" you use when testing, it is possible in some portion of your loop iteration the rounding of floating-point to integer may experience exceptions. That can have different amount of delays between x87 code and SSE code.
I did a quick test by simplifying your code somewhat to use the intrinsic of cvtps2pi for rounding. I also modified the scalar c code into two versions to compare the overhead of using rint vs. a simple type cast conversion.
Using a fixed value of D= 1024 (10^6 scalar loops, each loop has 2 mul, 1 add, 2 subtract), and compiled with simply /O2 on an ICC and MSC,
the scalar distance calculation with rint took ~ 40 M cycles
a modified scalar calculation with type cast conversion took ~ 17 M cycles
the modified SSE code with _mm_cvtps_pi32 took ~ 5 M cycles.
There certainly will be variances with different compilers, using an external function like rint vs. type cast convert vs. intrinsic convert. The value of D you choose and the method of timing measurement will make further variance on your measured speedup.
For your reference the modifications I made is based on accumulating the results of each evaluation of
int dist(int i,int j) { float xd = X[i] - X[j]; float yd = Y[i] - Y[j]; int z = rint(xd*xd + yd*yd); return z; }
int distB(int i,int j) // compare overhead of rint { float xd = X[i] - X[j]; float yd = Y[i] - Y[j]; return (int) (xd*xd + yd*yd); }
The SSE version includes replacing the rint with _mm_cvtps_pi32 and a bunch of _mm_add_pi32, so that the different loop structure of vectorized code have the same amount of add relative to the double-nested loop of scalar code and accumulated result.
sjkuo
|
|
| |
|
|
Re: (pain reloaded) Why performace isn't much better in euclidean distance with SSE2?
Jalo,
You may be watching at an old version of program. Initial posts I put on gcc-help list, shown an AOS (Array Of StructureS). Due suggestions, I changed that to a SOA (Structure Of Arrays). I have now a few aligned arrays with all the data.
I am going to post new version, which included latests suggestions I received here. You may wanna take a look at it.
Thanks.
|
|
| |
|
|
Re: (pain reloaded) Why performace isn't much better in euclidean distance with SSE2?
Jallo,
It seems that rint function entered a lot of noise ;-1 It was not an escencial part of my problem, so I removed for the sake of this proof of concept. I can see now a gain of 30% less in execution time. But still, I would expect more, given SSE2 is performing 4 operations at a time, right?
Unless maybe, I am loosing something ... as usual ;-|
Thanks for your attention (below c and assembler)
PS: Dunno why, but when used an if to repeat code for calling one function or the other, instead of a func pointer, the performance is worse than 30%. I know that using function pointer may prevent inlining the functions, so I tried to avoid its usage ... with the surprise that the SSE2 gain was less !. That's the reason I kept that part. Althought this may b an interesting point on its own, think thats another topic, not related with this serial vs vectorized debate.
#include <stdio.h> #include <stdlib.h> #include <xmmintrin.h>
int D; float *X,*Y,*Z;
inline static void dist(int i) { float xd,yd; int j; for(j=0; j<D; j++) { xd = X[i] - X[j]; yd = Y[i] - Y[j]; Z[j] = xd*xd + yd*yd; } }
inline static void dist_sse(int i) { int j; __m128 xmm0 = _mm_set1_ps(X[i]); __m128 xmm1 = xmm0; __m128 xmm2 = _mm_set1_ps(Y[i]); __m128 xmm3 = xmm2; __m128 xmm4,xmm5,xmm6,xmm7; for(j=0; j<D;j+=8) { xmm4 =_mm_load_ps(X+j); xmm5 =_mm_load_ps(X+j+4); xmm6 =_mm_load_ps(Y+j); xmm7 =_mm_load_ps(Y+j+4); xmm4 = _mm_sub_ps(xmm0,xmm4); xmm5 = _mm_sub_ps(xmm1,xmm5); xmm6 = _mm_sub_ps(xmm2,xmm6); xmm7 = _mm_sub_ps(xmm3,xmm7); xmm4 = _mm_mul_ps(xmm4,xmm4); xmm5 = _mm_mul_ps(xmm5,xmm5); xmm6 = _mm_mul_ps(xmm6,xmm6); xmm7 = _mm_mul_ps(xmm7,xmm7); xmm4 = _mm_add_ps(xmm4,xmm6); xmm5 = _mm_add_ps(xmm5,xmm7); _mm_store_ps(Z+j,xmm4); _mm_store_ps(Z+j+4,xmm5); } }
int main(int argc, char * argv[]) { int i,j,opc,debug; void (*opc_func)(int);
if ( argc != 4 ) { fprintf(stderr,"\nUsage: %s <opc=0|1> <D> <debug=0|1>\n\n",argv[0]); return 1; }
opc = atoi(argv[1]); D = atoi(argv[2]); debug = atoi(argv[3]);
if ( D %8 != 0 ) { fprintf(stderr,"\nDimension %d must be multiple of 8: \n\n",D); return 2; }
if ( opc == 0 ) { X = (float *) malloc(D * sizeof(float)); Y = (float *) malloc(D * sizeof(float)); Z = (float *) malloc(D * sizeof(float)); } else { X = (float *) _mm_malloc(D * sizeof(float), 16); Y = (float *) _mm_malloc(D * sizeof(float), 16); Z = (float *) _mm_malloc(D * sizeof(float), 16); }
for(i=0;i<D;i++) { X[i] = i; Y[i] = D - i; } opc_func = opc == 0? dist : dist_sse; for(i=0;i<D;i++) { opc_func(i); if ( debug ) { for(j=0; j<D; j++) printf("%f\n",Z[j]); } }
return 0; }
.file "kk.c" .text .p2align 4,,15 .def _dist; .scl 3; .type 32; .endef _dist: pushl %ebp xorl %eax, %eax movl %esp, %ebp pushl %edi movl 8(%ebp), %edi pushl %esi movl _D, %esi pushl %ebx cmpl %esi, %eax jge L7 movl _X, %ecx movl _Y, %edx movl _Z, %ebx .p2align 4,,15 L5: flds (%ecx,%eax,4) flds (%edx,%eax,4) fxch %st(1) fsubrs (%ecx,%edi,4) fxch %st(1) fsubrs (%edx,%edi,4) fxch %st(1) fmul %st(0), %st fxch %st(1) fmul %st(0), %st faddp %st, %st(1) fstps (%ebx,%eax,4) incl %eax cmpl %esi, %eax jl L5 L7: popl %ebx popl %esi popl %edi popl %ebp ret .p2align 4,,15 .def _dist_sse; .scl 3; .type 32; .endef _dist_sse: pushl %ebp movl %esp, %ebp pushl %esi pushl %ebx subl $8, %esp movl 8(%ebp), %esi movl _X, %ebx movl _Y, %ecx movl (%ebx,%esi,4), %edx movl %edx, -12(%ebp) movl (%ecx,%esi,4), %eax xorl %esi, %esi movss -12(%ebp), %xmm5 movl %eax, -16(%ebp) movss -16(%ebp), %xmm4 cmpl _D, %esi shufps $0, %xmm5, %xmm5 shufps $0, %xmm4, %xmm4 jl L36 jmp L34 .p2align 4,,7 L35: movl _X, %ebx movl _Y, %ecx L36: leal 0(,%esi,4), %edx movaps %xmm5, %xmm1 leal (%edx,%ebx), %eax movaps (%eax), %xmm2 leal (%edx,%ecx), %ebx movaps %xmm5, %xmm6 movaps (%ebx), %xmm0 addl $8, %esi movaps 16(%eax), %xmm3 subps %xmm2, %xmm1 movl _Z, %eax movaps 16(%ebx), %xmm7 movaps %xmm1, %xmm2 movaps %xmm4, %xmm1 subps %xmm0, %xmm1 movl %edx, %ebx movaps %xmm1, %xmm0 mulps %xmm2, %xmm2 mulps %xmm0, %xmm0 subps %xmm3, %xmm6 addl %eax, %ebx addps %xmm0, %xmm2 movaps %xmm6, %xmm3 movaps %xmm2, (%ebx) movaps %xmm4, %xmm6 subps %xmm7, %xmm6 movl _Z, %ecx movaps %xmm6, %xmm1 mulps %xmm3, %xmm3 mulps %xmm6, %xmm1 addps %xmm1, %xmm3 addl %ecx, %edx movaps %xmm3, 16(%edx) cmpl _D, %esi jl L35 L34: addl $8, %esp popl %ebx popl %esi popl %ebp ret .def ___main; .scl 2; .type 32; .endef .section .rdata,"dr" .align 4 LC1: .ascii "\12Usage: %s <opc=0|1> <D> <debug=0|1>\12\12\0" LC3: .ascii "%f\12\0" .align 4 LC2: .ascii "\12Dimension %d must be multiple of 8: \12\12\0" .text .p2align 4,,15 .globl _main .def _main; .scl 2; .type 32; .endef _main: pushl %ebp movl $16, %eax movl %esp, %ebp pushl %edi pushl %esi pushl %ebx subl $28, %esp movl 12(%ebp), %ebx andl $-16, %esp call __alloca call ___main cmpl $4, 8(%ebp) je L38 call ___getreent movl (%ebx), %esi movl $LC1, %ecx movl %ecx, 4(%esp) movl %esi, 8(%esp) movl 12(%eax), %edx movl %edx, (%esp) call _fprintf movl $1, %eax leal -12(%ebp), %esp L99: popl %ebx popl %esi popl %edi popl %ebp ret .p2align 4,,7 L38: movl 4(%ebx), %edx movl %edx, (%esp) call _atoi movl %eax, -16(%ebp) movl 8(%ebx), %eax movl %eax, (%esp) call _atoi movl %eax, _D movl 12(%ebx), %edi movl %edi, (%esp) call _atoi movl %eax, -20(%ebp) movl _D, %ecx testb $7, %cl jne L89 movl -16(%ebp), %edi testl %edi, %edi je L90 movl %ecx, %eax xorl %edx, %edx sall $2, %eax jne L91 L43: movl %edx, _X movl %ecx, %eax xorl %edx, %edx sall $2, %eax jne L92 L48: movl %edx, _Y movl %ecx, %eax xorl %edx, %edx sall $2, %eax jne L93 L53: movl %edx, _Z L41: xorl %edx, %edx cmpl %ecx, %edx jge L73 movl _X, %esi movl _Y, %ebx .p2align 4,,15 L60: movl %ecx, %eax cvtsi2ss %edx, %xmm1 subl %edx, %eax cvtsi2ss %eax, %xmm0 movss %xmm1, (%esi,%edx,4) movss %xmm0, (%ebx,%edx,4) incl %edx cmpl %ecx, %edx jl L60 L73: movl -16(%ebp), %ebx movl $_dist, %edi movl $_dist_sse, %edx testl %ebx, %ebx cmovne %edx, %edi xorl %esi, %esi cmpl %ecx, %esi jge L75 .p2align 4,,15 L101: movl %esi, (%esp) call *%edi movl -20(%ebp), %ecx testl %ecx, %ecx je L77 movl _D, %eax xorl %ebx, %ebx cmpl %eax, %ebx jge L65 .p2align 4,,15 L100: movl _Z, %eax flds (%eax,%ebx,4) incl %ebx movl $LC3, (%esp) fstpl 4(%esp) call _printf movl _D, %eax cmpl %eax, %ebx jl L100 L65: incl %esi cmpl %eax, %esi L102: jl L101 L75: leal -12(%ebp), %esp xorl %eax, %eax popl %ebx popl %esi popl %edi popl %ebp ret L90: leal 0(,%ecx,4), %ebx movl %ebx, (%esp) call _malloc movl %eax, _X movl _D, %edx sall $2, %edx movl %edx, (%esp) call _malloc movl %eax, _Y movl _D, %eax sall $2, %eax movl %eax, (%esp) call _malloc movl %eax, _Z movl _D, %ecx jmp L41 .p2align 4,,7 L77: movl _D, %eax incl %esi cmpl %eax, %esi jmp L102 L93: addl $16, %eax movl %eax, (%esp) call _malloc testl %eax, %eax movl %eax, %edx je L86 leal 16(%eax), %edi andl $-16, %edi movl %edi, %edx movl %eax, -4(%edi) L86: movl _D, %ecx jmp L53 L92: addl $16, %eax movl %eax, (%esp) call _malloc testl %eax, %eax movl %eax, %edx je L85 leal 16(%eax), %esi andl $-16, %esi movl %esi, %edx movl %eax, -4(%esi) L85: movl _D, %ecx jmp L48 L91: addl $16, %eax movl %eax, (%esp) call _malloc testl %eax, %eax movl %eax, %edx je L84 leal 16(%eax), %ecx andl $-16, %ecx movl %ecx, %edx movl %eax, -4(%ecx) L84: movl _D, %ecx jmp L43 L89: call ___getreent movl _D, %esi movl $LC2, %ecx movl %ecx, 4(%esp) movl %esi, 8(%esp) movl 12(%eax), %ebx movl %ebx, (%esp) call _fprintf movl $2, %eax leal -12(%ebp), %esp jmp L99 .comm _D, 16 # 4 .comm _X, 16 # 4 .comm _Y, 16 # 4 .comm _Z, 16 # 4 .def _printf; .scl 3; .type 32; .endef .def _atoi; .scl 3; .type 32; .endef .def ___getreent; .scl 3; .type 32; .endef .def _fprintf; .scl 3; .type 32; .endef .def _malloc; .scl 3; .type 32; .endef
|
|
| |
|
|