[Vm-dev] Is bytecodePrimMultiply correct?

Stefan Marr squeak at stefan-marr.de
Tue Oct 4 19:36:47 UTC 2011


Hi Eliot:

On 04 Oct 2011, at 19:11, Eliot Miranda wrote:

>     so the VMMaker source is probably easier to read.
Not for me ;)


>  First, thanks for the optimization; clearly
> 
> 	(arg = 0 or: [(result // arg) = rcvr and: [self isIntegerValue: result]])
> 
> 


> I'd suspect the definition of isIntegerValue:, which is:
No, not according to my testing with the RoarVM code base. isIntegerValue works fine.
Extra casts do not make a difference, but that might also be because the return type of our isIntegerValue is already our equivalent of sqInt (oop_int_t).
So, I am pretty sure it is not that.

So what remains is: `(result // arg) = rcvr`/`((r / ai)  ==  ri)` or in context:

    oop_int_t ri = rcvr.integerValue();
    oop_int_t ai = arg.integerValue();
    oop_int_t r  = ri * ai;
    if (ai == 0  ||  (((r / ai)  ==  ri)    &&    Oop::isIntegerValue(r))) {

Lets look at some beautiful assembly :-/

This is RoarVM's bytecodePrimMultiply, C++ code for reference below. This is the output with clang and -O2.

I think the critical part is:

0x00032f04  <+0068>  sar    %ebx               // oop_int_t ri = rcvr.integerValue();
0x00032f06  <+0070>  sar    %edx               // oop_int_t ai = arg.integerValue();
0x00032f08  <+0072>  imul   %edx,%ebx          // oop_int_t r  = ri * ai
// The following should be: if (ai == 0  ||  (((r / ai)  ==  ri)    &&    Oop::isIntegerValue(r))) { 
0x00032f0b  <+0075>  lea    (%ebx,%ebx,1),%eax 
0x00032f0e  <+0078>  test   %edx,%edx
0x00032f10  <+0080>  je     0x32f1a <_ZN18Squeak_Interpreter20bytecodePrimMultiplyEv+90>
0x00032f12  <+0082>  xor    %eax,%ebx
0x00032f14  <+0084>  js     0x3301f <_ZN18Squeak_Interpreter20bytecodePrimMultiplyEv+351>

I don't fully understand what that means, my assembly is rather rusty...
The full code is at the end of the mail.

But first for comparison my quick fix:

    oop_int_t ri = rcvr.integerValue();
    oop_int_t ai = arg.integerValue();
    long long result_with_overflow = (long long)ri * ai;
    if (ai == 0  || Oop::isIntegerValue(result_with_overflow)) {
      internalPopThenPush(2, Oop::from_int(result_with_overflow));
      fetchNextBytecode();
      return;
    }

This results in the following code, relevant for this snippet:

0x00032ee4  <+0068>  sar    %ebx               // oop_int_t ri = rcvr.integerValue();
0x00032ee6  <+0070>  sar    %edx               // oop_int_t ai = arg.integerValue();
0x00032ee8  <+0072>  mov    %ebx,%eax
0x00032eea  <+0074>  imul   %edx
0x00032eec  <+0076>  test   %ebx,%ebx
0x00032eee  <+0078>  jne    0x32ef5 <_ZN18Squeak_Interpreter20bytecodePrimMultiplyEv+85>
0x00032ef0  <+0080>  lea    (%eax,%eax,1),%ecx
0x00032ef3  <+0083>  jmp    0x32f15 <_ZN18Squeak_Interpreter20bytecodePrimMultiplyEv+117>
0x00032ef5  <+0085>  mov    %eax,%ecx
0x00032ef7  <+0087>  xor    %eax,%ecx
0x00032ef9  <+0089>  mov    %eax,%edi
0x00032efb  <+0091>  sar    $0x1f,%edi
0x00032efe  <+0094>  xor    %edx,%edi
0x00032f00  <+0096>  or     %ecx,%edi
0x00032f02  <+0098>  jne    0x3301a <_ZN18Squeak_Interpreter20bytecodePrimMultiplyEv+378>
0x00032f08  <+0104>  lea    (%eax,%eax,1),%ecx
0x00032f0b  <+0107>  mov    %ecx,%edx
0x00032f0d  <+0109>  xor    %eax,%edx
0x00032f0f  <+0111>  js     0x3301a <_ZN18Squeak_Interpreter20bytecodePrimMultiplyEv+378>

Well, I do not really get it.
But it sure looks like my quick fix could be a bit more expensive...



Best regards
Stefan


Here the assembly generated for the original code:

// Starting with the prolog
0x00032ec0  <+0000>  push   %ebp
0x00032ec1  <+0001>  mov    %esp,%ebp
0x00032ec3  <+0003>  push   %ebx
0x00032ec4  <+0004>  push   %edi
0x00032ec5  <+0005>  push   %esi
0x00032ec6  <+0006>  sub    $0x3c,%esp
0x00032ec9  <+0009>  call   0x32ece <_ZN18Squeak_Interpreter20bytecodePrimMultiplyEv+14>
0x00032ece  <+0014>  pop    %edi
0x00032ecf  <+0015>  mov    0x8(%ebp),%esi
// looks like some test for the Int_Tag
0x00032ed2  <+0018>  testb  $0x1,0x7196(%esi)
0x00032ed9  <+0025>  je     0x3304f <_ZN18Squeak_Interpreter20bytecodePrimMultiplyEv+399>
0x00032edf  <+0031>  mov    0x7188(%esi),%eax
0x00032ee5  <+0037>  mov    -0x4(%eax),%ebx
0x00032ee8  <+0040>  mov    (%eax),%edx
0x00032eea  <+0042>  mov    %ebx,%ecx
// looks like the second test for the Int_Tag
0x00032eec  <+0044>  and    $0x1,%ecx
0x00032eef  <+0047>  test   %ecx,%edx
0x00032ef1  <+0049>  je     0x32f3c <_ZN18Squeak_Interpreter20bytecodePrimMultiplyEv+124>
0x00032ef3  <+0051>  test   %ecx,%ecx
0x00032ef5  <+0053>  je     0x33083 <_ZN18Squeak_Interpreter20bytecodePrimMultiplyEv+451>
0x00032efb  <+0059>  test   $0x1,%dl
0x00032efe  <+0062>  je     0x33083 <_ZN18Squeak_Interpreter20bytecodePrimMultiplyEv+451>

0x00032f04  <+0068>  sar    %ebx               // oop_int_t ri = rcvr.integerValue();
0x00032f06  <+0070>  sar    %edx               // oop_int_t ai = arg.integerValue();
0x00032f08  <+0072>  imul   %edx,%ebx          // oop_int_t r  = ri * ai

// The following should be: if (ai == 0  ||  (((r / ai)  ==  ri)    &&    Oop::isIntegerValue(r))) { 
0x00032f0b  <+0075>  lea    (%ebx,%ebx,1),%eax
0x00032f0e  <+0078>  test   %edx,%edx
0x00032f10  <+0080>  je     0x32f1a <_ZN18Squeak_Interpreter20bytecodePrimMultiplyEv+90>
0x00032f12  <+0082>  xor    %eax,%ebx
0x00032f14  <+0084>  js     0x3301f <_ZN18Squeak_Interpreter20bytecodePrimMultiplyEv+351>

// The following is the successful case, and the remaining body of bytecodePrimMultiply.
0x00032f1a  <+0090>  or     $0x1,%eax
0x00032f1d  <+0093>  mov    %eax,-0x10(%ebp)
0x00032f20  <+0096>  mov    -0x10(%ebp),%eax
0x00032f23  <+0099>  mov    %eax,0x8(%esp)
0x00032f27  <+0103>  mov    %esi,(%esp)
0x00032f2a  <+0106>  movl   $0x2,0x4(%esp)
0x00032f32  <+0114>  call   0x36d70 <_ZN18Squeak_Interpreter19internalPopThenPushEi3Oop>
0x00032f37  <+0119>  jmp    0x33015 <_ZN18Squeak_Interpreter20bytecodePrimMultiplyEv+341>
0x00032f3c  <+0124>  movb   $0x1,0x4180(%esi)
0x00032f43  <+0131>  mov    0x7184(%esi),%ecx
0x00032f49  <+0137>  mov    %ecx,0x4168(%esi)
0x00032f4f  <+0143>  mov    %eax,0x416c(%esi)
0x00032f55  <+0149>  mov    0x718c(%esi),%eax
0x00032f5b  <+0155>  mov    (%eax),%ecx
0x00032f5d  <+0157>  and    $0x3,%ecx
0x00032f60  <+0160>  mov    %edx,-0x2c(%ebp)
0x00032f63  <+0163>  mov    0x6e3e6(%edi),%edx
0x00032f69  <+0169>  movsbl (%edx,%ecx,1),%ecx
0x00032f6d  <+0173>  shl    $0x2,%ecx
0x00032f70  <+0176>  neg    %ecx
0x00032f72  <+0178>  mov    (%eax,%ecx,1),%ecx
0x00032f75  <+0181>  and    $0xfffffffc,%ecx
0x00032f78  <+0184>  mov    %ecx,0x3c(%esi)
0x00032f7b  <+0187>  mov    %eax,0x4164(%esi)
0x00032f81  <+0193>  movb   $0x1,0x7195(%esi)
0x00032f88  <+0200>  lea    -0x18(%ebp),%eax
0x00032f8b  <+0203>  mov    %eax,(%esp)
0x00032f8e  <+0206>  movl   $0x1,0x4(%esp)
0x00032f96  <+0214>  call   0x75070 <_ZN17Safepoint_AbilityC1Eb>
0x00032f9b  <+0219>  mov    %ebx,-0x20(%ebp)
0x00032f9e  <+0222>  mov    -0x2c(%ebp),%eax
0x00032fa1  <+0225>  mov    %eax,-0x28(%ebp)
0x00032fa4  <+0228>  mov    -0x28(%ebp),%eax
0x00032fa7  <+0231>  mov    %eax,0x8(%esp)
0x00032fab  <+0235>  mov    -0x20(%ebp),%eax
0x00032fae  <+0238>  mov    %eax,0x4(%esp)
0x00032fb2  <+0242>  mov    %esi,(%esp)
0x00032fb5  <+0245>  call   0x3ccf0 <_ZN18Squeak_Interpreter22primitiveFloatMultiplyE3OopS0_>
0x00032fba  <+0250>  lea    -0x18(%ebp),%eax
0x00032fbd  <+0253>  mov    %eax,(%esp)
0x00032fc0  <+0256>  call   0x75120 <_ZN17Safepoint_AbilityD1Ev>
0x00032fc5  <+0261>  testb  $0x1,0x7195(%esi)
0x00032fcc  <+0268>  je     0x330b7 <_ZN18Squeak_Interpreter20bytecodePrimMultiplyEv+503>
0x00032fd2  <+0274>  mov    0x72e0(%esi),%eax
0x00032fd8  <+0280>  testb  $0x1,(%eax)
0x00032fdb  <+0283>  jne    0x330eb <_ZN18Squeak_Interpreter20bytecodePrimMultiplyEv+555>
0x00032fe1  <+0289>  mov    0x4168(%esi),%eax
0x00032fe7  <+0295>  mov    %eax,0x7184(%esi)
0x00032fed  <+0301>  mov    0x416c(%esi),%eax
0x00032ff3  <+0307>  mov    %eax,0x7188(%esi)
0x00032ff9  <+0313>  mov    0x4164(%esi),%eax
0x00032fff  <+0319>  mov    %eax,0x718c(%esi)
0x00033005  <+0325>  movb   $0x1,0x7196(%esi)
0x0003300c  <+0332>  testb  $0x1,0x4180(%esi)
0x00033013  <+0339>  je     0x3301f <_ZN18Squeak_Interpreter20bytecodePrimMultiplyEv+351>
0x00033015  <+0341>  mov    %esi,(%esp)
0x00033018  <+0344>  call   0x35f30 <_ZN18Squeak_Interpreter17fetchNextBytecodeEv>
0x0003301d  <+0349>  jmp    0x33047 <_ZN18Squeak_Interpreter20bytecodePrimMultiplyEv+391>
0x0003301f  <+0351>  mov    0x8(%esi),%eax
0x00033022  <+0354>  mov    (%eax),%eax
0x00033024  <+0356>  and    $0xfffffffc,%eax
0x00033027  <+0359>  mov    0x60(%eax),%eax
0x0003302a  <+0362>  mov    (%eax),%eax
0x0003302c  <+0364>  and    $0xfffffffc,%eax
0x0003302f  <+0367>  mov    0x44(%eax),%eax
0x00033032  <+0370>  mov    %eax,0x44(%esi)
0x00033035  <+0373>  movl   $0x1,0x71a0(%esi)
0x0003303f  <+0383>  mov    %esi,(%esp)
0x00033042  <+0386>  call   0x36460 <_ZN18Squeak_Interpreter10normalSendEv>
0x00033047  <+0391>  add    $0x3c,%esp
0x0003304a  <+0394>  pop    %esi
0x0003304b  <+0395>  pop    %edi
0x0003304c  <+0396>  pop    %ebx
0x0003304d  <+0397>  pop    %ebp
0x0003304e  <+0398>  ret    
0x0003304f  <+0399>  lea    0x5e81e(%edi),%eax
0x00033055  <+0405>  mov    %eax,0x10(%esp)
0x00033059  <+0409>  lea    0x62a4e(%edi),%eax
0x0003305f  <+0415>  mov    %eax,0xc(%esp)
0x00033063  <+0419>  lea    0x615ae(%edi),%eax
0x00033069  <+0425>  mov    %eax,0x4(%esp)
0x0003306d  <+0429>  lea    0x62a3e(%edi),%eax
0x00033073  <+0435>  mov    %eax,(%esp)
0x00033076  <+0438>  movl   $0xd1,0x8(%esp)
0x0003307e  <+0446>  call   0x6db20 <_Z14assert_failurePKcS0_iS0_S0_>
0x00033083  <+0451>  lea    0x5e81e(%edi),%eax
0x00033089  <+0457>  mov    %eax,0x10(%esp)
0x0003308d  <+0461>  lea    0x62b61(%edi),%eax
0x00033093  <+0467>  mov    %eax,0xc(%esp)
0x00033097  <+0471>  lea    0x62b23(%edi),%eax
0x0003309d  <+0477>  mov    %eax,0x4(%esp)
0x000330a1  <+0481>  lea    0x62b16(%edi),%eax
0x000330a7  <+0487>  mov    %eax,(%esp)
0x000330aa  <+0490>  movl   $0x3f,0x8(%esp)
0x000330b2  <+0498>  call   0x6db20 <_Z14assert_failurePKcS0_iS0_S0_>
0x000330b7  <+0503>  lea    0x5e81e(%edi),%eax
0x000330bd  <+0509>  mov    %eax,0x10(%esp)
0x000330c1  <+0513>  lea    0x6298b(%edi),%eax





C++ code:

void Squeak_Interpreter::bytecodePrimMultiply() {
  Oop rcvr = internalStackValue(1);
  Oop arg  = internalStackValue(0);
  
  if (areIntegers(rcvr, arg)) {    
    oop_int_t ri = rcvr.integerValue();
    oop_int_t ai = arg.integerValue();
    oop_int_t r  = ri * ai;
    if (ai == 0  ||  (((r / ai)  ==  ri)    &&    Oop::isIntegerValue(r))) {
      internalPopThenPush(2, Oop::from_int(r));
      fetchNextBytecode();
      return;
    }
  }
  else {
    successFlag = true;
    externalizeIPandSP();
    {
      Safepoint_Ability sa(true);
      primitiveFloatMultiply(rcvr, arg);
    }
    internalizeIPandSP();
    if (successFlag) {
      fetchNextBytecode();
      return;
    }
  }
  roots.messageSelector = specialSelector(8);
  set_argumentCount(1);
  normalSend();
}

-- 
Stefan Marr
Software Languages Lab
Vrije Universiteit Brussel
Pleinlaan 2 / B-1050 Brussels / Belgium
http://soft.vub.ac.be/~smarr
Phone: +32 2 629 2974
Fax:   +32 2 629 3525



More information about the Vm-dev mailing list