0xV3n0m

Announcement

Welcome To My Personal Blog

1.24.1 Multiplication

Multiplication using addition

A simple example:


unsigned int f(unsigned int a)
{
    return a*8;
};

Multiplication by 8 was replaced with 3 addition instructions, which do the same thing.

It is obvious that the MSVC optimizer decided that this code could be faster.


_TEXT SEGMENT
_a$ = 8                                  ; size = 4
_f PROC
    mov     eax, DWORD PTR _a$[esp-4]    ; load a into EAX
    add     eax, eax                     ; EAX = EAX + EAX = a*2
    add     eax, eax                     ; EAX = EAX + EAX = a*4
    add     eax, eax                     ; EAX = EAX + EAX = a*8
    ret     0
_f ENDP
_TEXT ENDS
END

Multiplication using shifting

Multiplication and division instructions by numbers that are powers of 2 are often replaced with shift instructions.


unsigned int f(unsigned int a)
{
    return a*4;
};


_a$ = 8                                  ; size = 4
_f PROC
    push    ebp
    mov     ebp, esp
    mov     eax, DWORD PTR _a$[ebp]      ; load a
    shl     eax, 2                       ; shift left by 2 bits = multiply by 4
    pop     ebp
    ret     0
_f ENDP

The instruction named SHL is an abbreviation for SHift Left.

Multiplication by 4 is just shifting the number left by 2 bits and adding two zero bits on the right (as the least significant bits).

This is like multiplying 3 by 100 — we simply add two zeros on the right.

This is how the shift left instruction works:

The bits added on the right are always zeros.

To make it clearer in case you got confused:

Let us say:


a = 3

3 in binary:


3 = 00000011

If we do:


shl eax, 2

Meaning shift left twice:


00000011 << 2 = 00001100

The result 00001100 in decimal = 12

Which is indeed: 3 * 4 = 12

Multiplication by 4 on ARM


f PROC
    LSL r0, r0, #2                       ; shift left by 2 bits = multiply by 4
    BX  lr
ENDP

Multiplication by 4 on MIPS


jr      $ra
    sll     $v0, $a0, 2                  ; shift left by 2 = multiply by 4 (delay slot)

SLL means "Shift Left Logical".

Multiplication using shifting, subtracting, and adding

It is still possible to get rid of multiplication when multiplying by numbers like 7 or 17, also using shifting. The math used here is relatively simple.

32-bit


#include <stdint.h>
int f1(int a)
{
return a*7;
};
int f2(int a)
{
return a*28;
};
int f3(int a)
{
return a*17;
};

x86


; a*7
_a$ = 8
_f1 PROC
    mov     ecx, DWORD PTR _a$[esp-4]    ; ECX = a
    lea     eax, DWORD PTR [ecx*8]       ; EAX = ECX*8 = a*8
    sub     eax, ecx                     ; EAX = EAX - ECX = a*8 - a = a*7
    ret     0
_f1 ENDP
; a17
_a $= 8 _f3 PROC mov eax, DWORD PTR _a$ [esp-4]    ; EAX = a
shl     eax, 4                       ; EAX = EAX << 4 = a16
add     eax, DWORD PTR _a$[esp-4]    ; EAX = EAX + a = a16 + a = a17
ret     0
_f3 ENDP

ARM

Keil in ARM mode exploits the second operand shift modifiers:


; a*7
||f1|| PROC
    RSB     r0, r0, r0, LSL #3           ; R0 = (R0 << 3) - R0 = a*8 - a = a*7
    BX      lr
ENDP
; a28
||f2|| PROC
RSB     r0, r0, r0, LSL #3           ; R0 = (R0 << 3) - R0 = a8 - a = a7
LSL     r0, r0, #2                   ; R0 = R0 << 2 = a7 * 4 = a*28
BX      lr
ENDP
; a17
||f3|| PROC
ADD     r0, r0, r0, LSL #4           ; R0 = R0 + (R0 << 4) = a + a16 = a*17
BX      lr
ENDP

But there are no such modifiers in Thumb mode. It also cannot optimize f2():


; a*7
||f1|| PROC
    LSLS    r1, r0, #3                   ; R1 = R0 << 3 = a*8
    SUBS    r0, r1, r0                    ; R0 = R1 - R0 = a*8 - a = a*7
    BX      lr
ENDP
; a*28
||f2|| PROC
MOVS    r1, #0x1c                    ; R1 = 28
MULS    r0, r1, r0                   ; R0 = 28 * a
BX      lr
ENDP
; a17
||f3|| PROC
LSLS    r1, r0, #4                   ; R1 = R0 << 4 = a16
ADDS    r0, r0, r1                   ; R0 = a + a16 = a17
BX      lr
ENDP

MIPS


_f1:
    sll     $v0, $a0, 3                  ; $v0 = $a0 << 3 = a*8
    jr      $ra
    subu    $v0, $a0                     ; $v0 = $v0 - $a0 = a*8 - a = a*7 (delay slot)
_f2:
sll      $v0,$ a0, 5                  ;  $v0 =$ a0 << 5 = a32
sll      $a0, 2 ;$ a0 =  $a0 &lt;&lt; 2 = a*4 jr$ ra
subu     $v0,$ a0                     ; $v0 = a32 - a4 = a28 (delay slot)
_f3:
sll      $v0,$ a0, 4                  ;  $v0 =$ a0 << 4 = a16
jr       $ra addu$ v0,  $a0 ;$ v0 = a16 + a = a*17 (delay slot)

64-bit


#include <stdint.h>
int64_t f1(int64_t a)
{
return a*7;
};
int64_t f2(int64_t a)
{
return a*28;
};
int64_t f3(int64_t a)
{
return a*17;
};

x64


; a*7
f1:
    lea     rax, [0+rdi*8]               ; RAX = RDI*8 = a*8
    sub     rax, rdi                     ; RAX = RAX - RDI = a*8 - a = a*7
    ret
; a28
f2:
lea     rax, [0+rdi4]               ; RAX = RDI4 = a4
sal     rdi, 5                       ; RDI = RDI << 5 = a32
sub     rdi, rax                     ; RDI = a32 - a4 = a28
mov     rax, rdi
ret
; a17
f3:
mov     rax, rdi
sal     rax, 4                       ; RAX = RAX << 4 = a16
add     rax, rdi                     ; RAX = a16 + a = a17
ret

ARM64

GCC 4.9 for ARM64 is also concise, thanks to shift modifiers:


; a*7
f1:
    lsl     x1, x0, 3                    ; X1 = X0 << 3 = a*8
    sub     x0, x1, x0                   ; X0 = X1 - X0 = a*8 - a = a*7
    ret
; a28
f2:
lsl     x1, x0, 5                    ; X1 = X0 << 5 = a32
sub     x0, x1, x0, lsl 2             ; X0 = X1 - (X0 << 2) = a32 - a4 = a*28
ret
; a17
f3:
add     x0, x0, x0, lsl 4            ; X0 = X0 + (X0 << 4) = a + a16 = a*17
ret

Booth’s multiplication algorithm

There was a time when computers were big and expensive to the point that some did not have hardware support for multiplication inside the CPU, like the Data General Nova.

When multiplication was needed, it could be done at the software level, for example using Booth’s multiplication algorithm.

This is a multiplication algorithm that uses only addition and shifts. What modern optimizing compilers do is not the same thing, but the goal (multiplication) and resources (faster operations) are the same.

1.24.2 Division

Division using shifts

An example of division by 4:


unsigned int f(unsigned int a)
{
    return a/4;
};

We get (MSVC 2010):


_a$ = 8
_f PROC
    mov     eax, DWORD PTR _a$[esp-4]
    shr     eax, 2                       ; shift right by 2 bits = divide by 4
    ret     0
_f ENDP

The SHR (SHift Right) instruction in this example shifts the number right by 2 bits.

The two bits that were freed on the left (most significant bits) are filled with zeros.

The two bits on the right (least significant bits) are discarded.

In fact, these two discarded bits are the division remainder.

The SHR instruction works exactly like SHL, but in the opposite direction.

It is easy to understand if you imagine the number 23 in the decimal system.

23 can be easily divided by 10 by simply removing the last digit (3 — this is the remainder).

What remains after the operation is 2 as the quotient.

The remainder is discarded, but that's fine because we are working on integer values anyway, these are not real numbers!

Division by 4 on ARM


f PROC
    LSR     r0, r0, #2                   ; logical shift right by 2 bits = divide by 4
    BX      lr
ENDP

Division by 4 on MIPS


jr      $ra
    srl     $v0, $a0, 2                  ; shift right logical by 2 = divide by 4 (delay slot)

SRL means "Shift Right Logical".

Share

If this article helped you, please share it with others!

CH1.22 - Replacing Arithmetic Instructions to Other Ones

https://v3nn00m.github.io/posts/re4b/chapter1_24/

Author

0xV3n0m

Published at

2026-02-05

License

0xV3n0m's Personal Blog License

Some information may be outdated

CH1.23 - Floating-Point Unit (Part 1)

CH1.21 - More About Strings

0xV3n0m

1.24.1 Multiplication

Multiplication using addition

Multiplication using shifting

Multiplication by 4 on ARM

Multiplication by 4 on MIPS

Multiplication using shifting, subtracting, and adding

32-bit

x86

ARM

MIPS

64-bit

x64

ARM64

Booth’s multiplication algorithm

1.24.2 Division

Division using shifts

Division by 4 on ARM

Division by 4 on MIPS

Table of Contents