Skip to content

Commit

Permalink
Provide support for ARMv4, lacking bx and clz. Unroll the
Browse files Browse the repository at this point in the history
test-and-subtract loop and compute the initial block as address,
shaving off between 5% and 10% on Cortex A9 and 30%+ a Raspberry Pi.
Code written by Matt Thomas and Joerg Sonnenberger.

Differential Revision: http://llvm-reviews.chandlerc.com/D2595

llvm-svn: 200001
  • Loading branch information
jsonn committed Jan 24, 2014
1 parent 7409e84 commit 59611a8
Show file tree
Hide file tree
Showing 3 changed files with 412 additions and 198 deletions.
214 changes: 143 additions & 71 deletions compiler-rt/lib/arm/udivmodsi4.S
Original file line number Diff line number Diff line change
Expand Up @@ -8,89 +8,161 @@
*===----------------------------------------------------------------------===//
*
* This file implements the __udivmodsi4 (32-bit unsigned integer divide and
* modulus) function for the ARM architecture. A naive digit-by-digit
* computation is employed for simplicity.
* modulus) function for the ARM 32-bit architecture.
*
*===----------------------------------------------------------------------===*/

#include "../assembly.h"

#define ESTABLISH_FRAME \
push {r4, r7, lr} ;\
add r7, sp, #4
#define CLEAR_FRAME_AND_RETURN \
pop {r4, r7, pc}

#define a r0
#define b r1
#define i r3
#define r r4
#define q ip
#define one lr

.syntax unified
.align 3
.syntax unified

#ifdef ARM_HAS_BX
#define JMP(r) bx r
#else
#define JMP(r) mov pc, r
#endif

.text
.arm
.p2align 2
DEFINE_COMPILERRT_FUNCTION(__udivmodsi4)
#if __ARM_ARCH_EXT_IDIV__
tst r1, r1
beq LOCAL_LABEL(divzero)
beq LOCAL_LABEL(divby0)
mov r3, r0
udiv r0, r3, r1
mls r1, r0, r1, r3
str r1, [r2]
bx lr
LOCAL_LABEL(divzero):
mov r0, #0
bx lr
#else
// We use a simple digit by digit algorithm; before we get into the actual
// divide loop, we must calculate the left-shift amount necessary to align
// the MSB of the divisor with that of the dividend (If this shift is
// negative, then the result is zero, and we early out). We also conjure a
// bit mask of 1 to use in constructing the quotient, and initialize the
// quotient to zero.
ESTABLISH_FRAME
clz r4, a
tst b, b // detect divide-by-zero
clz r3, b
mov q, #0
beq LOCAL_LABEL(return) // return 0 if b is zero.
mov one, #1
subs i, r3, r4
blt LOCAL_LABEL(return) // return 0 if MSB(a) < MSB(b)

LOCAL_LABEL(mainLoop):
// This loop basically implements the following:
//
// do {
// if (a >= b << i) {
// a -= b << i;
// q |= 1 << i;
// if (a == 0) break;
// }
// } while (--i)
//
// Note that this does not perform the final iteration (i == 0); by doing it
// this way, we can merge the two branches which is a substantial win for
// such a tight loop on current ARM architectures.
subs r, a, b, lsl i
itt hs
orrhs q, q,one, lsl i
movhs a, r
it ne
subsne i, i, #1
bhi LOCAL_LABEL(mainLoop)

// Do the final test subtraction and update of quotient (i == 0), as it is
// not performed in the main loop.
subs r, a, b
itt hs
orrhs q, #1
movhs a, r

LOCAL_LABEL(return):
// Store the remainder, and move the quotient to r0, then return.
str a, [r2]
mov r0, q
CLEAR_FRAME_AND_RETURN
cmp r1, #1
bcc LOCAL_LABEL(divby0)
beq LOCAL_LABEL(divby1)
cmp r0, r1
bcc LOCAL_LABEL(quotient0)
/*
* Implement division using binary long division algorithm.
*
* r0 is the numerator, r1 the denominator.
*
* The code before JMP computes the correct shift I, so that
* r0 and (r1 << I) have the highest bit set in the same position.
* At the time of JMP, ip := .Ldiv0block - 12 * I.
* This depends on the fixed instruction size of block.
*
* block(shift) implements the test-and-update-quotient core.
* It assumes (r0 << shift) can be computed without overflow and
* that (r0 << shift) < 2 * r1. The quotient is stored in r3.
*/

# ifdef __ARM_FEATURE_CLZ
clz ip, r0
clz r3, r1
/* r0 >= r1 implies clz(r0) <= clz(r1), so ip <= r3. */
sub r3, r3, ip
adr ip, LOCAL_LABEL(div0block)
sub ip, ip, r3, lsl #2
sub ip, ip, r3, lsl #3
mov r3, #0
bx ip
# else
str r4, [sp, #-8]!

mov r4, r0
adr ip, LOCAL_LABEL(div0block)

lsr r3, r4, #16
cmp r3, r1
movhs r4, r3
subhs ip, ip, #(16 * 12)

lsr r3, r4, #8
cmp r3, r1
movhs r4, r3
subhs ip, ip, #(8 * 12)

lsr r3, r4, #4
cmp r3, r1
movhs r4, r3
subhs ip, #(4 * 12)

lsr r3, r4, #2
cmp r3, r1
movhs r4, r3
subhs ip, ip, #(2 * 12)

/* Last block, no need to update r3 or r4. */
cmp r1, r4, lsr #1
subls ip, ip, #(1 * 12)

ldr r4, [sp], #8 /* restore r4, we are done with it. */
mov r3, #0

JMP(ip)
# endif

#define IMM #

#define block(shift) \
cmp r0, r1, lsl IMM shift; \
addhs r3, r3, IMM (1 << shift); \
subhs r0, r0, r1, lsl IMM shift

block(31)
block(30)
block(29)
block(28)
block(27)
block(26)
block(25)
block(24)
block(23)
block(22)
block(21)
block(20)
block(19)
block(18)
block(17)
block(16)
block(15)
block(14)
block(13)
block(12)
block(11)
block(10)
block(9)
block(8)
block(7)
block(6)
block(5)
block(4)
block(3)
block(2)
block(1)
LOCAL_LABEL(div0block):
block(0)

str r0, [r2]
mov r0, r3
JMP(lr)

LOCAL_LABEL(quotient0):
str r0, [r2]
mov r0, #0
JMP(lr)

LOCAL_LABEL(divby1):
mov r3, #0
str r3, [r2]
JMP(lr)
#endif /* __ARM_ARCH_EXT_IDIV__ */

LOCAL_LABEL(divby0):
mov r0, #0
#ifdef __ARM_EABI__
b __aeabi_idiv0
#else
JMP(lr)
#endif

END_COMPILERRT_FUNCTION(__udivmodsi4)
Loading

0 comments on commit 59611a8

Please sign in to comment.