This is an enhancement request. While u16, u32 and u64 numbers get divided by a small constant divisor using a fast algorithm, the same isn't true for u128 numbers:
fn digits_sum0(mut n: u16) -> u16 {
let mut total = 0;
while n != 0 {
total += n % 10;
n /= 10;
}
total
}
fn digits_sum1(mut n: u32) -> u32 {
let mut total = 0;
while n != 0 {
total += n % 10;
n /= 10;
}
total
}
fn digits_sum2(mut n: u64) -> u64 {
let mut total = 0;
while n != 0 {
total += n % 10;
n /= 10;
}
total
}
fn digits_sum3(mut n: u128) -> u128 {
let mut total = 0;
while n != 0 {
total += n % 10;
n /= 10;
}
total
}
digits_sum0:
xor eax, eax
test di, di
je .LBB0_2
.LBB0_1:
movzx ecx, di
imul edx, ecx, 52429
shr edx, 19
lea esi, [rdx + rdx]
lea esi, [rsi + 4*rsi]
sub edi, esi
add eax, edi
mov edi, edx
cmp ecx, 10
jae .LBB0_1
.LBB0_2:
ret
digits_sum1:
xor eax, eax
test edi, edi
je .LBB0_3
mov r8d, 3435973837
.LBB0_2:
mov edx, edi
imul rdx, r8
shr rdx, 35
lea esi, [rdx + rdx]
lea esi, [rsi + 4*rsi]
mov ecx, edi
sub ecx, esi
add eax, ecx
cmp edi, 10
mov edi, edx
jae .LBB0_2
.LBB0_3:
ret
digits_sum2:
xor ecx, ecx
test rdi, rdi
je .LBB1_3
movabs r8, -3689348814741910323
.LBB1_2:
mov rax, rdi
mul r8
shr rdx, 3
lea rax, [rdx + rdx]
lea rax, [rax + 4*rax]
mov rsi, rdi
sub rsi, rax
add rcx, rsi
cmp rdi, 10
mov rdi, rdx
jae .LBB1_2
.LBB1_3:
mov rax, rcx
ret
digits_sum3:
push r15
push r14
push r13
push r12
push rbx
mov rax, rdi
or rax, rsi
je .LBB2_1
mov rbx, rsi
mov r15, rdi
xor r14d, r14d
mov r13d, 10
xor r12d, r12d
.LBB2_4:
mov edx, 10
xor ecx, ecx
mov rdi, r15
mov rsi, rbx
call __udivti3@PLT
mov rcx, rax
mov rsi, rdx
mul r13
lea rdi, [rsi + 4*rsi]
lea rdx, [rdx + 2*rdi]
mov rdi, r15
sub rdi, rax
mov rax, rbx
sbb rax, rdx
add r14, rdi
adc r12, rax
cmp r15, 10
sbb rbx, 0
mov r15, rcx
mov rbx, rsi
jae .LBB2_4
jmp .LBB2_2
.LBB2_1:
xor r14d, r14d
xor r12d, r12d
.LBB2_2:
mov rax, r14
mov rdx, r12
pop rbx
pop r12
pop r13
pop r14
pop r15
ret
This is an enhancement request. While u16, u32 and u64 numbers get divided by a small constant divisor using a fast algorithm, the same isn't true for u128 numbers:
Generate asm (with -O):
The faster algorithm is short enough and it could be added to rustc:
http://ridiculousfish.com/blog/posts/labor-of-division-episode-i.html
http://ridiculousfish.com/blog/posts/labor-of-division-episode-ii.html
http://libdivide.com/
http://ridiculousfish.com/blog/posts/labor-of-division-episode-iii.html