HI all people who wants a fast multiply now,
Even without adding a hardware multiplier, it is possible to gain some performance on the multiply. I disassembled the multiply routine (__mulsi3):
__mulsi3:
addi sp,sp,-8 <- useless, can be removed
stw fp,4(sp) <- useless, can be removed
mov r3,zero
mov fp,sp <- useless, can be removed
beq r4,zero,mul_30
mul_14:
andi r2,r4,1
cmpeq r2,r2,zero
srli r4,r4,1
bne r2,zero,mul_28
add r3,r3,r5
mul_28:
slli r5,r5,1
bne r4,zero,mul_14
mul_30:
mov r2,r3
ldw fp,4(sp) <- useless, can be removed
addi sp,sp,8 <- useless, can be removed
ret
This routine can be optimized a lot. The most important optimization that can be done is removing the stack frame stuff. This removes 5 instructions without any problem. This together with two minor optimizations results in the following function:
__mulsi3:
mov r2,zero
beq r4,zero,mul_30
mul_14:
andi r3,r4,1
srli r4,r4,1
beq r3,zero,mul_28
add r2,r2,r5
mul_28:
slli r5,r5,1
bne r4,zero,mul_14
mul_30:
ret
Just put this function in an assember source file and it should overrule the default implementation from the library.
For us this resulted in a performance gain of about 10%.
When this is still not good enough, make a custom instruction and implement the the function like this:
__mulsi3:
custom 0,r2,r4,r5
ret
I used an opencore multiplier and changed it a bit to do a 32x32 multiply in 4 clock cycles (see below). Currently the upper 32-bits of the result are thrown away, so if you also want fast multiplies with 'long long' results, some work still has to be done to make it possible to get the upper 32-bits and correctly implement the __muldi3 function.
The multiplier takes 708 LEs in my Cyclone EP1C3 design. I think this is not the best implemenattion of a multiplier, but it works fast and it did only take a few hours to make.
Have fun with it,
Tim Brugman.
-------------------------------------------------------------------------------
-- File: mult_unit.vhd --
-- --
-- Copyright © Deversys, 2003 --
-- --
-- function : entity and architecture for multiplication algorithms testing --
-- --
-- Author: Vladimir V. Erokhin, PhD, --
-- e-mails: vladvas@deversys.com; vladvas@verilog.ru; --
-- --
--------------- Revision History ----------------------------------------
-- --
-- Date Engineer Description --
-- --
-- 041108 T. Brugman Customized to make a fast multiply --
-- for Altera NiosII --
-------------------------------------------------------------------------------
library IEEE;
use IEEE.std_logic_1164.all;
use IEEE.std_logic_arith.all;
use IEEE.std_logic_unsigned.all;
entity MULT_UNIT is
port (
clk: in STD_LOGIC;
clk_en: in STD_LOGIC;
reset: in STD_LOGIC;
start: in STD_LOGIC;
dataa: in STD_LOGIC_VECTOR (31 downto 0);
datab: in STD_LOGIC_VECTOR (31 downto 0);
done: out std_logic;
result: out STD_LOGIC_VECTOR (31 downto 0)
);
end MULT_UNIT;
architecture RTL of MULT_UNIT is
function MULT_HIER(MULTIPLICAND: STD_LOGIC_VECTOR; MULTIPLIER: STD_LOGIC_VECTOR) return STD_LOGIC_VECTOR is
variable MUL_RESULT_1, MUL_RESULT_2, MUL_RESULT_3, MUL_RESULT_4: STD_LOGIC_VECTOR(MULTIPLICAND'length - 1 downto 0);
variable RESULT: STD_LOGIC_VECTOR(MULTIPLICAND'length * 2 - 1 downto 0);
variable HIGH_HALF_OF_MCD, LOW_HALF_OF_MCD, HIGH_HALF_OF_MER, LOW_HALF_OF_MER: STD_LOGIC_VECTOR((MULTIPLICAND'length +1)/2 - 1 downto 0);
variable MUL_RESULT_1_4: STD_LOGIC_VECTOR((MULTIPLICAND'length+1)/2*3 - 1 downto 0);
variable TEMP1, TEMP2: STD_LOGIC_VECTOR(1 downto 0);
variable TEMP3: STD_LOGIC_VECTOR(2 downto 0);
begin
if MULTIPLICAND'length = 2 then
TEMP1 := (MULTIPLIER(1) and MULTIPLICAND(0)) & (MULTIPLIER(0) and MULTIPLICAND(0));
TEMP2 := (MULTIPLIER(1) and MULTIPLICAND(1)) & (MULTIPLIER(0) and MULTIPLICAND(1));
if TEMP1(1) = '1' then
TEMP3 := ((MULTIPLICAND(1) and MULTIPLIER(1) and MULTIPLIER(0)) & (TEMP2 + 1));
else
TEMP3 := ('0' & TEMP2);
end if;
return TEMP3 & TEMP1(0);
else
HIGH_HALF_OF_MCD := MULTIPLICAND(MULTIPLICAND'high downto (MULTIPLICAND'high + 1)/2);
LOW_HALF_OF_MCD := MULTIPLICAND((MULTIPLICAND'high+1)/2 - 1 downto 0);
HIGH_HALF_OF_MER := MULTIPLIER(MULTIPLIER'high downto (MULTIPLIER'high + 1)/2);
LOW_HALF_OF_MER := MULTIPLIER((MULTIPLIER'high+1)/2 - 1 downto 0);
MUL_RESULT_1 := MULT_HIER(LOW_HALF_OF_MCD, LOW_HALF_OF_MER);
MUL_RESULT_2 := MULT_HIER(LOW_HALF_OF_MCD, HIGH_HALF_OF_MER);
MUL_RESULT_3 := MULT_HIER(HIGH_HALF_OF_MCD, LOW_HALF_OF_MER);
MUL_RESULT_4 := MULT_HIER(HIGH_HALF_OF_MCD, HIGH_HALF_OF_MER);
MUL_RESULT_1_4 := MUL_RESULT_4 & MUL_RESULT_1(MULTIPLICAND'high downto (MULTIPLICAND'high+1)/2);
RESULT := (MUL_RESULT_1_4 + (('0' & MUL_RESULT_2) + MUL_RESULT_3)) & MUL_RESULT_1((MULTIPLICAND'high+1)/2-1 downto 0);
return(RESULT);
end if;
end MULT_HIER;
TYPE TState IS (IDLE, MUL1, MUL2, MUL3);
SIGNAL state : TState;
SIGNAL result_1, result_2, result_3, mul16_result : STD_LOGIC_VECTOR(31 downto 0);
SIGNAL mul_result: STD_LOGIC_VECTOR(63 downto 0);
SIGNAL reg_a, reg_b: STD_LOGIC_VECTOR(15 downto 0);
BEGIN
process(reset,clk,mul16_result,state,dataa,datab,mul_result)
VARIABLE result_1_4: STD_LOGIC_VECTOR(47 downto 0);
VARIABLE arg_a, arg_b : STD_LOGIC_VECTOR(15 downto 0);
begin
case state is
when IDLE =>
arg_a := dataa(15 downto 0);
arg_b := datab(15 downto 0);
when MUL1 =>
arg_a := dataa(15 downto 0);
arg_b := datab(31 downto 16);
when MUL2 =>
arg_a := dataa(31 downto 16);
arg_b := datab(15 downto 0);
when MUL3 =>
arg_a := dataa(31 downto 16);
arg_b := datab(31 downto 16);
end case;
mul16_result <= MULT_HIER(arg_a, arg_b);
if reset='1' then
state <= IDLE;
done <= '0';
elsif clk='1' and clk'event then
if clk_en = '1' then
case state is
when IDLE =>
done <= '0';
result_1 <= mul16_result;
if start = '1' then
state <= MUL1;
end if;
when MUL1 =>
result_2 <= mul16_result;
state <= MUL2;
when MUL2 =>
result_3 <= mul16_result;
state <= MUL3;
when MUL3 =>
result_1_4 := mul16_result & result_1(31 downto 16);
mul_result <= (result_1_4 + (('0' & result_2) + result_3)) & result_1(15 downto 0);
done <= '1';
state <= IDLE;
end case;
end if;
end if;
result <= mul_result(31 downto 0);
end process;
END RTL;