HW Multiplier

Honored Contributor

21 years ago

HI all people who wants a fast multiply now,

Even without adding a hardware multiplier, it is possible to gain some performance on the multiply. I disassembled the multiply routine (__mulsi3):

__mulsi3:
        addi    sp,sp,-8                   <- useless, can be removed
        stw     fp,4(sp)                   <- useless, can be removed
        mov     r3,zero 
        mov     fp,sp                       <- useless, can be removed
        beq     r4,zero,mul_30
mul_14:
        andi    r2,r4,1
        cmpeq   r2,r2,zero
        srli    r4,r4,1
        bne     r2,zero,mul_28
        add     r3,r3,r5
mul_28:
        slli    r5,r5,1
        bne     r4,zero,mul_14
mul_30:
        mov     r2,r3
        ldw     fp,4(sp)                <- useless, can be removed
        addi    sp,sp,8                <- useless, can be removed
        ret

This routine can be optimized a lot. The most important optimization that can be done is removing the stack frame stuff. This removes 5 instructions without any problem. This together with two minor optimizations results in the following function:

__mulsi3:
        mov     r2,zero
        beq     r4,zero,mul_30
mul_14:
        andi    r3,r4,1
        srli      r4,r4,1
        beq     r3,zero,mul_28
        add     r2,r2,r5
mul_28:
        slli      r5,r5,1
        bne     r4,zero,mul_14
mul_30:
        ret

Just put this function in an assember source file and it should overrule the default implementation from the library.

For us this resulted in a performance gain of about 10%.

When this is still not good enough, make a custom instruction and implement the the function like this:

__mulsi3:
         custom  0,r2,r4,r5
         ret

I used an opencore multiplier and changed it a bit to do a 32x32 multiply in 4 clock cycles (see below). Currently the upper 32-bits of the result are thrown away, so if you also want fast multiplies with 'long long' results, some work still has to be done to make it possible to get the upper 32-bits and correctly implement the __muldi3 function.

The multiplier takes 708 LEs in my Cyclone EP1C3 design. I think this is not the best implemenattion of a multiplier, but it works fast and it did only take a few hours to make.

Have fun with it,

Tim Brugman.

-------------------------------------------------------------------------------
--  File: mult_unit.vhd                                                      --
--                                                                           --
--  Copyright © Deversys, 2003                                             --
--                                                                           --
-- function : entity and architecture for multiplication algorithms testing  -- 
--                                                                           --
--  Author: Vladimir V. Erokhin, PhD,                                        --
--         e-mails: vladvas@deversys.com; vladvas@verilog.ru;                --
--                                                                           --
---------------  Revision History      ----------------------------------------
--                                                                           --
--     Date  Engineer               Description                        --
--                                                                           --
--      041108   T. Brugman               Customized to make a fast multiply --
--                                        for Altera NiosII                  --
-------------------------------------------------------------------------------
library IEEE;
use IEEE.std_logic_1164.all;
use IEEE.std_logic_arith.all;
use IEEE.std_logic_unsigned.all;
entity MULT_UNIT is
    port (
     clk: in STD_LOGIC;
     clk_en: in STD_LOGIC;
     reset: in STD_LOGIC;
     start: in STD_LOGIC;
  dataa: in STD_LOGIC_VECTOR (31  downto 0);
  datab: in STD_LOGIC_VECTOR (31 downto 0);
  done: out std_logic;
  result: out STD_LOGIC_VECTOR (31 downto 0)
  );
end MULT_UNIT;
architecture RTL of MULT_UNIT is
function MULT_HIER(MULTIPLICAND: STD_LOGIC_VECTOR; MULTIPLIER: STD_LOGIC_VECTOR) return STD_LOGIC_VECTOR is
variable MUL_RESULT_1, MUL_RESULT_2, MUL_RESULT_3, MUL_RESULT_4: STD_LOGIC_VECTOR(MULTIPLICAND&#39;length - 1 downto 0); 
variable RESULT: STD_LOGIC_VECTOR(MULTIPLICAND&#39;length * 2 - 1 downto 0); 
variable HIGH_HALF_OF_MCD, LOW_HALF_OF_MCD, HIGH_HALF_OF_MER, LOW_HALF_OF_MER: STD_LOGIC_VECTOR((MULTIPLICAND&#39;length +1)/2 - 1 downto 0); 
variable MUL_RESULT_1_4: STD_LOGIC_VECTOR((MULTIPLICAND&#39;length+1)/2*3 - 1 downto 0); 
variable TEMP1, TEMP2: STD_LOGIC_VECTOR(1 downto 0); 
variable TEMP3: STD_LOGIC_VECTOR(2 downto 0); 
begin
  if MULTIPLICAND&#39;length = 2 then
      TEMP1 := (MULTIPLIER(1) and MULTIPLICAND(0)) & (MULTIPLIER(0) and MULTIPLICAND(0));
      TEMP2 := (MULTIPLIER(1) and MULTIPLICAND(1)) & (MULTIPLIER(0) and MULTIPLICAND(1));
      if TEMP1(1) = &#39;1&#39; then
         TEMP3 := ((MULTIPLICAND(1) and MULTIPLIER(1) and MULTIPLIER(0)) & (TEMP2 + 1));
      else
         TEMP3 := (&#39;0&#39; & TEMP2);
      end if;
      return TEMP3 & TEMP1(0);
   else
      HIGH_HALF_OF_MCD := MULTIPLICAND(MULTIPLICAND&#39;high downto (MULTIPLICAND&#39;high + 1)/2);
      LOW_HALF_OF_MCD := MULTIPLICAND((MULTIPLICAND&#39;high+1)/2 - 1 downto 0);
      HIGH_HALF_OF_MER := MULTIPLIER(MULTIPLIER&#39;high downto (MULTIPLIER&#39;high + 1)/2);
      LOW_HALF_OF_MER := MULTIPLIER((MULTIPLIER&#39;high+1)/2 - 1 downto 0);
      
      MUL_RESULT_1 := MULT_HIER(LOW_HALF_OF_MCD, LOW_HALF_OF_MER);
      MUL_RESULT_2 := MULT_HIER(LOW_HALF_OF_MCD, HIGH_HALF_OF_MER);
      MUL_RESULT_3 := MULT_HIER(HIGH_HALF_OF_MCD, LOW_HALF_OF_MER);
      MUL_RESULT_4 := MULT_HIER(HIGH_HALF_OF_MCD, HIGH_HALF_OF_MER);
      MUL_RESULT_1_4 := MUL_RESULT_4 & MUL_RESULT_1(MULTIPLICAND&#39;high downto (MULTIPLICAND&#39;high+1)/2);
      RESULT := (MUL_RESULT_1_4 + ((&#39;0&#39; & MUL_RESULT_2) + MUL_RESULT_3)) & MUL_RESULT_1((MULTIPLICAND&#39;high+1)/2-1 downto 0);
      return(RESULT);
   end if;    
end MULT_HIER;
TYPE TState IS (IDLE, MUL1, MUL2, MUL3);
SIGNAL state : TState;
SIGNAL result_1, result_2, result_3, mul16_result : STD_LOGIC_VECTOR(31 downto 0); 
SIGNAL mul_result: STD_LOGIC_VECTOR(63 downto 0); 
SIGNAL reg_a, reg_b: STD_LOGIC_VECTOR(15 downto 0);
BEGIN
process(reset,clk,mul16_result,state,dataa,datab,mul_result)
  VARIABLE result_1_4: STD_LOGIC_VECTOR(47 downto 0);
  VARIABLE arg_a, arg_b : STD_LOGIC_VECTOR(15 downto 0);
begin
  case state is
    when IDLE =>
      arg_a := dataa(15 downto 0);
      arg_b := datab(15 downto 0);
    when MUL1 =>
      arg_a := dataa(15 downto 0);
      arg_b := datab(31 downto 16);
    when MUL2 =>
      arg_a := dataa(31 downto 16);
      arg_b := datab(15 downto 0);
    when MUL3 =>
      arg_a := dataa(31 downto 16);
      arg_b := datab(31 downto 16);
  end case;
  mul16_result <= MULT_HIER(arg_a, arg_b);
  if reset=&#39;1&#39; then
    state <= IDLE;
    done <= &#39;0&#39;;
  elsif clk=&#39;1&#39; and clk&#39;event then
    if clk_en = &#39;1&#39; then
      case state is
        when IDLE =>
          done <= &#39;0&#39;;
          result_1 <= mul16_result;
          if start = &#39;1&#39; then
            state <= MUL1;
          end if;
        when MUL1 =>
          result_2 <= mul16_result;
          state <= MUL2;
        when MUL2 =>
          result_3 <= mul16_result;
          state <= MUL3;
        when MUL3 =>
          result_1_4 := mul16_result & result_1(31 downto 16);
          mul_result <= (result_1_4 + ((&#39;0&#39; & result_2) + result_3)) & result_1(15 downto 0);
          done <= &#39;1&#39;;
          state <= IDLE;
      end case;
    end if;
  end if;
  result <= mul_result(31 downto 0);
end process;
END RTL;

Forum Discussion

Recent Discussions

NiosV and juart-terminal

Nios V license

NIOS does not start after SW download (timing issue?)

DK-DEV-AGI027-RA: JTAG chain broken after Nios V Hello, FPGA recovery fails

Ashling RISC Free IDE fails to download ELF file