-- Exemplar flavor VHDL source code for a 5X5 image convolving filter
-- with 8 bit coefficients and 8 bit unsigned inputs.  The architecture
-- accepts 5 bytes at a time, and converts the 5 bytes into 5 serial bit
-- streams at 80 Mbits/second, yielding a throughput of 10 million
-- convolutions/second with an ORCA 2C04-3S208 part.  Total logic count
-- is 70 PFU's out of a total of 100, thus giving a nominal gate count of
-- 3000 gates.  The speed is limited by the propagation delay of the carry
-- bit in the 8 bit shifting accumulators, and the distribution delay of the
-- 8 select signals (sel1, sel2, ...) which control the filter loading and
-- readout on the global tristate bus that collects the filter results.
--
-- Because these filters use a distributed arithmetic approach, the filter
-- coefficients are entirely contained within the various lookup tables.
-- In other filter design methods that implement multiplier-free filters,
-- the number of adders depends on the filter coefficients.
-- Copyright 1995,  John McCluskey
-- email:  J.McCluskey@ieee.org

-- This was compiled with CORE 2.1.10, using Xilinx 4000 as the target
-- technology, and the resulting XNF file was retargeted to ORCA using
-- ORCA Foundry 7.0  (formerly known as FPGA Foundry from Neocad).
 
LIBRARY ieee;
use ieee.std_logic_1164.all;
LIBRARY exemplar ;
use exemplar.exemplar_1164.all;

-- the "my_stuff" package contains definitions for a sychronous write enable
-- flip flop procedures.  
LIBRARY my_stuff; 
use my_stuff.my_stuff.all;

entity convolver is  
                     
        port (
        X1, X2, X3, X4, X5 : IN std_logic_vector(7 downto 0);
	RESULT: INOUT  std_logic_vector(7 downto 0);  --signed_char;
	RDY: OUT std_logic;  -- true pulse when ready for next words
  	CLK: IN std_logic    -- clock input
  	);             

end convolver;

architecture Orca of convolver is

subtype byte is integer range -128 to 127;
type lookup is array (natural range <>) of byte;


-- this is a 32 bit shift register with 4 taps
component sreg4 port (
	CLK,SDIN: IN std_logic;
	Q8, Q16, Q24, Q32: OUT std_logic
	);
end component;

-- this is a loadable 8 bit shifting accumulator
component acc8 port (
	CLK, LD, D7, D6, D5, D4, D3, D2, D1, D0: IN std_logic;
	Q7, Q6, Q5, Q4, Q3, Q2, Q1, Q0: OUT std_logic
	);
end component;

-- this is a loadable 12 bit accumulator that doesn't shift
-- it has an input that is a 2's complement 8 bit number
component acc12l port(
	CLK, LD, A7, A6, A5, A4, A3, A2, A1, A0: IN std_logic;
	Q11, Q10, Q9, Q8, Q7, Q6, Q5, Q4, Q3, Q2, Q1, Q0: OUT std_logic
	);
end component;

-- 5 wires to carry the serialized input bytes
signal serial: std_logic_vector(4 downto 0);

signal CNT, TCNT : std_logic_vector(2 downto 0);
attribute use_modgen: boolean;
attribute use_modgen of tcnt:signal is false;
signal Y1, Y2, Y3, Y4, Y5: std_logic_vector(7 downto 0);
signal int_bus, lat_bus: std_logic_vector(7 downto 0); -- internal bus
signal first, second: std_logic_vector(0 to 4);  -- first serial bits appear here
signal answer: std_logic_vector(11 downto 0);  -- final 12 bit answer
signal grab, sel, sel0,sel1,sel2,sel3,sel4,sel5,sel6,sel7: std_logic; 

begin

TCNT <= CNT + "001";  -- basic 8 cycle counter drives everything
dff_v(TCNT,CLK,CNT);

sel <= '1' when CNT="111" else '0';  -- generate a Johnson counter
dff(sel,clk,sel0);  dff(sel0,clk,sel1);
dff(sel1,clk,sel2);  dff(sel2,clk,sel3);
dff(sel3,clk,sel4); dff(sel4,clk,sel5);
dff(sel5,clk,sel6); dff(sel6,clk,sel7);

dff(sel7,clk,grab);
dff_enable(X1,grab,CLK,Y1);  -- enable latching the new words when grab is true
dff_enable(X2,grab,CLK,Y2);
dff_enable(X3,grab,CLK,Y3);
dff_enable(X4,grab,CLK,Y4);
dff_enable(X5,grab,CLK,Y5);  -- this uses up 40 flip flops
dff(grab,clk,RDY);  -- signal we have latched the 5 words and are ready for more.

-- define a 5 wide 8 input mux using tri-states
mux1: block  -- with staggered delay output into the serial bus
signal d0: std_logic;
signal d1: std_logic_vector(0 to 1);
signal d2: std_logic_vector(0 to 2);
signal d3: std_logic_vector(0 to 3);
signal sbus: std_logic_vector(0 to 4);

begin
sbus <= Y5(0) & Y4(0) & Y3(0) & Y2(0) & Y1(0) when sel1='1' else (others => 'Z');
sbus <= Y5(1) & Y4(1) & Y3(1) & Y2(1) & Y1(1) when sel2='1' else (others => 'Z');
sbus <= Y5(2) & Y4(2) & Y3(2) & Y2(2) & Y1(2) when sel3='1' else (others => 'Z');
sbus <= Y5(3) & Y4(3) & Y3(3) & Y2(3) & Y1(3) when sel4='1' else (others => 'Z');
sbus <= Y5(4) & Y4(4) & Y3(4) & Y2(4) & Y1(4) when sel5='1' else (others => 'Z');
sbus <= Y5(5) & Y4(5) & Y3(5) & Y2(5) & Y1(5) when sel6='1' else (others => 'Z');
sbus <= Y5(6) & Y4(6) & Y3(6) & Y2(6) & Y1(6) when sel7='1' else (others => 'Z');
sbus <= Y5(7) & Y4(7) & Y3(7) & Y2(7) & Y1(7) when sel0='1' else (others => 'Z');

dff_v(sbus,clk,first);  -- latch the tri-state bus into "first"
dff_v(first,clk,second);  -- and then into "second"

dff(second(0),clk,serial(0));  -- no delay

dff(second(1),clk, d0);  dff(d0, clk, serial(1)); -- delay by 1 bit

dff(second(2),clk, d1(0));  dff(d1(0),clk,d1(1)); 
dff(d1(1),clk,serial(2)); -- delay by 2 bits

dff(second(3),clk, d2(0));  dff_v(d2(0 to 1),clk,d2(1 to 2));
dff(d2(2),clk,serial(3)); -- delay by 3 bits

dff(second(4),clk, d3(0));  dff_v(d3(0 to 2),clk,d3(1 to 3));
dff(d3(3),clk,serial(4)); -- delay by 4 bits

-- this block uses up 40 tri-state buffers and 20 flip flops
end block;


filt0: block  -- this block implements a 4 tap FIR filter
-- with the assumption that the input data is non-negative
-- the filter coefficents are stored in the constant lookup table
-- this first filter is different, since it needs no 32 bit shift register

constant lut0: lookup(0 to 15) := (
	-17, 39, -3, 34, 97, -34, 8, -72,
	81, -102, 32, 99, 51, -43, 7, 19
	);

signal tbus, dbus, zbus: std_logic_vector(7 downto 0);

begin
-- the LSB hits this lookup table when sel2='1'
tbus <= int2evec( lut0( evec2int(first(0 to 3)) ), 8);
dff_v(tbus,clk,dbus);
-- then the lookup of the LSB's hit the accumulator when sel3='1'
a1: acc8 port map( CLK=> CLK, LD => sel3, D7 => dbus(7), D6 => dbus(6),
	D5 => dbus(5), D4 => dbus(4), D3 => dbus(3), D2 => dbus(2),
	D1 => dbus(1), D0 => dbus(0), Q7 => zbus(7), Q6 => zbus(6), 
	Q5 => zbus(5), Q4 => zbus(4), Q3 => zbus(3), Q2 => zbus(2),
	Q1 => zbus(1), Q0 => zbus(0) );
-- now drive the internal tri-state bus when the answer is ready
-- which is exactly the same cycle when we are loading the LSB's above
int_bus <= zbus when sel3='1' else (others => 'Z');
end block;  -- the filter uses 6 PLC's and 8 tri-state buffers

-- this filter coefficient (tap) uses the output from second(4)
-- with the resulting answer appearing at sel4
filt6: block  -- this block implements a 1 tap FIR filter
-- with the assumption that the input data is non-negative
-- there is only 1 filter coefficient

signal del: std_logic;
signal tbus, dbus, zbus: std_logic_vector(7 downto 0);
constant coef: byte := -34;
begin

tbus <= int2evec( coef, 8) when del='1' else (others => '0') ;
dff_v(tbus,clk,dbus);

a1: acc8 port map( CLK=> CLK, LD => sel4, D7 => dbus(7), D6 => dbus(6),
	D5 => dbus(5), D4 => dbus(4), D3 => dbus(3), D2 => dbus(2),
	D1 => dbus(1), D0 => dbus(0), Q7 => zbus(7), Q6 => zbus(6), 
	Q5 => zbus(5), Q4 => zbus(4), Q3 => zbus(3), Q2 => zbus(2),
	Q1 => zbus(1), Q0 => zbus(0) );
-- now drive the internal tri-state bus when the answer is ready
int_bus <= zbus when sel4='1' else (others => 'Z');
end block;  -- the filter uses 6 PLC's and 8 tri-state buffers



filt1: block  -- this block implements a 4 tap FIR filter
-- with the assumption that the input data is non-negative
-- the filter coefficents are stored in the constant lookup table

constant lut0: lookup(0 to 15) := (
	-14, 60, -43, 32, 17, -4, 88, -106,
	111, -33, 14, 0, 55, -66, 77, 89
	);

signal qbus: std_logic_vector(3 downto 0);
signal tbus, dbus, zbus: std_logic_vector(7 downto 0);

begin
s1: sreg4 port map(CLK => CLK, SDIN => serial(0),
	Q8 => qbus(3), Q16 => qbus(2), Q24 => qbus(1), Q32 => qbus(0) );

tbus <= int2evec( lut0( evec2int(qbus) ), 8);
dff_v(tbus,clk,dbus);
-- this filter runs 1 cycle behind filt0
a1: acc8 port map( CLK=> CLK, LD => sel5, D7 => dbus(7), D6 => dbus(6),
	D5 => dbus(5), D4 => dbus(4), D3 => dbus(3), D2 => dbus(2),
	D1 => dbus(1), D0 => dbus(0), Q7 => zbus(7), Q6 => zbus(6), 
	Q5 => zbus(5), Q4 => zbus(4), Q3 => zbus(3), Q2 => zbus(2),
	Q1 => zbus(1), Q0 => zbus(0) );
-- now drive the internal tri-state bus when the answer is ready
int_bus <= zbus when sel5='1' else (others => 'Z');
end block;  -- the filter uses 6 PLC's and 8 tri-state buffers

filt2: block  -- this block implements a 4 tap FIR filter
-- with the assumption that the input data is non-negative
-- the filter coefficents are stored in the constant lookup table

constant lut0: lookup(0 to 15) := (
	-33, 10, -43, 2, 17, -40, 8, -106,
	10, -63, 14, 33, 55, -66, 7, 9
	);

signal qbus: std_logic_vector(3 downto 0);
signal tbus, dbus, zbus: std_logic_vector(7 downto 0);

begin
s1: sreg4 port map(CLK => CLK, SDIN => serial(1),
	Q8 => qbus(3), Q16 => qbus(2), Q24 => qbus(1), Q32 => qbus(0) );

tbus <= int2evec( lut0( evec2int(qbus) ), 8);
dff_v(tbus,clk,dbus);

a1: acc8 port map( CLK=> CLK, LD => sel6, D7 => dbus(7), D6 => dbus(6),
	D5 => dbus(5), D4 => dbus(4), D3 => dbus(3), D2 => dbus(2),
	D1 => dbus(1), D0 => dbus(0), Q7 => zbus(7), Q6 => zbus(6), 
	Q5 => zbus(5), Q4 => zbus(4), Q3 => zbus(3), Q2 => zbus(2),
	Q1 => zbus(1), Q0 => zbus(0) );
-- now drive the internal tri-state bus when the answer is ready
int_bus <= zbus when sel6='1' else (others => 'Z');
end block;  -- the filter uses 6 PLC's and 8 tri-state buffers

filt3: block  -- this block implements a 4 tap FIR filter
-- with the assumption that the input data is non-negative
-- the filter coefficents are stored in the constant lookup table

constant lut0: lookup(0 to 15) := (
	-1, 0, 43, 15, 7, -47, 80, -16,
	11, -70, 1, 83, -102, 72, 111, 3
	);

signal qbus: std_logic_vector(3 downto 0);
signal tbus, dbus, zbus: std_logic_vector(7 downto 0);

begin
s1: sreg4 port map(CLK => CLK, SDIN => serial(2),
	Q8 => qbus(3), Q16 => qbus(2), Q24 => qbus(1), Q32 => qbus(0) );

tbus <= int2evec( lut0( evec2int(qbus) ), 8);
dff_v(tbus,clk,dbus);

a1: acc8 port map( CLK=> CLK, LD => sel7, D7 => dbus(7), D6 => dbus(6),
	D5 => dbus(5), D4 => dbus(4), D3 => dbus(3), D2 => dbus(2),
	D1 => dbus(1), D0 => dbus(0), Q7 => zbus(7), Q6 => zbus(6), 
	Q5 => zbus(5), Q4 => zbus(4), Q3 => zbus(3), Q2 => zbus(2),
	Q1 => zbus(1), Q0 => zbus(0) );
-- now drive the internal tri-state bus when the answer is ready
int_bus <= zbus when sel7='1' else (others => 'Z');
end block;  -- the filter uses 6 PLC's and 8 tri-state buffers

filt4: block  -- this block implements a 4 tap FIR filter
-- with the assumption that the input data is non-negative
-- the filter coefficents are stored in the constant lookup table

constant lut0: lookup(0 to 15) := (
	-23, 32, -1, 123, 2, 84, 8, 106,
	15, -88, 44, 30, 5, -66, 17, 16
	);

signal qbus: std_logic_vector(3 downto 0);
signal tbus, dbus, zbus: std_logic_vector(7 downto 0);

begin
s1: sreg4 port map(CLK => CLK, SDIN => serial(3),
	Q8 => qbus(3), Q16 => qbus(2), Q24 => qbus(1), Q32 => qbus(0) );

tbus <= int2evec( lut0( evec2int(qbus) ), 8);
dff_v(tbus,clk,dbus);

a1: acc8 port map( CLK=> CLK, LD => sel0, D7 => dbus(7), D6 => dbus(6),
	D5 => dbus(5), D4 => dbus(4), D3 => dbus(3), D2 => dbus(2),
	D1 => dbus(1), D0 => dbus(0), Q7 => zbus(7), Q6 => zbus(6), 
	Q5 => zbus(5), Q4 => zbus(4), Q3 => zbus(3), Q2 => zbus(2),
	Q1 => zbus(1), Q0 => zbus(0) );
-- now drive the internal tri-state bus when the answer is ready
int_bus <= zbus when sel0='1' else (others => 'Z');
end block;  -- the filter uses 6 PLC's and 8 tri-state buffers

filt5: block  -- this block implements a 4 tap FIR filter
-- with the assumption that the input data is non-negative
-- the filter coefficents are stored in the constant lookup table

constant lut0: lookup(0 to 15) := (
	14, 101, -3, 15, 17, -4, 88, -106,
	10, -33, 1, 0, 5, -66, 7, -99
	);

signal qbus: std_logic_vector(3 downto 0);
signal tbus, dbus, zbus: std_logic_vector(7 downto 0);

begin
s1: sreg4 port map(CLK => CLK, SDIN => serial(4),
	Q8 => qbus(3), Q16 => qbus(2), Q24 => qbus(1), Q32 => qbus(0) );

tbus <= int2evec( lut0( evec2int(qbus) ), 8);
dff_v(tbus,clk,dbus);

a1: acc8 port map( CLK=> CLK, LD => sel1, D7 => dbus(7), D6 => dbus(6),
	D5 => dbus(5), D4 => dbus(4), D3 => dbus(3), D2 => dbus(2),
	D1 => dbus(1), D0 => dbus(0), Q7 => zbus(7), Q6 => zbus(6), 
	Q5 => zbus(5), Q4 => zbus(4), Q3 => zbus(3), Q2 => zbus(2),
	Q1 => zbus(1), Q0 => zbus(0) );
-- now drive the internal tri-state bus when the answer is ready
int_bus <= zbus when sel1='1' else (others => 'Z');
end block;  -- the filter uses 6 PLC's and 8 tri-state buffers

-- latch the tri-state bus before accumulating the final results
dff_v(int_bus,clk,lat_bus);  
-- the first valid result comes out at sel4='1'

-- now the final result is calculated in a 12 bit accumulator, of which we will
-- only keep the top 8 bits.
acc1: acc12l port map ( CLK => CLK, LD => sel4, A7 => lat_bus(7), 
	A6 => lat_bus(6), A5 => lat_bus(5), A4 => lat_bus(4), A3 => lat_bus(3),
	A2 => lat_bus(2), A1 => lat_bus(1), A0 => lat_bus(0), Q11 => answer(11), 
	Q10 => answer(10), Q9 => answer(9), Q8 => answer(8), Q7 => answer(7), 
	Q6 => answer(6), Q5 => answer(5), Q4 => answer(4), Q3 => answer(3), 
	Q2 => answer(2), Q1 => answer(1), Q0 => answer(0) );


dff_enable(answer(7 downto 4),sel2,clk,result(3 downto 0));  -- output low nibble
dff_enable(answer(11 downto 8),sel3,clk,result(7 downto 4));  -- output high nibble

end Orca;

<div align="center"><br /><script type="text/javascript"><!--
google_ad_client = "pub-7293844627074885";
//468x60, Created at 07. 11. 25
google_ad_slot = "8619794253";
google_ad_width = 468;
google_ad_height = 60;
//--></script>
<script type="text/javascript" src="http://pagead2.googlesyndication.com/pagead/show_ads.js">
</script><br />&nbsp;</div>