%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% COMPLETE.BIB
%
% Brigham Young University Reconfigurable Logic Lab Bibliography
%
% This bibliography, in BibTex format, represents the current research
% in the field of 'Reconfigurable Computing'. The bibliography is
% continually expanding and additions or modifications are welcome.
% Please direct all correspondence regarding this bibliography to
% wirthlim(at)fpga.ee.byu.edu
%
% Last Modified: $Date: 1999/08/17 16:22:15 $
% Last Edited By: $Author: grahamp $
%
% The 'key' field is used with each entry in the bibliography to help
% sort through the entries. The keywords used to date are the
% following:
%
% system: includes articles that describe a reconfigurable computing
%         system or discusses architectural features of reconfigurable
%         computing machines,
% tool:   includes articles that discuss any aspect of reconfigurable
%         computing tools such as high-level programming environements,
%         hardware synthesis tools, and novel design approaches,
% app:    includes articles that discuss an application proposed, tested
%         or implemented on a reconfigurable computing platform,
% device: includes articles that discuss devices  used in
%         reconfigurable computing (FPGAs and other programmable logic).
% rtr:    includes articles that describe a reconfigurable computing
%         system, tool, device or application that uses the feature of
%         run-time reconfiguration.
%
% Each entry is given an index number and is used as a means for
% reference in BYU's reconfigurable logic laboratory library
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%                             Index Number: 1

@Article{athanas-silverman:93,
  author =       "P. M. Athanas and H. F. Silverman",
  title =        "Processor Reconfiguration through Instruction-set
                 Metamorphosis",
  key =          "system,tool",
  journal =      "IEEE Computer",
  volume =       "26",
  number =       "3",
  pages =        "11--18",
  year =         "1993",
  month =        mar,
  abstract =     "The processor reconfiguration through instruction-set
                 metamorphosis (PRISM) general-purpose architecture,
                 which speeds up computationally intensive tasks by
                 augmenting the core processor's functionality with new
                 operations, is described. The PRISM approach adapts the
                 configuration and fundamental operations of a core
                 processing system to the computationally intensive
                 portions of a targeted application. PRISM-1, an initial
                 prototype system, is described, and experimental
                 results that demonstrate the benefits of the PRISM
                 concept are presented.",
}

%                             Index Number: 2


@InProceedings{ebeling-boriello:91,
  author =       "C. Ebeling and G. Borriello and S. A. Hauck and D.
                 Song and E. A. Walkup",
  title =        "{TRIPTYCH}: a New {FPGA} Architecture",
  key =          "device",
  booktitle =    "{FPGAs}. International Workshop on Field Programmable
                 Logic and Applications",
  year =         "1991",
  month =        sep,
  pages =        "75--90",
  abstract =     "Existing FPGA architectures can be classified along
                 two dimensions: reprogrammable vs. one-time
                 programmable and general- purpose vs. domain specific.
                 The most challenging class of FPGA architectures to
                 design is the reprogrammable, general-purpose FPGA, of
                 which Xilinx is the most well-known example. In this
                 paper we describe Triptych, a new FPGA architecture
                 that addresses two problems of current reprogrammable
                 FPGAs: the large delays incurred in composing large
                 functions and the strict division between routing and
                 logic resources. Our studies indicate that Triptych is
                 more area-efficient than current architectures and has
                 comparable delay characteristics for a large range of
                 circuits that include both data-path elements and
                 control logic.",
}

%                             Index Number: 3

@Article{monaghan-noakes:92,
  author =       "S. Monaghan and P. D. Noakes",
  title =        "Reconfigurable Special Purpose Hardware for Scientific
                 Computation and Simulation",
  key =          "app",
  journal =      "Computing \& Control Engineering Journal",
  year =         "1992",
  month =        sep,
  pages =        "225--234",
  abstract =     "Xilinx Field Progammable Gate Arrays (FPGAs) are used
                 to implement reconfigurable special purpose computing
                 hardware for computationally intensive many-body
                 problems in physics and mathematics. The inexpensive
                 PC-based design environment used for this work is
                 described, and the performance for several different
                 problems of the resulting reconfigurable hardware is
                 compared with that of some general purpose computers.
                 The merits of using FPGAs in special purpose
                 computational hardware are outlined.",
}

%                             Index Number: 4


@Article{kean-gray:90,
  author =       "T. Kean and J. Gray",
  title =        "Configurable Hardware: Two Case Studies of Micro-Grain
                 Computation",
  key =          "device, app",
  journal =      "Journal of VLSI Signal Processing",
  volume =       "2",
  number =       "1",
  year =         "1990",
  month =        sep,
  pages =        "9--16",
  abstract =     "This paper describes a new VLSI
                 architecture-Configurable Array Logic (CAL) which, at
                 its lowest level, can be programmed electrically to
                 implement any circuit composed of logic gates. At
                 higher levels the technology provides a medium for the
                 direct implementation of algorithms. It particularly
                 addresses systolic and cellular automaton algorithms
                 where the basic computational elements perform
                 computations unsuited to conventional processors.",
}

%                             Index Number: 5

@InProceedings{chow-rose:93,
  author =       "P. Chow and S. O. Seo and K. Chung and G. Paez and J.
                 Rose",
  title =        "A High-Speed {FPGA} Using Programmable Mini-tiles",
  key =          "device",
  booktitle =    "Research on integrated systems: Proceedings of the
                 1993 Symposium",
  editor =       "G. Borriello and C. Ebeling",
  year =         "1993",
  pages =        "103--122",
  abstract =     "Field-Programmable Gate Arrays (FPGAs) are now a
                 recognized technology for the implementation of digital
                 systems, but they suffer from reduced speed and logic
                 density compared to Mask- Programmed Gate Arrays. Many
                 studies have been performed concerning the effect of an
                 FPGA's architecture on its speed and density. In this
                 paper we describe how these studies are used in an
                 actual implementation of a high-performance FPGA. The
                 architecture of the FPGA logic block was determined
                 through an experimental process using custom-built CAD
                 tools. The result is a logic block that is an
                 asymmetric tree of four-input lookup tables that are
                 hardwired together. A segmented routing architecture,
                 also tuned using experiments, is employed to improve
                 the speed of the interconnect. To address the problems
                 of full-custom design, a novel layout style for FPGAs
                 is proposed. It can be likened to the technique used in
                 PLAs, in which a 'mini-tile' contains a portion of most
                 components in the logic tile. The mini-tile is
                 optimized for layout density and speed, and placed into
                 a 4x4 array, where it is then customized by adding vias
                 to obtain the desired hardwired connections. As well as
                 providing ease of layout, this technique gives the
                 capability to easily change the hardwired connections
                 in the logic block architecture, and the segmentation
                 length distribution in the routing architecture.",
}

%                             Index Number: 6

@InProceedings{bertin-roncin:93,
  author =       "P. Bertin and D. Roncin and J. Vuillemin",
  title =        "Programmable Active Memories: a Performance
                 Assessment",
  key =          "system, app",
  booktitle =    "Research on Integrated Systems: Proceedings of the
                 1993 Symposium",
  editor =       "G. Borriello and C. Ebeling",
  year =         "1993",
  pages =        "88--102",
  abstract =     "We present some quantitative performance measurements
                 for the computing power of Programmable Active Memories
                 (PAM), as introduced by [2]. Based on Field
                 Programmable Gate Array (FPGA) technology, the PAM is a
                 universal hardware co-processor closely coupled to a
                 standard host computer. The PAM can speed up many
                 critical software applications running on the host, by
                 executing part of the computations through a specific
                 hardware design. The performance measurements presented
                 are based on two PAM architectures and ten specific
                 applications, drawn from arithmetics, algebra,
                 geometry, physics, biology, audio and video. Each of
                 these PAM designs proves as fast as any reported
                 hardware or super-computer for the corresponding
                 application. In cases where we could bring some genuine
                 algorithmic innovation into the design process, the PAM
                 has proved an order of magnitude faster than any
                 previously existing system (see [19] and [18]).",
}

%                             Index Number: 7

@InProceedings{linde-nordstrom:92,
  author =       "A. Linde and T. Nordstrom and M. Taveniku",
  title =        "Using {FPGA}s to Implement a Reconfigurable Highly
                 Parallel Computer",
  key =          "system",
  booktitle =    "Field-Programmable Gate Arrays: Architectures and
                 Tools for Rapid Prototyping. Second International
                 Workshop on Field Programmable Logic and Applications",
  year =         "1992",
  address =      "Vienna, Austria",
  month =        aug,
  pages =        "199--210",
  abstract =     "With the arrival of large Field Programmable Gate
                 Arrays (FPGAs) it is possible to build an entire
                 computer using only FPGA and memory. In this paper we
                 share some experience from building a highly parallel
                 computer using this concept. Even if today's FPGAs are
                 of considerable size, each processor must be relatively
                 simple if a highly parallel computer is to be
                 constructed from them. Based on our experience of other
                 parallel computers and thorough studies of the intended
                 applications, we think it is possible to build very
                 powerful and efficient computers using bit- serial
                 processing elements with SIMD (Single Instruction
                 stream, Multiple Data streams) control. A major benefit
                 of using FPGAs is the fact that different architectural
                 variations can easily be tested and evaluated on real
                 applications. In the primary application area, which is
                 artificial neural networks, the gains of extensions
                 like bit- serial multipliers or counters can quickly be
                 found. A concrete implementation of a processor array,
                 using Xilinx FPGAs is described in this paper. To get
                 efficient usage and high performance with the FPGA
                 circuits signal flow plays an important role. As the
                 current implementation of the Xilinx EDA software does
                 not support that design issue, the signal flow design
                 has to be made by hand. The processing elements are
                 simple and regular which makes it easy to implement
                 them with the XACT Editor. This gives high performance,
                 up to 40-50 MHz.",
}

%above is the year of the conference; year of publication:1993
%above is the place of the conf.; place of publication:Berlin, Germany
%date of conference: 31 Aug.-2 Sept. 1992

%                             Index Number: 8

@InProceedings{furtek:93,
  author =       "F. Furtek",
  title =        "A Field-Programmable Gate Array for Systolic
                 Computing",
  key =          "device, app",
  booktitle =    "Research on Integrated Systems: Proceedings of the
                 1993 Symposium",
  editor =       "G. Borriello and C. Ebeling",
  year =         "1993",
  pages =        "183--199",
  abstract =     "There is a growing awareness that reconfigurable
                 logic, in the form of SRAM-based field-programmable
                 gate arrays (FPGA's), is an ideal vehicle for
                 implementing a wide range of compute-intensive
                 algorithms. The CLi6000 series of SRAM FPGA's from
                 Concurrent Logic is especially well suited to the
                 special needs of this area. We describe the
                 architecture of the CLi6000 series of SRAM-based
                 FPGA's, emphasizing those features that support
                 efficient implementation of pipelined arithmetic
                 circuits. These features are illustrated through a
                 massively parallel, highly pipelined algorithm for
                 motion estimation, an especially compute-intensive
                 algorithm used in digital video compression.",
}

%                             Index Number: 9

@InProceedings{gray-kean:89,
  author =       "J. P. Gray and T. A. Kean",
  title =        "Configurable Hardware: {A} New Paradigm for
                 Computation",
  key =          "device, app",
  booktitle =    "Decennial CalTech Conference on VLSI",
  year =         "1989",
  month =        mar,
  pages =        "277--293",
  address =      "Pasadena, CA",
  abstract =     "At present there are two main methods of implementing
                 algorithms: interpretation of a data stream
                 representing a program by an active processing unit
                 (software) and interconnection of active logic elements
                 (hardware). In one case the computation performed is
                 dependent on data stored in memory and in the other on
                 the interconnection between a set of physical devices
                 (transistors). Both paradigms can be shown, given
                 reasonable definitions, to be essentially equivalent in
                 terms of the functions they can compute (see, for
                 example, [Savage76]). In this paper we will make the
                 case for a third paradigm: Configurable Hardware in
                 which the interconnection between active logic
                 elements, and hence the function computed, is dependent
                 on a control store.",
}

%                            Index Number: 10

@InProceedings{arnold:93,
  author =       "J. M. Arnold",
  title =        "The {Splash} 2 Software Environment",
  key =          "system, app",
  booktitle =    "Proceedings of IEEE Workshop on {FPGA}s for Custom
                 Computing Machines",
  editor =       "D. A. Buell and K. L. Pocek",
  year =         "1993",
  month =        apr,
  pages =        "88--93",
  address =      "Napa, CA",
  abstract =     "Splash 2 is an attached special purpose parallel
                 processor in which the computing elements are user
                 programmable FPGA devices. The architecture of Splash 2
                 is designed to accelerate the solution of problems
                 which exhibit at least modest amounts of temporal or
                 data parallelism. Applications are developed by writing
                 behavioral descriptions of algorithms in VHDL, which
                 are then iteratively refined and debugged within the
                 Splash 2 simulator. Once an application is determined
                 to be functionally correct in simulation, it is
                 compiled to a gate list and optimized by logic
                 synthesis. The gate list is then mapped onto the FPGA
                 architecture by automatic placement and routing tools
                 to form a loadable FPGA object module. A C language
                 library and a symbolic debugger comprise the execution
                 environment. The Splash 2 system has been shown to be
                 effective on a variety of applications, including text
                 searching, sequence analysis, and image processing.",
}

%                            Index Number: 11

@InProceedings{pryor-thistle:93,
  author =       "D. V. Pryor and M. R. Thistle and N. Shirazi",
  title =        "Text Searching On {Splash} 2",
  key =          "app",
  booktitle =    "Proceedings of IEEE Workshop on {FPGA}s for Custom
                 Computing Machines",
  editor =       "D. A. Buell and K. L. Pocek",
  year =         "1993",
  month =        apr,
  address =      "Napa, CA",
  pages =        "172--177",
  abstract =     "This paper proposes a flexible, reprogrammable
                 hardware solution to the acceleration of text-based
                 keyword search problems. In these problems, a stream of
                 input text is checked against a known list of keywords
                 (a dictionary) for occurrences of those keywords in the
                 text. Our solution employs an attached processor called
                 Splash 2, which exploits the speed and
                 reconfigurability of Field Programmable Gate Array
                 technology. The Splash 2 system was designed and built
                 at the SRC for a wide variety of applications. A Splash
                 2 system is comprised of an interface board to a Sun
                 Sparc-2 host and up to 16 Splash boards, each of which
                 contains 16 Xilinx 4010 FPGAs interconnected in a
                 linear array and also through a 16-way full crossbar
                 switch. Each Xilinx chip is coupled with a 4 Mbit
                 static RAM through a dedicated interface. The text
                 searching program implemented on a one-board Splash 2
                 system is capable of processing text at an estimated
                 rate of 50 million characters per second.",
}

%                            Index Number: 12

@InProceedings{arnold-buell:92,
  author =       "J. M. Arnold and D. A. Buell and E. G. Davis",
  title =        "Splash 2",
  key =          "system",
  booktitle =    "Proceedings of the 4th Annual ACM Symposium on
                 Parallel Algorithms and Architectures",
  year =         "1992",
  month =        jun,
  pages =        "316--324",
  abstract =     "The Splash attached processor board (referred to as
                 Splash 1) was designed and built at the SRC to provide
                 very high performance on a range of bit-processing
                 problems. It proved to be highly successful;
                 notwithstanding the known dangers of Second System
                 Syndrome, a follow-on system, Splash 2, is being
                 designed and built. This paper describes Splash 2,
                 compares it with Splash 1 and to discusses both its
                 programming and two algorithmic applications.",
}

%                            Index Number: 13

@InProceedings{arnold-buell:93,
  author =       "J. M. Arnold and D. A. Buell and E. G. Davis",
  title =        "{VHDL} Programming on {Splash} 2",
  key =          "system",
  booktitle =    "More {FPGAs}: Proceedings of the 1993 International
                 Workshop on Field-Programmable Logic and Applications",
  year =         "1993",
  month =        sep,
  pages =        "182--191",
  address =      "Oxford, England",
  abstract =     "Splash 2 is an attached special purpose parallel
                 processor in which the computing elements are user
                 programmable FPGA devices. The programming environment
                 for Splash 2 is based upon the VHSIC Hardware
                 Description Language (VHDL), simulation and logic
                 synthesis. Application programs for Splash 2 are
                 developed by writing behavioral descriptions of
                 algorithms in VHDL which are then iteratively refined
                 and debugged within the Splash 2 simulator. Logic
                 synthesis and automatic placement and routing
                 techniques are used to compile the VHDL applications
                 into loadable FPGA object modules.",
}

%                            Index Number: 14

@Article{hartenstein-hirschbiel:91,
  author =       "R. W. Hartenstein and A. G. Hirschbiel and M.
                 Riedmuller and K. Schmidt and M. Weber",
  title =        "A Novel {ASIC} Design Approach Based on a New Machine
                 Paradigm",
  key =          "system",
  journal =      "IEEE Journal of Solid-State Circuits",
  volume =       "26",
  number =       "7",
  year =         "1991",
  month =        jul,
  pages =        "975--989",
  abstract =     "This paper introduces a new design methodology for
                 rapid implementation of cheap high-performance ASIC's.
                 The method described here derives from high-level
                 algorithm specifications or from high-level source
                 programs not only the target hardware, but (in contrast
                 to silicon compilers) at the same time also the machine
                 code to run it. The new method is based on a novel
                 sequential machine paradigm where execution is used
                 (being orders of magnitude more efficient) instead of
                 simulation and where programmers may do the design job,
                 rather than real hardware designers. The paper
                 illustrates that for a very large class of commercially
                 important algorithms (DSP, graphics, image processing
                 and many others) this paradigm is orders of magnitude
                 more efficient than the von Neumann paradigm. Compared
                 to von-Neumann- based implementations, acceleration
                 factors of up to more than 2000 have been obtained
                 experimentally. The performance of ASIC's obtained by
                 this new methodology is mostly competitive with ASIC
                 designs obtained in the much slower and much more
                 expensive {"}traditional{"} way. As a byproduct the new
                 methodology also supports the automatic generation of
                 universal accelerators for coprocessor use in
                 workstations, etc., such as, e.g., to accelerate EDA
                 tools. It is the goal of this paper to explain the
                 highly efficient application of the xputer paradigm,
                 rather than to introduce its hardware implementation.
                 It is the goal of this paper to illustrate the
                 innovative power of this paradigm, and its potential as
                 a major step in progress toward systematically deriving
                 ASIC designs from algorithm specifications.",
}

%                            Index Number: 15

@InProceedings{raimbault-lavenier:93,
  author =       "F. Raimbault and D. Lavenier and S. Rubini and B.
                 Pottier",
  title =        "Fine Grain Parallelism on a {MIMD} Machine Using
                 {FPGA}s",
  key =          "system",
  booktitle =    "Proceedings of IEEE Workshop on {FPGA}s for Custom
                 Computing Machines",
  editor =       "D. A. Buell and K. L. Pocek",
  year =         "1993",
  month =        apr,
  address =      "Napa, CA",
  pages =        "2--8",
  abstract =     "Current MIMD machines are used for coarse
                 grain-parallelism and also offer message passing
                 mechanisms to deal with inter-processor communications.
                 But these mechanisms lack efficiency in fine- grain
                 parallel applications such as systolic computation.
                 This article presents the use of an FPGA chip to set up
                 a fast systolic communication agent on a linear
                 asynchronous network of TRANSPUTER processors; the
                 machine is called ARMEN.",
}

%                            Index Number: 16

@InProceedings{iseli-sanchez:93,
  author =       "C. Iseli and E. Sanchez",
  title =        "Spyder: {A} Reconfigurable {VLIW} Processor using
                 {FPGA}s",
  key =          "system",
  booktitle =    "Proceedings of IEEE Workshop on {FPGA}s for Custom
                 Computing Machines",
  editor =       "D. A. Buell and K. L. Pocek",
  year =         "1993",
  month =        apr,
  address =      "Napa, CA",
  pages =        "17--24",
  abstract =     "A processor with multiple reconfigurable execution
                 units has been designed and implemented. The
                 reconfigurable execution units are implemented using
                 reprogrammable field programmable gate array (FPGA)
                 chips. The architecture and implementation of this
                 processor are described in detail in this paper. An
                 example shows that this reconfigurable processor is
                 able to compute the new state of 100'000'000 cells of
                 Conway's game of life per second with a clock speed of
                 6.25 MHz.",
}

%                            Index Number: 17

@InProceedings{milne-cockshott:93,
  author =       "G. Milne and P. Cockshott and G. McCaskill and P.
                 Barrie",
  title =        "Realising Massively Concurrent Systems on the {SPACE}
                 Machine",
  key =          "system",
  booktitle =    "Proceedings of IEEE Workshop on {FPGA}s for Custom
                 Computing Machines",
  editor =       "D. A. Buell and K. L. Pocek",
  year =         "1993",
  month =        apr,
  address =      "Napa, CA",
  pages =        "26--32",
  abstract =     "Highly concurrent systems occur frequently in the
                 physical world. They include weather systems, traffic
                 systems, electrocardiac systems and integrated
                 circuits. To better understand such systems requires
                 that they be rigorously described and then simulated.
                 How do we best perform this description? Since such
                 systems are inherently concurrent and do not fit well
                 onto sequential von Neumann architectures, what type of
                 machine should be used to simulate them? This paper
                 focuses on a class of systems characterised as being
                 highly concurrent and which are composed out of many
                 simple parts which interact with other parts in their
                 locality. It discusses how to describe these systems
                 and introduces a cellular automata type of architecture
                 which is used to simulate these systems directly in
                 hardware, with physical concurrency being realised by
                 true hardware concurrency. The architecture of the
                 SPACE machine (Scalable Parallel Architecture for
                 Concurrency Experiments), which is constructed from
                 reconfigurable FPGA logic is introduced and it is
                 demonstrated how to simulate road traffic systems using
                 it.",
}

%                            Index Number: 18

@InProceedings{ling-amano:93,
  author =       "X. P. Ling and H. Amano",
  title =        "{WASMII}: a Data Driven Computer on a Virtual
                 Hardware",
  key =          "system,app,rtr",
  booktitle =    "Proceedings of IEEE Workshop on {FPGA}s for Custom
                 Computing Machines",
  editor =       "D. A. Buell and K. L. Pocek",
  year =         "1993",
  month =        apr,
  address =      "Napa, CA",
  pages =        "33--42",
  abstract =     "Virtual hardware is a technique to realize a large
                 digital circuit with a small real hardware by using an
                 extended Field Programmable Gate Array (FPGA)
                 technology. Several configuration RAM modules are
                 provided inside the FPGA chip, and the configuration of
                 the gate array can be rapidly changed by replacing the
                 active module. Data for configuration are transferred
                 from an off-chip backup RAM to an un-used configuration
                 RAM module. A novel computation mechanism called the
                 WASMII, which executes a target dataflow graph
                 directly, is proposed on the basis of the virtual
                 hardware. A WASMII chip consists of the FPGA for
                 virtual hardware and the additional mechanism to
                 replace configuration RAM modules in the data driven
                 manner. Configuration data are preloaded by the order
                 which is assigned in advance with a static scheduling
                 preprocessor. By connecting a number of WASMII chips, a
                 highly parallel system can be easily constructed.",
}

%                            Index Number: 19

@InProceedings{casselman:93,
  author =       "S. Casselman",
  title =        "Virtual Computing and The Virtual Computer",
  key =          "system",
  booktitle =    "Proceedings of IEEE Workshop on {FPGA}s for Custom
                 Computing Machines",
  editor =       "D. A. Buell and K. L. Pocek",
  year =         "1993",
  month =        apr,
  address =      "Napa, CA",
  pages =        "43--48",
  abstract =     "Virtual Computing is an entirely new form of
                 supercomputing that allows an algorithm to be
                 implemented in hardware. Based on the Xilinx FPGA[1]
                 and ICubes FPID[2] the Virtual Computer is completely
                 reconfigurable in every respect. Computing machines
                 based on reconfigurable logic are hyper-scalable
                 meaning they scale up better than 1-1.",
}

%                            Index Number: 20

@InProceedings{french-taylor:93,
  author =       "P. C. French and R. W. Taylor",
  title =        "A Self-Reconfiguring Processor",
  key =          "system",
  booktitle =    "Proceedings of IEEE Workshop on {FPGA}s for Custom
                 Computing Machines",
  editor =       "D. A. Buell and K. L. Pocek",
  year =         "1993",
  month =        apr,
  address =      "Napa, CA",
  pages =        "50--59",
  abstract =     "Recent developments in the design and fabrication of
                 field programmable logic devices (FPGA's) may well
                 change the way in which we design and fabricate
                 conventional microprocessors. The use of uncommitted
                 logic whose function may be modified at run time makes
                 the prospect of dynamic application specific integrated
                 circuits closer to reality than ever before. Much of
                 the work to date on reconfigurable logic has focussed
                 on its application in co-processor and {"}glue{"}
                 roles. This paper discusses how complete processors
                 might be fabricated with a minimum of {"}fixed{"} or
                 static logic. It is shown that in order to exploit
                 FPGAs, a processor that is radically different from
                 conventional architectures is required. The paper
                 concludes by considering what evolutions of current
                 logic families would favour this type of application.",
}

%                            Index Number: 21

@InProceedings{lewis-vanierssel:93,
  author =       "D. M. Lewis and M. H. van Ierssel and D. H. Wong",
  title =        "A Field Programmable Accelerator for Compiled-Code
                 Applications",
  key =          "system",
  booktitle =    "Proceedings of IEEE Workshop on {FPGA}s for Custom
                 Computing Machines",
  editor =       "D. A. Buell and K. L. Pocek",
  year =         "1993",
  month =        apr,
  address =      "Napa, CA",
  pages =        "60--67",
  abstract =     "This paper describes a special purpose application
                 accelerator using field programmable gate arrays to
                 accelerate a range of applications. The accelerator is
                 designed to support applications by allowing the user
                 to implement a processor with an instruction set
                 designed for the specific application being
                 accelerated, using specialized instructions to
                 implement critical fragments of the application. A
                 compiled-code software organization is used to reduce
                 overhead operations. A prototype has been built, and
                 the first application to be ported to it, logic
                 simulation, is underway.",
}

%                            Index Number: 22

@InProceedings{guccione-gonzales:93,
  author =       "S. A. Guccione and M. J. Gonzalez",
  title =        "A Data-Parallel Programming Model for Reconfigurable
                 Architectures",
  key =          "tool",
  booktitle =    "Proceedings of IEEE Workshop on {FPGA}s for Custom
                 Computing Machines",
  editor =       "D. A. Buell and K. L. Pocek",
  year =         "1993",
  month =        apr,
  address =      "Napa, CA",
  pages =        "79--87",
  abstract =     "Recently, several machines have been built using Field
                 Programmable Gate Array (FPGA) technology. These
                 reconfigurable architectures have demonstrated very
                 high performance for a variety of problems. The
                 configuration of these machines typically rely on some
                 form of hardware specification. In this paper we
                 demonstrate that a more traditional software approach
                 may be used. A vector based data-parallel model and its
                 mapping to a reconfigurable architecture are
                 introduced. Included in the model are parallel prefix
                 or scan operators. The language supporting this model
                 is a subset of the C programming language.",
}

%                            Index Number: 23

@InProceedings{monaghan-cowen:93,
  author =       "S. Monaghan and C. P. Cowen",
  title =        "Reconfigurable Multi-Bit Processor for {DSP}
                 Applications in Statistical Physics",
  key =          "system, app",
  booktitle =    "Proceedings of IEEE Workshop on {FPGA}s for Custom
                 Computing Machines",
  editor =       "D. A. Buell and K. L. Pocek",
  year =         "1993",
  month =        apr,
  address =      "Napa, CA",
  pages =        "103--110",
  abstract =     "A PC-AT hosted DSP processor architecture implemented
                 in SRAM- based field programmable gate arrays (FPGA)
                 and static memories is described. Despite its
                 simplicity, the processor circuits can be reconfigured
                 under software control to tackle a class of multi-bit
                 'pixel' processing problems of current interest in the
                 statistical physics of disordered materials, thereby
                 offering some of the problem flexibility of a general
                 purpose processor and the performance of custom
                 hardware. The flexibility offered by the FPGA
                 implementation is discussed in detail as is a
                 particular application of the processor (to disordered
                 superconductors). The performance of the processor is
                 shown to compare well with similarly costing commercial
                 DSP hardware. The low cost of the processor means it
                 can be replicated to obtain dedicated supercomputer
                 performance.",
}

%                            Index Number: 24

@InProceedings{cuccaro-reese:93,
  author =       "S. A. Cuccaro and C. F. Reese",
  title =        "The {CM-2X}: {A} Hybrid {CM-2/Xilinx} Prototype",
  key =          "system, app",
  booktitle =    "Proceedings of IEEE Workshop on {FPGA}s for Custom
                 Computing Machines",
  editor =       "D. A. Buell and K. L. Pocek",
  year =         "1993",
  month =        apr,
  address =      "Napa, CA",
  pages =        "121--130",
  abstract =     "This paper describes the CM-2X prototype. This
                 one-of-a-kind machine is the result of a Supercomputing
                 Research Center/Thinking Machines Corporation joint
                 effort to examine the suitability of a hybrid
                 combination of CM-2 architecture and Xilinx
                 programmable gate array technology. In addition to a
                 description of the CM-2X and Xilinx architecture, a
                 simple applications example is provided that
                 illustrates many of the issues involved in programming
                 the machine.",
}

%                            Index Number: 25

@InProceedings{wood:93,
  author =       "L. F. Wood",
  title =        "High Performance Analysis and Control of Complex
                 Systems Using Dynamically Reconfigurable Silicon and
                 Optical Fiber Memory",
  key =          "system",
  booktitle =    "Proceedings of IEEE Workshop on {FPGA}s for Custom
                 Computing Machines",
  editor =       "D. A. Buell and K. L. Pocek",
  year =         "1993",
  month =        apr,
  address =      "Napa, CA",
  pages =        "132--141",
  abstract =     "M is a highly parallel asynchronous computer for the
                 analysis and control of complex systems. A complex
                 system is a system with many interacting components.
                 Examples of complex systems include applications in
                 molecular biology, economics, and signal processing. M
                 asynchronous computations reproduce the structural
                 dynamics of a system using high fidelity behavioral
                 modeling. Programs are composed of an application
                 model, an environment model, and a distributed
                 subsumption operation system. Processes are implemented
                 using position independent instructions that operate in
                 parallel on strings of binary data. All M FPGA fine
                 grained parallel processing nodes are double buffered,
                 asynchronous, and highly pipelined. The fiber system
                 memory is optically multiplexed, and asynchronous. The
                 technology will extend new gigabit ATM optical networks
                 with integrated high performance computing services.",
}

%                            Index Number: 26

@InProceedings{babb-tessier:93,
  author =       "J. Babb and R. Tessier and A. Agarwal",
  title =        "Virtual Wires: Overcoming Pin Limitations in
                 {FPGA}-based Logic Emulators",
  key =          "tool",
  booktitle =    "Proceedings of IEEE Workshop on {FPGA}s for Custom
                 Computing Machines",
  editor =       "D. A. Buell and K. L. Pocek",
  year =         "1993",
  month =        apr,
  address =      "Napa, CA",
  pages =        "142--151",
  abstract =     "Existing FPGA-based logic emulators suffer from
                 limited inter- chip communication bandwidth, resulting
                 in low gate utilization (10-20 percent). This resource
                 imbalance increases the number of chips needed to
                 emulate a particular logic design and thereby decreases
                 emulation speed, since signals must cross more chip
                 boundaries. Current emulators only use a fraction of
                 potential communication bandwidth because they dedicate
                 each FPGA pin (physical wire) to a single emulated
                 signal (logical wire). These logical wires are not
                 active simultaneously and are only switched at
                 emulation clock speeds. Virtual wires overcome pin
                 limitations by intelligently multiplexing each physical
                 wire among multiple logical wires and pipelining these
                 connections at the maximum clocking frequency of the
                 FPGA. A virtual wire represents a connection from a
                 logical output on one FPGA to a logical input on
                 another FPGA. Virtual wires not only increase usable
                 bandwidth, but also relax the absolute limits imposed
                 on gate utilization. The resulting improvement in
                 bandwidth reduces the need for global interconnect,
                 allowing effective use of low dimension inter-chip
                 topologies, coupledf with the ability of virtual wires
                 to overlap communication with computation, can even
                 improve emulation speeds. We present the concept of
                 virtual wires and describe our first implementation, a
                 {"}softwire{"} compiler which utilizes static routing
                 and relies on minimal hardware support. Results from
                 compiling netlists for the 18K gate Sparcle
                 microprocessor and the 86K gate Alewife Communications
                 and Cache Controller indicate that virtual wires can
                 increase FPGA gate utilization beyond 80 percent
                 without a significant slowdown in emulation speed.",
}

%                            Index Number: 27

@InProceedings{foulk:93,
  author =       "P. W. Foulk",
  title =        "Data-folding in {SRAM} configurable {FPGA}s",
  key =          "tool, app",
  booktitle =    "Proceedings of IEEE Workshop on {FPGA}s for Custom
                 Computing Machines",
  editor =       "D. A. Buell and K. L. Pocek",
  year =         "1993",
  month =        apr,
  address =      "Napa, CA",
  pages =        "163--171",
  abstract =     "FPGAs which are configured by static RAM can be
                 rapidly changed from one logic configuration to
                 another. This raises the possibility of configuring the
                 logic to implement a function for a specific set of
                 values, i.e. folding the inputs into the logic design.
                 The paper discusses data folding with respect to
                 Algotronix FPGAs, presenting a text searching circuit
                 as an example. This folded circuit saves at least half
                 the logic over a conventional circuit, and very much
                 more if data folding is taken as far as possible. It
                 also presents performance figures for the folded
                 circuit, and discusses other applications, and suggests
                 features which are desirable if data folding is to be
                 practicable, most of which are possessed by the
                 Algotronix CAL array.",
}

%                            Index Number: 28

@InProceedings{hoang:93,
  author =       "D. T. Hoang",
  title =        "Searching Genetic Databases on {Splash} 2",
  key =          "app",
  booktitle =    "Proceedings of IEEE Workshop on {FPGA}s for Custom
                 Computing Machines",
  editor =       "D. A. Buell and K. L. Pocek",
  year =         "1993",
  month =        apr,
  address =      "Napa, CA",
  pages =        "185--191",
  abstract =     "In this paper, we describe two systolic arrays for
                 computing the edit distance between two genetic
                 sequences using a well-known dynamic programming
                 algorithm. The systolic arrays have been implemented
                 for the Splash 2 programmable logic array, and are
                 intended to be used for database searching. Simulations
                 indicate that the faster Splash 2 implementation can
                 search a database at a rate of 12 million characters
                 per second, several orders of magnitude faster than
                 implementations of the dynamic programming algorithm on
                 conventional computers.",
}

%                            Index Number: 29

@InProceedings{luk-lok:93,
  author =       "W. Luk and V. Lok and I. Page",
  title =        "Hardware Acceleration of Divide-and-Conquer Paradigms:
                 a Case Study",
  key =          "tool, app",
  booktitle =    "Proceedings of IEEE Workshop on {FPGA}s for Custom
                 Computing Machines",
  editor =       "D. A. Buell and K. L. Pocek",
  year =         "1993",
  month =        apr,
  address =      "Napa, CA",
  pages =        "192--201",
  abstract =     "We describe a method for speeding up
                 divide-and-conquer algorithms with a hardware
                 coprocessor, using sorting as an example. The method
                 employs a conventional processor for the {"}divide{"}
                 and {"}merge{"} phases, while the {"}conquer{"} phase
                 is handled by a purpose-built coprocessor. It is shown
                 how transformation techniques from the Ruby language
                 can be adopted in developing a family of systolic
                 sorters, and how one of the resulting designs is
                 prototyped in eight FPGAs on a PC coprocessor board
                 known as CHS2x4 from Algotronix. The execution of the
                 hardware unit is embedded in a sorting program, with
                 the PC host merging the sorted sequences from the
                 hardware sorter. The performance of this implementation
                 is compared against various sorting algorithms on a
                 number of PC systems.",
}

%                            Index Number: 30

@InProceedings{daalen-jeavons:93,
  author =       "M. van Daalen and P. Jeavons and J. Shawe-Taylor",
  title =        "A Stochastic Neural Architecture that Exploits
                 Dynamically Reconfigurable {FPGA}s",
  key =          "app",
  booktitle =    "Proceedings of IEEE Workshop on {FPGA}s for Custom
                 Computing Machines",
  editor =       "D. A. Buell and K. L. Pocek",
  year =         "1993",
  month =        apr,
  address =      "Napa, CA",
  pages =        "202--211",
  abstract =     "In this paper we present an expandable digital
                 architecture that provides an efficient real time
                 implementation platform for large neural networks. The
                 architecture makes heavy use of the techniques of bit
                 serial stochastic computing to carry out the large
                 number of required parallel synaptic calculations. In
                 this design all real valued quantities are encoded on
                 to stochastic bit streams in which the '1' density is
                 proportional to the given quantity. The actual digital
                 circuitry is simple and highly regular thus allowing
                 very efficient space usage of fine grained FPGAs.
                 Another feature of the design is that the large number
                 of weights required by a neural network are generated
                 by circuitry tailored to each of their specific values,
                 thus saving valuable cells. Whenever one of these
                 values is required to change, the appropriate circuitry
                 must be dynamically reconfigured. This may always be
                 achieved in a fixed and minimum number of cells for a
                 given bit stream resolution.",
}

%                            Index Number: 31

@InProceedings{chan-schlag:93,
  author =       "P. K. Chan and M. D. F. Schlag",
  title =        "Architectural Tradeoffs in
                 Field-Programmable-Device-Based Computing Systems",
  key =          "tool",
  booktitle =    "Proceedings of IEEE Workshop on {FPGA}s for Custom
                 Computing Machines",
  editor =       "D. A. Buell and K. L. Pocek",
  year =         "1993",
  month =        apr,
  address =      "Napa, CA",
  pages =        "152--161",
  abstract =     "Reprogrammable Field-Programmable Gate Arrays (FPGAs)
                 have enabled the realization of high performance and
                 affordable reconfigurable computing engines. We examine
                 the architectural tradeoffs involved in designing
                 general purpose FPGA-based computing systems with
                 field-programmable gate arrays and field- programmable
                 interconnects. The fact that FPGAs provide both
                 programmable logic and programmable interconnects
                 raises numerous design issues that need to be
                 considered with care. Factors that influence the
                 tradeoffs are routability, rearrangeability, and
                 speed.",
}

%                            Index Number: 32

@InProceedings{page-luk:91,
  author =       "I. Page and W. Luk",
  title =        "Compiling occam into {FPGA}s",
  key =          "tool, app",
  booktitle =    "FPGAs. International Workshop on Field Programmable
                 Logic and Applications",
  year =         "1991",
  month =        sep,
  address =      "Oxford, UK",
  editors =      "W. R. Moore and W. Luk",
  pages =        "271--283",
  abstract =     "We describe a compiler which maps programs expressed
                 in a subset of Occam into netlist descriptions of
                 parallel hardware. Using Field-Programmable Gate Arrays
                 to implement such netlists, problem-specific hardware
                 can be generated entirely by a software process. Inner
                 loops of time-consuming programs can be implemented as
                 hardware and the less intensively-used parts of the
                 program can be mapped into machine code by a
                 conventional compiler. Software investment is protected
                 since the same program can run entirely in software,
                 entirely in hardware, or in a mixture of both. A single
                 program can thus result in many implementations across
                 a potentially wide cost-performance range. The
                 compilation system has been used to generate
                 inner-loops, hardware interfaces to real-world devices,
                 systolic arrays, and complete microprocessors. In the
                 near future we hope to have a proven version of the
                 compiler, enabling us automatically to generate
                 provably correct hardware implementations, including
                 microprocessors, from higher-level specifications.",
}

%                            Index Number: 33

@InProceedings{champeau-pape:94,
  author =       "J. Champeau and L. Le Pape and B. Pottier and S.
                 Rubini and E. Gautrin and L. Perraudeau",
  title =        "Flexible Parallel {FPGA}-Based Architectures with
                 {ArMen}",
  key =          "system, app",
  booktitle =    "Proceedings of the 27th Hawaii International
                 Conference on System Sciences",
  year =         "1994",
  month =        jan,
  address =      "Wailea, HI",
  editor =       "T. N. Mudge and B. D. Shriver",
  pages =        "105--113",
  abstract =     "ArMen is a parallel machine in which each node is
                 coupled to an FPGA ring. The underlying idea is to
                 complement an MIMD architecture with global
                 coprocessors providing extra control and processing
                 properties. The use of regular hardware patterns such
                 as cellular automata or pipelines allows high level
                 definitions of the coprocessors. The results are fast
                 prototyping possibilities for specific applications
                 such as image processing or industrial control. Basic
                 realizations are described. Changing from an FPGA
                 technology to a VLSI one provider benefits with respect
                 to cost and performance, without any effort at the
                 specification level. The MADMACS pattern generator can
                 be used to fold several FPGA configurations into the
                 same VLSI circuit.",
}

%                            Index Number: 34

@InProceedings{herpel-held:94,
  author =       "H. J. Herpel and M. Held and M. Glesner",
  title =        "A Design Methodology for the Conceptual Design of
                 Application Specific Digital Processors in Mechatronic
                 Systems",
  key =          "tool",
  booktitle =    "Proceedings of the 27th Hawaii International
                 Conference on System Sciences",
  year =         "1994",
  month =        jan,
  address =      "Wailea, HI",
  editor =       "T. N. Mudge and B. D. Shriver",
  pages =        "78--86",
  abstract =     "This paper presents a methodology and a design
                 environment to support validation and design space
                 exploration for embedded systems including application
                 specific digital signal processors prototyping. Our
                 approach to heterogeneous system design is based on
                 rapid prototyping integrated with a set of graphical
                 design entry, synthesis, and analysis tools. System
                 partitioning into a set of software and hardware
                 modules is done at system description level. User
                 guided and automated synthesis tools generate a fully
                 functional prototype that can be connected to real
                 world processes to verify system design and to estimate
                 system performance.",
}

%                            Index Number: 35

@InProceedings{gebotys-gebotys:94,
  author =       "C. H. Gebotys and R. J. Gebotys",
  title =        "Application-Specific Architectures for
                 Field-Programmable {VLSI} Technologies",
  key =          "tool",
  booktitle =    "Proceedings of the 27th Hawaii International
                 Conference on System Sciences",
  year =         "1994",
  month =        jan,
  address =      "Wailea, HI",
  editor =       "T. N. Mudge and B. D. Shriver",
  pages =        "124--130",
  abstract =     "New field-programmable gate array (FPGA) technologies
                 have increased the industrial interest in tools which
                 map a DSP application and a set of performance
                 constraints to a specific VLSI architecture. This paper
                 presents an optimization methodology for mapping a DSP
                 application and a set of performance constraints into
                 an architecture targeted for FPGA technologies with
                 user-programmable RAM blocks on chip. The target
                 architecture supports multiple register files, multiple
                 busses, complex types of functional units, and
                 multichip implementation. The optimization methodology
                 presented in this paper maps DSP applications to
                 optimized register file architectures suitable for
                 FPGAs using a number of different integer programming
                 models. A new integer programming model is presented
                 and used to minimize the number of busses required in
                 the application-specific architectures. Results show
                 that the optimization methodology provides
                 architectures with 22% fewer bus connections than
                 previous research in practical cpu times. For the first
                 time this research provides industry with 1) a high
                 level design optimization methodology that synthesizes
                 application-specific DSP architectures for
                 implementation in new field programmable VLSI
                 technologies, and 2) a methodology to support fast
                 prototyping of DSP applications using multiple FPGA
                 chips.",
}

%                            Index Number: 36

@InProceedings{eldredge-hutchings:94,
  author =       "J. G. Eldredge and B. L. Hutchings",
  title =        "Density Enhancement of a Neural Network Using {FPGA}s
                 and Run-Time Reconfiguration",
  key =          "app,rtr",
  booktitle =    "Proceedings of IEEE Workshop on {FPGA}s for Custom
                 Computing Machines",
  editor =       "D. A. Buell and K. L. Pocek",
  year =         "1994",
  month =        apr,
  address =      "Napa, CA",
  pages =        "180--188",
  abstract =     "Run-time reconfiguration is a way of more fully
                 exploiting the flexibility of reconfigurable FPGAs. The
                 Run-Time Reconfiguration Artificial Neural network
                 (RRANN) uses run-time reconfiguration to increase the
                 hardware density of FPGAs. The RRANN architecture also
                 allows large amounts of parallelism to be used and is
                 very scalable. RRANN divides the back-stages and
                 configures the FPGAs to execute only one stage at a
                 time. The FPGAs are reconfigured as part of normal
                 execution in order to change stages. Using
                 reconfigurability in this way increases the number of
                 hardware neurons a single Xilinx XC3090 can implement
                 by 500%. Performance is effected by reconfiguration
                 overhead, but this overhead becomes insignificant in
                 large networks. This overhead is made even more
                 insignificant with improved configuration methods.
                 Run-time reconfiguration is a flexible realization of
                 the time/space trade-off. The RRANN architecture has
                 been designed and built using commercially available
                 hardware, and its performance has been measured.",
}

%                            Index Number: 37

@InProceedings{bade-hutchings:94,
  author =       "S. Bade and B. L. Hutchings",
  title =        "{FPGA}-Based Stochastic Neural Networks:
                 Implementation",
  key =          "app",
  booktitle =    "Proceedings of IEEE Workshop on {FPGA}s for Custom
                 Computing Machines",
  editor =       "D. A. Buell and K. L. Pocek",
  year =         "1994",
  month =        apr,
  address =      "Napa, CA",
  pages =        "189--198",
  abstract =     "Reconfigurable Field-Programmable Gate Arrays (FPGAs)
                 provide an effective programmable resource for
                 implementing hardware-based Artificial Neural Networks
                 (ANNs). They are low cost, readily available and
                 reconfigurable--all important advantages for ANN
                 applications. However, FPGAs lack the circuit density
                 necessary to implement large parallel ANNs with many
                 thousands of synapses. This paper presents an
                 architecture that makes it feasible to implement large
                 ANNs with FPGAs. The architecture combines stochastic
                 computation techniques with a novel lookup-table-based
                 architecture that fully exploits the lookup-table
                 structure of many FPGAs. This lookup-table-based
                 architecture is extremely efficient: it is capable of
                 supporting up to two synapses per Configurable Logic
                 Block (CLB). In addition, the architecture is simple to
                 implement, self-contained (weights are stored directly
                 in the synapse), and scales easily across multiple
                 chips.",
}

%                            Index Number: 38

@InProceedings{wirthlin-gilson:94,
  author =       "M. J. Wirthlin and B. L. Hutchings and K. L. Gilson",
  title =        "The {Nano Processor}: {A} Low Resource Reconfigurable
                 Processor",
  key =          "system, app",
  booktitle =    "Proceedings of IEEE Workshop on {FPGA}s for Custom
                 Computing Machines",
  editor =       "D. A. Buell and K. L. Pocek",
  year =         "1994",
  month =        apr,
  address =      "Napa, CA",
  pages =        "23--30",
  abstract =     "Reconfigurable logic systems approach the performance
                 of Application-Specific integrated Circuits (ASICs)
                 while retaining much of the generality of conventional
                 computing systems through reconfiguration.
                 Unfortunately, the development of these systems, unlike
                 conventional software systems, is hardware intensive,
                 requiring significant hardware development time. One
                 way to introduce a more flexible development approach
                 is to implement a customizable stored-program
                 processor. For a given application, the designer can
                 develop customized hardware to increase performance and
                 then control the sequencing and operation of this
                 hardware with software. Development time can be
                 significantly reduced because conventional software
                 development tools, e.g., assemblers and compilers, can
                 be used to quickly develop new applications on the
                 customized processor. This paper presents the Nano
                 Processor (nP), a fully customizable reconfigurable
                 processor, together with its integrated assembler, that
                 has been successfully implemented on the Xilinx 3000
                 series Field Programmable Gate Arrays (FPGA).",
}

%                            Index Number: 39

@InProceedings{eldredge-hutchings:94b,
  author =       "J. G. Eldredge and B. L. Hutchings",
  title =        "{RRANN}: The Run-Time Reconfiguration Artificial
                 Neural Network",
  key =          "app,rtr",
  booktitle =    "Custom Integrated Circuits Conference",
  year =         "1994",
  month =        may,
  address =      "San Diego, CA",
  pages =        "77--80",
  abstract =     "Run-time reconfiguration is a way of more fully
                 exploiting the flexibility of reconfigurable FPGAs. The
                 Run-Time Reconfiguration Artificial Neural Network
                 (RRANN) uses run-time reconfiguration to increase the
                 hardware density of FPGAs. This is done by dividing the
                 backpropagation algorithm into three sequentially
                 executed stages and configuring the FPGAs to execute
                 only one stage at a time. The FPGAs are reconfigured as
                 part of normal execution in order to change stages.
                 Using reconfigurability in this way increases the
                 number of hardware neurons a single FPGA can implement
                 by 500%. The RRANN architecture has been designed and
                 built using commercially available hardware, and its
                 performance has been measured.",
}

%                            Index Number: 40

@InProceedings{eldredge-hutchings:94c,
  author =       "J. G. Eldredge and B. L. Hutchings",
  title =        "{RRANN}: {A} Hardware Implementation of the
                 Backpropagation Algorithm Using Reconfigurable
                 {FPGA}s",
  key =          "app,rtr",
  booktitle =    "IEEE World Conference on Computational Intelligence",
  year =         "1994",
  month =        jun,
  address =      "Orlando, FL",
}

%                            Index Number: 41

@InProceedings{furtek-stone:90,
  title =        "Labyrinth: {A} Homogeneous Computational Medium",
  year =         "1990",
  booktitle =    "Proceedings of the IEEE Custom Integrated Circuits
                 Conference",
  key =          "device",
  author =       "F. Furtek and G. Stone and I. Jones",
  month =        may,
  pages =        "31.1.1--31.1.4",
  abstract =     "As a RAM-based reconfigurable logic array, Labyrinth
                 provides the flexibility and malleability of software
                 with the performance of a dedicated circuit. With a
                 single bit register, and a half adder per cell, the
                 architecture is optimized for register intensive,
                 massively parallel algorithms. The fine-grained,
                 highly-symmetric architecture scales very naturally and
                 facilitates compact circuit layouts. A 64-cell test
                 chip has been successfully built and tested, and a
                 4,096-cell chip is in the final stages of preparation
                 for fabrication.",
}

%                            Index Number: 42

@InBook{brown-francis:92,
  author =       "S. D. Brown and R. J. Francis and J. Rose and Z.
                 Vranesic",
  title =        "Field-Programmable Gate Arrays",
  publisher =    "Kluwer Academic Publishers",
  chapter =      "1",
  year =         "1992",
}

%                            Index Number: 43

@Article{cox-blanz:92,
  author =       "C. E. Cox and W. E. Blanz",
  title =        "{GANGLION} - a fast field-programmable gate array
                 implementation of a connectionist classifier",
  key =          "system",
  journal =      "IEEE Journal of Solid-State Circuits",
  volume =       "27",
  number =       "3",
  pages =        "288--299",
  year =         "1992",
  month =        mar,
  abstract =     "The architecture, implementation, and application of
                 GANGLION, a totally digital connectionist classifier,
                 are described. This fully interconnected feedforward
                 net with one hidden layer is capable of generating 4.48
                 billion interconnection/s. The architecture is realized
                 on a single 9U VME card and is built entirely from
                 off-the-shelf components. The very high throughput of
                 20 million decision/s is achieved by making efficient
                 use of field- programmable gate arrays. Specifically,
                 the authors take advantage of the reprogrammability of
                 the devices to automatically generate new custom
                 hardware for each application of the classifier.",
}

%                            Index Number: 44

@Article{rose-francis:1990,
  author =       "J. Rose and R. Francis and D. Lewis and P. Chow",
  title =        "Architecture of Field-Programmable Gate Arrays: The
                 Effect of Logic Block functionality on Area
                 Efficiency",
  key =          "device",
  journal =      "IEEE Journal of Solid State Circuits",
  year =         "1990",
  volume =       "25",
  number =       "5",
  pages =        "1217--1225",
  month =        oct,
  abstract =     "This paper examines the relationship between the
                 functionality of a field-programmable gate array (FPGA)
                 logic block and the area required to implement digital
                 circuits using that logic block. This investigation is
                 done experimentally by implementing a set of industrial
                 circuits as FPGA's using CAD tools for technology
                 mapping, placement, and routing. Unsing a simple model
                 of the interconnection and logic block area, a range of
                 programming technologies (the method of FPGA
                 customization) is explored. The experiments are based
                 on logic blocks that use lookup tables for implementing
                 combinational logic. Results indicate that the best
                 number of inputs to use (a measure of the block's
                 functionality) is between three and four, and that a D
                 flip-flop should be included in the logic block. These
                 results are largely independent of the programming
                 technology. More generally, it was observed that the
                 area efficiency of a logic block depends not only on
                 its functionality but on the average number of pins
                 connected per logic block. It is shown that as the
                 number of connected pins per block increases, the
                 number of wiring tracks required to route those blocks
                 also increases. Since adding functionality to a block
                 will lead to an increase in the number of connected
                 pins, it follows that an increase in functionality of
                 the block is only beneficial if the total number of
                 blocks is reduced to more than compensate for the
                 increased wiring area. This notion leads to the
                 conclusion that the most area-efficient logic blocks
                 are those with a high amount of functionality per
                 pin.",
}

%                            Index Number: 45

InProceedings{guccione-gonzales:93b,
  author =       "S. A. Guccione and M. J. Gonzalez",
  title =        "A neural network implementation using reconfigurable
                 architectures",
  key =          "app",
  booktitle =    "More {FPGAs}: Proceedings of the 1993 International
                 workshop on field-programmable logic and applications",
  year =         "1993",
  month =        sep,
  address =      "Oxford, England",
  pages =        "443--451",
  editor =       "W. Moore and W. Luk",
  abstract =     "Several architectures based on Field Programmable
                  Gate Arrays (FPGAs) have recently been
                  introduced. These machines have demonstrated a high
                  level of performance for a variety of
                  problems. Despite this success, software development
                  on these systems is generally limited to hardware
                  description languages. One programming model that
                  has been proposed for use with reconfigurable
                  architectures is the vector based data parallel
                  model. This paper describes the implementation of a
                  multi-layer feed-forward neural network using a
                  vector based data parallel approach. The algorithm
                  is described using a subset of the C programming
                  language. This description is translated into a
                  circuit which may be programmed into the FPGA based
                  processor.",
}

%date of publication: 1994

%                            Index Number: 46

@InProceedings{erdogan-hong:93,
  author =       "S. S. Erdogan and T. H. Hong",
  title =        "Massively Parallel back-propagation algorithm using
                 the reconfigurable machine",
  key =          "app",
  booktitle =    "World Congress on Neural Networks `93",
  year =         "1993",
  address =      "Portland, Oregon",
  pages =        "4:861--864",
  abstract =     "The potential of Artificial Neural Networks (ANNs) can
                 be realized with successful mapping of these algorithms
                 to massively parallel architectures which can optimize
                 their intensive computational requirements. The
                 Reconfigurable Machine (RM) is a parallel architecture
                 which is built using Xilinx's 4005 Field Programmable
                 Gate Array (FPGA) chips. Various popular neural models
                 are currently being described using VHSIC Hardware
                 Description Language (VHDL) to be mapped onto the RM.
                 The logic synthesis and optimization tools for VHDL
                 allow automatic generation of the target architecture
                 for RM. In this paper, a fully parallel implementation
                 of a fully connected three-layer Back-Propogation (BP)
                 is studied. The mapping encompasses both the forward
                 and backward passes. A novel approach based on weight
                 duplication during learning allows a fully parallel
                 implementation.",
}

%                            Index Number: 47

%                            Index Number: 48

@InProceedings{wazlowski-agarwal:93,
  author =       "M. Wazlowski and L. Agarwal and T. Lee and A. Smith
                 and E. Lam and P. Athanas and H. Silverman and S.
                 Ghosh",
  title =        "{PRISM-II} Compiler and Architecture",
  key =          "system, tool",
  booktitle =    "Proceedings of IEEE Workshop on {FPGA}s for Custom
                 Computing Machines",
  editor =       "D. A. Buell and K. L. Pocek",
  year =         "1993",
  month =        apr,
  address =      "Napa, California",
  pages =        "9--16",
  abstract =     "This paper discusses the architecture and compiler for
                 a general-purpose metamorphic computing platform called
                 PRISM-II. PRISM-II improves the performance of many
                 computationally- intensive tasks by augmenting the
                 functionality of the core processor with new
                 instructions that match the characteristics of targeted
                 applications. In essence, PRISM is a general purpose
                 hardware platform that behaves like an
                 application-specific platform. Two methods for hardware
                 synthesis, one using the VHDL Designer and the other
                 using X-BLOX, are presented and synthesis results are
                 compared.",
}

%                            Index Number: 49

@InProceedings{agarwal-wazlowski:94,
  author =       "L. Agarwal and M. Wazlowski and S. Ghosh",
  title =        "An asynchronous approach to efficient execution of
                 programs on adaptive architectures utilizing {FPGA}s",
  key =          "system, tool",
  booktitle =    "Proceedings of IEEE Workshop on {FPGA}s for Custom
                 Computing Machines",
  editor =       "D. A. Buell and K. L. Pocek",
  year =         "1994",
  month =        apr,
  address =      "Napa, California",
  pages =        "101--110",
  abstract =     "PRISM, a computer architecture consisting of a
                 general-purpose core processor and a reconfigurable
                 FPGA platform, was designed to bridge the gap between
                 general-purpose and specialized computers. The
                 proof-of-concept system, PRISM-I, suffers from several
                 limitations, principal among them being: single
                 bus-cycle restriction on the evaluation time of the
                 function synthesized on an FPGA, inability to execute
                 loops with dynamic loop-counts, and inefficient
                 execution of control constructs such as {"}if-then-
                 else{"}. This paper presents a novel execution model in
                 PRISM-II, that addresses the above limitations in a
                 general manner. Also presented is a new framework for
                 translating a C function into an FPGA-based custom
                 architecture.",
}

%                            Index Number: 50

@InProceedings{lopresti:91,
  author =       "D. P. Lopresti",
  title =        "Rapid implementation of a genetic sequence comparator
                 using field-programmable gate arrays",
  key =          "app",
  booktitle =    "Advanced Research in {VLSI}: Proceedings of the 1991
                 University of California/Santa Cruz Conference",
  pages =        "138--152",
  year =         "1991",
  month =        mar,
  address =      "Santa Cruz, CA",
  editor =       "C. Sequin",
  abstract =     "This paper describes the implementation of a parallel
                 algorithm for sequence comparison on the SPLASH
                 programmable logic array. The algorithm, originally
                 developed for a custom VLSI chip, has applications in
                 molecular genetics and runs faster on SPLASH than it
                 does on supercomputers. I discuss details of the
                 problem and its systolic solution, the SPLASH
                 architecture and design environment, and the
                 implementations currently running on SPLASH.",
}

%                            Index Number: 51

@InProceedings{fawcett:93,
  author =       "B. K. Fawcett",
  title =        "Applications of Reconfigurable Logic",
  key =          "app",
  booktitle =    "More {FPGAs}: Proceedings of the 1993 International
                 workshop on field-programmable logic and applications",
  year =         "1993",
  month =        sep,
  address =      "Oxford, England",
  pages =        "57--69",
  editor =       "W. Moore and W. Luk",
  abstract =     "Logic implemented in an SRAM-based FPGA is
                 reconfigurable; that is, changes can be made to the
                 system's logic functions by reprogramming the FPGA(s)
                 in the system. Examples are cited of systems that make
                 use of this in-system reconfigurability. These
                 applications can be divided into three main categories
                 based on how the FPGA's reconfigurability is applied:
                 systems with built-in diagnostics, adaptable system
                 designs, and systems with multi-purpose hardware.",
}

%                            Index Number: 52

@Proceedings{fpgas:91,
  title =        "{FPGAs}: Proceedings of the 1991 International
                 workshop on field-programmable logic and applications",
  editor =       "W. Moore and W. Luk",
  publisher =    "Abingdon EE and CS Books",
  address =      "Oxford, England",
  month =        sep,
  year =         "1991",
}

%                            Index Number: 53

@Proceedings{fpgas:93,
  title =        "More {FPGAs}: Proceedings of the 1993 International
                 workshop on field-programmable logic and applications",
  editor =       "W. Moore and W. Luk",
  publisher =    "Abingdon EE and CS Books",
  address =      "Oxford, England",
  month =        sep,
  year =         "1993",
}

%year of publication: 1994

%                            Index Number: 54

@Proceedings{fpgas:92,
  title =        "{FPGAs}: Proceedings of the 1992 International
                 workshop on field-programmable logic and applications",
  editor =       "H. Grunbacher and R. Hartenstein",
  publisher =    "Spinger-Verlag",
  address =      "Vienna, Austria",
  month =        sep,
  year =         "1992",
}

%                            Index Number: 55

@Proceedings{fccm:93,
  title =        "Proceedings of {IEEE} Workshop on {FPGA}s for Custom
                 Computing Machines",
  year =         "1993",
  month =        apr,
  address =      "Napa, CA",
  editor =       "D. A. Buell and K. L. Pocek",
}

%                            Index Number: 56

@Proceedings{fccm:94,
  title =        "Proceedings of {IEEE} Workshop on {FPGA}s for Custom
                 Computing Machines",
  year =         "1994",
  month =        apr,
  address =      "Napa, CA",
  editor =       "D. A. Buell and K. L. Pocek",
}

%                            Index Number: 57

@InProceedings{lysaght-dunlop:93,
  author =       "P. Lysaght and J. Dunlop",
  title =        "Dynamic Reconfiguration of {FPGA}s",
  key =          "system, app, rtr",
  booktitle =    "More {FPGAs}: Proceedings of the 1993 International
                 workshop on field-programmable logic and applications",
  year =         "1993",
  month =        sep,
  address =      "Oxford, England",
  pages =        "82--94",
  editor =       "W. Moore and W. Luk",
  abstract =     "This paper considers the dynamic reconfiguration of
                 those cellular Field Programmable Gate Arrays (FPGAs)
                 that employ static memory to store their device
                 configuration data. A FPGA is calssified as
                 dynamcically reconfigurable if it can be partially
                 reconfigured while active. The circuits on the device
                 that are not included in the selective reconfiguraion
                 must continue to operate without interruption.
                 Dynamically reconfigurable FPGAs form a new class of
                 logic which suggests new methods of digital system
                 synthesis and realisation with the potential for
                 significant advantages relatvie to current systems.
                 This paper investigates the importance of dynamic
                 reconfiguration and introduces self-controlling,
                 dynamically reconfigurable systems and the concept of
                 Logic Caching.",
}

%                            Index Number: 58

@InProceedings{dehon:94,
  author =       "A. DeHon",
  title =        "{DPGA}-Coupled Microprocessors: Commodity {IC}s for
                 the Early 21st Century",
  key =          "device, system",
  booktitle =    "Proceedings of IEEE Workshop on {FPGA}s for Custom
                 Computing Machines",
  editor =       "D. A. Buell and K. L. Pocek",
  year =         "1994",
  month =        apr,
  address =      "Napa, CA",
  pages =        "31--39",
  abstract =     "During the past decade the microprocessor has become a
                 key commodity component for building all kinds of
                 computational systems. During this time frame large,
                 reconfigurable logic arrays have exploited the same
                 advances in IC fabrication technology to emerge as
                 viable system building blocks. Looking at both the
                 technology prospects and application requirements,
                 there is compelling evidence that microprocessors with
                 integrated reconfigurable logic arrays will be a
                 primary building block for future computing systems. In
                 this paper, we look at the role such components can
                 play in building high-performance and economical
                 systems, as well as the ripe technological outlook. We
                 note how he tight integration of reconfigurable logic
                 into the processor can overcome some of the major
                 limitations of contemporary, attached reconfigurable
                 compute engines. We specifically consider the use of
                 integrated Dynamically Programmable Gate Array
                 structures for the configurable logic and examine the
                 advantages rapid reconfiguration provides in this
                 application.",
}

%                            Index Number: 59

@Article{trimberger:93,
  author =       "S. Trimberger",
  title =        "A Reprogrammable Gate Array and Applications",
  key =          "device",
  journal =      "Proceedings of the IEEE",
  pages =        "1030--1041",
  month =        jul,
  year =         "1993",
  abstract =     "A field programmable gate array (FPGA) can implement
                 thousands of gates of logic, has no up-front fixed
                 costs, and can be programmed in a few minutes by users
                 at their site. This paper describes an FPGA that is
                 programmed by writing into on-chip static memory. This
                 kind of FPGA can be reprogrammed any number of times,
                 providing a versatile platform for rapid hardware
                 implementation. Reprogrammable technology allows
                 software-like design methodologies to be applied to
                 logic design. This paper describes the construction of
                 this kind of FPGA, design tradeoffs, and examples of
                 applications that take advantage of reprogrammblilty.",
}

%                            Index Number: 60

@InProceedings{depreitere-neefs:94,
  author =       "J. Depreitere and H. Neefs and H. Van Marck and J. Van
                 Campenhout and R. Baets and B. Dhoedt and H. Thienpont
                 and I. Veretennicoff",
  title =        "An Optoelectronic {3-D} Field Programmable Gate
                 Array",
  key =          "device",
  booktitle =    "Field-Programmable Logic: Architectures, Synthesis and
                 Applications. 4th International Workshop on
                 Field-Programmable Logic and Applications",
  pages =        "352--360",
  month =        sep,
  year =         "1994",
  address =      "Prague, Czech Republic",
  editor =       "R. Hartenstein and M. Z. Servit",
  publisher =    "Springer-Verlag",
  abstract =     "Traditional Field-Programmable Gate Arrays suffer from
                 a lack of routing resources when implementing complex
                 logic designs. This paper proposes two possible
                 improvements to the FPGA structure that could alleviate
                 these problems. We suggest extending the FPGA class to
                 3-D architectures. The 3-D architectures could be
                 constructed of a stack of optically interconnected 2-D
                 planes. Furthermore, we suggest a hierarchical
                 distribution of routing resources that closely matches
                 the wire length distributions of the intended class of
                 applications.",
}

%                            Index Number: 61

@Article{rose-brown:1991,
  author =       "J. Rose and S. Brown",
  title =        "Flexibility of Interconnection Structures for
                 Field-Programmable Gate Arrays",
  key =          "device",
  journal =      "IEEE Journal of Solid State Circuits",
  year =         "1991",
  volume =       "26",
  number =       "3",
  pages =        "277--282",
  month =        mar,
  abstract =     "This paper explores the relationship between the
                 routability of a field-programmable gate array (FPGA)
                 and the flexibility of its interconnection structures.
                 The flexibility of an FPGA is determined by the number
                 and distribution of switches used in the
                 interconnection. While good routability can be obtained
                 with a high flexibility, a large number of switches
                 will result in poor performance and logic density
                 because each switch has significant delay and area. The
                 minimum number of switches required to achieve good
                 routability is determined by implementing several
                 industrial circuits in a variety of interconnection
                 architectures. These experiments indicate that high
                 flexibility is essential for the connection block that
                 joins the logic blocks to the routing channel, but a
                 relatively low flexibility is sufficient for switch
                 blocks at the junction of horizontal and vertical
                 channels. Furthermore, it is necessary to use only a
                 few more routing tracks than the absolute minimum
                 possible with structures of surprisingly low
                 flexibility.",
}

%                            Index Number: 62

@MastersThesis{eldredge:93,
  author =       "J. G. Eldredge",
  title =        "{FPGA} Density enhancement of a neural network through
                 run-time reconfiguration",
  key =          "app,rtr",
  school =       "Brigham Young University",
  address =      "Provo, UT",
  month =        dec,
  year =         "1993",
}

%                            Index Number: 63

@InProceedings{lazarus-meyer:93,
  author =       "R. B. Lazarus and F. M. Meyer",
  title =        "Realization of a Dynamically Reconfigurable
                 Preprocessor",
  key =          "system, app",
  booktitle =    "Proceedings of the IEEE 1993 National Aerospace and
                 Electronics Conference. {NAECON} 1993",
  year =         "1993",
  month =        aug,
  address =      "Dayton, OH",
  pages =        "74--80",
  abstract =     "Recent advances in configurable logic technology
                 provide sufficient processing density and bandwidth to
                 directly implement image and signal processing
                 algorithms in digital hardware. Our research
                 demonstrates the feasibility of employing field
                 programmable gate arrays (FPGAs) to realize high-speed
                 algorithm- specific processing architectures for
                 avionic signal processing applications. Architectures
                 composed of FPGAs provide a low-cost and flexible
                 alternative to custom hard-wired preprocessors and a
                 lower-cost, physically smaller alternative to massively
                 parallel processors (both SIMD and MIMD Machines).
                 Algorithm segments which require processing hundreds of
                 millions of operations per second have been mapped into
                 a single FPGA device. This technology may ultimately
                 fill a range of processing requirements in the areas of
                 radar and communication processing as well as image
                 enhancement applications. The application of
                 configurable logic devices allows realization of
                 processing architectures to efficiently compute
                 low-level algorithmic functions, or segments.
                 Reconfiguration of FPGAs to implement several algorithm
                 segments is analogous to selecting subroutines to form
                 a software algorithm suite in a conventional processor,
                 since it can be accomplished without hardware
                 modification. Specific architecture configurations
                 corresponding to algorithm segments can be chosen from
                 a library and immediately configured in hardware to
                 realize the same algorithm suite that could be realized
                 in software, but with greatly enhanced processing
                 performance (typically two orders of magnitude). For
                 example, the processing architecture can be
                 reconfigured to realize an algorithm segment with a 5X5
                 filter window instead of 3X3 window, or replace a
                 median filter segment with a morphological filter
                 segment.",
}

%is this the right date? CONF LOCATION:  Dayton, OH, USA; 24-28 May 1993
%is this the right address?  PUBLISHER:  IEEE; New York, NY, USA

%                            Index Number: 64

@InProceedings{hill-woo:93,
  author =       "D. Hill and N.-S. Woo",
  title =        "The Benefits of Flexibility in Lookup Table-Based
                 {FPGA}s",
  key =          "device",
  booktitle =    "IEEE Transactions on Computer-Aided Design of
                 Integrated Circuits and Systems",
  volume =       "12",
  year =         "1993",
  month =        feb,
  address =      "USA",
  pages =        "349--353",
  abstract =     "FPGAs need not be limited to a single fixed-size truth
                 table in each block. This paper discusses the utility
                 of allowing each block's single large table (e.g., one
                 5-input, 32-bit table ) to be reconfigured into smaller
                 table (e.g., eight 4-bit tables). Results describing
                 the efficiency of packing some standard benchmark
                 circuits into various configurations are presented and
                 the cost/benefits discussed. We show that a logic block
                 containing four lookup tables, each of which is 8-bit
                 RAM, is the best choice if only the area efficiency is
                 considered. We also show that if circuit speed is
                 considered, a logic block, containing two lookup
                 tables, each of which contains 16 bits of RAM, is the
                 best choice.",
}

%                            Index Number: 65

@InProceedings{maki-whitaker:91,
  author =       "G. Maki and S. Whitaker and G. Ganesh",
  title =        "A Reconfigurable Data Path Processor",
  key =          "system, app",
  booktitle =    "Proceedings of the Fourth Annual IEEE International
                 ASIC Conference and Exhibit",
  year =         "1991",
  month =        sep,
  address =      "Rochester, NY",
  pages =        "P18--4.1--4.4",
  abstract =     "A configurable data path processor is presented which
                 can be modified to optimize performance. FPGA, PLA and
                 PAL devices provide a great amount of flexibility to
                 realize arbitrary control functions. The new processor
                 is specifically designed for arbitrary data path
                 operations and can be dynamically reconfigured.",
}

%                            Index Number: 66

@InProceedings{korpiharju-viitanen:91,
  author =       "T. Korpiharju and J. Viitanen and H. Kiminkinen and J.
                 Takala and K. Kaski",
  title =        "{TUTCA} Configurable Logic Cell Array Architecture",
  key =          "device",
  booktitle =    "Proceedings of the Fourth Annual IEEE International
                 ASIC Conference and Exhibit",
  year =         "1991",
  month =        sep,
  address =      "Rochester, NY",
  pages =        "P3--3.1--3.4",
  abstract =     "A processor array architecture based on dynamically
                 configurable logic cell array is designed to contain an
                 8X8 array of processing units. This array is expandable
                 to construct larger arrays by combining chips together
                 in a matrix. The configuration data for the processing
                 units is loaded parallel into an internal configuration
                 RAM to enable quick reconfiguration for a new task.",
}

%                            Index Number: 67

@InProceedings{luk:94,
  author =       "W. Luk and T. Wu and I. Page",
  title =        "Hardware-Software Codesign of Multidimensional
                 Programs",
  key =          "tool, app",
  booktitle =    "Proceedings of IEEE Workshop on {FPGA}s for Custom
                 Computing Machines",
  editor =       "D. A. Buell and K. L. Pocek",
  year =         "1994",
  month =        apr,
  address =      "Napa, CA",
  pages =        "82--90",
  abstract =     "We present a method for parametrised partitioning of
                 multidimensional programs for acceleration using a
                 hardware coprocessor. The method involves a
                 divide-and-conquer structure, with the `divide' and
                 `merge' phases carried out by a general-purpose
                 processor while the `conquer' phase is handled by
                 application-specific-hardware. The partitioning
                 strategy has been captured in a simple functional
                 language, and we have automated the production of
                 partitioned programs in this language. Our approach has
                 been tested on an FPGA-based system using a number of
                 computer vision algorithms, including the Canny edge
                 detector, and the performance is compared against
                 executing the programs on the PC host.",
}

%                            Index Number: 68

@InProceedings{bolotski-dehon:94,
  author =       "{M. Bolotski, A. DeHon} and Jr. T. F. Knight",
  title =        "Unifying {FPGA}s and {SIMD} Arrays",
  key =          "device, system",
  booktitle =    "FPGA '94 -- 2nd International ACM/SIGDA Workshop on
                 FPGAs",
  year =         "1994",
  address =      "Berkeley, CA",
  month =        mar,
  pages =        "1--10",
  abstract =     "Field-Programmable Gate Arrays (FPGAs) and
                 Single-Instruction Multiple-Data (SIMD) processing
                 arrays share many architectural features. In both
                 architectures, an array of simple, fine- grained logic
                 elements is employed to provide high-speed
                 customizable, bit-wise computation. In this paper, we
                 present a unified computational array model which
                 encompasses both FPGAs and SIMD arrays, Within this
                 framework, we examine the differences and similarities
                 between these array structures and touch upon
                 techniques and lessons which can be transfered between
                 the architectures. The unified model also exposes
                 promising prospects for hybrid array architectures. We
                 introduce the Dynamically Programmable Gate Array which
                 combines the best features from FPGAs and SIMD arrays
                 into a single array architecture.",
}

%                            Index Number: 69

@InProceedings{barros-akil:92,
  author =       "M. Alves De Barros and M. Akil",
  title =        "Study and implementation of a real time 3*3
                 programmable convolver with reconfigurable technology",
  key =          "app",
  booktitle =    "Euro ASIC '92",
  year =         "1992",
  pages =        "392--395",
  publisher =    "IEEE Computer Society Press",
  abstract =     "The authors describe problems concerning the
                 implementation of 2D convolution algorithms using
                 reconfigurable technology. An approach for the
                 automatic design of specific architectures in this
                 technology is discussed. The Xilinx programmable gate
                 array (PGA) resources are presented. The authors
                 consider specially their time and area limits. They
                 present an implementation of a real time 3*3
                 programmable convolver with Xilinx XC 3090 PGA.",
}

%                             Index Number:70

@InProceedings{hauck-borriello:92,
  author =       "{S. Hauck, G. Borriello, S. Burns} and C. Ebeling",
  title =        "{MONTAGE}: An {FPGA} for Synchronous and Asynchronous
                 Circuits",
  key =          "device",
  booktitle =    "Field-Programmable Gate Arrays: Architectures and
                 Tools for Rapid Prototyping. Second International
                 Workshop on Field Programmable Logic and Applications",
  year =         "1992",
  address =      "Vienna, Austria",
  month =        aug,
  page =         "44-51",
  abstract =     "Field-programmable gate arrays are frequently used to
                 implement system interfaces and glue logic. However,
                 there has been little attention given to the special
                 problems of these types of circuits in FPGA
                 architectures. In this paper we describe Montage, a
                 Triptych-based FPGA designed for implementing
                 asynchronous logic and interfacing separately-clocked
                 synchronous circuits. Asynchronous circuits have
                 different requirements than synchronous circuits, which
                 make standard FPGAs unusable for asynchronous
                 applications. At the same time, many asynchronous
                 design methodologies allow components with greatly
                 different performance to be substituted for one
                 another, making a design environment which migrates
                 between FPGA, MPGA, and semi-custom implementations
                 very attractive. Similar problems also exist for
                 interfacing separately-clocked synchronous circuits. We
                 discuss these problems, and demonstrate how the Montage
                 FPGA satisfies the demands of these classes of
                 circuits.",
}

%above is the year of the conference.  Year of publication:1993
%conference location: Vienna, Austria;publisher loc.: Berlin, Germany
%conference dates: 31 Aug.-2 Sept. 1992

%                            Index Number: 71

@InProceedings{wu-perkowski:92,
  author =       "L.-F. Wu and M. A. Perkowski",
  title =        "Minimization of Permuted Reed-Muller Trees for
                 Cellular Logic Programmable Gate Arrays",
  key =          "tool",
  booktitle =    "Field-Programmable Gate Arrays: Architectures and
                 Tools for Rapid Prototyping. Second International
                 Workshop on Field Programmable Logic and Applications",
  year =         "1992",
  address =      "Vienna, Austria",
  month =        aug,
  pages =        "78--87",
  abstract =     "The new family of Field Programmable Gate Arrays
                 CLI6000 from Concurrent Logic Inc realizes the truly
                 Cellular Logic. It has been mainly designed for the
                 realization of data path architectures. However,
                 introduced by it new universal logic cell calls also
                 for new logic synthesis methods based on approximate,
                 for the minimization of Permuted Reed-Muller Trees that
                 are obtained by repetitive application of Davio
                 expansions (Shannon expansions for EXOR gates) in all
                 possible orders of variable in subtrees. Such trees are
                 particularly well matched to both the realization of
                 logic cell and connection structure of the CLI6000
                 device. It is shown on several standard benchmarks that
                 the heuristic algorithm gives good quality results in
                 much less time than the exact algorithm.",
}

%above is the year of the conference; year of publication:1993
%above is the place of the conf.; place of publication:Berlin, Germany
%date of conference: 31 Aug.-2 Sept. 1992

%                            Index Number: 72

@InProceedings{hill-britton:92,
  author =       "{D. Hill, B. Britton, B. Oswald, N.-S. Woo, S. Singh,
                 C.-T. Chen} and B. Krambeck",
  title =        "{ORCA}: {A} New Architecture for High-Performance
                 {FPGA}s",
  key =          "device",
  booktitle =    "Field-Programmable Gate Arrays: Architectures and
                 Tools for Rapid Prototyping. Second International
                 Workshop on Field Programmable Logic and Applications",
  year =         "1992",
  address =      "Vienna, Austria",
  month =        aug,
  pages =        "52--60",
  abstract =     "AT&T's ORCA (Optimized Reconfigurable Cell Array)
                 architecture extends FPGA applicability into a larger
                 domain than is possible with today's parts, including
                 datapath intensive designs such as memory controllers,
                 signal processing parts, and telecommunication
                 interfaces. Key to the suitability of the ORCA for
                 these jobs is the fact that each of its basic blocks is
                 capable of processing four bits. So, for example, a 16
                 bit adder requires exactly 4 blocks, not 9 or 16 as in
                 other architectures. Yet the total complexity of each
                 block is comparable to other current parts, thus
                 yielding a significant improvement in functional
                 density.",
}

%above is the year of the conference; year of publication:1993
%above is the place of the conf.; place of publication:Berlin, Germany
%date of conference: 31 Aug.-2 Sept. 1992

%                            Index Number: 73

@InProceedings{kempa-jung:92,
  author =       "G. Kempa and P. Jung",
  title =        "{FPGA} Based Logic Synthesis of Squarers Using
                 {VHDL}",
  key =          "tool, app",
  booktitle =    "Field-Programmable Gate Arrays: Architectures and
                 Tools for Rapid Prototyping. Second International
                 Workshop on Field Programmable Logic and Applications",
  year =         "1992",
  address =      "Vienna, Austria",
  month =        aug,
  pages =        "112--123",
  abstract =     "In this paper, the design of VHDL coded squarers by
                 using logic synthesis is considered. The square
                 function is important for the digital processing of
                 signals using e.g. matched filters and Viterbi
                 equalizers in receivers for communication systems.
                 However, many arithmetical functions like the square
                 function are not supported by VHDL. Hence, two major
                 drawbacks arise in the logic synthesis of VHDL code.
                 Firstly, the designers are forced to implement the
                 needed arithmetical functions in VHDL by themselves.
                 Secondly, when implementing arithmetical functions such
                 as the square function in VHDL, special care must by
                 taken in order to circumvent massive hardware overhead
                 of the synthesis results compared with manually
                 designed architectures. In the case of the square
                 function, this massive hardware overhead mainly stems
                 from the fact that the synthesis results of squarers
                 are as hardware expensive as the synthesis results of
                 multipliers. In the course of the present paper the
                 authors shall demonstrate how this hardware overhead of
                 squarers can be reduced by using a modified square
                 algorithm (MSA) which was developed by the authors. The
                 MSA was derived based on the Dadda algorithm which will
                 be discussed briefly.",
}

%above is the year of the conference; year of publication:1993
%above is the place of the conf.; place of publication:Berlin, Germany
%date of conference: 31 Aug.-2 Sept. 1992

%                            Index Number: 74

@InProceedings{heeb-pfister:92,
  author =       "B. Heeb and C. Pfister",
  title =        "Chameleon: {A} Workstation of a Different Colour",
  key =          "system, app",
  booktitle =    "Field-Programmable Gate Arrays: Architectures and
                 Tools for Rapid Prototyping. Second International
                 Workshop on Field Programmable Logic and Applications",
  year =         "1992",
  address =      "Vienna, Austria",
  month =        aug,
  pages =        "152--161",
  abstract =     "Chameleon is an experimental workstation based on a
                 RISC processor. It provides unprecedented flexibility
                 and speed for certain applications due to the use of
                 RAM-configurable Field Programmable Gate Arrays
                 (FPGAs). FPGAs are used to replace glue logic as well
                 as to provide a non-dedicated computation resource.
                 This resource can be regarded as a general purpose
                 coprocessor which can be reconfigured and thus
                 transformed into a special purpose coprocessor in
                 milliseconds at run-time. The coprocessor can be used
                 both for handling complex input/output functions as
                 well as to replace time critical inner loops of user
                 programs running on the central processing unit.
                 Chameleon radically relies on FPGAs for all
                 input/output functions. It serves as a means to probe
                 the limits of FPGA usage while at the same time being
                 the development system for its own FPGA circuits.",
}

%above is the year of the conference; year of publication:1993
%above is the place of the conf.; place of publication:Berlin, Germany
%date of conference: 31 Aug.-2 Sept. 1992

%                            Index Number: 75

@InProceedings{brunvand:92,
  author =       "E. Brunvand",
  title =        "Using {FPGA}s to Prototype a Self-Timed Computer",
  key =          "app",
  booktitle =    "Field-Programmable Gate Arrays: Architectures and
                 Tools for Rapid Prototyping. Second International
                 Workshop on Field Programmable Logic and Applications",
  year =         "1992",
  address =      "Vienna, Austria",
  month =        aug,
  pages =        "192--198",
  abstract =     "The NSR (non-synchronous RISC) architecture is an
                 architecture for a general purpose processor structured
                 as a collection of self-timed blocks that operate
                 concurrently and communicate over bundled data channels
                 in the style of micropipelines. A 16- bit version of
                 the NSR architecture has been implemented using Actel
                 field programmable gate arrays (FPGAs). Each of the
                 major components of the NSR is implemented using one or
                 two Actel FPGA chips using a library of self-timed
                 circuit modules. This prototype implementation is being
                 used to gain experience with the NSR architecture and
                 to gather statistics about the architectural choices.
                 The Actel FPGAs have proven to be extremely useful in
                 quickly prototyping this novel computer architecture.",
}

%above is the year of the conference; year of publication:1993
%above is the place of the conf.; place of publication:Berlin, Germany
%date of conference: 31 Aug.-2 Sept. 1992


%                            Index Number: 76

@InProceedings{shaw-milne:92,
  author =       "P. Shaw and G. Milne",
  title =        "A Highly Parallel {FPGA}-Based Machine and its Formal
                 Verification",
  key =          "system, app",
  booktitle =    "Field-Programmable Gate Arrays: Architectures and
                 Tools for Rapid Prototyping. Second International
                 Workshop on Field Programmable Logic and Applications",
  year =         "1992",
  address =      "Vienna, Austria",
  month =        aug,
  pages =        "162--173",
  abstract =     "The SPACE machine is introduced as a new type of
                 computer architecture, capable of very fast simulation
                 of highly concurrent systems. The machine is designed
                 to be scalable, constructed from a vast array of
                 boards. The decisions made in the design of the board
                 are discussed, and the actual hardware (based on an
                 array of Field Programmable Gate Array chips) is
                 described. It is shown that this machine can be
                 programmed by translating a subset of the Occam
                 language into asynchronous modules. Using the Circal
                 process algebra, a new method of formally verifying
                 asynchronous modules for these circuits is presented.
                 This method allows bounded gate delays to be included
                 in a two-level modelling mechanism.",
}

%above is the year of the conference; year of publication:1993
%above is the place of the conf.; place of publication:Berlin, Germany
%date of conference: 31 Aug.-2 Sept. 1992

%                            Index Number: 77

@Article{howard-taylor:92,
  author =       "N. Howard and R. W. Taylor",
  title =        "Reconfigurable logic: technology and applications",
  key =          "app",
  journal =      "Computing \& Control Engineering Journal",
  volume =       "3",
  number =       "5",
  month =        sep,
  pages =        "235--240",
  year =         "1992",
  abstract =     "Field programmable gate arrays (FPGAs) have the
                 potential to revolutionise the design of modern
                 computer systems. With the current generation of
                 reconfigurable arrays, the distinction between
                 hardware, software and firmware blurs, permitting the
                 designer to mix and match according to application. The
                 promise of logic systems that can be configured in
                 fractions of a second holds out the very real
                 possibility of designing adaptive hardware-machines
                 that can optimise themselves for their environment. The
                 article introduces the internal design of an ultra-fine
                 grain FPGA family, the Plessey/Pilkington ERA, (e.g.
                 ERA60100) and briefly compares this with other
                 families. The integrated design synthesis for mixed
                 hardware/software systems is discussed and the article
                 concludes with two examples of FPGA systems. One is a
                 distributed instrumentation system for process
                 tomography (the use of non-invasive sensors to measure
                 and reconstruct the internal state of an industrial
                 system), the other is a fast cellular automata
                 machine.",
}

%                            Index Number: 78

@Article{cockshott-shaw:92,
  author =       "P. Cockshott and P. Shaw and P. Barrie and G. J.
                 Milne",
  title =        "Scalable cellular array architecture",
  key =          "system, tool, app",
  journal =      "Computing \& Control Engineering Journal",
  volume =       "3",
  number =       "5",
  pages =        "219--224",
  month =        sep,
  year =         "1992",
  abstract =     "A new class of universal computers, cellular array
                 machines, is introduced. The development of
                 reprogrammable logic leading up to the development of
                 field programmable gate arrays is reviewed. An
                 architecture for constructing modular and scalable
                 general-purpose computers from reprogrammable logic is
                 presented. It is shown that such a machine can be
                 programmed by translating Occam into asynchronous
                 digital logic circuits. Examples are given of circuit
                 components which implement Occam operators.",
}

%                            Index Number: 79

@InProceedings{sueyoshi-apduhan:92,
  author =       "T. Sueyoshi and B. O. Apduhan and S. Funakoshi and I.
                 Arita",
  title =        "A new approach towards realization of reconfigurable
                 interconnection networks",
  key =          "system",
  booktitle =    "Eleventh Annual International Phoenix Conference on
                 Computers and Communications",
  pages =        "456--463",
  year =         "1992",
  abstract =     "A new approach to realize a reconfigurable
                 interconnection network which is the key factor in
                 constructing reconfigurable parallel computers,
                 utilizing the reconfigurability features of a field
                 programmable gate array (FPGA), is presented. The
                 organization of the reconfigurable interconnection
                 network and the mapping strategy for static and dynamic
                 networks are discussed. Mapping examples are included.
                 The control mechanism and interface that make it
                 possible to implement the optimum interconnection
                 topologies for interprocessor communication patterns on
                 the interconnection network for the efficient execution
                 of application programs on a multiprocessor system are
                 outlined. The system organization of a reconfigurable
                 interconnection network for a massively parallel
                 multiprocessor is described.",
}

%                            Index Number: 80

@InProceedings{albaharna-cheung:94,
  author =       "O. T. Albaharna and P. Y. K. Cheung and T. J. Clarke",
  title =        "Area \& Time Limitiations of {FPGA}-based Virtual
                 Hardware",
  key =          "device",
  booktitle =    "Proceedings of the IEEE International Conference on
                 Computer Design",
  pages =        "184--189",
  location =     "Cambridge, Massachusetts",
  month =        oct,
  year =         "1994",
  abstract =     "This paper examines the limitations of integrating
                 programmable logic with a powerful core processor on
                 the same die. An abstract model to investigate the area
                 and delay of Field Programmable Gate Array
                 architectures is presented. The model is used to show
                 that a system implemented on FPGAs will require as much
                 as 100 times more die area than its custom VLSI
                 implementation and would be about 10 times slower. Our
                 analysis shows that this high cost, inherent to the
                 current FPGA-based architectures, is a severe
                 limitation to virtual hardware development. A new
                 approach is needed to deliver high computational
                 speed-ups comparable to multiple processor systems with
                 the same total die area.",
}

%                            Index Number: 81

@InProceedings{jhitta:93,
  author =       "M. S. Jhitta",
  title =        "Introduction of a New {FPGA} Architecture",
  key =          "device",
  booktitle =    "More FPGAs. Oxford International Workshop on
                 Field-Programmable Logic and Applications",
  pages =        "13--23",
  month =        aug,
  year =         "1993",
  address =      "Oxford, England",
  editor =       "W. Moore and W. Luk",
  publisher =    "Abingdon EE\&CS Books",
  abstract =     "This paper outlines the architecture of the fourth
                 generation Dynamically Programmable Logic Device
                 (DPLD), a high performance, fine grain, SRAM field
                 programmable architecture developed by Pilkington
                 Micro-electronics. The changes and enhancements made
                 are based on extensive market research and
                 evaluation/benchmarking. The resulting architecture
                 specification differs significantly from previous
                 generations, with an enhanced cell structure and
                 improved routing structures.",
}

%                            Index Number: 82

@InProceedings{dehon:91,
  author =       "Andr\'{e} DeHon",
  title =        "Practical Schemes for Fat-Tree Network Construction",
  key =          "device",
  booktitle =    "Advanced Research in {VLSI}: Proceedings of the 1991
                 University of California/Santa Cruz Conference",
  year =         "1991",
  month =        mar,
  address =      "Santa Cruz, CA",
  editor =       "C. Sequin",
  pages =        "307--322",
  abstract =     "As multiprocessor computer networks are scaled to
                 support thousands and millions of processors, we must
                 exploit locality in order to avoid uniform degradation
                 in network performance. Fat- tree networks offer a
                 topology that theoretically scales arbitrarily while
                 allowing the exploitation of considerable locality. In
                 this paper, I present a scheme for constructing
                 practical fat-tree networks. Integrating expanders for
                 redundant multipath switching networks, I incorporate
                 fault-tolerance into the fat-tree network. I present
                 primitive building blocks for the construction of these
                 networks and describe how these building blocks can be
                 synthesized using current technology. I also present
                 organizational structures for composing these
                 primitives into arbitrarily large networks. This
                 synthesis results in a practical scheme for building
                 large-scale, high-performance multiprocessor computer
                 networks. With suitable locality and technology, a
                 786,432 processor network can route a message on the
                 first attempt with over 70% probability when the
                 network is fully loaded. The latency through the
                 network from one endpoint to another is at most 320 ns.
                 For more local connections, the network latency can be
                 as small as 40 ns.",
}

%                            Index Number: 83

@InProceedings{grunbacher-jaud:92,
  author =       "H. Grunbacher and A. Jaud",
  title =        "{JAPROC} - An 8 bit Micro Controller Design and its
                 Test Environment",
  key =          "system",
  booktitle =    "Field-Programmable Gate Arrays: Architectures and
                 Tools for Rapid Prototyping. Second International
                 Workshop on Field Programmable Logic and Applications",
  year =         "1992",
  address =      "Vienna, Austria",
  month =        aug,
  pages =        "146--151",
  abstract =     "This paper describes the design of JAPROC, an 8-bit
                 micro controller. JAPROC is a processor-core which is
                 being developed within the EUREKA project JAMIE. The
                 design consists of Approximately 5000 gates and has
                 been implemented in a FPGA Xilinx X4005. For testing
                 purposes a PC board has been developed which allows to
                 configure the FPGA, download and execute micro
                 controller code and compare the results to an
                 emulator.",
}

%                             Index Number:84

@InProceedings{surmann-ungering:92,
  author =       "H. Surmann and A. Ungering and K. Goser",
  title =        "Optimized Fuzzy Controller Architecture for Field
                 Programmable Gate Arrays",
  key =          "app",
  booktitle =    "Field-Programmable Gate Arrays: Architectures and
                 Tools for Rapid Prototyping. Second International
                 Workshop on Field Programmable Logic and Applications",
  year =         "1992",
  address =      "Vienna, Austria",
  month =        aug,
  pages =        "124--133",
  abstract =     "This paper describes an optimized fuzzy controller
                 (FC) architecture and its realization with field
                 programmable gate arrays (FPGAs). In consideration of
                 data dependencies and minor user restrictions within
                 the definition of fuzzy rules (FRs), it is possible to
                 develop a high speed FPGA architecture. A prototype of
                 the FC operates at 5 MHz and needs 50 mu s operation
                 time (8 bit resolution) independent of the number of
                 inputs/outputs with 256 fuzzy rules. A pipeline
                 architecture is used to achieve a high processing
                 speed.",
}

%above is the year of the conference; year of publication:1993
%above is the place of the conf.; place of publication:Berlin, Germany
%date of conference: 31 Aug.-2 Sept. 1992

%                            Index Number: 85

@Article{dillien-phillips:89,
  author =       "P. Dillien and I. Phillips",
  title =        "{ASIC} design flexibility with {ERA}s",
  key =          "device",
  journal =      "Electronic Product Design",
  volume =       "10",
  number =       "10",
  pages =        "29,31--32,34",
  month =        oct,
  year =         "1989",
  abstract =     "The ERA has a number of aspects; it can be viewed as a
                 proving vehicle for conventional masked gate array; as
                 a programmable logic device in its own right for
                 applications with lower volumes or where NREs cannot be
                 carried; but it also opens up new possibilities in
                 system design. For example, the implementation of
                 dynamically reconfigurable hardware for multi-tasking
                 applications, fault-tolerant systems and neural network
                 computers. Classified as a field programmable gate
                 array (FPGA), the ERA uses an embedded static RAM to
                 control the routeing of signals between its logic
                 elements. Even though its application areas overlap
                 considerably with those for high-performance
                 programmable logic devices (PLDs), the ERA should not
                 be confused with conventional PLDs. Unlike some
                 competing FPGAs which were designed upwards from PLD
                 architectures, the ERA has an architecture which is
                 much closer to a conventional gate array than to a
                 PLD.",
}

%                            Index Number: 86

@InProceedings{wolfe-shen:88,
  author =       "A. Wolfe and J. P. Shen",
  title =        "Flexible Processors: a Promising Application-Specific
                 Processor Design Approach",
  key =          "system, app",
  booktitle =    "Proceedings of the 21st Annual Workshop on
                 Microprogramming and Microarchitecture - MICRO '21",
  year =         "1988",
  address =      "San Diego, CA",
  month =        nov,
  pages =        "30--39",
  abstract =     "A new approach to application specific processor
                 design is presented in this paper. Existing application
                 specific processors are either based on existing
                 general purpose processors or custom designed special
                 purpose processors. The availability of a new
                 technology, the Xilinx Logic Cell Array, presents the
                 opportunity for a new alternative. The Flexible
                 Processor Cell is a prototype of an extremely
                 reconfigurable application specific processor. Flexible
                 processors can potentially provide the performance
                 advantages of special purpose processors. The flexible
                 processor concept opens many potential areas for future
                 research in processor architecture and implementation.
                 This paper presents the design, implementation, and
                 preliminary performance evaluation of an experimental
                 flexible processor.",
}

%                            Index Number: 87

@Article{fagin:93,
  author =       "B. S. Fagin",
  title =        "Quantitative Measurements of {FPGA} Utility in Special
                 and General Purpose Processors",
  key =          "app, system",
  journal =      "Journal of VLSI Signal Processing",
  volume =       "6",
  number =       "2",
  year =         "1993",
  address =      "Boston, Massachusetts",
  month =        aug,
  pages =        "129--137",
  abstract =     "We present experimental results on FPGA use in special
                 and general purpose processors, using as case studies a
                 computational accelerator for gene sequence analysis,
                 an integer implementation of the DLX microprocessor and
                 a real-time signal processor for rocket telemetry. All
                 these devices have been successfully prototyped, and
                 are now completely functional. We present detailed
                 analysis of our experience with FPGAs in these
                 machines, describing typically an order of magnitude
                 improvement over discrete IC implementations.",
}

%                            Index Number: 88

@InProceedings{erdogan-wahab:92,
  author =       "S. S. Erdogan and A. Wahab",
  title =        "Design of {RM-nc}: a reconfigurable neurocomputer for
                 massively parallel-pipelined computations",
  key =          "system, app",
  booktitle =    "IJCNN International Joint Conference on Neural
                 Networks",
  volume =       "2",
  pages =        "33--38",
  year =         "1992",
  abstract =     "The design of RM-nc, a reconfigurable machine for
                 massively parallel-pipelined computations, is
                 considered with the objective of demonstrating that a
                 completely reconfigurable platform, not only in the
                 domain of communication and control but also in the
                 domain of processing elements (PEs), is feasible. The
                 implementation of a fast computational element and
                 control environment for neural network simulations is
                 presented in order to assess the cost of providing
                 reconfigurability at computational level. The
                 implementation of a fast floating-point sum-of-products
                 circuit using special carry-save multipliers and
                 extensive pipelining is outlined on a field
                 programmable gate array (FPGA) platform. It is shown
                 that the flexibility of FPGA