4ycp
/
documentation


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422
							@ARTICLE{8433895,
  author={B. {Karanov} and M. {Chagnon} and F. {Thouin} and T. A. {Eriksson} and H. {Bülow} and D. {Lavery} and P. {Bayvel} and L. {Schmalen}},
  journal={Journal of Lightwave Technology}, 
  title={End-to-End Deep Learning of Optical Fiber Communications}, 
  year={2018},
  volume={36},
  number={20},
  pages={4843-4855},
  abstract={In this paper, we implement an optical fiber communication system as an end-to-end deep neural network, including the complete chain of transmitter, channel model, and receiver. This approach enables the optimization of the transceiver in a single end-to-end process. We illustrate the benefits of this method by applying it to intensity modulation/direct detection (IM/DD) systems and show that we can achieve bit error rates below the 6.7% hard-decision forward error correction (HD-FEC) threshold. We model all componentry of the transmitter and receiver, as well as the fiber channel, and apply deep learning to find transmitter and receiver configurations minimizing the symbol error rate. We propose and verify in simulations a training method that yields robust and flexible transceivers that allow-without reconfiguration-reliable transmission over a large range of link dispersions. The results from end-to-end deep learning are successfully verified for the first time in an experiment. In particular, we achieve information rates of 42 Gb/s below the HD-FEC threshold at distances beyond 40 km. We find that our results outperform conventional IM/DD solutions based on two- and four-level pulse amplitude modulation with feedforward equalization at the receiver. Our study is the first step toward end-to-end deep learning based optimization of optical fiber communication systems.},
  keywords={error statistics;forward error correction;intensity modulation;learning (artificial intelligence);neural nets;optical fibre communication;optical fibre dispersion;optical modulation;pulse amplitude modulation;intensity modulation/direct detection systems;bit error rates;fiber channel;symbol error rate;end-to-end deep learning based optimization;optical fiber communication system;optical fiber communications;forward error correction;single end-to-end process;end-to-end deep neural network;size 40.0 km;Training;Machine learning;Receivers;Optical transmitters;Transceivers;Optimization;Communication systems;Deep learning;detection;machine learning;modulation;neural networks;optical fiber communication},
  doi={10.1109/JLT.2018.2865109},
  ISSN={1558-2213},
  month={Oct},}

@ARTICLE{8664650,
  author={B. {Zhu} and J. {Wang} and L. {He} and J. {Song}},
  journal={IEEE Journal on Selected Areas in Communications}, 
  title={Joint Transceiver Optimization for Wireless Communication PHY Using Neural Network}, 
  year={2019},
  volume={37},
  number={6},
  pages={1364-1373},
  abstract={Deep learning has a wide application in the area of natural language processing and image processing due to its strong ability of generalization. In this paper, we propose a novel neural network structure for jointly optimizing the transmitter and receiver in communication physical layer under fading channels. We build up a convolutional autoencoder to simultaneously conduct the role of modulation, equalization, and demodulation. The proposed system is able to design different mapping scheme from input bit sequences of arbitrary length to constellation symbols according to different channel environments. The simulation results show that the performance of neural network-based system is superior to traditional modulation and equalization methods in terms of time complexity and bit error rate under fading channels. The proposed system can also be combined with other coding techniques to further improve the performance. Furthermore, the proposed system network is more robust to channel variation than traditional communication methods.},
  keywords={computational complexity;error statistics;fading channels;learning (artificial intelligence);natural language processing;neural nets;radio transceivers;wireless communication PHY;deep learning;natural language processing;image processing;communication physical layer;fading channels;convolutional autoencoder;arbitrary length;constellation symbols;neural network-based system;traditional modulation;time complexity;bit error rate;system network;traditional communication methods;transceiver optimization;neural network structure;equalization methods;mapping scheme;channel environments;coding techniques;Receivers;Convolutional codes;Communication systems;Deep learning;Neurons;Modulation;Transmitters;Deep learning;modulation;equalization;autoencoder;frequency selective fading},
  doi={10.1109/JSAC.2019.2904361},
  ISSN={1558-0008},
  month={June},}

@ARTICLE{6975096,
  author={M. A. {Jarajreh} and E. {Giacoumidis} and I. {Aldaya} and S. T. {Le} and A. {Tsokanos} and Z. {Ghassemlooy} and N. J. {Doran}},
  journal={IEEE Photonics Technology Letters}, 
  title={Artificial Neural Network Nonlinear Equalizer for Coherent Optical OFDM}, 
  year={2015},
  volume={27},
  number={4},
  pages={387-390},
  abstract={We propose a novel low-complexity artificial neural network (ANN)-based nonlinear equalizer (NLE) for coherent optical orthogonal frequency-division multiplexing (CO-OFDM) and compare it with the recent inverse Volterra-series transfer function (IVSTF)-based NLE over up to 1000 km of uncompensated links. Demonstration of ANN-NLE at 80-Gb/s CO-OFDM using 16-quadrature amplitude modulation reveals a Q -factor improvement after 1000-km transmission of 3 and 1 dB with respect to the linear equalization and IVSTF-NLE, respectively.},
  keywords={neural nets;OFDM modulation;quadrature amplitude modulation;novel low-complexity artificial neural network-based nonlinear equalizer;novel low-complexity ANN-based NLE;CO-OFDM;coherent optical orthogonal frequency-division multiplexing;16-quadrature amplitude modulation;Q -factor improvement;IVSTF-NLE;OFDM;Artificial neural networks;Q-factor;Optical fibers;Nonlinear optics;Equalizers;Optical fiber networks;Optical communication;coherent optical fiber transmission;functional link artificial neural networks;nonlinear equalizer;OFDM;Optical communication;coherent optical fiber transmission;functional link artificial neural networks;nonlinear equalizer;OFDM},
  doi={10.1109/LPT.2014.2375960},
  ISSN={1941-0174},
  month={Feb},}

@INPROCEEDINGS{7011454,
  author={P. {Dondon} and J. {Carvalho} and R. {Gardere} and P. {Lahalle} and G. {Tsenov} and V. {Mladenov}},
  booktitle={12th Symposium on Neural Network Applications in Electrical Engineering (NEUREL)}, 
  title={Implementation of a feed-forward Artificial Neural Network in VHDL on FPGA}, 
  year={2014},
  volume={},
  number={},
  pages={37-40},
  abstract={Describing an Artificial Neural Network (ANN) using VHDL allows a further implementation of such a system on FPGA. Indeed, the principal point of using FPGA for ANNs is flexibility that gives it an advantage toward other systems like ASICS which are entirely dedicated to one unique architecture and allowance to parallel programming, which is inherent to ANN calculation system and one of their advantages. Usually FPGAs do not have unlimited logical resources integrated in a single package and this limitation forcesrequirement for optimizations for the design in order to have the best efficiency in terms of speed and resource consumption. This paper deals with the VHDL designing problems which can be encountered when trying to describe and implement such ANNs on FPGAs.},
  keywords={feedforward neural nets;field programmable gate arrays;hardware description languages;parallel programming;feedforward artificial neural network;FPGA;ASICS;parallel programming;ANN calculation system;logical resources;resource consumption;VHDL designing problems;Neurons;Field programmable gate arrays;Biological neural networks;Artificial neural networks;Random access memory;Read only memory;MATLAB;FPGA implementation;neural networks;nonlinear systems;VHDL},
  doi={10.1109/NEUREL.2014.7011454},
  ISSN={},
  month={Nov},}

@INPROCEEDINGS{5328349,
  author={V. {Gupta} and K. {Khare} and R. P. {Singh}},
  booktitle={2009 International Conference on Advances in Recent Technologies in Communication and Computing}, 
  title={FPGA Design and Implementation Issues of Artificial Neural Network Based PID Controllers}, 
  year={2009},
  volume={},
  number={},
  pages={860-862},
  abstract={This paper discusses implementation issues of FPGA and ANN based PID controllers. FPGA-based reconfigurable computing architectures are suitable for hardware implementation of neural networks. FPGA realization of ANNs with a large number of neurons is still a challenging task. This paper discusses the issues involved in implementation of a multi-input neuron with linear/nonlinear excitation functions using FPGA. It also suggests advantages of error self-recurrent neural networks over back propagation neural network.},
  keywords={backpropagation;field programmable gate arrays;integrated circuit design;neural nets;reconfigurable architectures;three-term control;FPGA design;artificial neural network;PID controller;reconfigurable computing architecture;multiinput neuron;nonlinear excitation function;error self-recurrent neural network;backpropagation neural network;Field programmable gate arrays;Artificial neural networks;Three-term control;Neurons;Control systems;Multi-layer neural network;Communication system control;Hardware;Neural networks;Computer networks;FPGA;PID controller;artificial neural networks;error self-recurrent neural networks},
  doi={10.1109/ARTCom.2009.182},
  ISSN={},
  month={Oct},}

@book{omondi_rajapakse_2006, place={Dordrecht, The Netherlands}, title={FPGA implementations of neural networks}, publisher={Springer}, author={Omondi, Amos R and Rajapakse, Jagath Chandana}, year={2006}, pages={56} }

@INPROCEEDINGS{8702332,
  author={O. {Bilaniuk} and S. {Wagner} and Y. {Savaria} and J. {David}},
  booktitle={2019 IEEE International Symposium on Circuits and Systems (ISCAS)}, 
  title={Bit-Slicing FPGA Accelerator for Quantized Neural Networks}, 
  year={2019},
  volume={},
  number={},
  pages={1-5},
  abstract={Deep Neural Networks (DNNs) become the state-of-the-art in several domains such as computer vision or speech recognition. However, using DNNs for embedded applications is still strongly limited because of their complexity and the energy required to process large data sets. In this paper, we present the architecture of an accelerator for quantized neural networks and its implementation on a Nallatech 385-A7 board with an Altera Stratix V GX A7 FPGA. The accelerator's design centers around the matrix-vector product as the key primitive, and exploits bit-slicing to extract maximum performance using low-precision arithmetic.},
  keywords={bit-slice computers;electronic engineering computing;embedded systems;field programmable gate arrays;neural nets;bit-slicing FPGA accelerator;quantized neural networks;Deep Neural Networks;embedded applications;Nallatech 385-A7 board;Altera Stratix V GX A7 FPGA;DNN;low-precision arithmetic;matrix-vector product;Neural networks;Field programmable gate arrays;Computer architecture;Hardware;System-on-chip;Parallel processing;Neural Networks;Accelerators;BNN;CNN;RNN;QNN;FPGA},
  doi={10.1109/ISCAS.2019.8702332},
  ISSN={2158-1525},
  month={May},}

@INPROCEEDINGS{7929192,
  author={E. {Nurvitadhi} and D. {Sheffield} and  {Jaewoong Sim} and A. {Mishra} and G. {Venkatesh} and D. {Marr}},
  booktitle={2016 International Conference on Field-Programmable Technology (FPT)}, 
  title={Accelerating Binarized Neural Networks: Comparison of FPGA, CPU, GPU, and ASIC}, 
  year={2016},
  volume={},
  number={},
  pages={77-84},
  abstract={Deep neural networks (DNNs) are widely used in data analytics, since they deliver state-of-the-art accuracies. Binarized neural networks (BNNs) are recently proposed optimized variant of DNNs. BNNs constraint network weight and/or neuron value to either +1 or -1, which is representable in 1 bit. This leads to dramatic algorithm efficiency improvement, due to reduction in the memory and computational demands. This paper evaluates the opportunity to further improve the execution efficiency of BNNs through hardware acceleration. We first proposed a BNN hardware accelerator design. Then, we implemented the proposed accelerator on Aria 10 FPGA as well as 14-nm ASIC, and compared them against optimized software on Xeon server CPU, Nvidia Titan X server GPU, and Nvidia TX1 mobile GPU. Our evaluation shows that FPGA provides superior efficiency over CPU and GPU. Even though CPU and GPU offer high peak theoretical performance, they are not as efficiently utilized since BNNs rely on binarized bit-level operations that are better suited for custom hardware. Finally, even though ASIC is still more efficient, FPGA can provide orders of magnitudes in efficiency improvements over software, without having to lock into a fixed ASIC solution.},
  keywords={application specific integrated circuits;field programmable gate arrays;graphics processing units;microprocessor chips;neural nets;binarized neural networks;BNN hardware accelerator design;Aria 10 FPGA;CPU;GPU;ASIC;deep neural network;DNN;hardware acceleration;Neurons;Random access memory;Biological neural networks;Field programmable gate arrays;Graphics processing units;Hardware;System-on-chip;Deep learning;binarized neural networks;FPGA;CPU;GPU;ASIC;data analytics;hardware accelerator},
  doi={10.1109/FPT.2016.7929192},
  ISSN={},
  month={Dec},}

@ARTICLE{8954866,
  author={J. {Han} and Z. {Li} and W. {Zheng} and Y. {Zhang}},
  journal={Tsinghua Science and Technology}, 
  title={Hardware implementation of spiking neural networks on FPGA}, 
  year={2020},
  volume={25},
  number={4},
  pages={479-486},
  abstract={Inspired by real biological neural models, Spiking Neural Networks (SNNs) process information with discrete spikes and show great potential for building low-power neural network systems. This paper proposes a hardware implementation of SNN based on Field-Programmable Gate Arrays (FPGA). It features a hybrid updating algorithm, which combines the advantages of existing algorithms to simplify hardware design and improve performance. The proposed design supports up to 16 384 neurons and 16.8 million synapses but requires minimal hardware resources and archieves a very low power consumption of 0.477 W. A test platform is built based on the proposed design using a Xilinx FPGA evaluation board, upon which we deploy a classification task on the MNIST dataset. The evaluation results show an accuracy of 97.06% and a frame rate of 161 frames per second.},
  keywords={field programmable gate arrays;low-power electronics;neural chips;MNIST dataset;classification task;test platform;SNN;spiking neural networks;neural network process information;Xilinx FPGA evaluation board;low power consumption;minimal hardware resources;hardware design;hybrid updating algorithm;Field-Programmable Gate Arrays;low-power neural network systems;discrete spikes;biological neural models;power 0.477 W;Spiking Neural Network (SNN);Field-Programmable Gate Arrays (FPGA);digital circuit;low-power;MNIST},
  doi={10.26599/TST.2019.9010019},
  ISSN={1007-0214},
  month={Aug},}

@INPROCEEDINGS{9039366,
  author={M. I. {Iamaev} and S. P. {Shipitsin}},
  booktitle={2020 IEEE Conference of Russian Young Researchers in Electrical and Electronic Engineering (EIConRus)}, 
  title={Performance Comparison of FPGA-based Convolutional Neural Networks by Internal Representations}, 
  year={2020},
  volume={},
  number={},
  pages={1814-1817},
  abstract={Reconfigurable Field-Programmable Gate Arrays (FPGAs) have prospects for applying in mobile and wearable electronics. FPGA-based neural networks have strong advantage in energy consumption comparing to another solutions. For further improving of their energy efficiency it is appropriate to study the individual network parameters effect on the entire system performance. By the reason, different internal representations variants of convolutional neural network (CNN) were compared and investigated. The study involves an accuracy parameters analysis with restricted memory for weights and increasing the network depth. Binary parameters were chosen for FPGA implementation as more efficient. Binarized CNN was compared with equal CNN by memory comsuption of weights. In addition, the mathematical problem statement of realizing binarized neural network is considered.},
  keywords={convolutional neural nets;energy conservation;energy consumption;field programmable gate arrays;neural chips;FPGA-based convolutional neural networks;Reconfigurable Field-Programmable Gate Arrays;mobile electronics;wearable electronics;energy consumption;energy efficiency;network depth;binary parameters;binarized neural network;accuracy parameter analysis;internal representation variants;system performance;individual network parameter effect;restricted memory;binarized CNN;mathematical problem statement;Field programmable gate arrays;Training;Convolutional neural networks;Memory management;Energy efficiency;FPGA;CNN;MNIST;Binarized neural network;Glorot Initializer},
  doi={10.1109/EIConRus49466.2020.9039366},
  ISSN={2376-6565},
  month={Jan},}

@INPROCEEDINGS{903443,
  author={H. F. {Restrepo} and R. {Hoffmann} and A. {Perez-Uribe} and C. {Teuscher} and E. {Sanchez}},
  booktitle={Proceedings 2000 IEEE Symposium on Field-Programmable Custom Computing Machines (Cat. No.PR00871)}, 
  title={A networked FPGA-based hardware implementation of a neural network application}, 
  year={2000},
  volume={},
  number={},
  pages={337-338},
  abstract={Describes a networked FPGA-based implementation of the FAST (Flexible Adaptable-Size Topology) architecture, an artificial neural network (ANN) that dynamically adapts its size. Most ANN models base their ability to adapt to problems on changing the strength of the interconnections between computational elements according to a given learning algorithm. However, constrained interconnection structures may limit such ability. Field programmable hardware devices are very well adapted for the implementation of ANNs with in-circuit structure adaptation. To realize this implementation, we used a network of Labomat-3 boards (a reconfigurable platform developed in our laboratory), which communicate with each other using TCP/IP or a faster direct hardware connection.},
  keywords={neural chips;field programmable gate arrays;network topology;interconnected systems;reconfigurable architectures;transport protocols;local area networks;networked FPGA-based hardware implementation;neural network application;FAST architecture;flexible adaptable-size topology;artificial neural network;dynamic size adaptation;interconnection strength;computational elements;learning algorithm;constrained interconnection structures;field programmable hardware devices;in-circuit structure adaptation;Labomat-3 boards;reconfigurable platform;TCP/IP;direct hardware connection;Neural network hardware;Neural networks;Artificial neural networks;Field programmable gate arrays;Neurons;Frequency;Laboratories;Network topology;Parallel processing;Education},
  doi={10.1109/FPGA.2000.903443},
  ISSN={},
  month={April},}

@INPROCEEDINGS{8108073,
  author={T. V. {Huynh}},
  booktitle={2017 4th NAFOSTED Conference on Information and Computer Science}, 
  title={Deep neural network accelerator based on FPGA}, 
  year={2017},
  volume={},
  number={},
  pages={254-257},
  abstract={In this work, we propose an efficient architecture for the hardware realization of deep neural networks on reconfigurable computing platforms like FPGA. The proposed neural network architecture employs only one single physical computing layer to perform the whole computational fabric of fully-connected feedforward deep neural networks with customizable number of layers, number of neurons per layer and number of inputs. The inputs, weights and outputs of the network are represented in 16-bit half-precision floating-point number format. The network weights are hard-coded using on-chip memory of FPGA devices, allowing for very fast computation. For performance evaluation, the handwritten digit recognition application with MNIST database is performed, which reported a recognition rate of 97.20% and a peak performance of 15.81 kFPS when using a deep neural network of size 784-40-40-10 on the Xilinx Virtex-5 XC5VLX-110T device. When implementing a deep neural network of size 784-126-126-10 for MNIST database on the Xilinx ZynQ-7000 XC7Z045 device, the recognition rate is 98.16% and the peak performance is 15.90 kFPS.},
  keywords={feedforward neural nets;field programmable gate arrays;floating point arithmetic;neural net architecture;neural network accelerator;FPGA;deep neural network;reconfigurable computing platforms;neural network architecture;single physical computing layer;computational fabric;fully-connected feedforward deep neural networks;network weights;peak performance;half-precision floating-point number format;Xilinx ZynQ-7000 XC7Z045 device;Neurons;Hardware;Field programmable gate arrays;Computer architecture;Biological neural networks;Performance evaluation;Algorithm design and analysis;machine learning;deep neural network;MNIST;FPGA;floating-point},
  doi={10.1109/NAFOSTED.2017.8108073},
  ISSN={},
  month={Nov},}

@INPROCEEDINGS{7799795,
  author={S. {Li} and K. {Choi} and Y. {Lee}},
  booktitle={2016 International SoC Design Conference (ISOCC)}, 
  title={Artificial neural network implementation in FPGA: A case study}, 
  year={2016},
  volume={},
  number={},
  pages={297-298},
  abstract={Artificial Neural Network (ANN) is very powerful to deal with signal processing, computer vision and many other recognition problems. In this work, we implement basic ANN in FPGA. Compared with software, the FPGA implementation can utilize parallelism to speedup processing time. Additionally, hardware implementation can save more power compared with CPU/GPU. Our ANN in FPGA has a high learning ability, for logical XOR problem, which reduced the error rate from 10-2 to 10-4.},
  keywords={field programmable gate arrays;neural chips;artificial neural network;FPGA;ANN;signal processing;computer vision;recognition problem;CPU;GPU;logical XOR problem;error rate;Artificial neural networks;Parallel processing;Field programmable gate arrays;artificial neural network;parallelism;back propagation;LReLU;FPGA},
  doi={10.1109/ISOCC.2016.7799795},
  ISSN={},
  month={Oct},}

@INPROCEEDINGS{9012821,
  author={H. {Wang} and X. {Zhang} and D. {Kong} and G. {Lu} and D. {Zhen} and F. {Zhu} and K. {Xu}},
  booktitle={2019 IEEE International Conference on Integrated Circuits, Technologies and Applications (ICTA)}, 
  title={Convolutional Neural Network Accelerator on FPGA}, 
  year={2019},
  volume={},
  number={},
  pages={61-62},
  abstract={This paper presents the design and FPGA implementation of a convolutional neural network accelerator (CNNA). Two kinds of sparsity, zero-valued weights and zero-valued input feature map, are exploited to save power. The design features hierarchical memory organization to reduce external memory access. Bandwidth compression and decompression are also proposed to reduce external memory bandwidth. The unified scratch memory can be configured dynamically layer-by-layer to maximize memory utilization. The proposed CNNA is designed in Xilinx high level synthesis (HLS) language and implemented on ZCU102 board. With totally 2048 multiply-and-accumulation (MAC) unit, the design is able to deliver 1TOPS computing power when running at 250MHz.},
  keywords={convolutional neural nets;data compression;field programmable gate arrays;high level synthesis;integrated circuit design;random-access storage;convolutional neural network accelerator;FPGA implementation;CNNA;zero-valued weights;hierarchical memory organization;external memory access;external memory bandwidth;unified scratch memory;memory utilization;zero-valued input feature map;bandwidth compression;Xilinx high level synthesis language;Xilinx HLS language;ZCU102 board;multiply-and-accumulation unit;MAC unit;frequency 250.0 MHz;Field programmable gate arrays;Computer architecture;Bandwidth;Convolutional neural networks;Convolution;Hardware;Registers;convolutional neural network;FPGA},
  doi={10.1109/ICTA48799.2019.9012821},
  ISSN={},
  month={Nov},}

@INPROCEEDINGS{9027479,
  author={P. W. {Zaki} and A. M. {Hashem} and E. A. {Fahim} and M. A. {Masnour} and S. M. {ElGenk} and M. {Mashaly} and S. M. {Ismail}},
  booktitle={2019 15th International Computer Engineering Conference (ICENCO)}, 
  title={A Novel Sigmoid Function Approximation Suitable for Neural Networks on FPGA}, 
  year={2019},
  volume={},
  number={},
  pages={95-99},
  abstract={Artificial Neural Networks (ANN) is invading a lot of practical applications in our life nowadays. One of the main blocks of ANN is the activation function block, which is based on the sigmoid function. The hardware implementation of sigmoid function is a challenging task; hence some approximation techniques were previously developed. In this paper, a novel sigmoid approximation technique is proposed and compared with previous techniques, on both simulation and hardware design levels. They are applied in a neural network application, where the proposed technique showed high accuracy compared to the original sigmoid function. Moreover, the different techniques are implemented on Virtex 7 FPGA using IEEE 754 Floating Point representation to achieve high precision, where the proposed approximation consumed the least hardware area utilization compared to previous works for clock frequency of 358.166 MHZ.},
  keywords={field programmable gate arrays;floating point arithmetic;function approximation;neural nets;Artificial Neural Networks;ANN;main blocks;activation function block;hardware implementation;novel sigmoid approximation technique;hardware design levels;neural network application;original sigmoid function;Virtex 7 FPGA;hardware area utilization;sigmoid function approximation;IEEE 754 floating point representation;frequency 358.166 MHz;Hardware;Field programmable gate arrays;Artificial neural networks;Matlab;Mathematical model;Image recognition;Neural Networks;Sigmoid;Floating Point;FPGA},
  doi={10.1109/ICENCO48310.2019.9027479},
  ISSN={2475-2320},
  month={Dec},}

@INPROCEEDINGS{6614033,
  author={M. {Bohrn} and L. {Fujcik} and R. {Vrba}},
  booktitle={2013 36th International Conference on Telecommunications and Signal Processing (TSP)}, 
  title={Field Programmable Neural Array for feed-forward neural networks}, 
  year={2013},
  volume={},
  number={},
  pages={727-731},
  abstract={This paper focuses on a novel circuit suitable for implementation of feed-forward artificial neural networks. Structure of the circuit is derived from the concept of Field Programmable Neural Array - Field Programmable Neural Network. The aim of the paper is to describe an innovative promising approach to fast hardware implementations of feed-forward artificial neural networks and introduce some special techniques that was used in the design of this circuit and its optimization. The proposed circuit facilitates implementations of all common used topologies of feed-forward neural networks and covers a wide spectrum of applications of neural networks. The circuit is optimized to be implemented into new FPGAs from Xilinx, primarily families 5 and 6 and it also can be implemented into ASIC.},
  keywords={application specific integrated circuits;feedforward neural nets;field programmable gate arrays;neural chips;feedforward artificial neural networks;field programmable neural array;field programmable neural network;fast hardware implementations;FPGA;Xilinx;ASIC;Field programmable gate arrays;Biological neural networks;Hardware;Neurons;Control systems;Artificial neural networks;Artificial neural networks;field programmable neural network;FPGA;FPNA;FPNN},
  doi={10.1109/TSP.2013.6614033},
  ISSN={},
  month={July},}

@INPROCEEDINGS{8280163,
  author={M. {Shimoda} and S. {Sato} and H. {Nakahara}},
  booktitle={2017 International Conference on Field Programmable Technology (ICFPT)}, 
  title={All binarized convolutional neural network and its implementation on an FPGA}, 
  year={2017},
  volume={},
  number={},
  pages={291-294},
  abstract={A pre-trained convolutional neural network (CNN) is a feed-forward computation perspective, which is widely used in the embedded systems requiring highly power-and-area efficiency. This paper realizes a binarized CNN which treats only binarized values (+1/-1) for the weights and the activation value. In this case, the multiplier is replaced by an XNOR circuit instead of a dedicated DSP block. Binarization for both weights and activation are more suitable for hardware implementation. However, the first convolutional layer still calculates in integer precision, since the input value is 8 bit RGB pixel, and not binarized. In this paper, we decompose the input value into maps of which each pixel is in 1-bit precision. The proposed method enables a binarized CNN to use bitwise operation in all layers, and shares a binarized convolutional circuit among all convolutional layers. We call this all binarized CNN. We compared our proposal with conventional ones. Since all binarized CNN do not require a dedicated DSP block, our proposal is smaller and 1.2 times faster than the typical CNNs, and almost maintains baseline classification accuracy. In addition, pipelined all binarized CNN achieved 1840 FPS, consumed 0.3 watts and its accuracy was 82.8%.},
  keywords={Proposals;Field programmable gate arrays;Table lookup;Convolutional neural networks;Embedded systems;Hardware;Two dimensional displays;Deep Neural Network;Convolutional Neural Network;Binarized Convolutional Neural Network;FPGA},
  doi={10.1109/FPT.2017.8280163},
  ISSN={},
  month={Dec},}

@INPROCEEDINGS{8469659,
  author={Q. {Yi}},
  booktitle={2018 2nd IEEE Advanced Information Management,Communicates,Electronic and Automation Control Conference (IMCEC)}, 
  title={FPGA Implementation of Neural Network Accelerator}, 
  year={2018},
  volume={},
  number={},
  pages={1903-1906},
  abstract={A processor is presented to accelerate for artificial neural network. It implements the basic algorithms of the BP network and the Hopfield network (HNN). According to the characteristics of these two network algorithms, based on the structure of Single Instruction Multiple Data (SIMD), we design a one-dimensional pulse array and fully connected interconnection network. Data sharing between processing units can be implemented easily and flexibly. The FPGA simulation results show that the architecture effectively improves the speed of the neural network.},
  keywords={backpropagation;field programmable gate arrays;Hopfield neural nets;multiprocessor interconnection networks;parallel processing;neural network accelerator;artificial neural network;BP network;Hopfield network;one-dimensional pulse array;fully connected interconnection network;FPGA implementation;single instruction multiple data;SIMD;data sharing;Neural networks;Signal processing algorithms;Arrays;Training;Parallel processing;Registers;FPGA;neural network;SIMD},
  doi={10.1109/IMCEC.2018.8469659},
  ISSN={},
  month={May},}

@INPROCEEDINGS{7824478,
  author={J. {Renteria-Cedano} and C. {Peréz-Wences} and L. M. {Aguilar-Lobo} and J. R. {Loo-Yau} and S. {Ortega-Cisneros} and P. {Moreno} and J. A. {Reynoso-Hernández}},
  booktitle={2016 46th European Microwave Conference (EuMC)}, 
  title={A novel configurable FPGA architecture for hardware implementation of multilayer feedforward neural networks suitable for digital pre-distortion technique}, 
  year={2016},
  volume={},
  number={},
  pages={854-857},
  abstract={This paper presents a novel Field Programmable Gate Array (FPGA) architecture for hardware implementation of Multilayer Feedforward Neural Networks (MFNNs) suitable for Digital Pre-Distortion (DPD) technique. This architecture consists of a single neuron, several storage units and multiplexors that allow the reconfiguration via software of the FPGA to handle different numbers of inputs, layers, neurons and threshold functions of the MFNN. This novel FPGA architecture offers the advantage of using FPGAs with less logical resources than those found in a Virtex-6. The usefulness of this novel FPGA architecture is demonstrated by the modeling of the AM-AM and AM-PM characteristics of a GaN class F PA and a Doherty PA in a Virtex-6 FPGA from Xilinx configured with two different MFNNs, using a 2.1 GHz LTE signal of 5 MHz of bandwidth. The results obtained with the FPGA implementation are compared with data computed with MATLAB achieving Normalize Mean Square Error (NMSE) better than -50 dB.},
  keywords={feedforward neural nets;field programmable gate arrays;Long Term Evolution;mean square error methods;multiplexing equipment;power amplifiers;power engineering computing;radio networks;radio transmitters;reconfigurable architectures;telecommunication computing;configurable FPGA architecture;hardware implementation;multilayer feedforward neural networks;MFNN;digital predistortion technique;DPD technique;field programmable gate array architecture;single neuron;storage units;multiplexors;AM-AM characteristic modeling;AM-PM characteristic modeling;GaN class F PA;Doherty PA;Virtex-6 FPGA;Xilinx;LTE signal;Matlab;normalize mean square error;NMSE;Field programmable gate arrays;Computer architecture;Neurons;Biological neural networks;Mathematical model;Hardware;Feedforward neural networks;Doherty power amplifier;artificial neural network;NARX;FPGA},
  doi={10.1109/EuMC.2016.7824478},
  ISSN={},
  month={Oct},}

@INPROCEEDINGS{8892181,
  author={L. {Ioannou} and S. A. {Fahmy}},
  booktitle={2019 29th International Conference on Field Programmable Logic and Applications (FPL)}, 
  title={Neural Network Overlay Using FPGA DSP Blocks}, 
  year={2019},
  volume={},
  number={},
  pages={252-253},
  abstract={With the increasing wider application of neural networks, there has been significant focus on accelerating this class of computations. Larger, more complex networks are being proposed in a variety of domains, requiring more powerful computation platforms. The inherent parallelism and regularity of neural network structures means custom architectures can be adopted for this purpose. FPGAs have been widely used to implement such accelerators because of their flexibility, achievable performance, efficiency, and abundant peripherals. While platforms that utilize multicore CPUs and GPUs are also competitive, FPGAs offer superior energy efficiency, and a wider space of optimisations to enhance performance and efficiency. FPGAs are also more suitable for performing such computations at the edge, where multicore CPUs and GPUs are are less likely to be used and energy efficiency is paramount.},
  keywords={digital signal processing chips;energy conservation;field programmable gate arrays;neural chips;power aware computing;FPGA DSP blocks;neural networks;parallelism;neural network structures;custom architectures;GPUs;energy efficiency;neural network overlay;Biological neural networks;Field programmable gate arrays;Computer architecture;Neurons;Parallel processing;Optimization;Edge Computing;Neural Networks;Overlays},
  doi={10.1109/FPL.2019.00048},
  ISSN={1946-1488},
  month={Sep.},}

@INPROCEEDINGS{8330546,
  author={S. {Hong} and I. {Lee} and Y. {Park}},
  booktitle={2018 International Conference on Electronics, Information, and Communication (ICEIC)}, 
  title={Optimizing a FPGA-based neural accelerator for small IoT devices}, 
  year={2018},
  volume={},
  number={},
  pages={1-2},
  abstract={As neural networks have been widely used for machine-learning algorithms such as image recognition, to design efficient neural accelerators has recently become more important. However, designing neural accelerators is generally difficult because of their high memory storage requirement. In this paper, we propose an area-and-power efficient neural accelerator for small IoT devices, using 4-bit fixed-point weights through quantization technique. The proposed neural accelerator is trained through the TensorFlow infrastructure and the weight data is optimized in order to reduce the overhead of high weight memory requirement. Our FPGA-based design achieves 97.44% accuracy with MNIST 10,000 test images.},
  keywords={field programmable gate arrays;Internet of Things;learning (artificial intelligence);neural nets;4-bit fixed-point weights;IoT devices;neural networks;machine-learning algorithms;high memory storage requirement;weight memory requirement;area-and-power efficient neural accelerator;quantization technique;FPGA-based neural accelerator training;TensorFlow infrastructure;Biological neural networks;Training;Field programmable gate arrays;Quantization (signal);Memory management;Neurons;Optimization;Neural networks;Accelerator;Quantization;FPGA},
  doi={10.23919/ELINFOCOM.2018.8330546},
  ISSN={},
  month={Jan},}

@INPROCEEDINGS{7280031,
  author={Q. X. {Wu} and X. {Liao} and X. {Huang} and R. {Cai} and J. {Cai} and J. {Liu}},
  booktitle={2015 Fifth International Conference on Communication Systems and Network Technologies}, 
  title={Development of FPGA Toolbox for Implementation of Spiking Neural Networks}, 
  year={2015},
  volume={},
  number={},
  pages={806-810},
  abstract={Since more and more new findings and principles of intelligence emerge from neuroscience, spiking neural networks become important topics in artificial intelligence domain. However, as high computational complexity of spiking neural networks it is difficult to implement them efficiently using software simulation. In this paper a new hardware implementation method is proposed. In order to implement spiking neural networks more simply, efficiently and rapidly, a toolbox, which is composed of components of spiking neural networks, is developed for neuroscientists, computer scientists and electronic engineers to implement and simulate spiking neural networks in hardware. Using the toolbox a spiking neural network is easy to implement on a FPGA (Field Programmable Gate Arrays) chip, because the toolbox takes advantages of Xilinx System Generator and works in Mat lab Simulink environment. The graphic user interface enables users easy to design and simulate spiking neural networks on FPGAs and speed up run-time. This paper presents the methodology in development of the toolbox and the examples are used to show its promising application.},
  keywords={computational complexity;field programmable gate arrays;graphical user interfaces;neural nets;FPGA toolbox;spiking neural networks;computational complexity;artificial intelligence domain;neuroscience;software simulation;hardware implementation method;neuroscientists;computer scientists;electronic engineers;field programmable gate arrays chip;Xilinx System Generator;Matlab Simulink environment;graphic user interface;Neurons;Biological neural networks;Field programmable gate arrays;Computational modeling;Biological system modeling;Generators;Software packages;spiking neural networks;hardware implementation;neuron model;toolbox;FPGA},
  doi={10.1109/CSNT.2015.216},
  ISSN={},
  month={April},}

@INPROCEEDINGS{8966187,
  author={R. {Fuchikami} and F. {Issiki}},
  booktitle={2019 IEEE 9th International Conference on Consumer Electronics (ICCE-Berlin)}, 
  title={Fast and Light-weight Binarized Neural Network Implemented in an FPGA using LUT-based Signal Processing and its Time-domain Extension for Multi-bit Processing}, 
  year={2019},
  volume={},
  number={},
  pages={120-121},
  abstract={Fast and small-resourced implementation of convolutional neural network (CNN) into a field-programmable gate array (FPGA) was realized using a binarized neural network (NN). We propose a set of neuron and network models optimized for fully binarized implementation of general NNs using the look-up-tables (LUTs) in modern FPGAs, which is herein referred to as sparse-LUT model. Arrayed MNIST data images of more than 40 characters input from a camera were recognized with 92.8% accuracy and classified with the colored marks on the characters on organic light-emitting diode (OLED) display images with 1-ms cycle time, <; 1.0-ms delay in the LUT-based CNN recognition, and <; 2-ms total time delay. In combination with stochastic time-divided signal processing, binarized signals in this model can be extended for processing multi-bit (analogue-like) signals in an oversampling manner with increased recognition accuracies up to 98.6% in MNIST and 58.3% in CIFAR-10 image data sets using CNN. The source codes for the binarized NN core were released and open-sourced.},
  keywords={convolutional neural nets;field programmable gate arrays;image processing;organic light emitting diodes;table lookup;LUT-based CNN recognition;time delay;stochastic time-divided signal processing;binarized signals;processing multibit signals;CIFAR-10 image data sets;binarized NN core;FPGA;LUT-based signal processing;time-domain extension;small-resourced implementation;convolutional neural network;field-programmable gate array;binarized neural network;network models;fully binarized implementation;sparse-LUT model;arrayed MNIST data images;organic light-emitting diode;1-ms cycle time;general NN},
  doi={10.1109/ICCE-Berlin47944.2019.8966187},
  ISSN={2166-6822},
  month={Sep.},}

@INPROCEEDINGS{7351805,
  author={A. {Youssef} and K. {Mohammed} and A. {Nasar}},
  booktitle={2014 4th International Conference on Artificial Intelligence with Applications in Engineering and Technology}, 
  title={Two Novel Generic, Reconfigurable Neural Network FPGA Architectures}, 
  year={2014},
  volume={},
  number={},
  pages={3-7},
  abstract={Two novel generic, scalable, and reconfigurable neural network architectures implemented using field programmable gate arrays (FPGAs) are presented in this paper. Previous Implementations of feed-forward Neural Networks face two major issues: 1) Limited resources available on the FPGA compared to the large number of multiplications required by Neural-Networks, 2) Limited reusability of the design when applied to the Neural-Network applications with different architectures. Our proposed implementations circumvent both issues. The designs' scalability allows the user to program and implement different applications with variable number of neurons, starting from one neuron to the maximum number of neurons in any layer, this is performed with programming-like ease and flexibility. A GUI was implemented to allow automatic configuration of the processors for different applications. Finally, Propositions for future work are outlined.},
  keywords={feedforward neural nets;field programmable gate arrays;graphical user interfaces;reconfigurable architectures;reconfigurable neural network FPGA architectures;field-programmable gate arrays;feedforward neural-networks;design scalability;neurons;GUI;automatic processor configuration;Neurons;Biological neural networks;Random access memory;Field programmable gate arrays;Clocks;Software;Time series analysis;Field-programmable gate array (FPGA);hard-ware implementation;layer multiplexing;neural networks (NNs)},
  doi={10.1109/ICAIET.2014.11},
  ISSN={},
  month={Dec},}

@INPROCEEDINGS{8369336,
  author={Z. {Liu} and S. {Luo} and X. {Xu} and C. {Zhuo}},
  booktitle={2018 China Semiconductor Technology International Conference (CSTIC)}, 
  title={Cellular Neural Network (CENN) FPGA implementation using multi-level optimization}, 
  year={2018},
  volume={},
  number={},
  pages={1-3},
  abstract={Cellular Neural Network (CeNN) has been widely adopted in image processing tasks, which is considered as a powerful paradigm for embedded devices. Recently, digital implementations of CeNNs on FPGA have attracted researchers from both academia and industry due to its high flexibility and short time-to-market. However, most existing implementations are not well optimized to fully utilize the advantages of FPGA platform with unnecessary design and computational redundancy that prevents speedup. We propose a multi-level optimization framework for energy efficient CeNN implementations on FPGAs. In particular, the optimization framework is featured with three level optimizations: system-, module-, and design-space-level, with focus on computational redundancy and attainable performance, respectively. Experimental results show that our framework can achieve an energy efficiency improvement of 3.54× and up to 3.88× speedup compared with existing implementations.},
  keywords={cellular neural nets;field programmable gate arrays;image processing;optimisation;design-space-level;image processing tasks;embedded devices;multilevel optimization framework;energy efficient CeNN implementations;cellular neural network FPGA implementation;Optimization;Field programmable gate arrays;Cellular neural networks;Computer architecture;Space exploration;Random access memory;Energy efficiency;FPGA;Cellular neural network;acceleration},
  doi={10.1109/CSTIC.2018.8369336},
  ISSN={},
  month={March},}

@INPROCEEDINGS{6927383,
  author={M. {Pietras}},
  booktitle={2014 24th International Conference on Field Programmable Logic and Applications (FPL)}, 
  title={Hardware conversion of neural networks simulation models for neural processing accelerator implemented as FPGA-based SoC}, 
  year={2014},
  volume={},
  number={},
  pages={1-4},
  abstract={The transition from a neural network simulation model to its hardware representation is a complex process, which touches computations precision, performance and effective architecture implementation issues. Presented neural processing accelerator involves neural network sectioning, precision reduction and weight coefficients parsing (arrangements) in order to increase efficiency and maximize FPGA hardware resources utilization. Particular attention has been devoted on to ANN conversion methods designed for a system based on neural processing units and related with this process redundant calculations and empty neurons generation. In addition, this paper describes the FPGA-based Neural Processing Accelerator architecture benchmark for real example implementation of a pattern recognition neural network.},
  keywords={field programmable gate arrays;neural nets;pattern recognition;system-on-chip;hardware conversion;neural network simulation model;neural processing accelerator;FPGA-based SoC;hardware representation;neural network sectioning;precision reduction;weight coefficient parsing;FPGA hardware resource utilization maximization;ANN conversion method;empty neuron generation;FPGA-based neural processing accelerator architecture benchmark;pattern recognition neural network;Biological neural networks;Neurons;Hardware;Computational modeling;Artificial neural networks;Field programmable gate arrays;Computer architecture;FPGA;NPU;Accelerator Architecture;Kintex 7;Hardware Neural Network;Reduced floating-point;Matlab ANN Toolbox;Redundancy},
  doi={10.1109/FPL.2014.6927383},
  ISSN={1946-1488},
  month={Sep.},}

@ARTICLE{8412552,
  author={L. {Gong} and C. {Wang} and X. {Li} and H. {Chen} and X. {Zhou}},
  journal={IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems}, 
  title={MALOC: A Fully Pipelined FPGA Accelerator for Convolutional Neural Networks With All Layers Mapped on Chip}, 
  year={2018},
  volume={37},
  number={11},
  pages={2601-2612},
  abstract={Recently, field-programmable gate arrays (FPGAs) have been widely used in the implementations of hardware accelerator for convolutional neural networks (CNNs). However, most of these existing accelerators are designed in the same idea as their ASIC counterparts, in which all operations from different layers are mapped to the same hardware units and working in a multiplexed way. This manner does not take full advantage of reconfigurability and customizability of FPGAs, resulting in a certain degree of computational efficiency degradation. In this paper, we propose a new architecture for FPGA-based CNN accelerator that maps all the layers to their own on-chip units and working concurrently as a pipeline. A comprehensive mapping and optimizing methodology based on establishing roofline model oriented optimization model is proposed, which can achieve maximum resource utilization as well as optimal computational efficiency. Besides, to ease the programming burden, we propose a design framework which can provide a one-stop function for developers to generate the accelerator with our optimizing methodology. We evaluate our proposal by implementing different modern CNN models on Xilinx Zynq-7020 and Virtex-7 690t FPGA platforms. Experimental results show that our implementations can achieve a peak performance of 910.2 GOPS on Virtex-7 690t, and 36.36 GOP/s/W energy efficiency on Zynq-7020, which are superior to the previous approaches.},
  keywords={application specific integrated circuits;convolution;feedforward neural nets;field programmable gate arrays;optimisation;fully pipelined FPGA accelerator;convolutional neural networks;field-programmable gate arrays;hardware accelerator;hardware units;computational efficiency degradation;FPGA-based CNN accelerator;on-chip units;comprehensive mapping;optimizing methodology;roofline model;optimization model;maximum resource utilization;optimal computational efficiency;energy efficiency;CNN models;MALOC;ASIC;Field programmable gate arrays;Hardware;Computational modeling;System-on-chip;Computer architecture;Pipelines;Optimization;Convolutional neural network (CNN);design space exploration (DSE);field-programmable gate array (FPGA)-based accelerator;pipeline;programming framework;redundancy elimination},
  doi={10.1109/TCAD.2018.2857078},
  ISSN={1937-4151},
  month={Nov},}

@INPROCEEDINGS{7045812,
  author={Y. {Qi} and B. {Zhang} and T. M. {Taha} and H. {Chen} and R. {Hasan}},
  booktitle={NAECON 2014 - IEEE National Aerospace and Electronics Conference}, 
  title={FPGA design of a multicore neuromorphic processing system}, 
  year={2014},
  volume={},
  number={},
  pages={255-258},
  abstract={The Interest in specialized neuromorphic computing architectures has been increasing recently, and several applications have been shown to be capable of being accelerated on such a platform. This paper describes the implementation of a multicore digital neuromorphic processing system on an Altera Quartus II FPGA. Static routing was used to allow communication between the cores on the FPGA. Two applications were mapped to the system: image edge detection and ECG. Compared to an Intel processor implementation of these applications, the FPGA based neural implementations provided about 3× and 127× speedup for the edge detection and ECG applications. Given that both applications were implemented with the same base Verilog code, with only a change in the synaptic weights and number of neurons utilized, the system has the capability to accelerate a broad range of applications.},
  keywords={field programmable gate arrays;multiprocessing systems;network routing;FPGA design;multicore digital neuromorphic processing;neuromorphic computing architecture;Altera Quartus II FPGA;static routing;image edge detection;ECG;Verilog code;synaptic weight;neurons;Multicore processing;Neurons;Routing;Field programmable gate arrays;Neuromorphics;Biological neural networks;Image edge detection;FPGA;neuromorphic;multicore},
  doi={10.1109/NAECON.2014.7045812},
  ISSN={2379-2027},
  month={June},}

@ARTICLE{8501526,
  author={F. J. V. {Caballero} and D. J. {Ives} and C. {Laperle} and D. {Charlton} and Q. {Zhuge} and M. {O'Sullivan} and S. J. {Savory}},
  journal={IEEE/OSA Journal of Optical Communications and Networking}, 
  title={Machine learning based linear and nonlinear noise estimation}, 
  year={2018},
  volume={10},
  number={10},
  pages={D42-D51},
  abstract={Operators are pressured to maximize the achieved capacity over deployed links. This can be obtained by operating in the weakly nonlinear regime, requiring a precise understanding of the transmission conditions. Ideally, optical transponders should be capable of estimating the regime of operation from the received signal and feeding that information to the upper management layers to optimize the transmission characteristics; however, this estimation is challenging. This paper addresses this problem by estimating the linear and nonlinear signal-to-noise ratio (SNR) from the received signal. This estimation is performed by obtaining features of two distinct effects: nonlinear phase noise and second-order statistical moments. A small neural network is trained to estimate the SNRs from the extracted features. Over extensive simulations covering 19,800 sets of realistic fiber transmissions, we verified the accuracy of the proposed techniques. Employing both approaches simultaneously gave measured performances of 0.04 and 0.20 dB of standard error for the linear and nonlinear SNRs, respectively.},
  keywords={feature extraction;higher order statistics;learning (artificial intelligence);neural nets;optical fibre communication;optical information processing;optical links;optical noise;phase noise;transponders;transmission conditions;optical transponders;received signal;nonlinear signal-to-noise ratio;nonlinear phase noise;second-order statistical moments;machine learning;nonlinear noise estimation;fiber transmissions;nonlinear SNR;deployed link capacity;linear noise estimation;transmission characteristics optimization;neural network training;feature extraction;Optical fiber networks;Measurement;Signal to noise ratio;Estimation;Phase noise;Mathematical model;Coherent communications; Machine learning;Metrology; Optical performance monitoring},
  doi={10.1364/JOCN.10.000D42},
  ISSN={1943-0639},
  month={Oct},}

@INPROCEEDINGS{8823487,
  author={D. E. {Ipatov} and A. V. {Zverev}},
  booktitle={2019 20th International Conference of Young Specialists on Micro/Nanotechnologies and Electron Devices (EDM)}, 
  title={Development of Neuromorphic Accelerator}, 
  year={2019},
  volume={},
  number={},
  pages={720-725},
  abstract={The paper shows the design of a neuromorphic accelerator, as well as unified design solutions for creating a scalable modular system of neuromorphic accelerators. The neuromorphic accelerator based on the FPGA was developed. It allows simulating up to 131 thousand neurons with 67 million synaptic connections. The neuromorphic cross-connection board, which is the universal platform for working with the neuromorphic accelerators and neural networks that they stimulate, was developed. One cross connection board allows placing up to 16 neuromorphic accelerators, which makes a simulation of up to 2 million neurons with a total number of synaptic connections of up to 1 billion possible. An energy consumption analysis was made for every developed device.},
  keywords={field programmable gate arrays;neural chips;neuromorphic accelerator;neuromorphic cross-connection board;scalable modular system;synaptic connections;FPGA;unified design solutions;Neuromorphics;Neurons;Field programmable gate arrays;Biological neural networks;Backplanes;Computer architecture;Neuromorphic;neuromorphic architecture;neural networks;accelerator;machine learning},
  doi={10.1109/EDM.2019.8823487},
  ISSN={2325-419X},
  month={June},}

@ARTICLE{8693488,
  author={C. {Lammie} and A. {Olsen} and T. {Carrick} and M. {Rahimi Azghadi}},
  journal={IEEE Access}, 
  title={Low-Power and High-Speed Deep FPGA Inference Engines for Weed Classification at the Edge}, 
  year={2019},
  volume={7},
  number={},
  pages={51171-51184},
  abstract={Deep neural networks (DNNs) have recently achieved remarkable performance in a myriad of applications, ranging from image recognition to language processing. Training such networks on graphics processing units (GPUs) currently offers unmatched levels of performance; however, GPUs are subject to large-power requirements. With recent advancements in high-level synthesis (HLS) techniques, new methods for accelerating deep networks using field programmable gate arrays (FPGAs) are emerging. FPGA-based DNNs present substantial advantages in energy efficiency over conventional CPU- and GPU-accelerated networks. Using the Intel FPGA software development kit (SDK) for OpenCL development environment, networks described using the high-level OpenCL framework can be accelerated targeting heterogeneous platforms including CPUs, GPUs, and FPGAs. These networks, if properly customized on GPUs and FPGAs, can be ideal candidates for learning and inference in resource-constrained portable devices such as robots and the Internet of Things (IoT) edge devices, where power is limited and performance is critical. Here, we introduce GPU- and FPGA-accelerated deterministically binarized DNNs, tailored toward weed species classification for robotic weed control. Our developed networks are trained and benchmarked using a publicly available weed species dataset, named DeepWeeds, which include close to 18 000 weed images. We demonstrate that our FPGA-accelerated binarized networks significantly outperform their GPU-accelerated counterparts, achieving a>7-fold decrease in power consumption, while performing inference on weed images 2.86 times faster compared to our best performing baseline full-precision GPU implementation. These significant benefits are gained whilst losing only 1.17% of validation accuracy. In this paper, this is a significant step toward enabling deep inference and learning on IoT edge devices, and smart portable machines such as agricultural robots, which is the target application.},
  keywords={agriculture;field programmable gate arrays;graphics processing units;image classification;industrial robots;inference mechanisms;learning (artificial intelligence);low-power electronics;neural nets;GPU-accelerated networks;Intel FPGA software development kit;OpenCL development environment;high-level OpenCL framework;resource-constrained portable devices;weed species classification;robotic weed control;publicly available weed species dataset;weed images;FPGA-accelerated binarized networks;power consumption;deep inference;IoT edge devices;high-speed deep FPGA inference engines;weed classification;deep neural networks;image recognition;language processing;unmatched levels;large-power requirements;high-level synthesis techniques;deep networks;field programmable gate arrays;FPGA-based DNNs;energy efficiency;baseline full-precision GPU implementation;Field programmable gate arrays;Training;Graphics processing units;Acceleration;Robot kinematics;Engines;Machine learning (ML);deep neural networks (DNNs);convolutional neural networks (CNNs);binarized neural networks (BNNs);Internet of Things (IoT);field programmable gate arrays (FPGAs);high-level synthesis (HLS);weed classification},
  doi={10.1109/ACCESS.2019.2911709},
  ISSN={2169-3536},
  month={},}