@article {1442846, title = {Air Learning: An AI Research Platform for Algorithm-Hardware Benchmarking of Autonomous Aerial Robots}, journal = {Springer Machine Learning Journal}, number = {Special Issue on Reinforcement Learning for Real Life}, year = {Forthcoming}, abstract = {We introduce Air Learning, an AI research platform for benchmarking algorithm-hardware performance and energy efficiency trade-offs. We focus in particular on deep reinforcement learning (RL) interactions in autonomous unmanned aerial vehicles (UAVs). Equipped with a random environment generator, AirLearning exposes a UAV to a diverse set of challenging scenarios. Users can specify a task, train different RL policies and evaluate their performance and energy efficiency on a variety of hardware platforms. To show how Air Learning can be used, we seed it with Deep Q Networks (DQN) and Proximal Policy Optimization (PPO) to solve a point-to-point obstacle avoidance task in three different environments, generated using our configurable environment generator. We train the two algorithms using curriculum learning and non-curriculum-learning. Air Learning assesses the trained policies{\textquoteright} performance, under a variety of quality-of-flight (QoF) metrics, such as the energy consumed, endurance and the average trajectory length, on resource-constrained embedded platforms like a Ras-Pi. We find that the trajectories on an embedded Ras-Pi are vastly different from those predicted on a high-end desktop system, resulting in up to 79.43\% longer trajectories in one of the environments. To understand the source of such differences, we use Air Learning to artificially degrade desktop performance to mimic what happens on a low-end embedded system. QoF metrics with hardware-in-the-loop characterize those differences and expose how the choice of onboard compute affects the aerial robot{\textquoteright}s performance. We also conduct reliability studies to demonstrate how Air Learning can help understand how sensor failures affect the learned policies. All put together, Air Learning enables a broad class of RL studies on UAVs. More information and code for Air Learning can be found here.}, url = {https://arxiv.org/abs/1906.00421v4}, author = {Krishnan, Srivatsan and Boroujerdian, Behzad and William Fu and Faust, Aleksandra and Reddi, Vijay Janapa} } @article {1593444, title = {Accelerating Robot Dynamics Gradients on a CPU, GPU, and FPGA}, journal = {IEEE Robotics and Automation Letters}, volume = {6}, number = {2}, year = {2021}, pages = {2335-2342}, abstract = {Computing the gradient of rigid body dynamics is a central operation in many state-of-the-art planning and control algorithms in robotics. Parallel computing platforms such as GPUs and FPGAs can offer performance gains for algorithms with hardware-compatible computational structures. In this letter, we detail the designs of three faster than state-of-the-art implementations of the gradient of rigid body dynamics on a CPU, GPU, and FPGA. Our optimized FPGA and GPU implementations provide as much as a 3.0x end-to-end speedup over our optimized CPU implementation by refactoring the algorithm to exploit its computational features, e.g., parallelism at different granularities. We also find that the relative performance across hardware platforms depends on the number of parallel gradient evaluations required.}, url = {https://ieeexplore.ieee.org/abstract/document/9350173}, author = {Brian Plancher and Sabrina M. Neuman and Thomas Bourgeat and Scott Kuindersma and Srinivas Devadas and Reddi, Vijay Janapa} } @conference {1593451, title = {ActorQ: Quantization for Actor-Learner Distributed Reinforcement Learning}, booktitle = {ICLR}, year = {2021}, address = {Hardware Aware Efficient Training Workshop at ICLR 2021, Virtual, May 7}, abstract = {In this paper, we introduce a novel Reinforcement Learning (RL) training paradigm, ActorQ, for speeding up actor-learner distributed RL training. ActorQ leverages full precision optimization on the learner, and distributed data collection through lower-precision quantized actors. The quantized, 8-bit (or 16 bit) inference on actors, speeds up data collection without affecting the convergence. The quantized distributed RL training system, ActorQ, demonstrates end to end speedups of \> 1.5 {\texttimes} - 2.5 {\texttimes}, and faster convergence over full precision training on a range of tasks (Deepmind Control Suite) and different RL algorithms (D4PG, DQN). Finally, we break down the various runtime costs of distributed RL training (such as communication time, inference time, model load time, etc) and evaluate the effects of quantization on these system attributes.}, author = {Maximilian Lam and Sharad Chitlangia and Krishnan, Srivatsan and Zishen Wan and Gabriel Barth-Maron and Faust, Aleksandra and Reddi, Vijay Janapa} } @conference {1593453, title = {AI Tax in Mobile SoCs: End-to-end Performance Analysis of Machine Learning in Smartphones}, booktitle = {2021 IEEE International Symposium on Performance Analysis of Systems and Software}, year = {2021}, publisher = {IEEE}, organization = {IEEE}, address = {ISPASS {\textquoteright}21, Virtual, Stony Brook, NY, March 28-30}, abstract = {Mobile software is becoming increasingly feature rich, commonly being accessorized with the powerful decision making capabilities of machine learning (ML). To keep up with the consequently higher power and performance demands, system and hardware architects add specialized hardware units onto their system-on-chips (SoCs) coupled with frameworks to delegate compute optimally. While these SoC innovations are rapidly improving ML model performance and power efficiency, auxiliary data processing and supporting infrastructure to enable ML model execution can substantially alter the performance profile of a system. This work posits the existence of an AI tax, the time spent on non-model execution tasks. We characterize the execution pipeline of open source ML benchmarks and Android applications in terms of AI tax and discuss where performance bottlenecks may unexpectedly arise.}, url = {https://ieeexplore.ieee.org/abstract/document/9408206}, author = {Michael Buch and Zahra Azad and Joshi, Ajay and Reddi, Vijay Janapa} } @article {richins2020ai, title = {AI Tax: The Hidden Cost of AI Data Center Applications}, journal = {ACM Transactions on Computer Systems (TOCS)}, volume = {37}, number = {1-4}, year = {2021}, pages = {1-32}, abstract = {Artificial intelligence and machine learning are experiencing widespread adoption in industry and academia. This has been driven by rapid advances in the applications and accuracy of AI through increasingly complex algorithms and models; this, in turn, has spurred research into specialized hardware AI accelerators. Given the rapid pace of advances, it is easy to forget that they are often developed and evaluated in a vacuum without considering the full application environment. This article emphasizes the need for a holistic, end-to-end analysis of artificial intelligence (AI) workloads and reveals the {\textquotedblleft}AI tax.{\textquotedblright} We deploy and characterize\ Face Recognition\ in an edge data center. The application is an AI-centric edge video analytics application built using popular open source infrastructure and machine learning (ML) tools. Despite using state-of-the-art AI and ML algorithms, the application relies heavily on pre- and post-processing code. As AI-centric applications benefit from the acceleration promised by accelerators, we find they impose stresses on the hardware and software infrastructure: storage and network bandwidth become major bottlenecks with increasing AI acceleration. By specializing for AI applications, we show that a purpose-built edge data center can be designed for the stresses of accelerated AI at 15\% lower TCO than one derived from homogeneous servers and infrastructure.}, url = {https://dl.acm.org/doi/10.1145/3440689}, author = {Richins, Daniel and Dharmisha Doshi and Matthew Blackmore and Aswathy Thulaseedharan Nair and Neha Pathapati and Ankit Patel and Brainard Daguman and Daniel Dobrijalowski and Ramesh Illikkal and Kevin Long and David Zimmerman and Reddi, Vijay Janapa} } @article {1595404, title = {Data Engineering for Everyone}, year = {2021}, abstract = {Data engineering is one of the fastest-growing fields within machine learning (ML). As ML becomes more common, the appetite for data grows more ravenous. But ML requires more data than individual teams of data engineers can readily produce, which presents a severe challenge to ML deployment at scale. Much like the software-engineering revolution, where mass adoption of open-source software replaced the closed, in-house development model for infrastructure code, there is a growing need to enable rapid development and open contribution to massive machine learning data sets. This article shows that open-source data sets are the rocket fuel for research and innovation at even some of the largest AI organizations. Our analysis of nearly 2000 research publications from Facebook, Google and Microsoft over the past five years shows the widespread use and adoption of open data sets. Open data sets that are easily accessible to the public are vital to accelerate ML innovation for everyone. But such open resources are scarce in the wild. So, can we accelerate data set creation and enable the rapid development of open data sets, akin to the rapid development of open-source software? Moreover, can we develop automatic data set generation frameowrks and tools to avert the data scarcity crisis?}, url = {https://arxiv.org/abs/2102.11447}, author = {Reddi, Vijay Janapa and Greg Diamos and Pete Warden and Peter Mattson and David Kanter} } @conference {1595406, title = {Few-Shot Keyword Spotting in Any Language}, booktitle = {INTERSPEECH 2021}, year = {2021}, address = {Virtual, Brno, Czech Republic}, abstract = {We introduce a few-shot transfer learning method for keyword spotting in any language. Leveraging open speech corpora in nine languages, we automate the extraction of a large multilingual keyword bank and use it to train an embedding model. With just five training examples, we fine-tune the embedding model for keyword spotting and achieve an average F1 score of 0.75 on keyword classification for 180 new keywords unseen by the embedding model in these nine languages. This embedding model also generalizes to new languages. We achieve an average F1 score of 0.65 on 5-shot models for 260 keywords sampled across 13 new languages unseen by the embedding model. We investigate streaming accuracy for our 5-shot models in two contexts: keyword spotting and keyword search. Across 440 keywords in 22 languages, we achieve an average streaming keyword spotting accuracy of 85.2\% with a false acceptance rate of 1.2\%, and observe promising initial results on keyword search.}, url = {https://arxiv.org/abs/2104.01454}, author = {Mark Mazumder and Colby Banbury and Josh Meyer and Pete Warden and Reddi, Vijay Janapa} } @conference {1606280, title = {GLADAS: Gesture Learning for Advanced Driver Assistance Systems}, booktitle = {IEEE International Conference on Autonomous Systems}, year = {2021}, address = {ICAS 2021, Montr{\'e}al, Qu{\'e}bec, Canada, August 11-13, 2021}, abstract = {Human-computer interaction (HCI) is crucial for safety as autonomous vehicles (AVs) become commonplace. Yet, little effort has been put toward ensuring that AVs understand human communications on the road. In this paper, we present Gesture Learning for Advanced Driver Assistance Systems (GLADAS), a deep learning-based self-driving car hand gesture recognition system developed and evaluated using virtual simulation. We focus on gestures as they are a natural and common way for pedestrians to interact with drivers. We challenge the system to perform in typical, everyday driving interactions with humans. Our results provide a baseline performance of 94.56\% accuracy and 85.91\% F1 score, promising statistics that surpass human performance and motivate the need for further research into human-AV interaction.}, url = {https://arxiv.org/abs/1910.04695}, author = {Ethan Shaotran and Jonathan J. Cruz and Reddi, Vijay Janapa} } @article {1595417, title = {Learning to Seek: Autonomous Source Seeking with Deep Reinforcement Learning Onboard a Nano Drone Microcontroller}, year = {2021}, abstract = {Fully autonomous navigation using nano drones has numerous applications in the real world, ranging from search and rescue to source seeking. Nano drones are wellsuited for source seeking because of their agility, low price, and ubiquitous character. Unfortunately, their constrained form factor limits flight time, sensor payload, and compute capability. These challenges are a crucial limitation for the use of source-seeking nano drones in GPS-denied and highly cluttered environments. Hereby, we introduce a fully autonomous deep reinforcement learning-based light-seeking nano drone. The 33-gram nano drone performs all computation on-board the ultra-low-power microcontroller (MCU). We present the method for efficiently training, converting, and utilizing deep reinforcement learning policies. Our training methodology and novel quantization scheme allow fitting the trained policy in 3 kB of memory. The quantization scheme uses representative input data and input scaling to arrive at a full 8-bit model. Finally, we evaluate the approach in simulation and flight tests using a Bitcraze CrazyFlie, achieving 80\% success rate on average in a highly cluttered and randomized test environment. Even more, the drone finds the light source in 29\% fewer steps compared to a baseline simulation (obstacle avoidance without source information). To our knowledge, this is the first deep reinforcement learning method that enables source seeking within a highly constrained nano drone demonstrating robust flight behavior. Our general methodology is suitable for any (source seeking) highly constrained platform using deep reinforcement learning. Code \& video: https://github. com/harvard-edge/source-seeking}, url = {https://arxiv.org/abs/1909.11236v6}, author = {Bardienus P. Duisterhof and Krishnan, Srivatsan and Jonathan J. Cruz and Colby R. Banbury and William Fu and Faust, Aleksandra and Guido C. H. E. de Croon and Reddi, Vijay Janapa} } @conference {1624902, title = {PrecisionBatching: Bitserial Decomposition for Efficient Neural Network Inference on GPUs}, booktitle = {The 30th International Conference on Parallel Architectures and Compilation Techniques}, year = {2021}, publisher = {IEEE}, organization = {IEEE}, address = {presented at PACT 2021, virtual, September 26-29}, abstract = {We present PrecisionBatching, a quantized inference algorithm for speeding up neural network inference on traditional hardware platforms at low bitwidths. PrecisionBatching is based on the following insights: 1) neural network inference with low batch sizes on traditional hardware architectures (e.g: GPUs) is memory bound, 2) activation precision is critical to improving quantized model quality and 3) matrix-vector multiplication can be decomposed into binary matrix-matrix multiplications, enabling quantized inference with higher precision activations at the cost of more arithmetic operations. Combining these three insights, PrecisionBatching enables inference at extreme quantization levels (\< 8 bits) by shifting a memory bound problem to a compute bound problem and achieves higher compute efficiency and runtime speedup at fixed accuracy thresholds against standard quantized inference methods. Across a variety of applications (MNIST, language modeling, natural language inference, reinforcement learning) and neural network architectures (fully connected, RNN, LSTM), PrecisionBatching yields end-to-end speedups of over 8x on a GPU within a \< 1-5\% error margin of the full precision baseline, outperforming traditional 8-bit quantized inference by over 1.5x-2x at the same error tolerance.}, url = {https://www.computer.org/csdl/proceedings-article/pact/2021/427800a129/1xNNsIrPWNy}, author = {Maximilian Lam and Zachary Yedidia and Colby Banbury and Reddi, Vijay Janapa} } @proceedings {1592077, title = {Robomorphic computing: a design methodology for domain-specific accelerators parameterized by robot morphology}, journal = {Proceedings of the 26th ACM International Conference on Architectural Support for Programming Languages and Operating Systems}, year = {2021}, pages = {674-686}, publisher = {ACM}, address = {ASPLOS {\textquoteright}21, April 19-23, 2021, Virtual, New York, NY}, abstract = {Robotics applications have hard time constraints and heavy computational burdens that can greatly benefit from domain-specific hardware accelerators. For the latency-critical problem of robot motion planning and control, there exists a performance gap of at least an order of magnitude between joint actuator response rates and state-of-the-art software solutions. Hardware acceleration can close this gap, but it is essential to define automated hardware design flows to keep the design process agile as applications and robot platforms evolve. To address this challenge, we introduce robomorphic computing: a methodology to transform robot morphology into a customized hardware accelerator morphology. We (i) present this design methodology, using robot topology and structure to exploit parallelism and matrix sparsity patterns in accelerator hardware; (ii) use the methodology to generate a parameterized accelerator design for the gradient of rigid body dynamics, a key kernel in motion planning; (iii) evaluate FPGA and synthesized ASIC implementations of this accelerator for an industrial manipulator robot; and (iv) describe how the design can be automatically customized for other robot models. Our FPGA accelerator achieves speedups of 8{\texttimes} and 86{\texttimes} over CPU and GPU when executing a single dynamics gradient computation. It maintains speedups of 1.9{\texttimes} to 2.9{\texttimes} over CPU and GPU, including computation and I/O round-trip latency, when deployed as a coprocessor to a host CPU for processing multiple dynamics gradient computations. ASIC synthesis indicates an additional 7.2{\texttimes} speedup for single computation latency. We describe how this principled approach generalizes to more complex robot platforms, such as quadrupeds and humanoids, as well as to other computational kernels in robotics, outlining a path forward for future robomorphic computing accelerators.}, author = {Sabrina M. Neuman and Brian Plancher and Thomas Bourgeat and Thierry Tambe and Srinivas Devadas and Reddi, Vijay Janapa} } @conference {1606275, title = {Sniffy Bug: A Fully Autonomous Swarm of Gas-Seeking Nano Quadcopters in Cluttered Environments}, booktitle = {International Conference on Intelligent Robots and Systems}, year = {2021}, address = {IROS 2021, Prague, Czech Republic (Virtual)}, abstract = { Nano quadcopters are ideal for gas source localization (GSL) as they are safe, agile and inexpensive. However, their extremely restricted sensors and computational resources make GSL a daunting challenge. We propose a novel bug algorithm named {\textquoteleft}Sniffy Bug{\textquoteright}, which allows a fully autonomous swarm of gas-seeking nano quadcopters to localize a gas source in unknown, cluttered, and GPS-denied environments. The computationally efficient, mapless algorithm foresees in the avoidance of obstacles and other swarm members, while pursuing desired waypoints. The waypoints are first set for exploration, and, when a single swarm member has sensed the gas, by a particle swarm optimization-based (PSO) procedure. We evolve all the parameters of the bug (and PSO) algorithm using our novel simulation pipeline, {\textquoteleft}AutoGDM{\textquoteright}. It builds on and expands open source tools in order to enable fully automated end-to-end environment generation and gas dispersion modeling, allowing for learning in simulation. Flight tests show that Sniffy Bug with evolved parameters outperforms manually selected parameters in cluttered, real-world environments. Videos: https://bit.ly/37MmtdL }, url = {https://arxiv.org/abs/2107.05490}, author = {Bardienus P. Duisterhof and Shushuai Li and Javier Burgu{\'e}s and Reddi, Vijay Janapa and Guido C. H. E. de Croon} } @article {tambe2020adaptivfloat, title = {AdaptivFloat: A Floating-point based Data Type for Resilient Deep Learning Inference}, journal = {arXiv preprint arXiv:1909.13271}, year = {2020}, abstract = {Conventional hardware-friendly quantization methods, such as fixed-point or integer, tend to perform poorly at very low word sizes as their shrinking dynamic ranges cannot adequately capture the wide data distributions commonly seen in sequence transduction models. We present AdaptivFloat, a floating-point inspired number representation format for deep learning that dynamically maximizes and optimally clips its available dynamic range, at a layer granularity, in order to create faithful encoding of neural network parameters. AdaptivFloat consistently produces higher inference accuracies compared to block floating-point, uniform, IEEE-like float or posit encodings at very low precision (<= 8-bit) across a diverse set of state-of-the-art neural network topologies. And notably, AdaptivFloat is seen surpassing baseline FP32 performance by up to +0.3 in BLEU score and -0.75 in word error rate at weight bit widths that are <= 8-bit. Experimental results on a deep neural network (DNN) hardware accelerator, exploiting AdaptivFloat logic in its computational datapath, demonstrate per-operation energy and area that is 0.9{\texttimes} and 1.14{\texttimes}, respectively, that of equivalent bit width integer-based accelerator variants.}, url = {https://arxiv.org/pdf/1909.13271.pdf}, author = {Thierry Tambe and En-Yu Yang and Zishen Wan and Yuntian Deng and Reddi, Vijay Janapa and Alexander Rush and David Brooks and Gu-Yeon Wei} } @conference {1594782, title = {Algorithm-Hardware Co-Design of Adaptive Floating-Point Encodings for Resilient Deep Learning Inference}, booktitle = {2020 57th ACM/IEEE Design Automation Conference}, year = {2020}, pages = {1-6}, publisher = {IEEE}, organization = {IEEE}, address = {DAC {\textquoteright}20, July 20-24, Virtual, San Francisco, CA}, abstract = {Conventional hardware-friendly quantization methods, such as fixed-point or integer, tend to perform poorly at very low precision as their shrunken dynamic ranges cannot adequately capture the wide data distributions commonly seen in sequence transduction models. We present an algorithm-hardware co-design centered around a novel floating-point inspired number format, AdaptivFloat, that dynamically maximizes and optimally clips its available dynamic range, at a layer granularity, in order to create faithful encodings of neural network parameters. AdaptivFloat consistently produces higher inference accuracies compared to block floating-point, uniform, IEEE-like float or posit encodings at low bit precision (<=8-bit) across a diverse set of state-of-the-art neural networks, exhibiting narrow to wide weight distribution. Notably, at 4-bit weight precision, only a 2.1 degradation in BLEU score is observed on the AdaptivFloat-quantized Transformer network compared to total accuracy loss when encoded in the above-mentioned prominent datatypes. Furthermore, experimental results on a deep neural network (DNN) processing element (PE), exploiting AdaptivFloat logic in its computational datapath, demonstrate per-operation energy and area that is 0.9{\texttimes} and 1.14{\texttimes}, width, respectively that of an equivalent bit NVDLA-like integer-based PE.}, url = {https://ieeexplore.ieee.org/abstract/document/9218516}, author = {Thierry Tambe and En-Yu Yang and Zishen Wan and Yuntian Deng and Reddi, Vijay Janapa and Alexander Rush and David Brooks and Gu-Yeon Wei} } @conference {9065577, title = {Asymmetric Resilience: Exploiting Task-Level Idempotency for Transient Error Recovery in Accelerator-Based Systems}, booktitle = {2020 IEEE International Symposium on High Performance Computer Architecture (HPCA)}, year = {2020}, month = {Feb}, pages = {44-57}, abstract = {Accelerators make the task of building systems that are re-silient against transient errors like voltage noise and soft errors hard. Architects integrate accelerators into the system as black box third-party IP components. So a fault in one or more accelerators may threaten the system{\textquoteright}s reliability if there are no established failure semantics for how an error propagates from the accelerator to the main CPU. Existing solutions that assure system reliability come at the cost of sacrificing accelerator generality, efficiency, and incur significant overhead, even in the absence of errors. To over-come these drawbacks, we examine reliability management of accelerator systems via hardware-software co-design, coupling an efficient architecture design with compiler and run-time support, to cope with transient errors. We introduce asymmetric resilience that architects reliability at the system level, centered around a hardened CPU, rather than at the accelerator level. At runtime, the system exploits task-level idempotency to contain accelerator errors and use memory protection instead of taking checkpoints to mitigate over-heads. We also leverage the fact that errors rarely occur in systems, and exploit the trade-off between error recovery performance and improved error-free performance to enhance system efficiency. Using GPUs, which are at the fore-front of accelerator systems, we demonstrate how our system architecture manages reliability in both integrated and discrete systems, under voltage-noise and soft-error related faults, leading to extremely low overhead (less than 1\%) and substantial gains (20\% energy savings on average).}, keywords = {Acceleration, accelerator errors, accelerator generality, accelerator level, accelerator systems, accelerator-based systems, asymmetric resilience, black box third-party IP components, checkpointing, Computer architecture, discrete systems, embedded systems, error-free performance, multiprocessing systems, power aware computing, Reliability, reliability management, resilience, Runtime, soft-error related faults, system recovery, system reliability, Task analysis, task-level idempotency, Transient analysis, transient error recovery, voltage-noise}, issn = {2378-203X}, doi = {10.1109/HPCA47549.2020.00014}, author = {J. Leng and A. Buyuktosunoglu and R. Bertran and P. Bose and Q. Chen and M. Guo and V. Janapa Reddi} } @booklet {banbury2020benchmarking, title = {Benchmarking TinyML Systems: Challenges and Direction}, year = {2020}, author = {Colby R. Banbury and Reddi, Vijay Janapa and Lam, Max and William Fu and Amin Fazel and Jeremy Holleman and Xinyuan Huang and Robert Hurtado and David Kanter and Anton Lokhmotov and David Patterson and Danilo Pau and Jae-sun Seo and Jeff Sieracki and Urmish Thakker and Marian Verhelst and Poonam Yadav} } @conference {1594785, title = {An end-to-end RISC-V solution for ML on the edge using in-pipeline support}, booktitle = {Boston Area Architecture (BARC) Workshop}, year = {2020}, abstract = {Machine Learning (ML) is widely used today in many mobile applications. To preserve user privacy, there is a need to perform ML inference on the mobile devices. Given that ML inference is a computationally intensive task, the common technique used in mobile devices is offloading the task to a neural accelerator. However, the speed-up gained from offloading these tasks on the accelerators is limited by the overhead of frequent host-accelerator communication. In this paper, we propose a complete end-to-end solution that uses in-pipeline machine learning processing unit for accelerating ML workloads. First we introduce the software infrastructure we developed to support compilation and execution of machine learning models used in TensorFlow Lite framework. Then we discuss the microarchitecture we plan to implement for supporting the execution of our vectorized machine learning kernels.}, url = {http://people.bu.edu/joshi/files/rvmlpu-barc-2020.pdf}, author = {Zahra Azad and Louis, Marcia Sahaya and Leila Delshadtehrani and Anthony Ducimo and Suyog Gupta and Pete Warden and Reddi, Vijay Janapa and Joshi, Ajay} } @article {9076808, title = {Exceeding Conservative Limits: A Consolidated Analysis on Modern Hardware Margins}, journal = {IEEE Transactions on Device and Materials Reliability}, volume = {20}, number = {2}, year = {2020}, month = {June}, pages = {341-350}, abstract = {Modern large-scale computing systems (data centers, supercomputers, cloud and edge setups and high-end cyber-physical systems) employ heterogeneous architectures that consist of multicore CPUs, general-purpose many-core GPUs, and programmable FPGAs. The effective utilization of these architectures poses several challenges, among which a primary one is power consumption. Voltage reduction is one of the most efficient methods to reduce power consumption of a chip. With the galloping adoption of hardware accelerators (i.e., GPUs and FPGAs) in large datacenters and other large-scale computing infrastructures, a comprehensive evaluation of the safe voltage reduction levels for each different chip can be employed for efficient reduction of the total power. We present a survey of recent studies in voltage margins reduction at the system level for modern CPUs, GPUs and FPGAs. The pessimistic voltage guardbands inserted by the silicon vendors can be exploited in all devices for significant power savings. On average, voltage reduction can reach 12\% in multicore CPUs, 20\% in manycore GPUs and 39\% in FPGAs.}, keywords = {6G mobile communication, accelerators, Artificial Intelligence, benchmark testing, comprehensive evaluation, conservative limits, data centers, energy efficiency, FAA, field programmable gate arrays, FPGA, galloping adoption, general-purpose many-core GPUs, Graphics processing units, hardware accelerators, heterogeneous architectures, Iron, large-scale computing infrastructures, many-core GPU, Materials reliability, modern CPUs, modern hardware margins, modern large-scale computing systems, multicore CPU, multicore CPUs, multiprocessing systems, pessimistic voltage guardbands, power consumption, power savings, programmable FPGAs, safe voltage reduction levels, system level, Three-dimensional displays, voltage margin reduction, Voltage margins, voltage margins reduction}, issn = {1558-2574}, doi = {10.1109/TDMR.2020.2989813}, author = {G. Papadimitriou and A. Chatzidimitriou and D. Gizopoulos and V. J. Reddi and J. Leng and B. Salami and O. S. Unsal and A. C. Kestelman} } @booklet {duisterhof2020learning, title = {Learning to Seek: Deep Reinforcement Learning for Phototaxis of a Nano Drone in an Obstacle Field}, year = {2020}, author = {Bardienus P. Duisterhof and Krishnan, Srivatsan and Jonathan J. Cruz and Colby R. Banbury and William Fu and Faust, Aleksandra and Guido C. H. E. de Croon and Reddi, Vijay Janapa} } @conference {1501949, title = {Missing the Forest for the Trees: End-to-End AI Application Performance in Edge Data Centers}, booktitle = {International Symposium on High Performance Computer Architecture (HPCA)}, year = {2020}, abstract = {Artificial intelligence and machine learning are experiencing widespread adoption in the industry, academia, and even public consciousness. This has been driven by the rapid advances in the applications and accuracy of AI through increasingly complex algorithms and models; this, in turn, has spurred research into developing specialized hardware AI accelerators. The rapid pace of the advances makes it easy to miss the forest for the trees: they are often developed and evaluated in a vacuum without considering the full application environment in which they must eventually operate. In this paper, we deploy and characterize Face Recognition, an AI-centric edge video analytics application built using open source and widely adopted infrastructure and ML tools. We evaluate its holistic, end-to-end behavior in a production-size edge data center and reveal the {\textquotedblleft}AI tax{\textquotedblright} for all the processing that is involved. Even though the application is built around state-of-the-art AI and ML algorithms, it relies heavily on pre- and post-processing code which must be executed on a general-purpose CPU. As AI-centric applications start to reap the acceleration promised by so many accelerators, we find they impose stresses on the underlying software infrastructure and the data center{\textquoteright}s capabilities: storage and network bandwidth become major bottlenecks with increasing AI acceleration. By not having to serve a wide variety of applications, we show that a purpose-built edge data center can be designed to accommodate the stresses of accelerated AI at 15\% lower TCO than one de-rived from homogeneous servers and infrastructure. We also discuss how our conclusions generalize beyond Face Recognition as many AI-centric applications at the edge rely upon the same underlying software and hardware infrastructure.}, author = {Richins, Daniel and Dharmisha Doshi and Matthew Blackmore and Aswathy Thulaseedharan Nair and Neha Pathapati and Ankit Patel and Brainard Daguman and Daniel Dobrijalowski and Ramesh Illikkal and Kevin Long and David Zimmerman and Reddi, Vijay Janapa} } @article {9001257, title = {MLPerf: An Industry Standard Benchmark Suite for Machine Learning Performance}, journal = {IEEE Micro}, volume = {40}, number = {2}, year = {2020}, month = {March}, pages = {8-16}, abstract = {In this article, we describe the design choices behind MLPerf, a machine learning performance benchmark that has become an industry standard. The first two rounds of the MLPerf Training benchmark helped drive improvements to software-stack performance and scalability, showing a 1.3{\texttimes} speedup in the top 16-chip results despite higher quality targets and a 5.5{\texttimes} increase in system scale. The first round of MLPerf Inference received over 500 benchmark results from 14 different organizations, showing growing adoption.}, keywords = {benchmark testing, Computational modeling, industry standard benchmark suite, inference mechanisms, learning (artificial intelligence), machine learning, machine learning performance benchmark, Measurement, MLPerf Inference, MLPerf Training benchmark, Numerical models, software-stack performance, Training}, issn = {1937-4143}, doi = {10.1109/MM.2020.2974843}, author = {P. Mattson and V. J. Reddi and Cheng, C. and C. Coleman and G. Diamos and D. Kanter and P. Micikevicius and D. Patterson and G. Schmuelling and Tang, H. and G. Wei and Wu, C.} } @conference {9138989, title = {MLPerf Inference Benchmark}, booktitle = {2020 ACM/IEEE 47th Annual International Symposium on Computer Architecture (ISCA)}, year = {2020}, month = {May}, pages = {446-459}, abstract = {Machine-learning (ML) hardware and software system demand is burgeoning. Driven by ML applications, the number of different ML inference systems has exploded. Over 100 organizations are building ML inference chips, and the systems that incorporate existing models span at least three orders of magnitude in power consumption and five orders of magnitude in performance; they range from embedded devices to data-center solutions. Fueling the hardware are a dozen or more software frameworks and libraries. The myriad combinations of ML hardware and ML software make assessing ML-system performance in an architecture-neutral, representative, and reproducible manner challenging. There is a clear need for industry-wide standard ML benchmarking and evaluation criteria. MLPerf Inference answers that call. In this paper, we present our benchmarking method for evaluating ML inference systems. Driven by more than 30 organizations as well as more than 200 ML engineers and practitioners, MLPerf prescribes a set of rules and best practices to ensure comparability across systems with wildly differing architectures. The first call for submissions garnered more than 600 reproducible inference-performance measurements from 14 organizations, representing over 30 systems that showcase a wide range of capabilities. The submissions attest to the benchmark{\textquoteright}s flexibility and adaptability.}, keywords = {benchmark testing, Benchmarking, Inference, inference mechanisms, learning (artificial intelligence), machine learning, machine-learning hardware system, machine-learning software system, ML hardware, ML inference chips, ML inference systems, ML software, ML-system performance, MLPerf inference benchmark, power consumption, software performance evaluation}, doi = {10.1109/ISCA45697.2020.00045}, author = {V. J. Reddi and Cheng, C. and D. Kanter and P. Mattson and G. Schmuelling and Wu, C. and Anderson, B. and M. Breughe and M. Charlebois and W. Chou and R. Chukka and C. Coleman and Davis, S. and P. Deng and G. Diamos and J. Duke and D. Fick and J. S. Gardner and I. Hubara and S. Idgunji and T. B. Jablin and J. Jiao and T. S. John and P. Kanwar and Lee, D. and J. Liao and A. Lokhmotov and F. Massa and P. Meng and P. Micikevicius and C. Osborne and G. Pekhimenko and A. T. R. Rajan and D. Sequeira and A. Sirasao and F. Sun and Tang, H. and M. Thomson and F. Wei and Wu, E. and L. Xu and K. Yamada and Yu, B. and G. Yuan and A. Zhong and P. Zhang and Zhou, Y.} } @booklet {mattson2020mlperf, title = {MLPerf Training Benchmark}, year = {2020}, author = {Peter Mattson and Cheng, Christine and Cody Coleman and Greg Diamos and Paulius Micikevicius and David Patterson and Tang, Hanlin and Gu-Yeon Wei and Bailis, Peter and Victor Bittorf and David Brooks and Dehao Chen and Debojyoti Dutta and Udit Gupta and Hazelwood, Kim and Andrew Hock and Xinyuan Huang and Atsushi Ike and Bill Jia and Daniel Kang and David Kanter and Naveen Kumar and Jeffery Liao and Guokai Ma and Deepak Narayanan and Tayo Oguntebi and Gennady Pekhimenko and Lillian Pentecost and Reddi, Vijay Janapa and Taylor Robie and Tom St. John and Tsuguchika Tabaru and Wu, Carole-Jean and Lingjie Xu and Masafumi Yamazaki and Cliff Young and Matei Zaharia} } @article {9086807, title = {Predictive Guardbanding: Program-driven Timing Margin Reduction for GPUs}, journal = {IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems}, year = {2020}, pages = {1-1}, abstract = {Energy efficiency of GPU architectures has emerged as an essential aspect of computer system design. In this paper, we explore the energy benefits of reducing the GPU chip{\textquoteright}s voltage to the safe limit, i.e., Vmin point, using predictive software techniques. We perform such a study on several commercial off-the-shelf GPU cards. We find that there exists about 20\% voltage guardband on those GPUs spanning two architectural generations, which, if {\textquotedblleft}eliminated"entirely, can result in up to 25\% energy savings on one of the studied GPU cards. Our measurement results unveil a program dependent Vmin behavior across the studied applications, and the exact improvement magnitude depends on the program{\textquoteright}s available guardband. We make fundamental observations about the program-dependent Vmin behavior. We experimentally determine that the voltage noise has a more substantial impact on Vmin compared to the process and temperature variation, and the activities during the kernel execution cause large voltage droops. From these findings, we show how to use kernels{\textquoteright} microarchitectural performance counters to predict its Vmin value accurately. The average and maximum prediction errors are 0.5\% and 3\%, respectively. The accurate Vmin prediction opens up new possibilities of a cross-layer dynamic guardbanding scheme for GPUs, in which software predicts and manages the voltage guardband, while the functional correctness is ensured by a hardware safety net mechanism.}, keywords = {GPU, Graphics processing units, Kernel, Multi-core processors, Power demand, Power measurement, PVT variation., single instruction and multiple data, temperature measurement, Voltage control, voltage guardband, Voltage measurement}, issn = {1937-4151}, doi = {10.1109/TCAD.2020.2992684}, author = {J. Leng and A. Buyuktosunoglu and R. Bertran and P. Bose and Y. Zu and V. J. Reddi} } @booklet {lam2020quantized, title = {Quantized Neural Network Inference with Precision Batching}, year = {2020}, author = {Maximilian Lam and Zachary Yedidia and Colby Banbury and Reddi, Vijay Janapa} } @article {9037294, title = {The Sky Is Not the Limit: A Visual Performance Model for Cyber-Physical Co-Design in Autonomous Machines}, journal = {IEEE Computer Architecture Letters}, volume = {19}, number = {1}, year = {2020}, pages = {38-42}, author = {S. Krishnan and Z. Wan and K. Bhardwaj and P. Whatmough and A. Faust and G. Wei and D. Brooks and V. J. Reddi} } @article {8970359, title = {Voltage-Stacked Power Delivery Systems: Reliability, Efficiency, and Power Management}, journal = {IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems}, year = {2020}, pages = {1-1}, abstract = {In today{\textquoteright}s manycore processors, energy loss of more than 20\% may result from inherent inefficiencies of conventional power delivery system (PDS) design. By stacking multiple voltage domains in series to lower the step-down conversion ratio of the off-chip voltage regulator module (VRM) and reduce energy loss along the path of the power delivery network (PDN), voltage stacking (VS) offers a novel alternative power delivery technique to fundamentally improve power delivery efficiency (PDE). However, voltage stacking suffers from aggravated supply voltage noise from current imbalance, which hinders its adoption. In this paper, we investigate practical voltage stacking implementation in manycore processors to improve power delivery efficiency (PDE) and achieve reliable performance, while maintaining compatibility with advanced power management techniques. We first present the system configuration of a voltage-stacked manycore processor. We then systematically characterize supply voltage noise in voltage stacking, identify global and residual differential currents as its dominant contributors, and calculate the possible worst supply voltage noise. We next propose a hybrid voltage regulation solution, based on a charge-recycling off-chip voltage regulator and distributed integrated voltage regulators, to mitigate supply voltage noise effectively. We also study the compatibility of voltage stacking with higher level power management techniques. Finally, the performance of a voltage-stacked GPU system is comprehensively evaluated. Simulation results show that our approach can achieve 93.5\% power delivery efficiency, reducing the power loss by 13.6\% compared to conventional single-layer power delivery system.}, keywords = {Energy loss, Handheld computers, Integrated Voltage Regulator., Manycore Architecture, Manycore processors, Power Delivery System, Regulators, Resistance, Stacking, Supply Noise, Voltage control, Voltage Stacking}, issn = {1937-4151}, doi = {10.1109/TCAD.2020.2969607}, author = {A. Zou and J. Leng and X. He and Y. Zu and C. D. Gill and V. J. Reddi and Zhang, X.} } @article {1448418, title = {Accelerator-Level Parallelism}, journal = {arXiv}, volume = {arXiv:1907.02064v4 [cs.DC]}, year = {2019}, abstract = { Future applications demand more performance, but technology advances have been faltering. A promising approach to further improve computer system performance under energy constraints is to employ hardware accelerators. Already today, mobile systems concurrently employ multiple accelerators in what we call accelerator-level parallelism (ALP). To spread the benefits of ALP more broadly, we charge computer scientists to develop the science needed to best achieve the performance and cost goals of ALP hardware and software. }, url = {https://arxiv.org/abs/1907.02064}, author = {Mark D. Hill and Reddi, Vijay Janapa} } @article {1431417, title = {Asymmetric Resilience for Accelerator-Rich Systems}, journal = {Computer Architecture Letters}, year = {2019}, abstract = {Accelerators are becoming popular owing to their exceptional performance and power-efficiency. However, researchers are yet to pay close attention to their reliability---a key challenge as technology scaling makes building reliable systems challenging. A straightforward solution to make accelerators reliable is to design the accelerator from the ground-up to be reliable by itself. However, such a myopic view of the system, where each accelerator is designed in isolation, is unsustainable as the number of integrated accelerators continues to rise in SoCs. To address this challenge, we propose a paradigm called "asymmetric resilience{\textquoteright}{\textquoteright} that avoids accelerator-specific reliability design. Instead, its core principle is to develop the reliable heterogeneous system around the CPU architecture. We explain the implications of architecting such a system and the modifications needed in a heterogeneous system to adopt such an approach. As an example, we demonstrate how to use asymmetric resilience to handle GPU execution errors using the CPU with minimal overhead. The general principles can be extended to include other accelerators.}, author = {Leng, Jingwen and Buyuktosunoglu, Alper and Bertran, Ramon and Bose, Pradip and Reddi, Vijay Janapa} } @proceedings {1442843, title = {Deep Reinforcement Learning for Cyber Security}, journal = {ArXiv}, year = {2019}, abstract = {The scale of Internet-connected systems has increased considerably, and these systems are being exposed to cyber attacks more than ever. The complexity and dynamics of cyber attacks require protecting mechanisms to be responsive, adaptive, and large-scale. Machine learning, or more specifically deep reinforcement learning (DRL), methods have been proposed widely to address these issues. By incorporating deep learning into traditional RL, DRL is highly capable of solving complex, dynamic, and especially high-dimensional cyber defense problems. This paper presents a survey of DRL approaches developed for cyber security. We touch on different vital aspects, including DRL-based security methods for cyber-physical systems, autonomous intrusion detection techniques, and multi-agent DRL-based game theory simulations for defense strategies against cyber attacks. Extensive discussions and future research directions on DRL-based cyber security are also given. We expect that this comprehensive review provides the foundations for and facilitates future studies on exploring the potential of emerging DRL to cope with increasingly complex cyber security problems.}, url = {https://arxiv.org/abs/1906.05799}, author = {Thanh Thi Nguyen and Reddi, Vijay Janapa} } @conference {1351107, title = {Fine-Tuning the Active Timing Margin (ATM) Control Loop for Maximizing Multi-Core Efficiency on an IBM POWER Server}, booktitle = {Proceedings of the 25th International Symposium on High Performance Computer Architecture (HPCA)}, year = {2019}, publisher = {IEEE}, organization = {IEEE}, abstract = { Active Timing Margin (ATM) is a technology that improves processor efficiency by reducing the pipeline timing margin with a control loop that adjusts voltage and frequency based on real-time chip environment monitoring. Although ATM has already been shown to yield substantial performance benefits, its full potential has yet to be unlocked. In this paper, we investigate how to maximize ATM{\textquoteright}s efficiency gain with a new means of exposing the inter-core speed variation: finetuning the ATM control loop. We conduct our analysis and evaluation on a production-grade POWER7+ system. On the POWER7+ server platform, we fine-tune the ATM control loop by programming its Critical Path Monitors, a key component of its ATM design that measures the cores{\textquoteright} timing margins. With a robust stress-test procedure, we expose over 200 MHz of inherent inter-core speed differential by fine-tuning the percore ATM control loop. Exploiting this differential, we manage to double the ATM frequency gain over the static timing margin; this is not possible using conventional means, i.e. by setting fixed points for each core, because the corelevel must account for chip-wide worst-case voltage variation. To manage the significant performance heterogeneity of fine-tuned systems, we propose application scheduling and throttling to manage the chip{\textquoteright}s process and voltage variation. Our proposal improves application performance by more than 10\% over the static margin, almost doubling the 6\% improvement of the default, unmanaged ATM system. Our technique is general enough that it can be adopted by any system that employs an active timing margin control loop. Keywords-Active timing margin, Performance, Power efficiency, Reliability, Critical path monitors }, author = {Zu, Yazhou and Richins, Daniel and Charles Leufergy and Reddi, Vijay Janapa} } @conference {1349360, title = {Gables: A Roofline Model for Mobile SoCs}, booktitle = {Proceedings of the 25th International Symposium on High Performance Computer Architecture (HPCA)}, year = {2019}, abstract = { Over a billion mobile consumer system-on-chip (SoC) chipsets ship each year. Of these, the mobile consumer market undoubtedly involving smartphones has a significant market share. Most modern smartphones comprise of advanced SoC architectures that are made up of multiple cores, GPS, and many different programmable and fixed-function accelerators connected via a complex hierarchy of interconnects with the goal of running a dozen or more critical software usecases under strict power, thermal and energy constraints. The steadily growing complexity of a modern SoC challenges hardware computer architects on how best to do early stage ideation. Late SoC design typically relies on detailed full-system simulation once the hardware is specified and accelerator software is written or ported. However, early-stage SoC design must often select accelerators before a single line of software is written. To help frame SoC thinking and guide early stage mobile SoC design, in this paper we contribute the Gables model that refines and retargets the Roofline model{\textemdash}designed originally for the performance and bandwidth limits of a multicore chip{\textemdash}to model each accelerator on a SoC, to apportion work concurrently among different accelerators (justified by our usecase analysis), and calculate a SoC performance upper bound. We evaluate the Gables model with an existing SoC and develop several extensions that allow Gables to inform early stage mobile SoC design. Index Terms{\textemdash}Accelerator architectures, Mobile computing, Processor architecture, System-on-Chip }, author = {Mark Hill and Reddi, Vijay Janapa} } @booklet {shaotran2019gladas, title = {GLADAS: Gesture Learning for Advanced Driver Assistance Systems}, year = {2019}, author = {Ethan Shaotran and Jonathan J. Cruz and Reddi, Vijay Janapa} } @article {1533679, title = {Mlperf training benchmark}, journal = {arXiv preprint arXiv:1910.01500}, year = {2019}, abstract = {Machine learning (ML) needs industry-standard performance benchmarks to support design and competitive evaluation of the many emerging software and hardware solutions for ML. But ML training presents three unique benchmarking challenges absent from other domains: optimizations that improve training throughput can increase the time to solution, training is stochastic and time to solution exhibits high variance, and software and hardware systems are so diverse that fair benchmarking with the same binary, code, and even hyperparameters is difficult. We therefore present MLPerf, an ML benchmark that overcomes these challenges. Our analysis quantitatively evaluates MLPerf{\textquoteright}s efficacy at driving performance and scalability improvements across two rounds of results from multiple vendors.}, author = {Peter Mattson and Cheng, Christine and Cody Coleman and Greg Diamos and Paulius Micikevicius and David Patterson and Tang, Hanlin and Gu-Yeon Wei} } @conference {1438644, title = {Modern Hardware Margins: CPUs, GPUs, FPGAs}, booktitle = {25th IEEE International Symposium on On-Line Testing and Robust System Design (IOLTS)}, year = {2019}, publisher = {IEEE}, organization = {IEEE}, abstract = {Modern large-scale computing systems (data centers, supercomputers, cloud and edge setups and high-end cyber-physical systems) employ heterogeneous architectures that consist of multicore CPUs, general-purpose many-core GPUs, and programmable FPGAs. The effective utilization of these architectures poses several challenges, among which a primary one is power consumption. Voltage reduction is one of the most efficient methods to reduce power consumption of a chip. With the galloping adoption of hardware accelerators (i.e., GPUs and FPGAs) in large datacenters and other large-scale computing infrastructures, a comprehensive evaluation of the safe voltage reduction levels for each different chip can be employed for efficient reduction of the total power. We present a survey of recent studies in voltage margins reduction at the system level for modern CPUs, GPUs and FPGAs. The pessimistic voltage guardbands inserted by the silicon vendors can be exploited in all devices for significant power savings. Voltage reduction can reach 12\% in multicore CPUs, 20\% in manycore GPUs and 39\% in FPGAs.}, author = {Dimitris Gizopoulos and Papadimitriou, George and Athanasios Chatzidimitriou and Reddi, Vijay Janapa and Leng, Jingwen and Behzad Salami and Osman S. Unsal and Adrian Cristal Kestelman} } @conference {1425168, title = {One Size Does Not Fit All: Quantifying and Exposing the Accuracy-Latency Trade-off in Machine Learning Cloud Service APIs via Tolerance Tiers}, booktitle = {Proceedings of the 19th International Symposium on Performance Analysis of Systems and Software (ISPASS)}, year = {2019}, abstract = { Today{\textquoteright}s cloud service architectures follow a {\textquotedblleft}one size fits all{\textquotedblright} deployment strategy where the same service version instantiation is provided to the end users. However, consumers are broad and different applications have different accuracy and responsiveness requirements, which as we demonstrate renders the {\textquotedblleft}one size fits all{\textquotedblright} approach inefficient in practice. We use a production grade speech recognition engine, which serves several thousands of users, and an open source computer vision based system, to explain our point. To overcome the limitations of the {\textquotedblleft}one size fits all{\textquotedblright} approach, we recommend Tolerance Tiers where each MLaaS tier exposes an accuracy/responsiveness characteristic, and consumers can programmatically select a tier. We evaluate our proposal on the CPU-based automatic speech recognition (ASR) engine and cutting-edge neural networks for image classification deployed on both CPUs and GPUs. The results show that our proposed approach provides a MLaaS cloud service architecture that can be tuned by the end API user or consumer to outperform the conventional {\textquotedblleft}one size fits all{\textquotedblright} approach. }, author = {Halpern, Matthew and Boroujerdian, Behzad and Mummert, Todd and Duesterwald, Evelyn and Reddi, Vijay Janapa} } @article {1533683, title = {Quantized Reinforcement Learning (QUARL)}, journal = {arXiv preprint arXiv:1910.01055}, year = {2019}, abstract = {Recent work has shown that quantization can help reduce the memory, compute, and energy demands of deep neural networks without significantly harming their quality. However, whether these prior techniques, applied traditionally to image-based models, work with the same efficacy to the sequential decision making process in reinforcement learning remains an unanswered question. To address this void, we conduct the first comprehensive empirical study that quantifies the effects of quantization on various deep reinforcement learning policies with the intent to reduce their computational resource demands. We apply techniques such as post-training quantization and quantization aware training to a spectrum of reinforcement learning tasks (such as Pong, Breakout, BeamRider and more) and training algorithms (such as PPO, A2C, DDPG, and DQN). Across this spectrum of tasks and learning algorithms, we show that policies can be quantized to 6-8 bits of precision without loss of accuracy. We also show that certain tasks and reinforcement learning algorithms yield policies that are more difficult to quantize due to their effect of widening the models{\textquoteright} distribution of weights and that quantization aware training consistently improves results over post-training quantization and oftentimes even over the full precision baseline. Finally, we demonstrate real-world applications of quantization for reinforcement learning. We use half-precision training to train a Pong model 50\% faster, and we deploy a quantized reinforcement learning based navigation policy to an embedded system, achieving an 18\ speedup and a 4\ reduction in memory usage over\ {\textellipsis}}, url = {https://arxiv.org/abs/1910.01055v4}, author = {Krishnan, Srivatsan and Sharad Chitlangia and Maximilian Lam and Zishen Wan and Faust, Aleksandra and Reddi, Vijay Janapa} } @article {1533694, title = {The Role of Compute in Autonomous Aerial Vehicles}, journal = {arXiv preprint arXiv:1906.10513}, year = {2019}, abstract = {Autonomous-mobile cyber-physical machines are part of our future. Specifically, unmanned-aerial-vehicles have seen a resurgence in activity with use-cases such as package delivery. These systems face many challenges such as their low-endurance caused by limited onboard-energy, hence, improving the mission-time and energy are of importance. Such improvements traditionally are delivered through better algorithms. But our premise is that more powerful and efficient onboard-compute should also address the problem. This paper investigates how the compute subsystem, in a cyber-physical mobile machine, such as a Micro Aerial Vehicle, impacts mission-time and energy. Specifically, we pose the question as what is the role of computing for cyber-physical mobile robots? We show that compute and motion are tightly intertwined, hence a close examination of cyber and physical processes and their impact on one another is necessary. We show different impact paths through which compute impacts mission-metrics and examine them using analytical models, simulation, and end-to-end benchmarking. To enable similar studies, we open sourced MAVBench, our tool-set consisting of a closed-loop simulator and a benchmark suite. Our investigations show cyber-physical co-design, a methodology where robot{\textquoteright}s cyber and physical processes/quantities are developed with one another consideration, similar to hardware-software co-design, is necessary for optimal robot design.}, author = {Boroujerdian, Behzad and Genc, Hasan and Krishnan, Srivatsan and Bardienus Pieter Duisterhof and Brian Plancher and Kayvan Mansoorshahi and Marcelino Almeida and Cui, Wenzhi and Faust, Aleksandra and Reddi, Vijay Janapa} } @booklet {boroujerdian2019role, title = {The Role of Compute in Autonomous Aerial Vehicles}, year = {2019}, abstract = {Autonomous-mobile cyber-physical machines are part of our future. Specifically, unmanned-aerial-vehicles have seen a resurgence in activity with use-cases such as package delivery. These systems face many challenges such as their low-endurance caused by limited onboard-energy, hence, improving the mission-time and energy are of importance. Such improvements traditionally are delivered through better algorithms. But our premise is that more powerful and efficient onboard-compute should also address the problem. This paper investigates how the compute subsystem, in a cyber-physical mobile machine, such as a Micro Aerial Vehicle, impacts mission-time and energy. Specifically, we pose the question as what is the role of computing for cyber-physical mobile robots? We show that compute and motion are tightly intertwined, hence a close examination of cyber and physical processes and their impact on one another is necessary. We show different impact paths through which compute impacts mission-metrics and examine them using analytical models, simulation, and end-to-end benchmarking. To enable similar studies, we open sourced MAVBench, our tool-set consisting of a closed-loop simulator and a benchmark suite. Our investigations show cyber-physical co-design, a methodology where robot{\textquoteright}s cyber and physical processes/quantities are developed with one another consideration, similar to hardware-software co-design, is necessary for optimal robot design.}, author = {Boroujerdian, Behzad and Genc, Hasan and Krishnan, Srivatsan and Bardienus Pieter Duisterhof and Brian Plancher and Kayvan Mansoorshahi and Marcelino Almeida and Cui, Wenzhi and Faust, Aleksandra and Reddi, Vijay Janapa} } @conference {1425169, title = {Tail Latency in Node.js: Energy Efficient Turbo Boosting for Long Latency Requests in Event-Driven Web Services}, booktitle = {Proceedings of the 15th ACM SIGPLAN/SIGOPS International Conference on Virtual Execution Environments (VEE)}, year = {2019}, abstract = {Cloud-based Web services are shifting to the event-driven, scripting language-based programming model to achieve productivity, flexibility, and scalability. Implementations of this model, however, generally suffer from long tail latencies, which we measure using Node.js as a case study. Unlike in traditional thread-based systems, reducing long tails is difficult in event-driven systems due to their inherent asynchronous programming model. We propose a framework to identify and optimize tail latency sources in scripted eventdriven Web services. We introduce profiling that allows us to gain deep insights into not only how asynchronous eventdriven execution impacts application tail latency but also how the managed runtime system overhead exacerbates the tail latency issue further. Using the profiling framework, we propose an event-driven execution runtime design that orchestrates the hardware{\textquoteright}s boosting capabilities to reduce tail latency. We achieve higher tail latency reductions with lower energy overhead than prior techniques that are unaware of the underlying event-driven program execution model. The lessons we derive from Node.js apply to other event-driven services based on scripting language frameworks.}, author = {Cui, Wenzhi and Richins, Daniel and Zhu, Yuhao and Reddi, Vijay Janapa} } @booklet {1436936, title = {Toward Exploring End-to-End Learning Algorithms for Autonomous Aerial Machines}, journal = {Workshop Algorithms And Architectures For Learning In-The-Loop Systems In Autonomous Flight with International Conference on Robotics and Automation (ICRA)}, year = {2019}, abstract = {We develop AirLearning, a tool suite for endto-end closed-loop UAV analysis, equipped with a customized yet randomized environment generator in order to expose the UAV with a diverse set of challenges. We take Deep Q networks (DQN) as an example deep reinforcement learning algorithm and use curriculum learning to train a point to point obstacle avoidance policy. While we determine the best policy based on the success rate, we evaluate it under strict resource constraints on an embedded platform such as RasPi 3. Using hardware in the loop methodology, we quantify the policy{\textquoteright}s performance with quality of flight metrics such as energy consumed, endurance and the average length of the trajectory. We find that the trajectories produced on the embedded platform are very different from those predicted on the desktop, resulting in up to 26.43\% longer trajectories.Quality of flight metrics with hardware in the loop characterizes those differences in simulation, thereby exposing how the choice of onboard compute contributes to shortening or widening of {\textquoteleft}Sim2Real{\textquoteright} gap.}, author = {Krishnan, Srivatsan and Boroujerdian, Behzad and Faust, Aleksandra and Reddi, Vijay Janapa} } @booklet {1437309, title = {Towards Deep Learning using TensorFlow Lite on RISC-V}, journal = {Third Workshop on Computer Architecture Research with RISC-V (CARRV)}, year = {2019}, abstract = { Deep neural networks have been extensively adopted for a myriad of applications due to their ability to learn patterns from large amounts of data. The desire to preserve user privacy and reduce user-perceived latency has created the need to perform deep neural network inference tasks on low-power consumer edge devices. Since such tasks often tend to be computationally intensive, offloading this compute from mobile/embedded CPU to a purposedesigned "Neural Processing Engines" is a commonly adopted solution for accelerating deep learning computations. While these accelerators offer significant speed-ups for key machine learning kernels, overheads resulting from frequent host-accelerator communication often diminish the net application-level benefit of this heterogeneous system. Our solution for accelerating such workloads involves developing ISA extensions customized for machine learning kernels and designing a custom in-pipeline execution unit for these specialized instructions. We base our ISA extensions on RISC-V: an open ISA specification that lends itself to such specializations. In this paper, we present the software infrastructure for optimizing neural network execution on RISC-V with ISA extensions. Our ISA extensions are derived from the RISC-V Vector ISA proposal, and we develop optimized implementations of the critical kernels such as convolution and matrix multiplication using these instructions. These optimized functions are subsequently added to the TensorFlow Lite source code and cross-compiled for RISC-V. We find that only a small set of instruction extensions achieves coverage over a wide variety of deep neural networks designed for vision and speech-related tasks. On average, our software implementation using the extended instructions set reduces the executed instruction count by 8X in comparison to baseline implementation. In parallel, we are also working on the hardware design of the inpipeline machine learning accelerator. We plan to open-source our software modifications to TF Lite, as well as the micro-architecture design in due course. }, author = {Louis, Marcia Sahaya and Zahra Azad and Leila Delshadtehrani and Suyog Gupta and Pete Warden and Reddi, Vijay Janapa and Joshi, Ajay} } @conference {richins2018amdahl, title = {Amdahl{\textquoteright}s Law in Big Data Analytics: Alive and Kicking in TPCx-BB (BigBench)}, booktitle = {IEEE International Symposium on High Performance Computer Architecture (HPCA)}, year = {2018}, pages = {630{\textendash}642}, publisher = {IEEE}, organization = {IEEE}, abstract = {Big data, specifically data analytics, is responsible for driving many of consumers{\textquoteright} most common online activities, including shopping, web searches, and interactions on social media. In this paper, we present the first (micro)architectural investigation of a new industry-standard, open source benchmark suite directed at big data analytics applications{\textemdash}TPCx-BB (BigBench). Where previous work has usually studied benchmarks which oversimplify big data analytics, our study of BigBench reveals that there is immense diversity among applications, owing to their varied data types, computational paradigms, and analyses. In our analysis, we also make an important discovery generally restricting processor performance in big data. Contrary to conventional wisdom that big data applications lend themselves naturally to parallelism, we discover that they lack sufficient thread-level parallelism (TLP) to fully utilize all cores. In other words, they are constrained by Amdahl{\textquoteright}s law. While TLP may be limited by various factors, ultimately we find that single-thread performance is as relevant in scale-out workloads as it is in more classical applications. To this end we present core packing: a software and hardware solution that could provide as much as 20\% execution speedup for some big data analytics applications.}, url = {https://doi.org/10.1109/HPCA.2018.00060}, author = {Richins, Daniel and Ahmed, Tahrina and Clapp, Russell and Reddi, Vijay Janapa} } @report {1594790, title = {Asymmetric Resilience: Rethinking Reliability for Accelerator-Rich Systems}, year = {2018}, institution = {IBM}, abstract = {We have already entered the heterogeneous computing era when computing systems harness computational horsepower from not only general purpose CPUs but also other processors such as graphics processing unit (GPU) and hardware accelerators. Performance, power-efficiency, and reliability are three most critical aspects of processors, and there usually exists a tradeoff among them. Accelerators are heavily optimized for performance and power-efficiency rather than reliability. However, it is equally important to ensure overall reliability while introducing accelerators to computing systems. In this paper, we focus on optimizing accelerator{\textquoteright}s reliability without adopting the {\textquotedblleft}whac-a-mole{\textquotedblright} paradigm which develops accelerator-specific reliability optimization. Instead, we advocate maintaining the reliability at the system level, and propose the design paradigm called {\textquotedblleft}asymmetric resilience,{\textquotedblright} whose principle is to develop the reliable heterogeneous system centering around the CPU architecture. This generic design paradigm eases accelerators away from reliability optimization. We present the design principles and practices for the heterogeneous system that adopt such design paradigm. Following the principles of asymmetric resilience, we demonstrate how to use CPU architecture to handle GPU execution errors, which allows GPU focus on typical case operation for better energy efficiency. We explore the design space and show that the average overhead is only 1\% for error-free execution and the overhead increases linearly with error probability.}, author = {Leng, Jingwen and Buyuktosunoglu, Alper and Bertran, Ramon and Bose, Pradip and Reddi, Vijay Janapa} } @article {chin2018domain, title = {Domain-Specific Approximation for Object Detection}, journal = {IEEE Micro}, volume = {38}, number = {1}, year = {2018}, pages = {31{\textendash}40}, publisher = {IEEE}, abstract = {In summary,our contributions are as follows: {\textbullet} We investigate DSA and characterize the effectiveness of category-awareness. {\textbullet} We conduct a limit study to understand the benefit of applying approximation in a perframe manner with category-awareness (category-aware dynamic DSA). {\textbullet} We present the challenges of harnessing DSA and a proof-of-concept runtime.}, url = {https://doi.org/10.1109/MM.2018.112130335}, author = {Chin, Ting-Wu and Yu, Chia-Lin and Halpern, Matthew and Genc, Hasan and Tsao, Shiao-Li and Reddi, Vijay Janapa} } @conference {zou2018efficient, title = {Efficient and Reliable Power Delivery in Voltage-Stacked Manycore System With Hybrid Charge-Recycling Regulators}, booktitle = {55th ACM/ESDA/IEEE Design Automation Conference (DAC)}, year = {2018}, pages = {1{\textendash}6}, publisher = {IEEE}, organization = {IEEE}, abstract = {Voltage stacking (VS) fundamentally improves power delivery efficiency (PDE) by series-stacking multiple voltage domains to eliminate explicit step-down voltage conversion and reduce energy loss along the power delivery path. However, it suffers from aggravated supply noise, preventing its adoption in mainstream computing systems. In this paper, we investigate a practical approach to enabling efficient and reliable power delivery in voltage-stacked manycore systems that can ensure worst-case supply noise reliability without excessive costly over-design. We start by developing an analytical model to capture the essential noise behaviors in VS. It allows us to identify dominant noise contributor and derive the worst-case conditions. With this in-depth understanding, we propose a hybrid voltage regulation solution to effectively mitigate noise with worst-case guarantees. When evaluated with real-world benchmarks, our solution can achieve 93.8\% power delivery efficiency, an improvement of 13.9\% over the conventional baseline.}, url = {https://doi.org/10.1145/3195970.3196037}, author = {Zou, An and Leng, Jingwen and He, Xin and Zu, Yazhou and Reddi, Vijay Janapa and Zhang, Xuan} } @conference {1364481, title = {MAVBench: Micro Aerial Vehicle Benchmarking}, booktitle = {Proceedings of the International Symposium on Microarchitecture (MICRO)}, year = {2018}, abstract = {Unmanned Aerial Vehicles (UAVs) are getting closer to becoming ubiquitous in everyday life. Among them, Micro Aerial Vehicles (MAVs) have seen an outburst of attention recently, specifically in the area with a demand for autonomy. A key challenge standing in the way of making MAVs autonomous is that researchers lack the comprehensive understanding of how performance, power, and computational bottlenecks affect MAV applications. MAVs must operate under a stringent power budget, which severely limits their flight endurance time. As such, there is a need for new tools, benchmarks, and methodologies to foster the systematic development of autonomous MAVs. In this paper, we introduce the {\textquotedblleft}MAVBench{\textquotedblright} framework which consists of a closed-loop simulator and an end-to-end application benchmark suite. A closed-loop simulation platform is needed to probe and understand the intra-system (application data flow) and inter-system (system and environment) interactions in MAV applications to pinpoint bottlenecks and identify opportunities for hardware and software co-design and optimization. In addition to the simulator, MAVBench provides a benchmark suite, the first of its kind, consisting of a variety of MAV applications designed to enable computer architects to perform characterization and develop future aerial computing systems. Using our open source, end-to-end experimental platform, we uncover a hidden, and thus far unexpected compute to total system energy relationship in MAVs. Furthermore, we explore the role of compute by presenting three case studies targeting performance, energy and reliability. These studies confirm that an efficient system design can improve MAV{\textquoteright}s battery consumption by up to 1.8X.}, author = {Boroujerdian, Behzad and Genc, Hasan and Krishnan, Srivatsan and Cui, Wenzhi and Faust, Aleksandra and Reddi, Vijay Janapa} } @webarticle {1347805, title = {Mobile SoCs: The Wild West of Domain Specific Architectures}, journal = {Mobile SoCs: The Wild West of Domain Specific Architectures}, year = {2018}, url = {https://www.sigarch.org/mobile-socs/}, author = {Reddi, Vijay Janapa} } @article {reddi2018two, title = {Two Billion Devices and Counting}, journal = {IEEE Micro}, volume = {38}, number = {1}, year = {2018}, pages = {6{\textendash}21}, publisher = {IEEE}, abstract = {Mobile computing has grown drastically over the past decade. Despite the rapid pace of advancements, mobile device understanding, benchmarking, and evaluation are still in their infancies, both in industry and academia. This article presents an industry perspective on the challenges facing mobile computer architecture, specifically involving mobile workloads, benchmarking, and experimental methodology, with the hope of fostering new research within the community to address pending problems. These challenges pose a threat to the systematic development of future mobile systems, which, if addressed, can elevate the entire mobile ecosystem to the next level.Mobile devices have come a long way from the first portable cellular phone developed by Motorola in 1973. Most modern smartphones are good enough to replace desktop computers. A smartphone today has enough computing power to be on par with the fastest supercomputers from the 1990s.For instance, the Qualcomm Adreno 540 GPU found in the latest smartphones has a peak compute capability of more than 500 Gflops, putting it in competition with supercomputers that were on the TOP500 list in the early to mid-1990s. Mobile computing has experienced an unparalleled level of growth over the past decade. At the time of this writing, there are more than 2 billion mobile devices in the world.1 But perhaps even more importantly, mobile phones are showing no signs of slowing in uptake. In fact, smartphone adoption rates are on the rise. The number of devices is rising as mobile device penetration increases in markets like India and China. It is anticipated that the number of mobile subscribers will grow past 6 billion in the coming years.2 As Figure 1 shows, while the Western European and North American markets are reaching saturation, the vast majority of growth is coming from countries in Asia. Given that only 35 percent of the world{\textquoteright}s population has thus far adopted mobile technology, there is still significant room for growth and innovation.}, url = {https://ieeexplore.ieee.org/document/8301138/}, author = {Reddi, Vijay Janapa and Yoon, Hongil and Knies, Allan} } @conference {boroujerdian4compute, title = {Why Compute Matters for UAV Energy Efficiency?}, booktitle = {2nd International Symposium on Aerial Robotics}, number = {6}, year = {2018}, abstract = {Unmanned Aerial Vehicles (UAVs) are getting closer to becoming ubiquitous in everyday life. Although the researchers in the robotic domain have made rapid progress in recent years, hardware and software architects in the computer architecture community lack the comprehensive understanding of how performance, power, and computational bottlenecks affect UAV applications. Such an understanding enables system architects to design microchips tailored for aerial agents. This paper is an attempt by computer architects to initiate the discussion between the two academic domains by investigating the underlying compute systems{\textquoteright} impact on aerial robotic applications. To do so, we identify performance and energy constraints and examine the impact of various compute knobs such as processor cores and frequency on these constraints. Our experiment show that such knobs allow for up to 5X speed up for a wide class of applications.}, author = {Boroujerdian, Behzad and Genc, Hasan and Krishnan, Srivatsan and Faust, Aleksandra and Reddi, Vijay Janapa} } @article {zhu2017cognitive, title = {Cognitive Computing Safety: The New Horizon for Reliability/The Design and Evolution of Deep Learning Workloads}, journal = {IEEE Micro}, number = {1}, year = {2017}, pages = {15{\textendash}21}, publisher = {IEEE}, abstract = {Recent advances in cognitive computing have brought widespread excitement for various machine learning{\textendash}based intelligent services, ranging from autonomous vehicles to smart traffic-light systems. To push such cognitive services closer to reality, recent research has focused extensively on improving the performance, energy efficiency, privacy, and security of cognitive computing platforms.Among all the issues, a rapidly rising and critical challenge to address is the practice of safe cognitive computing{\textemdash} that is, how to architect machine learning{\textendash}based systems to be robust against uncertainty and failure to guarantee that they perform as intended without causing harmful behavior. Addressing the safety issue will involve close collaboration among different computing communities, and we believe computer architects must play a key role. In this position paper, we first discuss the meaning of safety and the severe implications of the safety issue in cognitive computing. We then provide a framework to reason about safety, and we outline several opportunities for the architecture community to help make cognitive computing safer.}, url = {https://doi.org/10.1109/MM.2017.2}, author = {Zhu, Yuhao and Reddi, Vijay Janapa and Adolf, Robert and Rama, Saketh and Reagen, Brandon and Gu-Yeon Wei and David Brooks} } @webarticle {1347806, title = {A Decade of Mobile Computing}, journal = {SIGARCH Computer Architecture Today Blog.}, year = {2017}, url = {https://www.sigarch.org/a-decade-of-mobile-computing/}, author = {Reddi, Vijay Janapa} } @article {genc2017flying, title = {Flying IoT: Toward Low-Power Vision in the Sky}, journal = {IEEE Micro}, volume = {37}, number = {6}, year = {2017}, pages = {40{\textendash}51}, publisher = {IEEE}, url = {https://doi.org/10.1109/ACCESS.2018.2819189}, author = {Genc, Hasan and Zu, Yazhou and Chin, Ting-Wu and Halpern, Matthew and Reddi, Vijay Janapa} } @conference {zou2017ivory, title = {Ivory: Early-Stage Design Space Exploration Tool for Integrated Voltage Regulators}, booktitle = {Proceedings of the 54th Annual Design Automation Conference (DAC)}, year = {2017}, pages = {1}, publisher = {ACM}, organization = {ACM}, abstract = {Despite being employed in burgeoning eforts to improve power delivery eiciency, integrated voltage regulators (IVRs) have yet to be evaluated in a rigorous, systematic, or quantitative manner. To fulill this need, we present Ivory, a high-level design space exploration tool capable of providing accurate conversion eiciency, static performance characteristics, and dynamic transient responses of an IVR-enabled power delivery subsystem (PDS), enabling rapid trade-of exploration at early design stage, approximately 1000x faster than SPICE simulation. We demonstrate and validate Ivory with a wide spectrum of IVR topologies. In addition, we present a case study using Ivory to reveal the optimal PDS conigurations, with underlying power break-downs and area overheads for the GPU manycore architecture, which has yet to embrace IVRs.\ }, url = {https://doi.org/10.1145/3061639.3062268}, author = {Zou, An and Leng, Jingwen and Zu, Yazhou and Tao Tong and Reddi, Vijay Janapa and David Brooks and Gu-Yeon Wei and Zhang, Xuan} } @article {zhu2017optimizing, title = {Optimizing General-Purpose Cpus for Energy-Efficient Mobile Web Computing}, journal = {ACM Transactions on Computer Systems (TOCS)}, volume = {35}, number = {1}, year = {2017}, pages = {1}, publisher = {ACM}, abstract = {Mobile applications are increasingly being built using web technologies as a common substrate to achieve portability and to improve developer productivity. Unfortunately, web applications often incur large performance overhead, directly affecting the user quality-of-service (QoS) experience. Traditional techniques in improving mobile processor performance have mostly been adopting desktop-like design techniques such as increasing single-core microarchitecture complexity and aggressively integrating more cores. However, such a desktop-oriented strategy is likely coming to an end due to the stringent energy and thermal constraints that mobile devices impose. Therefore, we must pivot away from traditional mobile processor design techniques in order to provide sustainable performance improvement while maintaining energy efficiency. In this article, we propose to combine hardware customization and specialization techniques to improve the performance and energy efficiency of mobile web applications. We first perform design-space exploration (DSE) and identify opportunities in customizing existing general-purpose mobile processors, that is, tuning microarchitecture parameters. The thorough DSE also lets us discover sources of energy inefficiency in customized general-purpose architectures. To mitigate these inefficiencies, we propose, synthesize, and evaluate two new domain-specific specializations, called the Style Resolution Unit and the Browser Engine Cache. Our optimizations boost performance and energy efficiency at the same time while maintaining generalpurpose programmability. As emerging mobile workloads increasingly rely more on web technologies, the type of optimizations we propose will become important in the future and are likely to have a long-lasting and widespread impact.\ }, url = {https://doi.org/10.1145/3041024}, author = {Zhu, Yuhao and Reddi, Vijay Janapa} } @article {reddi2017research, title = {Research for Practice: Web Security and Mobile Web Computing}, journal = {Communications of the ACM (CACM)}, year = {2017}, publisher = {ASSOC COMPUTING MACHINERY 2 PENN PLAZA, STE 701, NEW YORK, NY 10121-0701 USA}, abstract = {OUR THIRD INSTALLMENT of Research for Practice brings readings spanning programming languages, compilers, privacy, and the mobile Web. First, Jean Yang provides an overview of how to use information flow techniques to build programs that are secure by construction. As Yang writes, information flow is a conceptually simple {\textquotedblleft}clean idea{\textquotedblright}: the flow of sensitive information across program variables and control statements can be tracked to determine whether information may in fact leak. Making information flow practical is a major challenge, however. Instead of relying on programmers to track information flow, how can compilers and language runtimes be made to do the heavy lifting? How can application writers easily express their privacy policies and understand the implications of a given policy for the set of values that an application user may see? Yang{\textquoteright}s set of papers directly addresses these questions via a clever mix of techniques from compilers, systems, and language design. This focus on theory made practical is an excellent topic for RfP\ }, author = {Reddi, Vijay Janapa and Zhu, Yuhao} } @booklet {mohan2017storage, title = {Storage on Your Smartphone Uses More Energy Than You Think}, journal = {USENIX HotStorage}, year = {2017}, abstract = {Energy consumption is a key concern for mobile devices. Prior research has focused on the screen and the network as the major sources of energy consumption. Through carefully designed measurement-based experiments, we show that for certain storage-intensive workloads, the storage subsystem on an Android smartphone consumes a significant amount of energy (36\%), on par with screen energy consumption. We analyze the energy consumption of different storage primitives, such as sequential and random writes, on two popular mobile file systems, ext4 and F2FS. In addition, since most Android applications use SQLite for storage, we analyze the energy consumption of different SQLite operations. We present several interesting results from our analysis: for example, random writes consume 15{\texttimes} higher energy than sequential writes, and that F2FS consumes half the energy as ext4 for most workloads. We believe our results contribute useful design guidelines for the developers of energy-efficient mobile file systems.\ }, author = {Mohan, Jayashree and Purohith, Dhathri and Halpern, Mathew and Vijay Chidambaram and Reddi, Vijay Janapa} } @article {zu2017ti, title = {Ti-States: Power Management in Active Timing Margin Processors}, journal = {IEEE Micro}, volume = {37}, number = {3}, year = {2017}, pages = {106{\textendash}114}, publisher = {IEEE}, abstract = {TEMPERATURE INVERSION IS A TRANSISTOR-LEVEL EFFECT THAT IMPROVES PERFORMANCE WHEN TEMPERATURE INCREASES. THIS ARTICLE PRESENTS A COMPREHENSIVE MEASUREMENT-BASED ANALYSIS OF ITS IMPLICATIONS FOR ARCHITECTURE DESIGN AND POWER MANAGEMENT USING THE AMD A10-8700P PROCESSOR. THE AUTHORS PROPOSE TEMPERATURE-INVERSION STATES (TI -STATES) TO HARNESS THE OPPORTUNITIES PROMISED BY TEMPERATURE INVERSION. THEY EXPECT TI -STATES TO BE ABLE TO IMPROVE THE POWER EFFICIENCY OF MANY PROCESSORS MANUFACTURED IN FUTURE CMOS TECHNOLOGIES.}, url = {https://doi.org/10.1109/MM.2017.68}, author = {Zu, Yazhou and Huang, Wei and Paul, Indrani and Reddi, Vijay Janapa} } @conference {liu2016barrier, title = {Barrier-Aware Warp Scheduling for Throughput Processors}, booktitle = {Proceedings of the 2016 International Conference on Supercomputing}, year = {2016}, pages = {42}, publisher = {ACM}, organization = {ACM}, abstract = {Parallel GPGPU applications rely on barrier synchronization to align thread block activity. Few prior work has studied and characterized barrier synchronization within a thread block and its impact on performance. In this paper, we find that barriers cause substantial stall cycles in barrier-intensive GPGPU applications although GPGPUs employ lightweight hardware-support barriers. To help investigate the reasons, we define the execution between two adjacent barriers of a thread block as a warp-phase. We find that the execution progress within a warp-phase varies dramatically across warps, which we call warp-phase-divergence. While warp-phasedivergence may result from execution time disparity among warps due to differences in application code or input, and/or shared resource contention, we also pinpoint that warp-phase-divergence may result from warp scheduling.To mitigate barrier induced stall cycle inefficiency, we propose barrier-aware warp scheduling (BAWS). It combines two techniques to improve the performance of barrier-intensive GPGPU applications. The first technique, most-waiting-first (MWF), assigns a higher scheduling priority to the warps of a thread block that has a larger number of warps waiting at a barrier. The second technique, critical-fetch-first (CFF), fetches instructions from the warp to be issued by MWF in the next cycle. To evaluate the efficiency of BAWS, we consider 13 barrier-intensive GPGPU applications, and we report that BAWS speeds up performance by 17\% and 9\% on average (and up to 35\% and 30\%) over loosely-round-robin (LRR) and greedy-then-oldest (GTO) warp scheduling, respectively. We compare BAWS against recent concurrent work SAWS, finding that BAWS outperforms SAWS by 7\% on average and up to 27\%. For non-barrier-intensive workloads, we demonstrate that BAWS is performance-neutral compared to GTO and SAWS, while improving performance by 5.7\% on average (and up to 22\%) compared to LRR. BAWS{\textquoteright} hardware co}, url = {https://doi.org/10.1145/2925426.2926267}, author = {Liu, Yuxi and Yu, Zhibin and Eeckhout, Lieven and Reddi, Vijay Janapa and Luo, Yingwei and Wang, Xiaolin and Wang, Zhenlin and Xu, Chengzhong} } @booklet {halpern1case, title = {The Case for Node Multi-Versioning in Cognitive Cloud Services: Achieving Responsiveness and Accuracy at Datacenter Scale}, journal = {Workshop on Cognitive Architectures (CogArch)}, year = {2016}, abstract = {Cognitive cloud services seek to provide end-users with functionalities that have historically required human intellect to complete. End-users expect these services to be both responsive and accurate, which pose conflicting requirements for service providers. Today{\textquoteright}s cloud services deployment schemes follow a {\textquotedblleft}one size fits all{\textquotedblright} scale-out strategy, where multiple instantiations of the same version of the service are used to scale-out and handle all end-users. Meanwhile, many cognitive services are of a statistical nature where deeper exploration yields more accurate results but also requires more processing time. Finding a single service configuration setting that satisfies the latency and accuracy requirements for the largest number of expected end-user requests can be a challenging task. As a result, cognitive cloud service providers are conservatively configured to maximize the number of enduser requests for which a satisfactory latency-accuracy tradeoff can be achieved. Using a production-grade Automatic Speech Recognition cloud service as a representative example to study, we demonstrate the inefficiencies of this single version approach and propose a new service node multi-versioning deployment scheme for cognitive services instead. We present an oracle-based limit study where we show that service node multi-versioning can provide a 2.5X reduction in execution time together with a 24\% improvement in accuracy over a traditional single version deployment scheme. We also discuss several design considerations to address when implementing service node multi-versioning.}, author = {Halpern, Matthew and Mummert, Todd and Novak, Miroslav and Duesterwald, Evelyn and Reddi, Vijay Janapa} } @article {kazdagli2016emma, title = {EMMA: A New Platform to Evaluate Hardware-based Mobile Malware Analyses}, journal = {arXiv preprint arXiv:1603.03086}, year = {2016}, abstract = {Hardware-based malware detectors (HMDs) are a key emerging technology to build trustworthy computing platforms, especially mobile platforms. Quantifying the efficacy of HMDs against malicious adversaries is thus an important problem. The challenge lies in that real-world malware typically adapts to defenses, evades being run in experimental settings, and hides behind benign applications. Thus, realizing the potential of HMDs as a line of defense {\textendash} that has a small and battery-efficient code base {\textendash} requires a rigorous foundation for evaluating HMDs. To this end, we introduce EMMA{\textemdash}a platform to evaluate the efficacy of HMDs for mobile platforms. EMMA deconstructs malware into atomic, orthogonal actions and introduces a systematic way of pitting different HMDs against a diverse subset of malware hidden inside benign applications. EMMA drives both malware and benign programs with real user-inputs to yield an HMD{\textquoteright}s effective operating range{\textemdash} i.e., the malware actions a particular HMD is capable of detecting. We show that small atomic actions, such as stealing a Contact or SMS, have surprisingly large hardware footprints, and use this insight to design HMD algorithms that are less intrusive than prior work and yet perform 24.7\% better. Finally, EMMA brings up a surprising new result{\textemdash} obfuscation techniques used by malware to evade static analyses makes them more detectable using HMDs.}, author = {Kazdagli, Mikhail and Huang, Ling and Reddi, Vijay Janapa and Tiwari, Mohit} } @conference {zhu2016greenweb, title = {GreenWeb: Language Extensions for Energy-Efficient Mobile Web Computing}, booktitle = {Proceedings of the 37th ACM SIGPLAN Conference on Programming Language Design and Implementation}, volume = {51}, number = {6}, year = {2016}, pages = {145-160 }, publisher = {ACM}, organization = {ACM}, abstract = {Web computing is gradually shifting toward mobile devices, in which the energy budget is severely constrained. As a result, Web developers must be conscious of energy efficiency. However, current Web languages provide developers little control over energy consumption. In this paper, we take a first step toward language-level research to enable energy-efficient Web computing. Our key motivation is that mobile systems can wisely budget energy usage if informed with user quality-of-service (QoS) constraints. To do this, programmers need new abstractions. We propose two language abstractions, QoS type and QoS target, to capture two fundamental aspects of user QoS experience. We then present GreenWeb, a set of language extensions that empower developers to easily express the QoS abstractions as program annotations. As a proof of concept, we develop a GreenWeb runtime, which intelligently determines how to deliver specified user QoS expectation while minimizing energy consumption. Overall, GreenWeb shows significant energy savings (29.2\% ⇠ 66.0\%) over Android{\textquoteright}s default Interactive governor with few QoS violations. Our work demonstrates a promising first step toward language innovations for energy-efficient Web computing. Categories and Subject Descriptors D.3.2 [Programming Language]: Language Classifications{\textendash}Specialized application languages; D.3.3 [Programming Language]: Language Constructs and Features{\textendash}Constraints Keywords Energy-efficiency, Web, Mobile computing}, url = {https://doi.org/10.1145/2908080.2908082}, author = {Zhu, Yuhao and Reddi, Vijay Janapa} } @conference {halpern2016mobile, title = {Mobile Cpu{\textquoteright}s Rise to Power: Quantifying the Impact of Generational Mobile Cpu Design Trends on Performance, Energy, and User Satisfaction}, booktitle = {High Performance Computer Architecture (HPCA), 2016 IEEE International Symposium on}, year = {2016}, pages = {64{\textendash}76}, publisher = {IEEE}, organization = {IEEE}, abstract = {In this paper, we assess the past, present, and future of mobile CPU design. We study how mobile CPU designs trends have impacted the end-user, hardware design, and the holistic mobile device. We analyze the evolution of ten cutting-edge mobile CPU designs released over the past seven years. Specifically, we report measured performance, power, energy and user satisfaction trends across mobile CPU generations. A key contribution of our work is that we contextualize the mobile CPU{\textquoteright}s evolution in terms of user satisfaction, which has largely been absent from prior mobile hardware studies. To bridge the gap between mobile CPU design and user satisfaction, we construct and conduct a novel crowdsourcing study that spans over 25,000 survey participants using the Amazon Mechanical Turk service. Our methodology allows us to identify what mobile CPU design techniques provide the most benefit to the end-user{\textquoteright}s quality of user experience. Our results quantitatively demonstrate that CPUs play a crucial role in modern mobile system-on-chips (SoCs). Over the last seven years, both single- and multicore performance improvements have contributed to end-user satisfaction by reducing user-critical application response latencies. Mobile CPUs aggressively adopted many power-hungry desktoporiented design techniques to reach these performance levels. Unlike other smartphone components (e.g. display and radio) whose peak power consumption has decreased over time, the mobile CPU{\textquoteright}s peak power consumption has steadily increased. As the limits of technology scaling restrict the ability of desktop-like scaling to continue for mobile CPUs, specialized accelerators appear to be a promising alternative that can help sustain the power, performance, and energy improvements that mobile computing necessitates. Such a paradigm shift will redefine the role of the CPU within future SoCs, which merit several design considerations based on our findings.}, url = {https://doi.org/10.1109/HPCA.2016.7446054}, author = {Halpern, Matthew and Zhu, Yuhao and Reddi, Vijay Janapa} } @conference {kazdagli2016quantifying, title = {Quantifying and Improving the Efficiency of Hardware-Based Mobile Malware Detectors}, booktitle = {The 49th Annual IEEE/ACM International Symposium on Microarchitecture}, year = {2016}, pages = {37}, publisher = {IEEE}, organization = {IEEE}, abstract = {Hardware-based malware detectors (HMDs) are a key emerging technology to build trustworthy systems, especially mobile platforms. Quantifying the efficacy of HMDs against malicious adversaries is thus an important problem. The challenge lies in that real-world malware adapts to defenses, evades being run in experimental settings, and hides behind benign applications. Thus, realizing the potential of HMDs as a small and battery-efficient line of defense requires a rigorous foundation for evaluating HMDs. We introduce Sherlock{\textemdash}a white-box methodology that quantifies an HMD{\textquoteright}s ability to detect malware and identify the reason why. Sherlock first deconstructs malware into atomic, orthogonal actions to synthesize a diverse malware suite. Sherlock then drives both malware and benign programs with real user-inputs, and compares their executions to determine an HMD{\textquoteright}s operating range, i.e., the smallest malware actions an HMD can detect. We show three case studies using Sherlock to not only quantify HMDs{\textquoteright} operating ranges but design better detectors. First, using information about concrete malware actions, we build a discretewavelet transform based unsupervised HMD that outperforms prior work based on power transforms by 24.7\% (AUC metric). Second, training a supervised HMD using Sherlock{\textquoteright}s diverse malware dataset yields 12.5\% better HMDs than past approaches that train on ad-hoc subsets of malware. Finally, Sherlock shows why a malware instance is detectable. This yields a surprising new result{\textemdash}obfuscation techniques used by malware to evade static analyses makes them more detectable using HMDs.}, url = {https://doi.org/10.1109/MICRO.2016.7783740}, author = {Kazdagli, Mikhail and Reddi, Vijay Janapa and Tiwari, Mohit} } @webarticle {yang2016research, title = {Research for Practice: Web Security and Mobile Web Computing}, journal = {ACM Queue}, volume = {14}, number = {4}, year = {2016}, pages = {80}, publisher = {ACM}, url = {https://queue.acm.org/detail.cfm?id=3005356}, author = {Yang, Jean and Reddi, Vijay Janapa and Zhu, Yuhao and Bailis, Peter} } @conference {chachmon2016simulation, title = {Simulation and Analysis Engine for Scale-Out Workloads}, booktitle = {Proceedings of the 2016 International Conference on Supercomputing (ICS)}, year = {2016}, pages = {22}, publisher = {ACM}, organization = {ACM}, abstract = {We introduce a system-level Simulation and Analysis Engine (SAE) framework based on dynamic binary instrumentation for fine-grained and customizable instruction-level introspection of everything that executes on the processor. SAE can instrument the BIOS, kernel, drivers, and user processes. It can also instrument multiple systems simultaneously using a single instrumentation interface, which is essential for studying scale-out applications. SAE is an x86 instruction set simulator designed specifically to enable rapid prototyping, evaluation, and validation of architectural extensions and program analysis tools using its flexible APIs. It is fast enough to execute full platform workloads{\textemdash}a modern operating system can boot in a few minutes{\textemdash}thus enabling research, evaluation, and validation of complex functionalities related to multicore configurations, virtualization, security, and more. To reach high speeds, SAE couples tightly with a virtual platform and employs both a just-in-time (JIT) compiler that helps simulate simple instructions eciently and a fast interpreter for simulating new or complex instructions. We describe SAE{\textquoteright}s architecture and instrumentation engine design and show the framework{\textquoteright}s usefulness for single- and multi-system architectural and program analysis studies.}, url = {https://doi.org/10.1145/2925426.2926293}, author = {Chachmon, Nadav and Richins, Daniel and Cohn, Robert and Christensson, Magnus and Cui, Wenzhi and Reddi, Vijay Janapa} } @conference {zu2016t, title = {Ti-States: Processor Power Management in the Temperature Inversion Region}, booktitle = {Proceedings of the 49th Annual IEEE/ACM International Symposium on Microarchitecture (MICRO)}, year = {2016}, pages = {1{\textendash}13}, publisher = {IEEE}, organization = {IEEE}, abstract = {Temperature inversion is a transistor-level effect that can improve performance when temperature increases. It has largely been ignored in the past because it does not occur in the typical operating region of a processor, but temperature inversion is becoming increasing important in current and future technologies. In this paper, we study temperature inversion{\textquoteright}s implications on architecture design, and power and performance management. We present the first public comprehensive measurement-based analysis on the effects of temperature inversion on a real processor, using the AMD A10- 8700P processor as our system under test. We show that the extra timing margin introduced by temperature inversion can provide more than 5\% Vdd reduction benefit, and this improvement increases to more than 8\% when operating in the near-threshold, low-voltage region. To harness this opportunity, we present Tistates, a power management technique that sets the processor{\textquoteright}s voltage based on real-time silicon temperature to improve power efficiency. Ti-states lead to 6\% to 12\% measured power saving across a range of different temperatures compared to a fixed margin. As technology scales to FD-SOI and FinFET, we show there is an ideal operating temperature for various workloads to maximize the benefits of temperature inversion. The key is to counterbalance leakage power increase at higher temperatures with dynamic power reduction by the Ti-states. The projected optimal temperature is typically around 60{\textdegree}C and yields 8\% to 9\% chip power saving. The optimal high-temperature can be exploited to reduce design cost and runtime operating power for overall cooling. Our findings are important for power and thermal management in future chips and process technologies.Keywords-timing margin; temperature inversion; power management; reliability; technology scaling}, url = {https://doi.org/10.1109/MICRO.2016.7783758}, author = {Zu, Yazhou and Huang, Wei and Paul, Indrani and Reddi, Vijay Janapa} } @patent {reddi2015adaptive, title = {Adaptive Event-Guided System and Method for Avoiding Voltage Emergencies}, volume = {8,949,666}, year = {2015}, edition = {United States of America}, chapter = {US}, author = {Reddi, Vijay Janapa and Gupta, Meeta Sharma and Holloway, Glenn and Gu-Yeon Wei and Smith, Michael D and David Brooks} } @conference {zu2015adaptive, title = {Adaptive Guardband Scheduling to Improve System-Level Efficiency of the Power7+}, booktitle = {MICRO-48: The 48th Annual IEEE/ACM International Symposium of Microarchitecture}, year = {2015}, pages = {308{\textendash}321}, publisher = {ACM}, organization = {ACM}, abstract = { The traditional guardbanding approach to ensure processor reliability is becoming obsolete because it always over-provisions voltage and wastes a lot of energy. As a next-generation alternative, adaptive guardbanding dynamically adjusts chip clock frequency and voltage based on timing margin measured at runtime. With adaptive guardbanding, voltage guardband is only provided when needed, thereby promising significant energy eciency improvement. In this paper, we provide the first full-system analysis of adaptive guardbanding{\textquoteright}s implications using a POWER7+ multicore. On the basis of a broad collection of hardware measurements, we show the benefits of adaptive guardbanding in a practical setting are strongly dependent upon workload characteristics and chip-wide multicore activity. A key finding is that adaptive guardbanding{\textquoteright}s benefits diminish as the number of active cores increases, and they are highly dependent upon the workload running. Through a series of analysis, we show these high-level system e↵ects are the result of interactions between the application characteristics, architecture and the underlying voltage regulator module{\textquoteright}s loadline e↵ect and IR drop e↵ects. To that end, we introduce adaptive guardband scheduling to reclaim adaptive guardbanding{\textquoteright}s e- ciency under di↵erent enterprise scenarios. Our solution reduces processor power consumption by 6.2\% over a highly optimized system, e↵ectively doubling adaptive guardbanding{\textquoteright}s original improvement. Our solution also avoids malicious workload mappings to guarantee application QoS in the face of adaptive guardbanding hardware{\textquoteright}s variable performance. }, url = {https://doi.org/10.1145/2830772.2830824}, author = {Zu, Yazhou and Lefurgy, Charles R and Leng, Jingwen and Halpern, Matthew and Floyd, Michael S and Reddi, Vijay Janapa} } @conference {zhu2015event, title = {Event-Based Scheduling for Energy-Efficient QoS (EQoS) in Mobile Web Applications}, booktitle = {21st International Symposium on High Performance Computer Architecture (HPCA)}, year = {2015}, pages = {137{\textendash}149}, publisher = {IEEE}, organization = {IEEE}, abstract = {Mobile Web applications have become an integral part of our society. They pose a high demand for application quality of service (QoS). However, the energy-constrained nature of mobile devices makes optimizing for QoS difficult. Prior art on energy efficiency optimizations has only focused on the trade-off between raw performance and energy consumption, ignoring the application QoS characteristics. In this paper, we propose the concept of energy-efficient QoS (eQoS) to capture the trade-off between QoS and energy consumption. Given the fundamental event-driven nature of mobile Web applications, we further propose event-based scheduling as an optimization framework for eQoS. The event-based scheduling automatically reasons about users{\textquoteright} QoS requirements, and accurately slacks the events{\textquoteright} execution time to save energy without violating end users{\textquoteright} experience. We demonstrate a working prototype using the Google Chromium and V8 framework on the Samsung Exynos 5410 SoC (used in the Galaxy S4 smartphone). Based on real hardware and software measurements, we achieve 41.2\% energy saving with only 0.4\% of QoS violations perceptible to end users.}, url = {https://doi.org/10.1109/HPCA.2015.7056028}, author = {Zhu, Yuhao and Halpern, Matthew and Reddi, Vijay Janapa} } @conference {leng2015gpu, title = {Gpu Voltage Noise: Characterization and Hierarchical Smoothing of Spatial and Temporal Voltage Noise Interference in Gpu Architectures}, booktitle = {21st International Symposium on High Performance Computer Architecture (HPCA)}, year = {2015}, pages = {161{\textendash}173}, publisher = {IEEE}, organization = {IEEE}, abstract = {Energy efficiency is undoubtedly important for GPU architectures. Besides the traditionally explored energy-efficiency optimization techniques, exploiting the supply voltage guardband remains a promising yet unexplored opportunity. Our hardware measurements show that up to 23\% of the nominal supply voltage can be eliminated to improve GPU energy efficiency by as much as 25\%. The key obstacle for exploiting this opportunity lies in understanding the characteristics and root causes of large voltage droops in GPU architectures and subsequently smoothing them away without severe performance penalties. The GPU{\textquoteright}s manycore nature complicates the voltage noise phenomenon, and its distinctive architecture features from the CPU necessitate a GPU-specific voltage noise analysis. In this paper, we make the following contributions. First, we provide a voltage noise categorization framework to identify, characterize, and understand voltage noise in the manycore GPU architecture. Second, we perform a microarchitecture-level voltage-droop root-cause analysis for the two major droop types we identify, namely the local first-order droop and the global second-order droop. Third, on the basis of our categorization and characterization, we propose a hierarchical voltage smoothing mechanism that mitigates each type of voltage droop. Our evaluation shows it can reduce up to 31\% worst-case droop, which translates to 11.8\% core-level and 7.8\% processor-level energy reduction}, url = {https://doi.org/10.1109/HPCA.2015.7056030}, author = {Leng, Jingwen and Zu, Yazhou and Reddi, Vijay Janapa} } @conference {halpern2015locality, title = {Locality Lost: Unlocking the Performance of Event-Driven Servers}, booktitle = {International Symposium on Microarchitecture}, year = {2015}, abstract = {Server-side Web applications are in the midst of a software evolution. Application developers are turning away from the established thread-per-request model, where each request gets a dedicated thread on the server, and toward event-driven programming platforms, which promise improved scalability and CPU utilization [1]. In response, we perform a microarchitectural analysis of these applications in current server processors and identify several serious performance bottlenecks and optimization opportunities for future processor designs.}, author = {Richins, Daniel and Zhu, Yuhao and Halpern, Matthew and Reddi, Vijay Janapa} } @conference {zhu2015microarchitectural, title = {Microarchitectural Implications of Event-Driven Server-Side Web Applications}, booktitle = {Proceedings of the 48th International Symposium on Microarchitecture}, year = {2015}, pages = {762{\textendash}774}, publisher = {ACM}, organization = {ACM}, abstract = {Enterprise Web applications are moving towards serverside scripting using managed languages. Within this shifting context, event-driven programming is emerging as a crucial programming model to achieve scalability. In this paper, we study the microarchitectural implications of server-side scripting, JavaScript in particular, from a unique event-driven programming model perspective. Using the Node.js framework, we come to several critical microarchitectural conclusions. First, unlike traditional server-workloads such as CloudSuite and BigDataBench that are based on the conventional threadbased execution model, event-driven applications are heavily single-threaded, and as such they require significant singlethread performance. Second, the single-thread performance is severely limited by the front-end inefficiencies of today{\textquoteright}s server processor microarchitecture, ultimately leading to overall execution inefficiencies. The front-end inefficiencies stem from the unique combination of limited intra-event code reuse and large inter-event reuse distance. Third, through a deep understanding of event-specific characteristics, architects can mitigate the front-end inefficiencies of the managed-languagebased event-driven execution via a combination of instruction cache insertion policy and prefetcher.}, url = {https://doi.org/10.1145/2830772.2830792}, author = {Zhu, Yuhao and Richins, Daniel and Halpern, Matthew and Reddi, Vijay Janapa} } @conference {halpern2015mosaic, title = {Mosaic: Cross-Platform User-Interaction Record and Replay for the Fragmented Android Ecosystem}, booktitle = {Performance Analysis of Systems and Software (ISPASS), 2015 IEEE International Symposium on}, year = {2015}, pages = {215{\textendash}224}, publisher = {IEEE}, organization = {IEEE}, abstract = {In contrast to traditional computing systems, such as desktops and servers, that are programmed to perform {\textquotedblleft}compute-bound{\textquotedblright} and {\textquotedblleft}run-to-completion{\textquotedblright} tasks, mobile applications are designed for user interactivity. Factoring user interactivity into computer system design and evaluation is important, yet possesses many challenges. In particular, systematically studying interactive mobile applications across the diverse set of mobile devices available today is difficult due to the mobile device fragmentation problem. At the time of writing, there are 18,796 distinct Android mobile devices on the market and will only continue to increase in the future. Differences in screen sizes, resolutions and operating systems impose different interactivity requirements, making it difficult to uniformly study these systems. We present Mosaic, a cross-platform, timing-accurate record and replay tool for Android-based mobile devices. Mosaic overcomes device fragmentation through a novel virtual screen abstraction. User interactions are translated from a physical device into a platform-agnostic intermediate representation before translation to a target system. The intermediate representation is human-readable, which allows Mosaic users to modify previously recorded traces or even synthesize their own user interactive sessions from scratch. We demonstrate that Mosaic allows user interaction traces to be recorded on emulators, smartphones, tablets, and development boards and replayed on other devices. Using Mosaic we were able to replay 45 different Google Play applications across multiple devices, and also show that we can perform cross-platform performance comparisons between two different processors under identical user interactions.}, url = {https://doi.org/10.1109/ISPASS.2015.7095807}, author = {Halpern, Matthew and Zhu, Yuhao and Peri, Ramesh and Reddi, Vijay Janapa} } @article {zhu2015role, title = {The Role of the Cpu in Energy-Efficient Mobile Web Browsing}, journal = {IEEE Micro}, volume = {35}, number = {1}, year = {2015}, pages = {26{\textendash}33}, publisher = {IEEE}, abstract = {THE MOBILE CPU IS STARTING TO NOTICEABLY IMPACT WEB BROWSING PERFORMANCE AND ENERGY CONSUMPTION. ACHIEVING ENERGY-EFFICIENT MOBILE WEB BROWSING REQUIRES CONSIDERING BOTH CPU AND NETWORK CAPABILITIES. RESEARCHERS MUST LEVERAGE INTERACTIONS BETWEEN THE CPU AND NETWORK TO DELIVER HIGH MOBILE WEB PERFORMANCE WHILE MAINTAINING A LOW ENERGY FOOTPRINT. DESIGNING FUTURE HIGH-PERFORMANCE AND ENERGY-EFFICIENT MOBILE WEB CLIENTS IMPLIES LOOKING BEYOND INDIVIDUAL COMPONENTS AND TAKING A FULL SYSTEM PERSPECTIVE.}, url = {https://doi.org/10.1109/MM.2015.8}, author = {Zhu, Yuhao and Halpern, Matthew and Reddi, Vijay Janapa} } @conference {leng2015safe, title = {Safe Limits on Voltage Reduction Efficiency in GPUs: A Direct Measurement Approach}, booktitle = {Microarchitecture (MICRO), 2015 48th Annual IEEE/ACM International Symposium on}, year = {2015}, pages = {294{\textendash}307}, publisher = {IEEE}, organization = {IEEE}, abstract = {Energy eciency of GPU architectures has emerged as an important aspect of computer system design. In this paper, we explore the energy benefits of reducing the GPU chip{\textquoteright}s voltage to the safe limit, i.e. Vmin point. We perform such a study on several commercial o↵- the-shelf GPU cards. We find that there exists about 20\% voltage guardband on those GPUs spanning two architectural generations, which, if {\textquotedblleft}eliminated{\textquotedblright} completely, can result in up to 25\% energy savings on one of the studied GPU cards. The exact improvement magnitude depends on the program{\textquoteright}s available guardband, because our measurement results unveil a program dependent Vmin behavior across the studied programs. We make fundamental observations about the programdependent Vmin behavior. We experimentally determine that the voltage noise has a larger impact on Vmin compared to the process and temperature variation, and the activities during the kernel execution cause large voltage droops. From these findings, we show how to use a kernel{\textquoteright}s microarchitectural performance counters to predict its Vmin value accurately. The average and maximum prediction errors are 0.5\% and 3\%, respectively. The accurate Vmin prediction opens up new possibilities of a cross-layer dynamic guardbanding scheme for GPUs, in which software predicts and manages the voltage guardband, while the functional correctness is ensured by a hardware safety net mechanism.}, url = {https://doi.org/10.1145/2830772.2830811}, author = {Leng, Jingwen and Buyuktosunoglu, Alper and Bertran, Ramon and Bose, Pradip and Reddi, Vijay Janapa} } @article {leng2014energy, title = {Energy Efficiency Benefits of Reducing the Voltage Guardband on the Kepler Gpu Architecture}, journal = {Proc. of Silicon Errors in Logic {\textendash} System Effects (SELSE)}, year = {2014}, abstract = {Energy efficiency of GPU architectures has emerged as an important design criterion for both NVIDIA and AMD. In this paper, we explore the benefits of scaling a generalpurpose GPU (GPGPU) core{\textquoteright}s supply voltage to the near limits of execution failure. We find that as much as 21\% of NVIDIA GTX 680{\textquoteright}s core supply voltage guardband can be eliminated to achieve significant energy efficiency improvement. Measured results indicate that the energy improvements can be as high as 25\% without any performance loss. The challenge, however, is to understand what impacts the minimum voltage guardband and how the guardband can be scaled without compromising correctness. We show that GPU microarchitectural activity patterns caused by different program characteristics are the root cause(s) of the large voltage guardband. We also demonstrate how microarchitecture-level parameters, such as clock frequency and the number of cores, impact the guardband. We hope our preliminary analysis lays the groundwork for future research.}, author = {Leng, Jingwen and Zu, Yazhou and Reddi, Vijay Janapa} } @conference {zhou2014estimation, title = {Estimation of Instantaneous Frequency Fluctuation in a Fast DVFS Environment Using an Empirical BTI Stress-Relaxation Model}, booktitle = {Proceedings of the International Reliability Physics Symposium (IRPS)}, year = {2014}, pages = {2D{\textendash}2}, publisher = {IEEE}, organization = {IEEE}, abstract = {This work proposes an empirical Bias Temperature Instability (BTI) stress-relaxation model based on the superposition property. The model was used to study the instantaneous frequency fluctuation in a fast Dynamic Voltage and Frequency Scaling (DVFS) environment. VDD and operating frequency information for this study were collected from an ARM Cortex A15 processor based development board running an Android operating system. Simulation results show that the frequency peaks and dips are functions of mainly two parameters: (1) the amount of stress or recovery experienced by the circuit prior to the VDD switching and (2) the frequency sensitivity to device aging after the VDD switching.}, url = {https://doi.org/10.1109/IRPS.2014.6860593}, author = {Zhou, Chen and Wang, Xiaofei and Xu, Weichao and Zhu, Yuhao and Reddi, Vijay Janapa and Kim, Chris H} } @article {zhu2014exploiting, title = {Exploiting Webpage Characteristics for Energy-Efficient Mobile Web Browsing}, journal = {Computer Architecture Letters (CAL)}, volume = {13}, number = {1}, year = {2014}, pages = {33{\textendash}36}, publisher = {IEEE}, abstract = {Web browsing on mobile devices is undoubtedly the future. However, with the increasing complexity of webpages, the mobile device{\textquoteright}s computation capability and energy consumption become major pitfalls for a satisfactory user experience. In this paper, we propose a mechanism to effectively leverage processor frequency scaling in order to balance the performance and energy consumption of mobile web browsing. This mechanism explores the performance and energy tradeoff in webpage loading, and schedules webpage loading according to the webpages{\textquoteright} characteristics, using the different frequencies. The proposed solution achieves 20.3\% energy saving compared to the performance mode, and improves webpage loading performance by 37.1\% compared to the battery saving mode.Index Terms{\textemdash}Energy, EDP, Cutoff, Performance, Webpages}, url = {https://doi.org/10.1109/L-CA.2012.33}, author = {Zhu, Yuhao and Srikanth, Aditya and Leng, Jingwen and Reddi, Vijay Janapa} } @conference {leng2014gpuvolt, title = {GPUVolt: Modeling and Characterizing Voltage Noise in Gpu Architectures}, booktitle = {Proceedings of the International Symposium on Low Power Electronics and Design (ISLPED)}, year = {2014}, pages = {141{\textendash}146}, publisher = {ACM}, organization = {ACM}, abstract = {Voltage noise is a major obstacle in improving processor energy eciency because it necessitates large operating voltage guardbands that increase overall power consumption and limit peak performance. Identifying the leading root causes of voltage noise is essential to minimize the unnecessary guardband and maximize the overall energy eciency. We provide the first-ever modeling and characterization of voltage noise in GPUs based on a new simulation infrastructure called GPUVolt. Using it, we identify the key intracore microarchitectural components (e.g., the register file and special functional units) that significantly impact the GPU{\textquoteright}s voltage noise. We also demonstrate that intercore-aligned microarchitectural activity detrimentally impacts the chipwide worst-case voltage droops. On the basis of these findings, we propose a combined register-file and execution-unit throttling mechanism that smooths GPU voltage noise and reduces the guardband requirement by as much as 29\%.Categories and Subject DescriptorsC.4 [Performance of Systems]: Modeling techniques, Reliability, availability, and serviceabilityKeywordsdi/dt, inductive noise, GPU architecture, GPU reliability}, author = {Leng, Jingwen and Zu, Yazhou and Rhu, Minsoo and Gupta, Meeta and Reddi, Vijay Janapa} } @booklet {chai2014lightweight, title = {Lightweight Detection and Recovery Mechanisms to Extend Algorithm Resiliency in Noisy Computation}, journal = {Workshop on Near-threshold Computing (WNTC)}, year = {2014}, abstract = {{\textemdash} The intrinsic robustness of an algorithm and architecture depends highly on the combined ability tolerate noise. In this paper, we present an alternative approach for energy reduction for near threshold computing based on a statistical modeling of computational noise induced from noisy memory and non-ideal interconnects. We present this approach as a complement to the standard approximate computing approaches. We show results of the lightweight error checks and recovery based on several design considerations on data value speculation.Index Terms{\textemdash}Approximate computing, noise resiliency, computation noise, near threshold computing}, author = {Chai, Sek and Zhang, David and Leng, Jingwen and Reddi, Vijay Janapa} } @booklet {kazdagli2014morpheus, title = {Morpheus: Benchmarking Computational Diversity in Mobile Malware}, journal = {Workshop on Hardware and Architectural Support for Security and Privacy (HASP)}, year = {2014}, publisher = {ACM}, abstract = {Computational characteristics of a program can potentially be used to identify malicious programs from benign ones. However, systematically evaluating malware detection techniques, especially when malware samples are hard to run correctly and can adapt their computational characteristics, is a hard problem. We introduce Morpheus {\textendash} a benchmarking tool that includes both real mobile malware and a synthetic malware generator that can be configured to generate a computationally diverse malware sample-set {\textendash} as a tool to evaluate computational signatures based malware detection. Morpheus also includes a set of computationally diverse benign applications that can be used to repackage malware into, along with a recorded trace of over 1 hour long realistic human usage for each app that can be used to replay both benign and malicious executions.The current Morpheus prototype targets Android applications and malware samples. Using Morpheus, we quantify the computational diversity in malware behavior and expose opportunities for dynamic analyses that can detect mobile malware. Specifically, the use of obfuscation and encryption to thwart static analyses causes the malicious execution to be more distinctive {\textendash} a potential opportunity for detection. We also present potential challenges, specifically, minimizing false positives that can arise due to diversity of benign executions.Categories and Subject DescriptorsD.4.6 [Security and Protection]: Invasive softwareKeywordssecurity, mobile malware, performance counters}, author = {Kazdagli, Mikhail and Huang, Ling and REDDI, Vijay and Tiwari, Mohit} } @article {zhu2014webcore, title = {WebCore: Architectural Support for Mobile Web Browsing}, journal = {Proceedings of the 41st International Symposium on Computer Architecture (ISCA)}, volume = {42}, number = {3}, year = {2014}, pages = {541{\textendash}552}, publisher = {ACM}, abstract = {The Web browser is undoubtedly the single most important application in the mobile ecosystem. An average user spends 72 minutes each day using the mobile Web browser. Web browser internal engines (e.g., WebKit) are also growing in importance because they provide a common substrate for developing various mobile Web applications. In a user-driven, interactive, and latency-sensitive environment, the browser{\textquoteright}s performance is crucial. However, the battery-constrained nature of mobile devices limits the performance that we can deliver for mobile Web browsing. As traditional general-purpose techniques to improve performance and energy efficiency fall short, we must employ domain-specific knowledge while still maintaining general-purpose flexibility.In this paper, we first perform design-space exploration to identify appropriate general-purpose architectures that uniquely fit the characteristics of a popular Web browsing engine. Despite our best effort, we discover sources of energy inefficiency in these customized general-purpose architectures. To mitigate these inefficiencies, we propose, synthesize, and evaluate two new domain-specific specializations, called the Style Resolution Unit and the Browser Engine Cache. Our optimizations boost energy efficiency and at the same time improve mobile Web browsing performance. As emerging mobile workloads increasingly rely more on Web browser technologies, the type of optimizations we propose will become important in the future and are likely to have lasting widespread impact.}, url = {https://doi.org/10.1109/ISCA.2014.6853239}, author = {Zhu, Yuhao and Reddi, Vijay Janapa} } @conference {guckert2013case, title = {A Case for Persistent Caching of Compiled Javascript Code in Mobile Web Browsers}, booktitle = {Workshop on Architectural and Microarchitectural Support for Binary Translation (AMAS-BT)}, year = {2013}, abstract = { Over the past decade webpages have grown an order of magnitude in computational complexity. Modern webpages provide rich and complex interactive behaviors for differentiated user experiences. Many of these new capabilities are delivered via JavaScript embedded within these webpages. In this work, we evaluate the potential benefits of persistently caching compiled JavaScript code in the Mozilla JavaScript engine within the Firefox browser. We cache compiled byte codes and generated native code across browser sessions to eliminate the redundant compilation work that occurs when webpages are revisited. Current browsers maintain persistent caches of code and images received over the network. Current browsers also maintain inmemory {\textquotedblleft}caches{\textquotedblright} of recently accessed webpages (WebKit{\textquoteright}s Page Cache or Firefox{\textquoteright}s {\textquotedblleft}Back-Forward{\textquotedblright} cache) that do not persist across browser sessions. This paper assesses the performance improvement and power reduction opportunities that arise from caching compiled JavaScript across browser sessions. We show that persistent caching can achieve an average of 91\% reduction in compilation time for top webpages and 78\% for HTML5 webpages. It also reduces energy consumption by an average of 23\% as compared to the baseline. }, author = {Guckert, Lauren and O{\textquoteright}Connor, Mike and Ravindranath, S Kumar and Zhao, Zhuoran and Reddi, V Janapa} } @conference {leng2013gpuwattch, title = {GPUWattch: Enabling Energy Optimizations in GPGPUs}, booktitle = {ACM SIGARCH Computer Architecture News}, volume = {41}, number = {3}, year = {2013}, pages = {487{\textendash}498}, publisher = {ACM}, organization = {ACM}, abstract = {General-purpose GPUs (GPGPUs) are becoming prevalent in mainstream computing, and performance per watt has emerged as a more crucial evaluation metric than peak performance. As such, GPU architects require robust tools that will enable them to quickly explore new ways to optimize GPGPUs for energy efficiency. We propose a new GPGPU power model that is configurable, capable of cycle-level calculations, and carefully validated against real hardware measurements. To achieve configurability, we use a bottom-up methodology and abstract parameters from the microarchitectural components as the model{\textquoteright}s inputs. We developed a rigorous suite of 80 microbenchmarks that we use to bound any modeling uncertainties and inaccuracies. The power model is comprehensively validated against measurements of two commercially available GPUs, and the measured error is within 9.9\% and 13.4\% for the two target GPUs (GTX 480 and Quadro FX5600). The model also accurately tracks the power consumption trend over time. We integrated the power model with the cycle-level simulator GPGPU-Sim and demonstrate the energy savings by utilizing dynamic voltage and frequency scaling (DVFS) and clock gating. Traditional DVFS reduces GPU energy consumption by 14.4\% by leveraging within-kernel runtime variations. More finer-grained SM cluster-level DVFS improves the energy savings from 6.6\% to 13.6\% for those benchmarks that show clustered execution behavior. We also show that clock gating inactive lanes during divergence reduces dynamic power by 11.2\%.Categories and Subject DescriptorsC.1.4 [Processor Architectures]: Parallel Architectures; C.4 [Performance of Systems]: Modeling techniquesGeneral TermsExperimentation, Measurement, Power, PerformanceKeywordsEnergy, CUDA, GPU architecture, Power estimation}, url = {https://doi.org/10.1145/2485922.2485964}, author = {Leng, Jingwen and Hetherington, Tayler and ElTantawy, Ahmed and Gilani, Syed and Kim, Nam Sung and Aamodt, Tor M and Reddi, Vijay Janapa} } @conference {zhu2013high, title = {High-Performance and Energy-Efficient Mobile Web Browsing on Big/Little Systems}, booktitle = {High Performance Computer Architecture (HPCA2013), 2013 IEEE 19th International Symposium on}, year = {2013}, pages = {13{\textendash}24}, publisher = {IEEE}, organization = {IEEE}, abstract = {Internet web browsing has reached a critical tipping point. Increasingly, users rely more on mobile web browsers to access the Internet than desktop browsers. Meanwhile, webpages over the past decade have grown in complexity by more than tenfold. The fast penetration of mobile browsing and everricher webpages implies a growing need for high-performance mobile devices in the future to ensure continued end-user browsing experience. Failing to deliver webpages meeting hard cut-off constraints could directly translate to webpage abandonment or, for e-commerce websites, great revenue loss. However, mobile devices{\textquoteright} limited battery capacity limits the degree of performance that mobile web browsing can achieve. In this paper, we demonstrate the benefits of heterogeneous systems with big/little cores each with different frequencies to achieve the ideal trade-off between high performance and energy efficiency. Through detailed characterizations of different webpage primitives based on the hottest 5,000 webpages, we build statistical inference models that estimate webpage load time and energy consumption. We show that leveraging such predictive models lets us identify and schedule webpages using the ideal core and frequency configuration that minimizes energy consumption while still meeting stringent cut-off constraints. Real hardware and software evaluations show that our scheduling scheme achieves 83.0\% energy savings, while only violating the cut-off latency for 4.1\% more webpages as compared with a performance-oriented hardware strategy. Against a more intelligent, OS-driven, dynamic voltage and frequency scaling scheme, it achieves 8.6\% energy savings and 4.0\% performance improvement simultaneously.}, url = {https://doi.org/10.1109/HPCA.2013.6522303}, author = {Zhu, Yuhao and Reddi, Vijay Janapa} } @booklet {kanev2013measuring, title = {Measuring Code Optimization Impact on Voltage Noise}, journal = {Workshop on Silicon Errors in Logic - System Effects (SELSE)}, year = {2013}, abstract = {In this paper, we characterize the impact of compiler optimizations on voltage noise. While intuition may suggest that the better processor utilization ensured by optimizing compilers results in a small amount of voltage variation, our measurements on a IntelR CoreTM2 Duo processor show the opposite {\textendash} the majority of SPEC 2006 benchmarks exhibit more voltage droops when aggressively optimized. We show that this increase in noise could be sufficient for a net performance decrease in a typicalcase, resilient design.}, author = {Svilen Kanev and Jones, Timothy M and Gu-Yeon Wei and Brooks, David M and Reddi, Vijay Janapa} } @article {reddi2013reliability, title = {Reliability-Aware Microarchitecture Design}, journal = {IEEE Micro}, number = {4}, year = {2013}, pages = {4{\textendash}5}, publisher = {IEEE}, url = {https://doi.org/10.1109/MM.2013.87}, author = {Reddi, Vijay Janapa} } @book {reddi2013resilient, title = {Resilient Architecture Design for Voltage Variation}, series = {Synthesis Lectures on Computer Architecture}, volume = {8}, number = {2}, year = {2013}, pages = {1{\textendash}138}, publisher = {Morgan \& Claypool Publishers}, organization = {Morgan \& Claypool Publishers}, abstract = {Shrinking feature size and diminishing supply voltage are making circuits sensitive to supply voltage fluctuations within the microprocessor, caused by normal workload activity changes. If left unattended,voltage fluctuations can lead to timing violations or even transistor lifetime issues that degrade processor robustness. Mechanisms that learn to tolerate, avoid, and eliminate voltage fluctuations based on program and microarchitectural events can help steer the processor clear of danger, thus enabling tighter voltage margins that improve performance or lower power consumption.We describe the problem of voltage variation and the factors that influence this variation during processor design and operation. We also describe a variety of runtime hardware and software mitigation techniques that either tolerate, avoid, and/or eliminate voltage violations.We hope processor architects will find the information useful since tolerance, avoidance, and elimination are generalizable constructs that can serve as a basis for addressing other reliability challenges as well.KEYWORDSvoltage noise, voltage smoothing, di dt , inductive noise, voltage emergencies, error detection, error correction, error recovery, transient errors, power supply noise, power delivery networks}, url = {https://doi.org/10.1109/TVLSI.2009.2025279}, author = {Reddi, Vijay Janapa and Gupta, Meeta Sharma} } @conference {reddi2012hardware, title = {Hardware and Software Co-Design for Robust and Resilient Execution}, booktitle = {Collaboration Technologies and Systems (CTS), 2012 International Conference on}, year = {2012}, pages = {380{\textendash}380}, publisher = {IEEE}, organization = {IEEE}, author = {Reddi, Vijay Janapa} } @conference {campanoni2012helix, title = {HELIX: Automatic Parallelization of Irregular Programs for Chip Multiprocessing}, booktitle = {Proceedings of the Tenth International Symposium on Code Generation and Optimization}, year = {2012}, pages = {84{\textendash}93}, publisher = {ACM}, organization = {ACM}, abstract = {We describe and evaluate HELIX, a new technique for automatic loop parallelization that assigns successive iterations of a loop to separate threads. We show that the inter-thread communication costs forced by loop-carried data dependences can be mitigated by code optimization, by using an effective heuristic for selecting loops to parallelize, and by using helper threads to prefetch synchronization signals. We have implemented HELIX as part of an optimizing compiler framework that automatically selects and parallelizes loops from general sequential programs. The framework uses an analytical model of loop speedups, combined with profile data, to choose loops to parallelize. On a six-core IntelVR Core❚▼ i7-980X, HELIX achieves speedups averaging 2.25$\%<$, with a maximum of 4.12$\%<$, for thirteen C benchmarks from SPEC CPU2000.}, url = {https://doi.org/10.1145/2259016.2259028}, author = {Campanoni, Simone and Jones, Timothy and Holloway, Glenn and Reddi, Vijay Janapa and Gu-Yeon Wei and David Brooks} } @conference {reddi2012robust, title = {Robust and Resilient Designs from the Bottom-Up: Technology, CAD, Circuit, and System Issues}, booktitle = {Design Automation Conference (ASP-DAC), 2012 17th Asia and South Pacific}, year = {2012}, pages = {7{\textendash}16}, publisher = {IEEE}, organization = {IEEE}, abstract = {The semiconductor industry is facing a critical research challenge: design future high performance and energy efficient systems while satisfying historical standards for reliability and lower costs. The primary cause of this challenge is device and circuit parameter variability, which results from the manufacturing process and system operation. As technology scales, the adverse impact of these variations on system-level metrics increases. In this paper, we describe an interdisciplinary effort toward robust and resilient designs that mitigate the effects of device and circuit parameter variations in order to enhance system performance, energy efficiency, and reliability. Collaboration between the technology, CAD, circuit, and system levels of the compute hierarchy can foster the development of cost-effective and efficient solutions.}, url = {https://doi.org/10.1109/ASPDAC.2012.6165064}, author = {Reddi, Vijay Janapa and Pan, David Z and Nassif, Sani R and Bowman, Keith A} } @conference {bailis2011dimetrodon, title = {Dimetrodon: processor-level preventive thermal management via idle cycle injection}, booktitle = {Design Automation Conference (DAC), 2011 48th ACM/EDAC/IEEE}, year = {2011}, pages = {89{\textendash}94}, publisher = {IEEE}, organization = {IEEE}, author = {Bailis, Peter and Reddi, Vijay Janapa and Gandhi, Sanjay and David Brooks and Seltzer, Margo} } @conference {1437311, title = {Mobile Processors for Energy-Efficient Web Search}, booktitle = {Transactions on Computer Systems}, volume = {29}, year = {2011}, publisher = {ACM}, organization = {ACM}, edition = {4}, abstract = {As cloud and utility computing spreads, computer architects must ensure continued capability growth for the data centers that comprise the cloud. Given megawatt scale power budgets, increasing data center capability requires increasing computing hardware energy efficiency. To increase the data center{\textquoteright}s capability for work, the work done per Joule must increase. We pursue this efficiency even as the nature of data center applications evolves. Unlike traditional enterprise workloads, which are typically memory or I/O bound, big data computation and analytics exhibit greater compute intensity. This article examines the efficiency of mobile processors as a means for data center capability. In particular, we compare and contrast the performance and efficiency of the Microsoft Bing search engine executing on the mobile-class Atom processor and the server-class Xeon processor. Bing implements statistical machine learning to dynamically rank pages, producing sophisticated search results but also increasing computational intensity. While mobile processors are energy-efficient, they exact a price for that efficiency. The Atom is 5{\texttimes}\ more energy-efficient than the Xeon when comparing queries per Joule. However, search queries on Atom encounter higher latencies, different page results, and diminished robustness for complex queries. Despite these challenges, quality-of-service is maintained for most, common queries. Moreover, as different computational phases of the search engine encounter different bottlenecks, we describe implications for future architectural enhancements, application tuning, and system architectures. After optimizing the Atom server platform, a large share of power and cost go toward processor capability. With optimized Atoms, more servers can fit in a given data center power budget. For a data center with 15MW critical load, Atom-based servers increase capability by 3.2{\texttimes}\ for Bing.}, author = {Reddi, Vijay Janapa and Lee, Benjamin and Chilimbi, Trishul and Vaid, Kushagra} } @article {reddi2011resilient, title = {Resilient Architectures via Collaborative Design: Maximizing Commodity Processor Performance in the Presence of Variations}, journal = {IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems}, volume = {30}, number = {10}, year = {2011}, pages = {1429{\textendash}1445}, publisher = {IEEE}, abstract = {Unintended variations in circuit lithography and undesirable fluctuations in circuit operating parameters such as supply voltage and temperature are threatening the continuation of technology scaling that microprocessor evolution relies on. Although circuit-level solutions for some variation problems may be possible, they are prohibitively expensive and impractical for commodity processors, on which not only the consumer market but also an increasing segment of the business market now depends. Solutions at the microarchitecture level and even the software level, on the other hand, overcome some of these circuitlevel challenges without significantly raising costs or lowering performance. Using examples drawn from our Alarms Project and related work, we illustrate how collaborative design that encompasses circuits, architecture, and chip-resident software leads to a cost-effective solution for inductive voltage noise, sometimes called the dI/dt problem.The strategy that we use for assuring correctness while preserving performance can be extended to other variation problems. Index Terms{\textemdash}Dynamic variation, error correction, error detection, error recovery, error resiliency, hw/sw co-design, inductive noise, power supply noise, reliability, resilient design, resilient microprocessor, timing error, variation, voltage droop.}, url = {https://doi.org/10.1109/TCAD.2011.2163635}, author = {Reddi, Vijay Janapa and David Brooks} } @article {1595420, title = { Voltage Noise in Production Processors}, journal = {IEEE Micro}, volume = {31}, number = {1}, year = {2011}, pages = {20-28}, abstract = {Voltage variations are a major challenge in processor design. Here, researchers characterize the voltage noise characteristics of programs as they run to completion on a production Core 2 Duo processor. Furthermore, they characterize the implications of resilient architecture design for voltage variation in future systems.}, url = {https://ieeexplore.ieee.org/document/5661758}, author = {Reddi, Vijay Janapa and Svilen Kanev and Kim, Wonyoung and Campanoni, Simone and Michael D. Smith and Gu-Yeon Wei and David Brooks} } @article {reddi2010eliminating, title = {Eliminating Voltage Emergencies via Software-Guided Code Transformations}, journal = {ACM Transactions on Architecture and Code Optimization (TACO)}, volume = {7}, number = {2}, year = {2010}, pages = {12}, publisher = {ACM}, abstract = {In recent years, circuit reliability in modern high-performance processors has become increasingly important. Shrinking feature sizes and diminishing supply voltages have made circuits more sensitive to microprocessor supply voltage fluctuations. These fluctuations result from the natural variation of processor activity as workloads execute, but when left unattended, these voltage fluctuations can lead to timing violations or even transistor lifetime issues. In this paper, we present a hardware-software collaborative approach to mitigate voltage fluctuations. A checkpoint-recovery mechanism rectifies errors when voltage violates maximum tolerance settings, while a run-time software layer reschedules the program{\textquoteright}s instruction stream to prevent recurring violations at the same program location. The run-time layer, combined with the proposed code rescheduling algorithm, removes 60\% of all violations with minimal overhead, thereby significantly improving overall performance. Our solution is a radical departure from the ongoing industry standard approach to circumvent the issue altogether by optimizing for the worst case voltage flux, which compromises power and performance efficiency severely, especially looking ahead to future technology generations. Existing conservative approaches will have severe implications on the ability to deliver efficient microprocessors. The proposed technique reassembles a traditional reliability problem as a runtime performance optimization problem, thus allowing us to design processors for typical case operation by building intelligent algorithms that can prevent recurring violations.Categories and Subject Descriptors: B.8.1 [Performance and Reliability]: Reliability, Testing, and Fault-ToleranceGeneral Terms: Performance, ReliabilityAdditional Key Words and Phrases: Voltage Noise, dI/dt, Inductive Noise, Voltage Emergencies}, url = {https://doi.org/10.1145/1839667.1839674}, author = {Reddi, Vijay Janapa and Campanoni, Simone and Gupta, Meeta S and Smith, Michael D and Gu-Yeon Wei and David Brooks and Hazelwood, Kim} } @mastersthesis {reddi2010software, title = {Software-Assisted Hardware Reliability: Enabling Aggressive Timing Speculation Using Run-time Feedback from Hardware and Software}, year = {2010}, school = {Harvard University}, type = {phd}, abstract = {In the era of nanoscale technology scaling, we are facing the limits of physics, challenging robust and reliable microprocessor design and fabrication. As these trends continue, guaranteeing correctness of execution is becoming prohibitively expensive and impractical. In this thesis, we demonstrate the benefits of abstracting circuit-level challenges to the architecture and software layers. Reliability challenges are broadly classified into process, voltage, and thermal variations. As proof of concept, we target voltage variation, which is least understood, demonstrating its growing detrimental effects on future processors: Shrinking feature size and diminishing supply voltage are making circuits more sensitive to supply voltage fluctuations within the microprocessor. If left unattended, these voltage fluctuations can lead to timing violations or even transistor lifetime issues. This problem, more commonly known as the dI/dt problem, is forcing microprocessor designers to increasingly sacrifice processor performance, as well as power efficiency, in order to guarantee correctness and robustness of operation. Industry addresses this problem by un-optimizing the processor for the worst case voltage flux. Setting such extreme operating voltage margins for those large and\ infrequent voltage swings is not a sustainable solution in the long term. Therefore, we depart from this traditional strategy and operate the processor under more typical case conditions. We demonstrate that a collaborative architecture between hardware and software enables aggressive operating voltage margins, and as a consequence improves processor performance and power efficiency. This co-designed architecture is built on the principles of tolerance, avoidance and elimination. Using a fail-safe hardware mechanism to tolerate voltage margin violations, we enable timing speculation, while a run-time hardware and software layer attempts to not only predict and avoid impending violations, but also reschedules instructions and co-schedules threads intelligently to eliminate voltage violations altogether. We believe tolerance, avoidance and elimination are generalizable constructs capable of acting as guidelines to address and successfully mitigate the other parameter-related reliability challenges as well.}, url = {https://doi.org/10.1145/1629911.1630114}, author = {Reddi, Vijay Janapa} } @article {kanevsystem, title = {A System-Level View of Voltage Noise in Production Processors}, journal = {ACM Transactions on Architecture and Code Optimization}, volume = {9}, number = {4}, year = {2010}, abstract = { Parameter variations have become a dominant challenge in microprocessor design. Voltage variation is es- pecially daunting because it happens rapidly. We measure and characterize voltage variation in a running Intel⃝R\ CoreTM2 Duo processor. By sensing on-die voltage as the processor runs single-threaded, multi- threaded, and multi-program workloads, we determine the average supply voltage swing of the processor to be only 4\%, far from the processor{\textquoteright}s 14\% worst-case operating voltage margin. While such large margins guarantee correctness, they penalize performance and power efficiency. We investigate and quantify the benefits of designing a processor for typical-case (rather than worst-case) voltage swings, assuming that a fail-safe mechanism protects it from infrequently occurring large voltage fluctuations. With the investigated processors, such\ resilient\ designs could yield 15\% to 20\% performance improvements. But we also show that in future systems, these gains could be lost as increasing voltage swings intensify the frequency of fail-safe recoveries. After characterizing microarchitectural activity that leads to voltage swings within multi-core systems, we show two software techniques that have the potential to mitigate such voltage emergencies. A voltage-aware compiler can choose to de-optimize for performance in favor of better noise behavior, while a thread scheduler can co-schedule phases of different programs to mitigate error recovery overheads in future resilient processor designs. }, author = {Svilen Kanev and Reddi, Vijay Janapa and Timothy M. Jones and Kim, Wonyoung and Campanoni, Simone and Michael D. Smith and Gu-Yeon Wei and David Brooks} } @conference {reddi2010voltage, title = {Voltage Smoothing: Characterizing and Mitigating Voltage Noise in Production Processors via Software-Guided Thread Scheduling}, booktitle = {Proceedings of the 2010 43rd Annual IEEE/ACM International Symposium on Microarchitecture}, year = {2010}, pages = {77{\textendash}88}, publisher = {IEEE Computer Society}, organization = {IEEE Computer Society}, abstract = {More than 20\% of the available energy is lost in {\textquotedblleft}the last centimeter{\textquotedblright} from the PCB board to the microprocessor chip due to inherent inefficiencies of power delivery subsystems (PDSs) in today{\textquoteright}s computing systems. By series-stacking multiple voltage domains to eliminate explicit voltage conversion and reduce loss along the power delivery path, voltage stacking (VS) is a novel configuration that can improve power delivery efficiency (PDE). However, VS suffers from aggravated levels of supply noise caused by current imbalance between the stacking layers, preventing its practical adoption in mainstream computing systems. Throughput-centric manycore architectures such as GPUs intrinsically exhibit more balanced workloads, yet suffer from lower PDE, making them ideal platforms to implement voltage stacking. In this paper, we present a cross-layer approach to practical voltage stacking implementation in GPUs. It combines circuit-level voltage regulation using distributed charge-recycling integrated voltage regulators (CR-IVRs) with architecture-level voltage smoothing guided by control theory. Our proposed voltage-stacked GPUs can eliminate 61.5\% of total PDS energy loss and achieve 92.3\% system-level power delivery efficiency, a 12.3\% improvement over the conventional single-layer based PDS. Compared to the circuit-only solution, the cross-layer approach significantly reduces the implementation cost of voltage stacking (88\% reduction in area overhead) without compromising supply reliability under worst-case scenarios and across a wide range of real-world benchmarks. In addition, we demonstrate that the cross-layer solution not only complements on-chip CR-IVRs to transparently manage current imbalance and restore stable layer voltages, but also serves as a seamless interface to accommodate higher-level power optimization techniques, traditionally thought to be incompatible with a VS configuration.}, url = {https://doi.org/10.1109/MICRO.2010.35}, author = {Reddi, Vijay Janapa and Svilen Kanev and Kim, Wonyoung and Campanoni, Simone and Smith, Michael D and Gu-Yeon Wei and David Brooks} } @conference {reddi2010web, title = {Web Search Using Mobile Cores: Quantifying and Mitigating the Price of Efficiency}, booktitle = {International Symposium on Computer Architecture}, year = {2010}, abstract = {The commoditization of hardware, data center economies of scale, and Internet-scale workload growth all demand greater power efficiency to sustain scalability. Traditional enterprise workloads, which are typically memory and I/O bound, have been well served by chip multiprocessors comprising of small, power-efficient cores. Recent advances in mobile computing have led to modern small cores capable of delivering even better power efficiency. While these cores can deliver performance-per-Watt efficiency for data center workloads, small cores impact application quality-of-service robustness, and flexibility, as these workloads increasingly invoke computationally intensive kernels. These challenges constitute the price of efficiency. We quantify efficiency for an industry-strength online web search engine in production at both the microarchitecture- and system-level, evaluating search on server and mobile-class architectures using Xeon and Atom processors.Categories and Subject DescriptorsC.0 [Computer Systems Organization]: General{\textemdash}System architectures; C.4 [Computer Systems Organization]: Performance of Systems{\textemdash}Design studies, Reliability, availability, and serviceabilityGeneral TermsMeasurement, Experimentation, Performance}, url = {https://dl.acm.org/citation.cfm?doid=1816038.1816002}, author = {Reddi, Vijay Janapa and Lee, Benjamin and Chilimbi, Trishul and Vaid, Kushagra} } @conference {gupta2009event, title = {An Event-Guided Approach to Handling Inductive Noise in Processors}, booktitle = {Design, Automation, and Test in Europe Conference (DATE-09)}, year = {2009}, address = {Nice, France}, abstract = { Supply voltage fluctuations that result from inductive noise are increasingly troublesome in modern microprocessors. A voltage {\textquotedblleft}emergency{\textquotedblright}, i.e., a swing beyond tolerable operating margins, jeopardizes the safe and correct operation of the processor. Techniques aimed at reducing power consumption, e.g., by clock gating or by reducing nominal supply voltage, exacerbate this noise problem, requiring ever-wider operating margins. We propose an event-guided, adaptive method for avoiding voltage emergencies, which exploits the fact that most emergencies are correlated with unique microarchitectural events, such as cache misses or the pipeline flushes that follow branch mispredictions. Using checkpoint and rollback to handle unavoidable emergencies, our method adapts dynamically by learning to trigger avoidance mechanisms when emergency-prone events recur. After tightening supply voltage margins to increase clock frequency and accounting for all costs, the net result is a performance improvement of 8\% across a suite of fifteen SPEC CPU2000 benchmarks. }, url = {https://doi.org/10.1109/DATE.2009.5090651}, author = {Gupta, Meeta S and Reddi, Vijay Janapa and Smith, Michael D and Gu-Yeon Wei and Brooks, David M} } @conference {gupta2009event, title = {An Event-Guided Approach to Reducing Voltage Noise in Processors}, booktitle = {Design, Automation \& Test in Europe Conference \& Exhibition, 2009. DATE{\textquoteright}09.}, year = {2009}, pages = {160{\textendash}165}, publisher = {IEEE}, organization = {IEEE}, url = {https://doi.org/10.1109/DATE.2009.5090651}, author = {Gupta, Meeta S and Reddi, Vijay Janapa and Holloway, Glenn and Gu-Yeon Wei and Brooks, David M} } @article {shye2009plr, title = {PLR: A Software Approach to Transient Fault Tolerance for Multicore Architectures}, journal = {IEEE Transactions on Dependable and Secure Computing}, volume = {6}, number = {2}, year = {2009}, pages = {135{\textendash}148}, publisher = {IEEE}, abstract = {Transient faults are emerging as a critical concern in the reliability of general-purpose microprocessors. As architectural trends point towards multi-core designs, there is substantial interest in adapting such parallel hardware resources for transient fault tolerance. This paper presents process-level redundancy (PLR), a software technique for transient fault tolerance which leverages multiple cores for low overhead. PLR creates a set of redundant processes per application process, and systematically compares the processes to guarantee correct execution. Redundancy at the process level allows the operating system to freely schedule the processes across all available hardware resources. PLR uses a software-centric approach to transient fault tolerance which shifts the focus from ensuring correct hardware execution to ensuring correct software execution. As a result, many benign faults that do not propagate to affect program correctness can be safely ignored. A real prototype is presented that is designed to be transparent to the application and can run on general-purpose single-threaded programs without modifications to the program, operating system, or underlying hardware. The system is evaluated for fault coverage and performance on 4-way SMP machine, and provides improved performance over existing software transient fault tolerance techniques with an 16.9\% overhead for fault detection on a set of optimized SPEC2000 binaries.Index Terms{\textemdash}fault tolerance, reliability, transient faults, soft errors, process-level redundancy}, url = {https://doi.org/10.1109/TDSC.2008.62}, author = {Shye, Alex and Blomstedt, Joseph and Moseley, Tipp and Reddi, Vijay Janapa and Connors, Daniel A} } @conference {reddi2009software, title = {Software-Assisted Hardware Reliability: Abstracting Circuit-Level Challenges to the Software Stack}, booktitle = {Proceedings of the 46th Annual Design Automation Conference}, year = {2009}, pages = {788{\textendash}793}, publisher = {ACM}, organization = {ACM}, abstract = {Power constrained designs are becoming increasingly sensitive to supply voltage noise. We propose a hardware-software collaborative approach to enable aggressive operating margins: a checkpoint-recovery mechanism corrects margin violations, while a run-time software layer reschedules the program{\textquoteright}s instruction stream to prevent recurring margin crossings at the same program location. The run-time layer removes 60\% of these events with minimal overhead, thereby significantly improving overall performance.Categories and Subject DescriptorsC.0 [Computer Systems Organization]: General{\textemdash} Hardware/Software interfaces and System architectures.General TermsPerformance, Reliability.KeywordsRuntime Optimization, Hardware Software Co-Design.}, url = {https://doi.org/10.1145/1629911.1630114}, author = {Reddi, Vijay Janapa and Gupta, Meeta S and Smith, Michael D and Gu-Yeon Wei and David Brooks and Campanoni, Simone} } @conference {reddi2009voltage, title = {Voltage Emergency Prediction: Using Signatures to Reduce Operating Margins}, booktitle = {High Performance Computer Architecture, 2009. HPCA 2009. IEEE 15th International Symposium on}, year = {2009}, pages = {18{\textendash}29}, publisher = {IEEE}, organization = {IEEE}, abstract = {Inductive noise forces microprocessor designers to sacrifice performance in order to ensure correct and reliable operation of their designs. The possibility of wide fluctuations in supply voltage means that timing margins throughout the processor must be set pessimistically to protect against worst-case droops and surges. While sensor-based reactive schemes have been proposed to deal with voltage noise, inherent sensor delays limit their effectiveness. Instead, this paper describes a voltage emergency predictor that learns the signatures of voltage emergencies (the combinations of control flow and microarchitectural events leading up to them) and uses these signatures to prevent recurrence of the corresponding emergencies. In simulations of a representative superscalar microprocessor in which fluctuations beyond 4\% of nominal voltage are treated as emergencies (an aggressive configuration), these signatures can pinpoint the likelihood of an emergency some 16 cycles ahead of time with 90\% accuracy. This lead time allows machines to operate with much tighter voltage margins (4\% instead of 13\%) and up to 13.5\% higher performance, which closely approaches the 14.2\% performance improvement possible with an ideal oracle-based predictor.}, url = {https://doi.org/10.1109/HPCA.2009.4798233}, author = {Reddi, Vijay Janapa and Gupta, Meeta S and Holloway, Glenn and Gu-Yeon Wei and Smith, Michael D and David Brooks} } @conference {reddi2009voltage, title = {Voltage Noise: Why It{\textquoteright}s Bad, and What To Do About It}, booktitle = {5th IEEE Workshop on Silicon Errors in Logic-System Effects (SELSE), Palo Alto, CA}, year = {2009}, abstract = {Power constrained designs are becoming increasingly sensitive to supply voltage noise. We propose hardware-software collaboration to enable aggressive voltage margins: a fail-safe hardware mechanism tolerates margin violations in order to train a run-time software layer that reschedules instructions to avoid recurring violations. Additionally, the software controls an emergency signature-based predictor that throttles to suppress emergencies that code rescheduling cannot eliminate.}, author = {Reddi, Vijay Janapa and Gupta, Meeta S and Rangan, Krishna K and Campanoni, Simone and Holloway, Glenn and Smith, Michael D and Gu-Yeon Wei and David Brooks} } @conference {reddi2007persistent, title = {Persistent Code Caching: Exploiting Code Reuse Across Executions and Applications}, booktitle = {Code Generation and Optimization, 2007. CGO{\textquoteright}07. International Symposium on}, year = {2007}, pages = {74{\textendash}88}, publisher = {IEEE}, organization = {IEEE}, abstract = {Run-time compilation systems are challenged with the task of translating a program{\textquoteright}s instruction stream while maintaining low overhead. While software managed code caches are utilized to amortize translation costs, they are ineffective for programs with short run times or large amounts of cold code. Such program characteristics are prevalent in real-life computing environments, ranging from Graphical User Interface (GUI) programs to large-scale applications such as database management systems. Persistent code caching addresses these issues. It is described and evaluated in an industry-strength dynamic binary instrumentation system {\textendash} Pin. The proposed approach improves the intra-execution model of code reuse by storing and reusing translations across executions, thereby achieving inter-execution persistence. Dynamically linked programs leverage inter-application persistence by using persistent translations of library code generated by other programs. New translations discovered across executions are automatically accumulated into the persistent code caches, thereby improving performance over time. Inter-execution persistence improves the performance of GUI applications by nearly 90\%, while inter-application persistence achieves a 59\% improvement. In more specialized uses, the SPEC2K INT benchmark suite experiences a 26\% improvement under dynamic binary instrumentation. Finally, a 400\% speedup is achieved in translating the Oracle database in a regression testing environment.}, url = {https://doi.org/10.1109/CGO.2007.29}, author = {Reddi, Vijay Janapa and Connors, Dan and Cohn, Robert and Smith, Michael D} } @conference {moseley2007shadow, title = {Shadow Profiling: Hiding Instrumentation Costs with Parallelism}, booktitle = {Proceedings of the International Symposium on Code Generation and Optimization}, year = {2007}, pages = {198{\textendash}208}, publisher = {IEEE Computer Society}, organization = {IEEE Computer Society}, abstract = {In profiling, a tradeoff exists between information and overhead. For example, hardware-sampling profilers incur negligible overhead, but the information they collect is consequently very coarse. Other profilers use instrumentation tools to gather temporal traces such as path profiles and hot memory streams, but they have high overhead. Runtime and feedback-directed compilation systems need detailed information to aggressively optimize, but the cost of gathering profiles can outweigh the benefits. Shadow profiling is a novel method for sampling long traces of instrumented code in parallel with normal execution, taking advantage of the trend of increasing numbers of cores. Each instrumented sample can be many millions of instructions in length. The primary goal is to incur negligible overhead, yet attain profile information that is nearly as accurate as a perfect profile.The profiler requires no modifications to the operating system or hardware, and is tunable to allow for greater coverage or lower overhead. We evaluate the performance and accuracy of this new profiling technique for two common types of instrumentation-based profiles: interprocedural path profiling and value profiling. Overall, profiles collected using the shadow profiling framework are 94\% accurate versus perfect value profiles, while incurring less than 1\% overhead. Consequently, this technique increases the viability of dynamic and continuous optimization systems by hiding the high overhead of instrumentation and enabling the online collection of many types of profiles that were previously too costly.}, url = {https://doi.org/10.1109/CGO.2007.35}, author = {Moseley, Tipp and Shye, Alex and Reddi, Vijay Janapa and Grunwald, Dirk and Peri, Ramesh} } @conference {moseleyusing, title = {Using Process-Level Redundancy to Exploit Multiple Cores for Transient Fault Tolerance}, booktitle = {37th Annual IEEE/IFIP International Conference on Dependable Systems and Networks (DSN)}, year = {2007}, abstract = {Transient faults are emerging as a critical concern in the reliability of general-purpose microprocessors. As architectural trends point towards multi-threaded multi-core designs, there is substantial interest in adapting such parallel hardware resources for transient fault tolerance. This paper proposes a software-based multi-core alternative for transient fault tolerance using process-level redundancy (PLR). PLR creates a set of redundant processes per application process and systematically compares the processes to guarantee correct execution. Redundancy at the process level allows the operating system to freely schedule the processes across all available hardware resources. PLR{\textquoteright}s softwarecentric approach to transient fault tolerance shifts the focus from ensuring correct hardware execution to ensuring correct software execution. As a result, PLR ignores many benign faults that do not propagate to affect program correctness. A real PLR prototype for running single-threaded applications is presented and evaluated for fault coverage and performance. On a 4-way SMP machine, PLR provides improved performance over existing software transient fault tolerance techniques with 16.9\% overhead for fault detection on a set of optimized SPEC2000 binaries.}, author = {Shye, Alex and Iyer, Matthew and Moseley, Tipp and Hodgdon, David and Fay, Dan and Reddi, Vijay Janapa and Connors, Daniel A.} } @conference {shye2007using, title = {Using process-level redundancy to exploit multiple cores for transient fault tolerance}, booktitle = {Dependable Systems and Networks, 2007. DSN{\textquoteright}07. 37th Annual IEEE/IFIP International Conference on}, year = {2007}, pages = {297{\textendash}306}, publisher = {IEEE}, organization = {IEEE}, author = {Shye, Alex and Moseley, Tipp and Reddi, Vijay Janapa and Blomstedt, Joseph and Connors, Daniel A} } @mastersthesis {reddi2006deploying, title = {Deploying Dynamic Code Transformation in Modern Computing Environments}, year = {2006}, school = {University of Colorado}, type = {phd}, abstract = {Dynamic code transformation systems are steadily gaining acceptance in computing environments for services such as program optimization, translation, instrumentation and security. Code transformation systems are required to perform complex and time consuming tasks such as costly program analysis and apply transformations (i.e. instrumentation, translation etc.) As these steps are applied to all code regions (regardless of characteristics), the transformation overhead can be significant. Once transformed, the remaining overhead is determined by the performance of the translated code. Current code transformation systems can only become part of mainstream computing only if these overheads are eliminated. Nevertheless, certain application and computing environments exist in which code transformation systems can be effectively deployed. This thesis identifies two such environments, persistence and mixed execution. Persistence leverages previous execution characteristics to address the transformation overhead. This is accomplished by capturing the translated executions at the end of their first invocation. The captured executions are cached on disk for re-use. All subsequent invocations of the run-time system using the same application cause the system to reuse the cached executions. Since applications exhibit similar behavior across varying input data sets, this execution model successfully diminishes the transformation overhead across multiple invocations. Persistence in the domain of dynamic binary instrumentation is highlighted as an example. Mixed execution accepts that the performance of the code generated by today{\textquoteright}s code transformation systems is in no position to compete with original execution times. Therefore, this technique proposes executing a mix of the original and translated code\ sequences to keep the translated code performance penalties within bounds. This execution model is a more effective alternative to pure Just-in-Time compiler-based code transformation systems, when low overheads and minimal architectural perturbation are the critical constraints required to be met. A dynamic compilation framework for controlling microprocessor energy and performance using this model is presented in light of its effectiveness and practicality.}, author = {Reddi, Vijay Janapa} } @patent {cohn2006system, title = {System and Method to Instrument References to Shared Memory}, year = {2006}, note = {US Patent App. 11/143,130}, edition = {United States of America}, chapter = {US}, author = {Cohn, Robert and Moseley, Tipp and REDDI, Vijay} } @conference {shye2006transient, title = {Transient fault tolerance via dynamic process-level redundancy}, booktitle = {Proc. of Workshop on Binary Instrumentation and Applications}, year = {2006}, author = {Shye, Alex and Reddi, Vijay Janapa and Moseley, Tipp and Connors, Daniel A} } @booklet {shye2005analysis, title = {Analysis of Path Profiling Information Generated With Performance Monitoring Hardware}, journal = {Workshop on Interaction between Compilers and Computer Architectures (INTERACT)}, year = {2005}, pages = {34{\textendash}43}, publisher = {IEEE}, abstract = { Even with the breakthroughs in semiconductor technology that will enable billion transistor designs, hardwarebased architecture paradigms alone cannot substantially improve processor performance. The challenge in realizing the full potential of these future machines is to find ways to adapt program behavior to application needs and processor resources. As such, run-time optimization will have a distinct role in future high performance systems. However, as these systems are dependent on accurate, fine-grain profile information, traditional approaches to collecting profiles at run-time result in significant slowdowns during program execution. A novel approach to low-overhead profiling is to exploit hardware Performance Monitoring Units (PMUs) present in modern microprocessors. The Itanium-2 PMU can periodically sample the last few taken branches in an executing program and this information can be used to recreate partial paths of execution. With compiler-aided analysis, the partial paths can be correlated into full paths. As statistically hot paths are most likely to occur in PMU samples, even infrequent sampling can accurately identify these paths. While traditional path profiling techniques carry a high overhead, a PMU-based path profiler represents an effective lightweight profiling alternative. This paper characterizes the PMU-based path information and demonstrates the construction of such a PMU-based path profiler for a run-time system. }, url = {https://doi.org/10.1109/INTERACT.2005.3}, author = {Shye, Alex and Iyer, Matthew and Moseley, Tipp and Hodgdon, David and Fay, Dan and Reddi, Vijay Janapa and Connor, DA} } @conference {shye2005code, title = {Code Coverage Testing Using Hardware Performance Monitoring Support}, booktitle = {Proceedings of the sixth international symposium on Automated analysis-driven debugging}, year = {2005}, pages = {159{\textendash}163}, publisher = {ACM}, organization = {ACM}, abstract = {Code coverage analysis, the process of finding code exercised by a particular set of test inputs, is an important component of software development and verification. Most traditional methods of implementing code coverage analysis tools are based on program instrumentation. These methods typically incur high overhead due to the insertion and execution of instrumentation code, and are not deployable in many software environments. Hardware-based sampling techniques attempt to lower overhead by leveraging existing Hardware Performance Monitoring (HPM) support for program counter (PC) sampling. While PC-sampling incurs lower levels of overhead, it does not provide complete coverage information. This paper extends the HPM approach in two ways. First, it utilizes the sampling of branch vectors which are supported on modern processors. Second, compiler analysis is performed on branch vectors to extend the amount of code coverage information derived from each sample. This paper shows that although HPM is generally used to guide performance improvement efforts, there is substantial promise in leveraging the HPM information for code debugging and verification. The combination of sampled branch vectors and compiler analysis can be used to attain upwards of 80\% of the actual code coverage.}, url = {https://doi.org/10.1145/1085130.1085151}, author = {Shye, Alex and Iyer, Matthew and Reddi, Vijay Janapa and Connors, Daniel A} } @conference {wu2005dynamic, title = {A Dynamic Compilation Framework for Controlling Microprocessor Energy and Performance}, booktitle = {Proceedings of the 38th annual IEEE/ACM International Symposium on Microarchitecture}, year = {2005}, pages = {271{\textendash}282}, publisher = {IEEE Computer Society}, organization = {IEEE Computer Society}, abstract = { Dynamic voltage and frequency scaling (DVFS) is an effective technique for controlling microprocessor energy and performance. Existing DVFS techniques are primarily based on hardware, OS timeinterrupts, or static-compiler techniques. However, substantially greater gains can be realized when control opportunities are also explored in a dynamic compilation environment. There are several advantages to deploying DVFS and managing energy/performance tradeoffs through the use of a dynamic compiler. Most importantly, dynamic compiler driven DVFS is fine-grained, code-aware, and adaptive to the current microarchitecture environment. This paper presents a design framework of the run-time DVFS optimizer in a general dynamic compilation system. A prototype of the DVFS optimizer isimplemented and integrated into an industrialstrength dynamic compilation system. The obtained optimization system is deployed in a real hardware platform that directly measures CPU voltage and current for accurate power and energy readings. Experimental results, based on physical measurements for over 40 SPEC or Olden benchmarks, show that significant energy savings are achieved with little performance degradation. SPEC2K FP benchmarks benefit with energy savings of up to 70\% (with 0.5\% performance loss). In addition, SPEC2K INT show up to 44\% energy savings (with 5\% performance loss), SPEC95 FP save up to 64\% (with 4.9\% performance loss), and Olden save up to 61\% (with 4.5\% performance loss). On average, the technique leads to an energy delay product (EDP) improvement that is 3X-5X better than static voltage scaling, and is more than 2X (22\% vs. 9\%) better than the reported DVFS results of prior static compiler work. While the proposed technique is an effective method for microprocessor voltage and frequency control, the design framework and methodology described in this paper have broader potential to address other energy and power issues such as di/dt and thermal control. }, url = {https://ieeexplore.ieee.org/document/1540966}, author = {Wu, Qiang and Martonosi, Margaret and Clark, Douglas W and Reddi, Vijay Janapa and Connors, Dan and Wu, Youfeng and Lee, Jin and David Brooks} } @conference {moseley2005dynamic, title = {Dynamic Run-time Architecture Techniques For Enabling Continuous Optimization}, booktitle = {Proceedings of the 2nd conference on Computing frontiers}, year = {2005}, pages = {211{\textendash}220}, publisher = {ACM}, organization = {ACM}, abstract = {Future computer systems will integrate tens of multithreaded processor cores on a single chip die, resulting in hundreds of concurrent program threads sharing system resources. These designs will be the cornerstone of improving throughput in high-performance computing and server environments. However, to date, appropriate systems software (operating system, run-time system, and compiler) technologies for these emerging machines have not been adequately explored. Future processors will require sophisticated hardware monitoring units to continuously feed back resource utilization information to allow the operating system to make optimal thread co-scheduling decisions and also to software that continuously optimizes the program itself. Nevertheless, in order to continually and automatically adapt systems resources to program behaviors and application needs, specific run-time information must be collected to adequately enable dynamic code optimization and operating system scheduling. Generally, run-time optimization is limited by the time required to collect profiles, the time required to perform optimization, and the inherent benefits of any optimization or decisions. Initial techniques for effectively utilizing runtime information for dynamic optimization and informed thread scheduling in future multithreaded architectures are presented.}, url = {https://doi.org/10.1145/1062261.1062296}, author = {Moseley, Tipp and Shye, Alex and Reddi, Vijay Janapa and Iyer, Matthew and Fay, Dan and Hodgdon, David and Kihm, Joshua L and Settle, Alex and Grunwald, Dirk and Connors, Daniel A} } @booklet {reddi2005persistence, title = {Persistence in Dynamic Code Transformation Systems}, journal = {ACM SIGARCH Computer Architecture News}, volume = {33}, number = {5}, year = {2005}, pages = {69{\textendash}74}, publisher = {ACM}, abstract = {Dynamic code transformation systems (DCTS) can broadly be grouped into three distinct categories: optimization, translation and instrumentation. All of these face the critical challenge of minimizing the overhead incurred during transformation since their execution is interleaved with the execution of the application itself. The common DCTS tasks incurring overhead are the identification of frequently executed code sequences, costly analysis of program information, and run-time creation (writing) of new code sequences. The cost of such work is amortized by the repeated execution of the transformed code. However, as these steps are applied to all general code regions (regardless of their execution frequency and characteristics), there is substantial overhead that impacts the application{\textquoteright}s performance. As such, it is challenging to effectively deploy dynamic transformation under fixed performance constraints. This paper explores a technique for eliminating the overhead incurred by exploiting persistent application execution characteristics that are shared across different application invocations. This technique is implemented and evaluated in Pin, a dynamic instrumentation engine. This version of Pin is referred to as Persistent Pin (PPin). Initial PPin experimental results indicate that using information from prior runs can reduce dynamic instrumentation overhead of SPEC applications by as much as 25\% and over 90\% for everyday applications like web browsers, display rendering systems, and spreadsheet programs.}, url = {https://doi.org/10.1145/1127577.1127591}, author = {Reddi, Vijay Janapa and Connors, Dan and Cohn, Robert S} } @conference {luk2005pin, title = {Pin: Building Customized Program Analysis Tools with Dynamic Instrumentation}, booktitle = {Programming Language Design and Implementation (PLDI)}, number = {6}, year = {2005}, publisher = {ACM}, organization = {ACM}, abstract = {Robust and powerful software instrumentation tools are essential for program analysis tasks such as profiling, performance evaluation, and bug detection. To meet this need, we have developed a new instrumentation system called Pin. Our goals are to provide easy-to-use, portable, transparent, and efficient instrumentation. Instrumentation tools (called Pintools) are written in C/C++ using Pin{\textquoteright}s rich API. Pin follows the model of ATOM, allowing the tool writer to analyze an application at the instruction level without the need for detailed knowledge of the underlying instruction set. The API is designed to be architecture independent whenever possible, making Pintools source compatible across different architectures. However, a Pintool can access architecture-specific details when necessary. Instrumentation with Pin is mostly transparent as the application and Pintool observe the application{\textquoteright}s original, uninstrumented behavior. Pin uses dynamic compilation to instrument executables while they are running. For efficiency, Pin uses several techniques, including inlining, register re-allocation, liveness analysis, and instruction scheduling to optimize instrumentation. This fully automated approach delivers significantly better instrumentation performance than similar tools. For example, Pin is 3.3x faster than Valgrind and 2x faster than DynamoRIO for basic-block counting. To illustrate Pin{\textquoteright}s versatility, we describe two Pintools in daily use to analyze production software. Pin is publicly available for Linux platforms on four architectures: IA32 (32-bit x86), EM64T (64-bit x86), ItaniumR , and ARM. In the ten months since Pin 2 was released in July 2004, there have been over 3000 downloads from its website.Categories and Subject DescriptorsD.2.5 [Software Engineering]: Testing and Debugging-code inspections and walk-throughs, debugging aids, tracing; D.3.4 [Programming Languages]: Processorscompilers, incremental compilersGeneral TermsLanguages, Performance, ExperimentationKeywordsInstrumentation, program analysis tools, dynamic compilation}, url = {https://doi.org/10.1145/1065010.1065034}, author = {Luk, Chi-Keung and Cohn, Robert and Muth, Robert and Patil, Harish and Klauser, Artur and Lowney, Geoff and Wallace, Steven and Reddi, Vijay Janapa and Hazelwood, Kim} } @conference {figueira2005topology, title = {Topology-Based Hypercube Structures for Global Communication in Heterogeneous Networks}, booktitle = {European Conference on Parallel Processing}, year = {2005}, pages = {994{\textendash}1004}, publisher = {Springer, Berlin, Heidelberg}, organization = {Springer, Berlin, Heidelberg}, abstract = {Hypercube structures are heavily used by parallel algorithms that require all-to-all communication. When communicating over a heterogeneous and irregular network, the performance obtained by the hypercube structure will depend on the matching of the hypercube structure to the topology of the underlying network. In this paper, we present strategies to build topology-based hypercubes structures. These strategies do not assume any kind of topology. They take into account the communication cost between pair of nodes to provide a performance-efficient hypercube structure. These enhanced hypercube structures help improve the performance of parallel applications that require all-to-all communication in heterogeneous networks by up to ~30\%.}, author = {Figueira, Silvia M and Reddi, Vijay Janapa} } @booklet {reddi2004pin, title = {PIN: A Binary Instrumentation Tool for Computer Architecture Research and Education}, journal = {Workshop on Computer architecture education (WCAE)}, year = {2004}, pages = {22}, publisher = {ACM}, abstract = {Computer architecture embraces a tremendous number of ever-changing inter-connected concepts and information, yet computer architecture education is very often static, seemingly motionless. Computer architecture is commonly taught using simple piecewise methods of explaining how the hardware performs a given task, rather than characterizing the interaction of software and hardware. Visualization tools allow students to interactively explore basic concepts in computer architecture but are limited in their ability to engage students in research and design concepts. Likewise as the development of simulation models such as caches, branch predictors, and pipelines aid student understanding of architecture components, such models have limitations in the workloads that can be examined because of issues with execution time and environment. Overall, to effectively understand modern architectures, it is simply essential to experiment the characteristics of real application workloads. Likewise, understanding program behavior is necessary to effective programming, comprehension of architecture bottlenecks, and hardware design. Computer architecture education must include experience in analyzing program behavior and workload characteristics using effective tools. To explore workload characteristic analysis in computer architecture design, we propose using PIN, a binary instrumentation tool for computer architecture research and education projects.\ }, url = {https://doi.org/10.1145/1275571.1275600}, author = {Reddi, Vijay Janapa and Settle, Alex and Connors, Daniel A and Cohn, Robert S} }