From cd9083a5b54e10b70b33aa6a2cb9243b98eab74f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sercan=20Ayg=C3=BCn?= <7710195+serco425@users.noreply.github.com> Date: Mon, 3 Jun 2024 06:58:14 -0500 Subject: [PATCH 1/2] iss2-adding-BNN-info Sercan Aygun: I added a paragraph about BNNs in the Transient Faults Section. --- contents/robust_ai/robust_ai.qmd | 2 ++ 1 file changed, 2 insertions(+) diff --git a/contents/robust_ai/robust_ai.qmd b/contents/robust_ai/robust_ai.qmd index e69c9ce5..394bbb5e 100644 --- a/contents/robust_ai/robust_ai.qmd +++ b/contents/robust_ai/robust_ai.qmd @@ -157,6 +157,8 @@ During the inference phase, transient faults can impact the reliability and trus In safety-critical applications, such as autonomous vehicles or medical diagnosis, transient faults during inference can have severe consequences, leading to incorrect decisions or actions [@li2017understanding;@jha2019ml]. Ensuring the resilience of ML systems against transient faults is crucial to maintaining the integrity and reliability of the predictions. +Another critical impact on ML systems is related to the Binarized Neural Network (BNN) performance in case of bit-flip errors. As the network weights are represented in single-bit precision, bit-flips on the network weights pose a significant challenge, rendering BNNs fragile. For instance, two-hidden layer BNN architecture for MNIST classification demonstrates performance degradation from 98% test accuracy to 70% when random bit-flipping soft errors are inserted through model weights with a 10% probability. Addressing this issue requires considering flip-aware training techniques or leveraging emerging computing paradigms (e.g., stochastic computing) to enhance fault tolerance and robustness in BNNs [@Aygun2021BSBNN]. + ### Permanent Faults Permanent faults are hardware defects that persist and cause irreversible damage to the affected components. These faults are characterized by their persistent nature and require repair or replacement of the faulty hardware to restore normal system functionality. From 3481e9668fd57291539bb7bea874190c89f6b591 Mon Sep 17 00:00:00 2001 From: Vijay Janapa Reddi Date: Mon, 3 Jun 2024 20:38:14 -0400 Subject: [PATCH 2/2] Updated w/ references and links + expanded the general description a bit --- contents/robust_ai/robust_ai.bib | 24 ++++++++++++++++++++++++ contents/robust_ai/robust_ai.qmd | 8 +++++--- 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/contents/robust_ai/robust_ai.bib b/contents/robust_ai/robust_ai.bib index d7d90a40..f297b0f2 100644 --- a/contents/robust_ai/robust_ai.bib +++ b/contents/robust_ai/robust_ai.bib @@ -1,6 +1,30 @@ %comment{This file was created with betterbib v5.0.11.} +@article{Aygun2021BSBNN, + author = {Aygun, Sercan and Gunes, Ece Olcay and De Vleeschouwer, Christophe}, + title = {Efficient and robust bitstream processing in binarised neural networks}, + journal = {Electron. Lett.}, + volume = {57}, + number = {5}, + pages = {219--222}, + keywords = {Logic circuits, Neural nets (circuit implementations), Logic elements, Neural net devices}, + doi = {10.1049/ell2.12045}, + year = {2021}, + source = {Crossref}, + url = {https://doi.org/10.1049/ell2.12045}, + publisher = {Institution of Engineering and Technology (IET)}, + issn = {0013-5194, 1350-911X}, + month = jan, +} + +@article{courbariaux2016binarized, + author = {Courbariaux, Matthieu and Hubara, Itay and Soudry, Daniel and El-Yaniv, Ran and Bengio, Yoshua}, + title = {Binarized neural networks: {Training} deep neural networks with weights and activations constrained to+ 1 or-1}, + journal = {arXiv preprint arXiv:1602.02830}, + year = {2016}, +} + @book{reddi2013resilient, author = {Reddi, Vijay Janapa and Gupta, Meeta Sharma}, title = {Resilient Architecture Design for Voltage Variation}, diff --git a/contents/robust_ai/robust_ai.qmd b/contents/robust_ai/robust_ai.qmd index 394bbb5e..59a94154 100644 --- a/contents/robust_ai/robust_ai.qmd +++ b/contents/robust_ai/robust_ai.qmd @@ -153,11 +153,13 @@ In ML systems, transient faults can have significant implications during the tra For example, a bit flip in the weight matrix of a neural network can cause the model to learn incorrect patterns or associations, leading to degraded performance [@wan2021analyzing]. Transient faults in the data pipeline, such as corruption of training samples or labels, can also introduce noise and affect the quality of the learned model. -During the inference phase, transient faults can impact the reliability and trustworthiness of ML predictions. If a transient fault occurs in the memory storing the trained model parameters or in the computation of the inference results, it can lead to incorrect or inconsistent predictions. For instance, a bit flip in the activation values of a neural network can alter the final classification or regression output [@mahmoud2020pytorchfi]. +During the inference phase, transient faults can impact the reliability and trustworthiness of ML predictions. If a transient fault occurs in the memory storing the trained model parameters or in the computation of the inference results, it can lead to incorrect or inconsistent predictions. For instance, a bit flip in the activation values of a neural network can alter the final classification or regression output [@mahmoud2020pytorchfi]. In safety-critical applications, such as autonomous vehicles or medical diagnosis, transient faults during inference can have severe consequences, leading to incorrect decisions or actions [@li2017understanding;@jha2019ml]. Ensuring the resilience of ML systems against transient faults is crucial to maintaining the integrity and reliability of the predictions. -Another critical impact on ML systems is related to the Binarized Neural Network (BNN) performance in case of bit-flip errors. As the network weights are represented in single-bit precision, bit-flips on the network weights pose a significant challenge, rendering BNNs fragile. For instance, two-hidden layer BNN architecture for MNIST classification demonstrates performance degradation from 98% test accuracy to 70% when random bit-flipping soft errors are inserted through model weights with a 10% probability. Addressing this issue requires considering flip-aware training techniques or leveraging emerging computing paradigms (e.g., stochastic computing) to enhance fault tolerance and robustness in BNNs [@Aygun2021BSBNN]. +At the other extreme, in resource-constrained environments like TinyML, Binarized Neural Networks (BNNs)[@courbariaux2016binarized] have emerged as a promising solution. BNNs represent network weights in single-bit precision, offering computational efficiency and faster inference times. However, this binary representation renders BNNs fragile to bit-flip errors on the network weights. For instance, prior work [@Aygun2021BSBNN] has shown that a two-hidden layer BNN architecture for a simple task such as MNIST classification suffers performance degradation from 98% test accuracy to 70% when random bit-flipping soft errors are inserted through model weights with a 10% probability. + +Addressing such issues requires considering flip-aware training techniques or leveraging emerging computing paradigms (e.g., [stochastic computing](https://en.wikipedia.org/wiki/Stochastic_computing)) to enhance fault tolerance and robustness, which we will discuss in @sec-hw-intermittent-detect-mitigate. Future research directions aim to develop hybrid architectures, novel activation functions, and loss functions tailored to bridge the accuracy gap compared to full-precision models while maintaining their computational efficiency. ### Permanent Faults @@ -245,7 +247,7 @@ Mitigating the impact of intermittent faults in ML systems requires a multifacet Designing ML systems resilient to intermittent faults is crucial to ensuring their reliability and robustness. This involves incorporating fault-tolerant techniques, runtime monitoring, and adaptive mechanisms into the system architecture. By proactively addressing the challenges of intermittent faults, ML systems can maintain their accuracy, consistency, and trustworthiness, even in sporadic hardware failures. Regular testing, monitoring, and maintenance of ML systems can help identify and mitigate intermittent faults before they cause significant disruptions or performance degradation. -### Detection and Mitigation +### Detection and Mitigation {#sec-hw-intermittent-detect-mitigate} This section explores various fault detection techniques, including hardware-level and software-level approaches, and discusses effective mitigation strategies to enhance the resilience of ML systems. Additionally, we will look into resilient ML system design considerations, present case studies and examples, and highlight future research directions in fault-tolerant ML systems.