diff --git a/contents/efficient_ai/efficient_ai.bib b/contents/efficient_ai/efficient_ai.bib
index e6c96a014..78cacebfb 100644
--- a/contents/efficient_ai/efficient_ai.bib
+++ b/contents/efficient_ai/efficient_ai.bib
@@ -101,3 +101,36 @@ @misc{howard2017mobilenets
   volume        = {abs/1704.04861},
   year          = {2017}
 }
+
+@article{russakovsky2015imagenet,
+  title={Imagenet large scale visual recognition challenge},
+  author={Russakovsky, Olga and Deng, Jia and Su, Hao and Krause, Jonathan and Satheesh, Sanjeev and Ma, Sean and Huang, Zhiheng and Karpathy, Andrej and Khosla, Aditya and Bernstein, Michael and others},
+  journal={International journal of computer vision},
+  volume={115},
+  pages={211--252},
+  year={2015},
+  publisher={Springer}
+}
+
+@inproceedings{lin2014microsoft,
+  title={Microsoft coco: Common objects in context},
+  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+  booktitle={Computer Vision--ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13},
+  pages={740--755},
+  year={2014},
+  organization={Springer}
+}
+
+@article{chowdhery2019visual,
+  title={Visual wake words dataset},
+  author={Chowdhery, Aakanksha and Warden, Pete and Shlens, Jonathon and Howard, Andrew and Rhodes, Rocky},
+  journal={arXiv preprint arXiv:1906.05721},
+  year={2019}
+}
+
+@article{warden2018speech,
+  title={Speech commands: A dataset for limited-vocabulary speech recognition},
+  author={Warden, Pete},
+  journal={arXiv preprint arXiv:1804.03209},
+  year={2018}
+}
diff --git a/contents/efficient_ai/efficient_ai.qmd b/contents/efficient_ai/efficient_ai.qmd
index bf0ae2dd9..ce3e73fd1 100644
--- a/contents/efficient_ai/efficient_ai.qmd
+++ b/contents/efficient_ai/efficient_ai.qmd
@@ -161,7 +161,7 @@ Moreover, the optimal model choice isn't always universal but often depends on t
 
 Another important consideration is the relationship between model complexity and its practical benefits. Take voice-activated assistants as an example such as "Alexa" or "OK Google." While a complex model might demonstrate a marginally superior understanding of user speech, if it's slower to respond than a simpler counterpart, the user experience could be compromised. Thus, adding layers or parameters doesn't always equate to better real-world outcomes.
 
-Furthermore, while benchmark datasets, such as ImageNet, COCO, Visual Wake Words, Google Speech Commands, etc. provide a standardized performance metric, they might not capture the diversity and unpredictability of real-world data. Two facial recognition models with similar benchmark scores might exhibit varied competencies when faced with diverse ethnic backgrounds or challenging lighting conditions. Such disparities underscore the importance of robustness and consistency across varied data. For example, @fig-stoves from the Dollar Street dataset shows stove images across extreme monthly incomes. So if a model was trained on pictures of stoves found in wealth countries only, it will fail to recognize stoves from poorer regions.
+Furthermore, while benchmark datasets, such as ImageNet [@russakovsky2015imagenet], COCO [@lin2014microsoft], Visual Wake Words [@chowdhery2019visual], Google Speech Commands [@warden2018speech], etc. provide a standardized performance metric, they might not capture the diversity and unpredictability of real-world data. Two facial recognition models with similar benchmark scores might exhibit varied competencies when faced with diverse ethnic backgrounds or challenging lighting conditions. Such disparities underscore the importance of robustness and consistency across varied data. For example, @fig-stoves from the Dollar Street dataset shows stove images across extreme monthly incomes. So if a model was trained on pictures of stoves found in wealth countries only, it will fail to recognize stoves from poorer regions.
 
 ![Objects, such as stoves, have different shapes and technological levels in differen regions. A model that is not trained on diverse datasets might perform well on a benchmark but fail in real-world applications. Source: Dollar Street stove images.](https://pbs.twimg.com/media/DmUyPSSW0AAChGa.jpg){#fig-stoves}