From bb43aa2c4872c86bc836432539be6a6252680d61 Mon Sep 17 00:00:00 2001
From: Milan Kordic <78221808+milank94@users.noreply.github.com>
Date: Fri, 5 Jan 2024 07:55:34 -0500
Subject: [PATCH] 2312 TT-BUDA release alignment (#3)

* initial track_pybuda; 2313 rc

* fix typo in ViLT tests

* Modify test case paths

* Modify clean up command to include onnx and tflite file formats

* Fix ONNX download paths for ResNet and RetinaNet

* Add NotImplemented error to Fuyu-8B model

* Fix ONNX model paths

* Add clean up for .h5 files

* Remove .png files from clean up

* Add wideresnet in model_demos

* Add Xception in model_demos

* Add GhostNet in model_demos

* Fix model demos table

* Fix WideResNet and Xception file paths

* Stream image and label files

* Patch Xception variant for GS silicon

* Skip Fuyu-8B (WIP)

* Remove commented code
---
 model_demos/Makefile                          |   4 +
 model_demos/README.md                         |  24 +-
 .../whisper/pytorch_whisper_generation.py     |   4 +-
 .../beit/pytorch_beit_classify_16_224_hf.py   |  57 ++++
 model_demos/cv_demos/clip/pytorch_clip.py     |  12 +-
 .../cv_demos/densenet/pytorch_densenet.py     |   3 +-
 .../tflite_efficientnet_lite0_1x1.py          |  63 ++++
 .../tflite_efficientnet_lite4_1x1.py          |  63 ++++
 .../cv_demos/ghostnet/timm_ghostnet.py        |  53 ++++
 .../cv_demos/hrnet/pytorch_hrnet_osmr.py      |   2 +
 .../cv_demos/hrnet/pytorch_hrnet_timm.py      |   2 +
 .../landmark/hand_landmark_lite_1x1.py        |  55 ++++
 .../landmark/palm_detection_lite_1x1.py       |  53 ++++
 .../landmark/pose_landmark_lite_1x1.py        |  57 ++++
 .../cv_demos/mlpmixer/timm_mlpmixer.py        |  59 ++++
 .../tflite_mobilenet_v2_ssd_1x1.py            |  60 ++++
 .../openpose/pytorch_lwopenpose_2d_osmr.py    |  52 ++++
 .../openpose/pytorch_lwopenpose_3d_osmr.py    |  53 ++++
 model_demos/cv_demos/resnet/onnx_resnet.py    | 116 +++++++
 .../cv_demos/retinanet/onnx_retinanet_r101.py |  81 +++++
 .../pytorch_stable_diffusion.py               |   3 +-
 .../cv_demos/unet/pytorch_unet_torchhub.py    |   2 +-
 model_demos/cv_demos/vgg/pytorch_vgg_hf.py    |   3 +
 model_demos/cv_demos/vgg/pytorch_vgg_osmr.py  |   9 +-
 model_demos/cv_demos/vgg/pytorch_vgg_timm.py  |   9 +
 .../cv_demos/vgg/pytorch_vgg_torchhub.py      |   3 +
 .../cv_demos/vilt/pytorch_vilt_maskedlm.py    |  73 +++++
 .../vilt/pytorch_vilt_question_answering.py   |  66 ++++
 model_demos/cv_demos/vilt/vilt_model.py       |  83 ++++++
 .../wideresnet/pytorch_wideresnet_timm.py     |  62 ++++
 .../wideresnet/pytorch_wideresnet_torchhub.py |  64 ++++
 .../cv_demos/xception/timm_xception.py        |  76 +++++
 .../cv_demos/yolo_v3/holli_src/utils.py       | 256 ++++++++++++++++
 .../cv_demos/yolo_v3/holli_src/yolo_layer.py  | 227 ++++++++++++++
 .../cv_demos/yolo_v3/holli_src/yolov3.py      | 128 ++++++++
 .../cv_demos/yolo_v3/holli_src/yolov3_base.py | 113 +++++++
 .../cv_demos/yolo_v3/holli_src/yolov3_tiny.py | 134 +++++++++
 .../cv_demos/yolo_v3/pytorch_yolov3_holli.py  |  63 ++++
 .../yolo_v3/pytorch_yolov3_holli_1x1.py       |  74 +++++
 .../yolo_v3/pytorch_yolov3_tiny_holli.py      |  57 ++++
 .../cv_demos/yolo_v5/pytorch_yolov5_320.py    |   6 +-
 .../cv_demos/yolo_v5/pytorch_yolov5_480.py    |  45 ++-
 .../cv_demos/yolo_v5/pytorch_yolov5_640.py    |  43 ++-
 .../codegen/pytorch_codegen_causal_lm.py      |   1 +
 .../nlp_demos/falcon/pytorch_falcon.py        |   2 -
 .../fuyu8b/pytorch_fuyu8b_past_cache.py       | 282 ++++++++++++++++++
 .../nlp_demos/opt/pytorch_opt_causal_lm.py    |   2 +
 .../nlp_demos/t5/pytorch_t5_generation.py     |   7 +
 .../nlp_demos/xglm/pytorch_xglm_causal_lm.py  |  16 +-
 model_demos/pyproject.toml                    |  13 +
 model_demos/requirements.txt                  |   1 +
 model_demos/tests/conftest.py                 |  10 +-
 model_demos/tests/test_onnx_resnet.py         |   8 +
 model_demos/tests/test_onnx_retinanet.py      |   8 +
 model_demos/tests/test_pytorch_beit.py        |  11 +
 model_demos/tests/test_pytorch_distilbert.py  |   6 +-
 model_demos/tests/test_pytorch_dpr.py         |  15 +-
 model_demos/tests/test_pytorch_fuyu8b.py      |   8 +
 model_demos/tests/test_pytorch_ghostnet.py    |   8 +
 model_demos/tests/test_pytorch_mlpmixer.py    |   8 +
 model_demos/tests/test_pytorch_openpose.py    |  14 +
 model_demos/tests/test_pytorch_vgg.py         |  11 +-
 model_demos/tests/test_pytorch_vilt.py        |  14 +
 model_demos/tests/test_pytorch_wideresnet.py  |  18 ++
 model_demos/tests/test_pytorch_xception.py    |  11 +
 model_demos/tests/test_pytorch_yolov3.py      |  20 ++
 .../tests/test_tflite_efficientnet_lite.py    |  14 +
 model_demos/tests/test_tflite_landmark.py     |  20 ++
 .../tests/test_tflite_mobilenet_ssd.py        |   8 +
 69 files changed, 2921 insertions(+), 87 deletions(-)
 create mode 100644 model_demos/cv_demos/beit/pytorch_beit_classify_16_224_hf.py
 create mode 100644 model_demos/cv_demos/efficientnet_lite/tflite_efficientnet_lite0_1x1.py
 create mode 100644 model_demos/cv_demos/efficientnet_lite/tflite_efficientnet_lite4_1x1.py
 create mode 100644 model_demos/cv_demos/ghostnet/timm_ghostnet.py
 create mode 100644 model_demos/cv_demos/landmark/hand_landmark_lite_1x1.py
 create mode 100644 model_demos/cv_demos/landmark/palm_detection_lite_1x1.py
 create mode 100644 model_demos/cv_demos/landmark/pose_landmark_lite_1x1.py
 create mode 100644 model_demos/cv_demos/mlpmixer/timm_mlpmixer.py
 create mode 100644 model_demos/cv_demos/mobilenet_ssd/tflite_mobilenet_v2_ssd_1x1.py
 create mode 100644 model_demos/cv_demos/openpose/pytorch_lwopenpose_2d_osmr.py
 create mode 100644 model_demos/cv_demos/openpose/pytorch_lwopenpose_3d_osmr.py
 create mode 100644 model_demos/cv_demos/resnet/onnx_resnet.py
 create mode 100644 model_demos/cv_demos/retinanet/onnx_retinanet_r101.py
 create mode 100644 model_demos/cv_demos/vilt/pytorch_vilt_maskedlm.py
 create mode 100644 model_demos/cv_demos/vilt/pytorch_vilt_question_answering.py
 create mode 100644 model_demos/cv_demos/vilt/vilt_model.py
 create mode 100644 model_demos/cv_demos/wideresnet/pytorch_wideresnet_timm.py
 create mode 100644 model_demos/cv_demos/wideresnet/pytorch_wideresnet_torchhub.py
 create mode 100644 model_demos/cv_demos/xception/timm_xception.py
 create mode 100644 model_demos/cv_demos/yolo_v3/holli_src/utils.py
 create mode 100644 model_demos/cv_demos/yolo_v3/holli_src/yolo_layer.py
 create mode 100644 model_demos/cv_demos/yolo_v3/holli_src/yolov3.py
 create mode 100644 model_demos/cv_demos/yolo_v3/holli_src/yolov3_base.py
 create mode 100644 model_demos/cv_demos/yolo_v3/holli_src/yolov3_tiny.py
 create mode 100644 model_demos/cv_demos/yolo_v3/pytorch_yolov3_holli.py
 create mode 100644 model_demos/cv_demos/yolo_v3/pytorch_yolov3_holli_1x1.py
 create mode 100644 model_demos/cv_demos/yolo_v3/pytorch_yolov3_tiny_holli.py
 create mode 100644 model_demos/nlp_demos/fuyu8b/pytorch_fuyu8b_past_cache.py
 create mode 100644 model_demos/tests/test_onnx_resnet.py
 create mode 100644 model_demos/tests/test_onnx_retinanet.py
 create mode 100644 model_demos/tests/test_pytorch_beit.py
 create mode 100644 model_demos/tests/test_pytorch_fuyu8b.py
 create mode 100644 model_demos/tests/test_pytorch_ghostnet.py
 create mode 100644 model_demos/tests/test_pytorch_mlpmixer.py
 create mode 100644 model_demos/tests/test_pytorch_openpose.py
 create mode 100644 model_demos/tests/test_pytorch_vilt.py
 create mode 100644 model_demos/tests/test_pytorch_wideresnet.py
 create mode 100644 model_demos/tests/test_pytorch_xception.py
 create mode 100644 model_demos/tests/test_pytorch_yolov3.py
 create mode 100644 model_demos/tests/test_tflite_efficientnet_lite.py
 create mode 100644 model_demos/tests/test_tflite_landmark.py
 create mode 100644 model_demos/tests/test_tflite_mobilenet_ssd.py

diff --git a/model_demos/Makefile b/model_demos/Makefile
index facccbb1..c2e65ed7 100644
--- a/model_demos/Makefile
+++ b/model_demos/Makefile
@@ -41,4 +41,8 @@ clean_tt:
 	@find . | grep -E ".pkl_memoize_py3" | xargs rm -rf
 	@find . | grep -E "generated_modules" | xargs rm -rf
 	@find . | grep -E "tt_build" | xargs rm -rf
+	@find . -type f -name "*.onnx" | xargs rm -f
+	@find . -type f -name "*.tflite" | xargs rm -f
+	@find . -type f -name "*.h5" | xargs rm -f
+	@find . -type f -name "*.png" | xargs rm -f
 	@echo "All done cleaning TT files!"
diff --git a/model_demos/README.md b/model_demos/README.md
index 7839b9e3..d3fd0bb2 100644
--- a/model_demos/README.md
+++ b/model_demos/README.md
@@ -31,6 +31,7 @@ python cv_demos/resnet/pytorch_resnet.py
 |-------------------------------------------|:--------:|
 |   [ALBERT](nlp_demos/albert/)            |     GS, WH   |
 |   [Autoencoder](cv_demos/autoencoder/)  |     GS, WH   |
+|   [BeiT](nlp_demos/beit/)                |     GS, WH   |
 |   [BERT](nlp_demos/bert/)                |     GS, WH   |
 |   [CLIP](cv_demos/clip/)                |     GS, WH   |
 |   [CodeGen](nlp_demos/codegen/)          |     GS, WH   |
@@ -38,19 +39,28 @@ python cv_demos/resnet/pytorch_resnet.py
 |   [DenseNet](cv_demos/densenet/)        |     GS, WH   |
 |   [DistilBERT](nlp_demos/distilbert/)    |     GS, WH   |
 |   [DPR](nlp_demos/dpr/)                  |     GS, WH   |
-|   [Falcon](nlp_demos/falcon/)               |    WH   |
+|   [EfficientNet-Lite](cv_demos/efficientnet_lite/) |     WH   |
+|   [Falcon-7B](nlp_demos/falcon/)               |    WH   |
 |   [FLAN-T5](nlp_demos/flant5/)           |     GS, WH   |
+|   [Fuyu-8B](nlp_demos/fuyu8b/)          |       |
+|   [GhostNet](cv_demos/ghostnet/)         |     GS, WH   |
 |   [GoogLeNet](cv_demos/googlenet/)      |     GS, WH   |
 |   [GPT-2](nlp_demos/gpt2/)               |     GS, WH   |
 |   [GPT Neo](nlp_demos/gptneo/)           |     GS, WH   |
+|   [Hand Landmark](nlp_demos/landmark/)  |    WH   |
 |   [HRNet](cv_demos/hrnet/)              |     GS, WH   |
 |   [Inception-v4](cv_demos/inceptionv4/) |    GS, WH   |
-|   [MobileNetV1](cv_demos/mobilenetv1/)  |     GS, WH   |
-|   [MobileNetV2](cv_demos/mobilenetv2/)  |     GS, WH   |
-|   [MobileNetV3](cv_demos/mobilenetv3/)  |     GS, WH   |
+|   [MLP-Mixer](cv_demos/mlpmixer/)  |     GS, WH   |
+|   [MobileNetSSD](cv_demos/mobilenet_ssd/)  |     WH   |
+|   [MobileNetV1](cv_demos/mobilenet_v1/)  |     GS, WH   |
+|   [MobileNetV2](cv_demos/mobilenet_v2/)  |     GS, WH   |
+|   [MobileNetV3](cv_demos/mobilenet_v3/)  |     GS, WH   |
+|   [OpenPose](nlp_demos/openpose/)          |     GS, WH   |
 |   [OPT](nlp_demos/opt/)                  |     GS, WH   |
+|   [Pose Landmark](nlp_demos/landmark/)  |    WH   |
 |   [ResNet](cv_demos/resnet/)            |     GS, WH   |
 |   [ResNeXt](cv_demos/resnext/)          |     GS, WH   |
+|   [RetinaNet](cv_demos/retinanet/)      |     GS, WH   |
 |   [RoBERTa](nlp_demos/roberta/)          |     GS, WH   |
 |   [SqueezeBERT](nlp_demos/squeezebert/)  |     GS, WH   |
 |   [Stable Diffusion](cv_demos/stable_diffusion/)    |    WH   |
@@ -58,10 +68,14 @@ python cv_demos/resnet/pytorch_resnet.py
 |   [U-Net](cv_demos/unet/)               |    GS, WH   |
 |   [VGG](cv_demos/vgg/)                  |     GS, WH   |
 |   [ViT](cv_demos/vit/)                  |     GS, WH   |
+|   [ViLT](cv_demos/vilt/)                  |     GS, WH   |
 |   [VoVNet](cv_demos/vovnet/)            |     GS, WH   |
+|   [WideResNet](cv_demos/wideresnet/)      |     GS, WH   |
 |   [Whisper](audio_demos/whisper/)          |     GS, WH   |
+|   [Xception](cv_demos/xception/)        |     GS, WH   |
 |   [XGLM](nlp_demos/xglm/)                |     GS, WH   |
-|   [YOLOv5](cv_demos/yolov5/)            |     GS, WH   |
+|   [YOLOv3](cv_demos/yolo_v3/)            |     GS, WH   |
+|   [YOLOv5](cv_demos/yolo_v5/)            |     GS, WH   |
 
 ## Contributing
 
diff --git a/model_demos/audio_demos/whisper/pytorch_whisper_generation.py b/model_demos/audio_demos/whisper/pytorch_whisper_generation.py
index e11c5685..8a7af832 100644
--- a/model_demos/audio_demos/whisper/pytorch_whisper_generation.py
+++ b/model_demos/audio_demos/whisper/pytorch_whisper_generation.py
@@ -18,9 +18,11 @@ def run_whisper_generation(variant="openai/whisper-small"):
     compiler_cfg.amp_level = 2
     compiler_cfg.enable_enumerate_u_kt = False
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
+    os.environ["PYBUDA_LEGACY_UBLOCK_SHAPE"] = "1"
     if "small" in variant:
         os.environ["PYBUDA_NLP_MANUAL_TARGET"] = "35000"
-
+    elif "medium" in variant or "large" in variant:
+        os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "65536"
     available_devices = pybuda.detect_available_devices()
     if available_devices[0] == BackendDevice.Grayskull:
         softmax_ops_to_override = [
diff --git a/model_demos/cv_demos/beit/pytorch_beit_classify_16_224_hf.py b/model_demos/cv_demos/beit/pytorch_beit_classify_16_224_hf.py
new file mode 100644
index 00000000..02ef48a1
--- /dev/null
+++ b/model_demos/cv_demos/beit/pytorch_beit_classify_16_224_hf.py
@@ -0,0 +1,57 @@
+# BeiT Model Demo
+
+import os
+
+import pybuda
+import requests
+from PIL import Image
+from pybuda._C.backend_api import BackendDevice
+from transformers import BeitForImageClassification, BeitImageProcessor
+
+
+def run_beit_classify_224_hf_pytorch(variant="microsoft/beit-base-patch16-224"):
+
+    # Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    available_devices = pybuda.detect_available_devices()
+
+    compiler_cfg.enable_t_streaming = True
+    if variant == "microsoft/beit-base-patch16-224":
+        compiler_cfg.retain_tvm_python_files = True
+        compiler_cfg.enable_tvm_constant_prop = True
+        if available_devices[0] == BackendDevice.Grayskull:
+            os.environ["PYBUDA_ENABLE_STABLE_SOFTMAX"] = "1"
+    elif variant == "microsoft/beit-large-patch16-224":
+        if available_devices[0] == BackendDevice.Grayskull:
+            compiler_cfg.retain_tvm_python_files = True
+            compiler_cfg.enable_tvm_constant_prop = True
+            os.environ["PYBUDA_ENABLE_STABLE_SOFTMAX"] = "1"
+        else:
+            compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
+
+    # Create PyBuda module from PyTorch model
+    image_processor = BeitImageProcessor.from_pretrained(variant)
+    model = BeitForImageClassification.from_pretrained(variant)
+    tt_model = pybuda.PyTorchModule("pt_beit_classif_16_224", model)
+
+    # Get sample image
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    sample_image = Image.open(requests.get(url, stream=True).raw)
+
+    # Preprocessing
+    img_tensor = image_processor(sample_image, return_tensors="pt").pixel_values
+
+    # Run inference on Tenstorrent device
+    output_q = pybuda.run_inference(tt_model, inputs=([img_tensor]))
+    output = output_q.get()[0].value().detach().float().numpy()
+
+    # Postprocessing
+    predicted_class_idx = output.argmax(-1).item()
+
+    # Print output
+    print("Predicted class:", predicted_class_idx)
+    print(model.config.id2label[predicted_class_idx])
+
+
+if __name__ == "__main__":
+    run_beit_classify_224_hf_pytorch()
diff --git a/model_demos/cv_demos/clip/pytorch_clip.py b/model_demos/cv_demos/clip/pytorch_clip.py
index 03f87226..63c0c687 100644
--- a/model_demos/cv_demos/clip/pytorch_clip.py
+++ b/model_demos/cv_demos/clip/pytorch_clip.py
@@ -5,7 +5,9 @@
 import torch
 from PIL import Image
 from transformers import CLIPModel, CLIPProcessor
-from transformers.models.clip.modeling_clip import _expand_mask, _make_causal_mask
+
+# from transformers.models.clip.modeling_clip import _expand_mask, _make_causal_mask
+from transformers.modeling_attn_mask_utils import _create_4d_causal_attention_mask, _prepare_4d_attention_mask
 
 
 class CLIPVisionWrapper(torch.nn.Module):
@@ -32,11 +34,15 @@ def forward(self, input_ids, attention_mask):
         hidden_states = self.clip_model.text_model.embeddings(input_ids=input_ids, position_ids=None)
 
         bsz, seq_len = input_shape
-        causal_attention_mask = _make_causal_mask(input_shape, hidden_states.dtype, device=hidden_states.device)
+        # CLIP's text model uses causal mask, prepare it here.
+        # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
+        causal_attention_mask = _create_4d_causal_attention_mask(
+            input_shape, hidden_states.dtype, device=hidden_states.device
+        )
         # expand attention_mask
         if attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
+            attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype)
 
         encoder_outputs = self.clip_model.text_model.encoder(
             inputs_embeds=hidden_states,
diff --git a/model_demos/cv_demos/densenet/pytorch_densenet.py b/model_demos/cv_demos/densenet/pytorch_densenet.py
index de5208b3..de81c683 100644
--- a/model_demos/cv_demos/densenet/pytorch_densenet.py
+++ b/model_demos/cv_demos/densenet/pytorch_densenet.py
@@ -50,14 +50,13 @@ def run_densenet_pytorch(variant="densenet121"):
             if available_devices[0] == BackendDevice.Wormhole_B0:
                 compiler_cfg.default_dram_parameters = False
                 compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
-            elif available_devices[0] == BackendDevice.Grayskull:
-                os.environ["PYBUDA_RIBBON2"] = "1"
 
     elif variant == "densenet161":
         compiler_cfg.balancer_policy = "CNN"
         compiler_cfg.enable_t_streaming = True
         compiler_cfg.place_on_new_epoch("concatenate_131.dc.sparse_matmul.7.lc2")
         os.environ["PYBUDA_DISABLE_CONSTANT_FOLDING"] = "1"
+        os.environ["PYBUDA_LEGACY_UBLOCK_SHAPE"] = "1"
         os.environ["PYBUDA_GRAPHSOLVER_SELF_CUT_TYPE"] = "ConsumerOperandDataEdgesFirst"
         # Device specific configurations
         available_devices = pybuda.detect_available_devices()
diff --git a/model_demos/cv_demos/efficientnet_lite/tflite_efficientnet_lite0_1x1.py b/model_demos/cv_demos/efficientnet_lite/tflite_efficientnet_lite0_1x1.py
new file mode 100644
index 00000000..451af0b3
--- /dev/null
+++ b/model_demos/cv_demos/efficientnet_lite/tflite_efficientnet_lite0_1x1.py
@@ -0,0 +1,63 @@
+# EfficientNet-Lite0 1x1 demo
+
+import os
+import shutil
+import tarfile
+
+import pybuda
+import requests
+import torch
+from pybuda import TFLiteModule
+from pybuda._C.backend_api import BackendDevice
+
+
+def run_efficientnet_lite0_1x1():
+
+    # Device specific configurations
+    available_devices = pybuda.detect_available_devices()
+    if available_devices:
+        if available_devices[0] != BackendDevice.Wormhole_B0:
+            raise NotImplementedError("Model not supported on Grayskull")
+
+    # Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.enable_t_streaming = True
+    compiler_cfg.enable_tvm_constant_prop = True
+    compiler_cfg.graph_solver_self_cut_type = "FastCut"
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16
+
+    # Set PyBDUA environment variable
+    os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "wormhole_b0_1x1.yaml"
+    os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
+
+    # Download model weights
+    MODEL = "efficientnet-lite0"
+    url = f"https://storage.googleapis.com/cloud-tpu-checkpoints/efficientnet/lite/{MODEL}.tar.gz"
+    extract_to = "cv_demos/efficientnet_lite"
+    file_name = url.split("/")[-1]
+    response = requests.get(url, stream=True)
+    with open(file_name, "wb") as f:
+        f.write(response.content)
+    with tarfile.open(file_name, "r:gz") as tar:
+        tar.extractall(path=extract_to)
+    os.remove(file_name)
+
+    # Load model path
+    tflite_path = f"cv_demos/efficientnet_lite/{MODEL}/{MODEL}-fp32.tflite"
+    tt_model = TFLiteModule("tflite_efficientnet_lite0", tflite_path)
+
+    # Run inference on Tenstorrent device
+    input_shape = (1, 224, 224, 3)
+    input_tensor = torch.rand(input_shape)
+
+    output_q = pybuda.run_inference(tt_model, inputs=([input_tensor]))
+    output = output_q.get()[0].value().detach().float().numpy()
+    print(output)
+
+    # Remove remanent files
+    shutil.rmtree(extract_to + "/" + MODEL)
+
+
+if __name__ == "__main__":
+    run_efficientnet_lite0_1x1()
diff --git a/model_demos/cv_demos/efficientnet_lite/tflite_efficientnet_lite4_1x1.py b/model_demos/cv_demos/efficientnet_lite/tflite_efficientnet_lite4_1x1.py
new file mode 100644
index 00000000..cfe61eec
--- /dev/null
+++ b/model_demos/cv_demos/efficientnet_lite/tflite_efficientnet_lite4_1x1.py
@@ -0,0 +1,63 @@
+# EfficientNet-Lite4 1x1 demo
+
+import os
+import shutil
+import tarfile
+
+import pybuda
+import requests
+import torch
+from pybuda import TFLiteModule
+from pybuda._C.backend_api import BackendDevice
+
+
+def run_efficientnet_lite4_1x1():
+
+    # Device specific configurations
+    available_devices = pybuda.detect_available_devices()
+    if available_devices:
+        if available_devices[0] != BackendDevice.Wormhole_B0:
+            raise NotImplementedError("Model not supported on Grayskull")
+
+    # Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.enable_t_streaming = True
+    compiler_cfg.enable_tvm_constant_prop = True
+    compiler_cfg.graph_solver_self_cut_type = "FastCut"
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16
+
+    # Set PyBDUA environment variable
+    os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "wormhole_b0_1x1.yaml"
+    os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
+
+    # Download model weights
+    MODEL = "efficientnet-lite4"
+    url = f"https://storage.googleapis.com/cloud-tpu-checkpoints/efficientnet/lite/{MODEL}.tar.gz"
+    extract_to = "cv_demos/efficientnet_lite"
+    file_name = url.split("/")[-1]
+    response = requests.get(url, stream=True)
+    with open(file_name, "wb") as f:
+        f.write(response.content)
+    with tarfile.open(file_name, "r:gz") as tar:
+        tar.extractall(path=extract_to)
+    os.remove(file_name)
+
+    # Load model path
+    tflite_path = f"cv_demos/efficientnet_lite/{MODEL}/{MODEL}-fp32.tflite"
+    tt_model = TFLiteModule("tflite_efficientnet_lite4", tflite_path)
+
+    # STEP 3: Run inference on Tenstorrent device
+    input_shape = (1, 320, 320, 3)
+    input_tensor = torch.rand(input_shape)
+
+    output_q = pybuda.run_inference(tt_model, inputs=([input_tensor]))
+    output = output_q.get()[0].value().detach().float().numpy()
+    print(output)
+
+    # Remove remanent files
+    shutil.rmtree(extract_to + "/" + MODEL)
+
+
+if __name__ == "__main__":
+    run_efficientnet_lite4_1x1()
diff --git a/model_demos/cv_demos/ghostnet/timm_ghostnet.py b/model_demos/cv_demos/ghostnet/timm_ghostnet.py
new file mode 100644
index 00000000..914e5981
--- /dev/null
+++ b/model_demos/cv_demos/ghostnet/timm_ghostnet.py
@@ -0,0 +1,53 @@
+# Ghostnet
+
+import os
+import urllib
+
+import pybuda
+import requests
+import timm
+import torch
+from PIL import Image
+
+
+def run_ghostnet_timm():
+    # Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object
+    compiler_cfg.enable_t_streaming = True
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+    os.environ["PYBUDA_RIBBON2"] = "1"
+
+    model = timm.create_model("ghostnet_100", pretrained=True)
+
+    # Create PyBuda module from PyTorch model
+    tt_model = pybuda.PyTorchModule("ghostnet_100_timm_pt", model)
+
+    data_config = timm.data.resolve_data_config({}, model=model)
+    transforms = timm.data.create_transform(**data_config)
+
+    url = "https://raw.githubusercontent.com/pytorch/hub/master/images/dog.jpg"
+    img = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+    img_tensor = transforms(img).unsqueeze(0)
+
+    # Run inference on Tenstorrent device
+    output_q = pybuda.run_inference(tt_model, inputs=([img_tensor]))
+    output = output_q.get()[0].value()
+
+    top5_probabilities, top5_class_indices = torch.topk(output.softmax(dim=1) * 100, k=5)
+
+    # Get imagenet class mappings
+    url = "https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt"
+    image_classes = urllib.request.urlopen(url)
+    categories = [s.decode("utf-8").strip() for s in image_classes.readlines()]
+
+    for i in range(top5_probabilities.size(1)):
+        class_idx = top5_class_indices[0, i].item()
+        class_prob = top5_probabilities[0, i].item()
+        class_label = categories[class_idx]
+
+        print(f"{class_label} : {class_prob}")
+
+
+if __name__ == "__main__":
+    run_ghostnet_timm()
diff --git a/model_demos/cv_demos/hrnet/pytorch_hrnet_osmr.py b/model_demos/cv_demos/hrnet/pytorch_hrnet_osmr.py
index 22459fc4..e51a3f14 100644
--- a/model_demos/cv_demos/hrnet/pytorch_hrnet_osmr.py
+++ b/model_demos/cv_demos/hrnet/pytorch_hrnet_osmr.py
@@ -1,5 +1,6 @@
 # HRNet Demo Script
 
+import os
 import urllib
 
 import pybuda
@@ -22,6 +23,7 @@ def run_hrnet_osmr_pytorch(variant="hrnet_w18_small_v1"):
     compiler_cfg.balancer_policy = "CNN"
     compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+    os.environ["PYBUDA_LEGACY_UBLOCK_SHAPE"] = "1"
 
     # Variant specific configurations
     if variant in ["hrnetv2_w44", "hrnetv2_w48"]:
diff --git a/model_demos/cv_demos/hrnet/pytorch_hrnet_timm.py b/model_demos/cv_demos/hrnet/pytorch_hrnet_timm.py
index 3f03ef2c..3f81b447 100644
--- a/model_demos/cv_demos/hrnet/pytorch_hrnet_timm.py
+++ b/model_demos/cv_demos/hrnet/pytorch_hrnet_timm.py
@@ -1,5 +1,6 @@
 # HRNet Demo Script
 
+import os
 import urllib
 
 import pybuda
@@ -23,6 +24,7 @@ def run_hrnet_timm_pytorch(variant="hrnet_w18_small"):
     compiler_cfg.balancer_policy = "CNN"
     compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+    os.environ["PYBUDA_LEGACY_UBLOCK_SHAPE"] = "1"
 
     # Variant specific configurations
     if variant == "hrnet_w48":
diff --git a/model_demos/cv_demos/landmark/hand_landmark_lite_1x1.py b/model_demos/cv_demos/landmark/hand_landmark_lite_1x1.py
new file mode 100644
index 00000000..937c420b
--- /dev/null
+++ b/model_demos/cv_demos/landmark/hand_landmark_lite_1x1.py
@@ -0,0 +1,55 @@
+# Hand Landmark lite 1x1 demo
+
+import os
+
+import pybuda
+import requests
+import torch
+from pybuda import TFLiteModule
+from pybuda._C.backend_api import BackendDevice
+
+
+def run_hand_landmark_lite_1x1():
+
+    # Device specific configurations
+    available_devices = pybuda.detect_available_devices()
+    if available_devices:
+        if available_devices[0] != BackendDevice.Wormhole_B0:
+            raise NotImplementedError("Model not supported on Grayskull")
+
+    # Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.enable_t_streaming = True
+    compiler_cfg.enable_tvm_constant_prop = True
+    compiler_cfg.graph_solver_self_cut_type = "ConsumerOperandDataEdgesFirst"
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+
+    # Set PyBDUA environment variable
+    os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "wormhole_b0_1x1.yaml"
+    os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
+    os.environ["PYBUDA_ENABLE_SINGLE_BUFFER_FALLBACK"] = "1"
+
+    # Download model weights
+    url = "https://storage.googleapis.com/mediapipe-assets/hand_landmark_lite.tflite"
+    tflite_path = "cv_demos/landmark/" + url.split("/")[-1]
+    response = requests.get(url, stream=True)
+    with open(tflite_path, "wb") as f:
+        f.write(response.content)
+
+    # Load Hand Landmark model
+    tt_model = TFLiteModule("tflite_hand_landmark_lite", tflite_path)
+
+    # Run inference on Tenstorrent device
+    input_shape = (1, 224, 224, 3)
+    input_tensor = torch.rand(input_shape)
+    output_q = pybuda.run_inference(tt_model, inputs=([input_tensor]))
+    output = output_q.get()
+    print(output)
+
+    # Remove weight file
+    os.remove(tflite_path)
+
+
+if __name__ == "__main__":
+    run_hand_landmark_lite_1x1()
diff --git a/model_demos/cv_demos/landmark/palm_detection_lite_1x1.py b/model_demos/cv_demos/landmark/palm_detection_lite_1x1.py
new file mode 100644
index 00000000..4ab7ae39
--- /dev/null
+++ b/model_demos/cv_demos/landmark/palm_detection_lite_1x1.py
@@ -0,0 +1,53 @@
+# Palm Detection Lite 1x1 demo
+
+import os
+
+import pybuda
+import requests
+import torch
+from pybuda import TFLiteModule
+from pybuda._C.backend_api import BackendDevice
+
+
+def run_palm_detection_lite_1x1():
+
+    # Device specific configurations
+    available_devices = pybuda.detect_available_devices()
+    if available_devices:
+        if available_devices[0] != BackendDevice.Wormhole_B0:
+            raise NotImplementedError("Model not supported on Grayskull")
+
+    # Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.enable_t_streaming = True
+    compiler_cfg.enable_tvm_constant_prop = True
+    compiler_cfg.graph_solver_self_cut_type = "ConsumerOperandDataEdgesFirst"
+
+    # Set PyBDUA environment variable
+    os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "wormhole_b0_1x1.yaml"
+    os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
+
+    # Download model weights
+    url = "https://storage.googleapis.com/mediapipe-assets/palm_detection_lite.tflite"
+    tflite_path = "cv_demos/landmark/" + url.split("/")[-1]
+    response = requests.get(url, stream=True)
+    with open(tflite_path, "wb") as f:
+        f.write(response.content)
+
+    # Load Palm Landmark model
+    tt_model = TFLiteModule("tflite_palm_detection_lite", tflite_path)
+
+    # Run inference on Tenstorrent device
+    input_shape = (1, 192, 192, 3)
+    input_tensor = torch.rand(input_shape)
+    output_q = pybuda.run_inference(tt_model, inputs=([input_tensor]))
+    output = output_q.get()
+    print(output)
+
+    # Remove weight file
+    os.remove(tflite_path)
+
+
+if __name__ == "__main__":
+    run_palm_detection_lite_1x1()
diff --git a/model_demos/cv_demos/landmark/pose_landmark_lite_1x1.py b/model_demos/cv_demos/landmark/pose_landmark_lite_1x1.py
new file mode 100644
index 00000000..827320e5
--- /dev/null
+++ b/model_demos/cv_demos/landmark/pose_landmark_lite_1x1.py
@@ -0,0 +1,57 @@
+# Pose Landmark Lite 1x1 demo
+
+import os
+
+import pybuda
+import requests
+import torch
+from pybuda import TFLiteModule
+from pybuda._C.backend_api import BackendDevice
+
+
+def run_pose_landmark_lite_1x1():
+
+    # Device specific configurations
+    available_devices = pybuda.detect_available_devices()
+    if available_devices:
+        if available_devices[0] != BackendDevice.Wormhole_B0:
+            raise NotImplementedError("Model not supported on Grayskull")
+
+    # Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.enable_t_streaming = True
+    compiler_cfg.enable_tvm_constant_prop = True
+    compiler_cfg.graph_solver_self_cut_type = "FastCut"
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+    compiler_cfg.enable_single_buffer_fallback = True
+
+    # Set PyBDUA environment variable
+    os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "wormhole_b0_1x1.yaml"
+    os.environ["PYBUDA_SPLIT_RESIZE2D"] = "128"
+    os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
+    os.environ["PYBUDA_MAX_CONCAT_INPUTS"] = "6"
+
+    # Download model weights
+    url = "https://storage.googleapis.com/mediapipe-assets/pose_landmark_lite.tflite"
+    tflite_path = "cv_demos/landmark/" + url.split("/")[-1]
+    response = requests.get(url, stream=True)
+    with open(tflite_path, "wb") as f:
+        f.write(response.content)
+
+    # Load Pose Landmark model
+    tt_model = TFLiteModule("tflite_pose_landmark_light", tflite_path)
+
+    # STEP 3: Run inference on Tenstorrent device
+    input_shape = (1, 256, 256, 3)
+    input_tensor = torch.rand(input_shape)
+    output_q = pybuda.run_inference(tt_model, inputs=([input_tensor]))
+    output = output_q.get()
+    print(output)
+
+    # Remove weight file
+    os.remove(tflite_path)
+
+
+if __name__ == "__main__":
+    run_pose_landmark_lite_1x1()
diff --git a/model_demos/cv_demos/mlpmixer/timm_mlpmixer.py b/model_demos/cv_demos/mlpmixer/timm_mlpmixer.py
new file mode 100644
index 00000000..06ce380d
--- /dev/null
+++ b/model_demos/cv_demos/mlpmixer/timm_mlpmixer.py
@@ -0,0 +1,59 @@
+# MLP-Mixer - TIMM Demo Script
+
+import urllib
+
+import pybuda
+import requests
+import timm
+import torch
+from PIL import Image
+from timm.data import resolve_data_config
+from timm.data.transforms_factory import create_transform
+
+
+def run_mlpmixer_timm():
+
+    # Load MLP-Mixer feature extractor and model from TIMM
+    # "mixer_b16_224", "mixer_b16_224_in21k", "mixer_b16_224_miil", "mixer_b16_224_miil_in21k",
+    # "mixer_b32_224", "mixer_l16_224", "mixer_l16_224_in21k",
+    # "mixer_l32_224", "mixer_s16_224", "mixer_s32_224"
+    variant = "mixer_b16_224"
+    model = timm.create_model(variant, pretrained=True)
+    config = resolve_data_config({}, model=model)
+    transform = create_transform(**config)
+
+    # Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.enable_t_streaming = True
+
+    # Load data sample
+    url = "https://datasets-server.huggingface.co/assets/imagenet-1k/--/default/train/18/image/image.jpg"
+    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+    label = "tiger"
+
+    # Data preprocessing
+    pixel_values = transform(image).unsqueeze(0)
+
+    # Run inference on Tenstorrent device
+    output_q = pybuda.run_inference(pybuda.PyTorchModule(f"timm_{variant}", model), inputs=[(pixel_values,)])
+    output = output_q.get()
+
+    # Data postprocessing
+    probabilities = torch.nn.functional.softmax(output[0].value()[0], dim=0)
+
+    # Get ImageNet class mappings
+    url = "https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt"
+    image_classes = urllib.request.urlopen(url)
+    categories = [s.decode("utf-8").strip() for s in image_classes.readlines()]
+
+    # Get top-k prediction
+    top1_prob, top1_catid = torch.topk(probabilities, 1)
+    predicted_label = categories[top1_catid]
+
+    # Display output
+    print(f"True Label: {label} | Predicted Label: {predicted_label} | Predicted Probability: {top1_prob.item():.2f}")
+
+
+if __name__ == "__main__":
+    run_mlpmixer_timm()
diff --git a/model_demos/cv_demos/mobilenet_ssd/tflite_mobilenet_v2_ssd_1x1.py b/model_demos/cv_demos/mobilenet_ssd/tflite_mobilenet_v2_ssd_1x1.py
new file mode 100644
index 00000000..42a68fcb
--- /dev/null
+++ b/model_demos/cv_demos/mobilenet_ssd/tflite_mobilenet_v2_ssd_1x1.py
@@ -0,0 +1,60 @@
+# MobileNet SSD 1x1 Demo Script
+
+import os
+
+import pybuda
+import requests
+from PIL import Image
+from pybuda import TFLiteModule
+from pybuda._C.backend_api import BackendDevice
+from torchvision import transforms
+
+
+def run_mobilenetv2_ssd_1x1_tflite():
+
+    # Set PyBUDA configuration parameters
+    available_devices = pybuda.detect_available_devices()
+    if available_devices:
+        if available_devices[0] != BackendDevice.Wormhole_B0:
+            raise NotImplementedError("Model not supported on Grayskull")
+
+    # Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.enable_t_streaming = True
+    compiler_cfg.enable_tvm_constant_prop = True
+    compiler_cfg.graph_solver_self_cut_type = "FastCut"
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+    compiler_cfg.cpu_fallback_ops = set(["concatenate"])
+
+    # Set PyBDUA environment variable
+    os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "wormhole_b0_1x1.yaml"
+    os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
+
+    # Download model weights
+    url = "https://storage.googleapis.com/mediapipe-models/object_detector/ssd_mobilenet_v2/float32/latest/ssd_mobilenet_v2.tflite"
+    tflite_path = "cv_demos/mobilenet_ssd/" + url.split("/")[-1]
+    response = requests.get(url, stream=True)
+    with open(tflite_path, "wb") as f:
+        f.write(response.content)
+
+    # Load model path
+    tt_model = TFLiteModule("tflite_mobilenet_ssd", tflite_path)
+
+    # Image preprocessing
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    image = Image.open(requests.get(url, stream=True).raw)
+    transform = transforms.Compose([transforms.Resize((256, 256)), transforms.ToTensor()])
+    img_tensor = transform(image).permute((1, 2, 0)).unsqueeze(0)
+
+    # Run inference on Tenstorrent device
+    output_q = pybuda.run_inference(tt_model, inputs=([img_tensor]))
+    output = output_q.get()
+    print(output)
+
+    # Remove weight file
+    os.remove(tflite_path)
+
+
+if __name__ == "__main__":
+    run_mobilenetv2_ssd_1x1_tflite()
diff --git a/model_demos/cv_demos/openpose/pytorch_lwopenpose_2d_osmr.py b/model_demos/cv_demos/openpose/pytorch_lwopenpose_2d_osmr.py
new file mode 100644
index 00000000..5631f4d6
--- /dev/null
+++ b/model_demos/cv_demos/openpose/pytorch_lwopenpose_2d_osmr.py
@@ -0,0 +1,52 @@
+# LW-OpenPose 2D Demo Script
+
+import pybuda
+import requests
+from PIL import Image
+from pytorchcv.model_provider import get_model as ptcv_get_model
+from torchvision import transforms
+
+
+def get_image_tensor():
+    # Image processing
+    url = "https://raw.githubusercontent.com/axinc-ai/ailia-models/master/pose_estimation_3d/blazepose-fullbody/girl-5204299_640.jpg"
+    input_image = Image.open(requests.get(url, stream=True).raw)
+    preprocess = transforms.Compose(
+        [
+            transforms.Resize(224),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+        ]
+    )
+    input_tensor = preprocess(input_image)
+    input_batch = input_tensor.unsqueeze(0)  # create a mini-batch as expected by the model
+    return input_batch
+
+
+def run_lwopenpose_2d_osmr_pytorch():
+
+    # Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "CNN"
+    compiler_cfg.enable_t_streaming = True
+    compiler_cfg.enable_auto_fusing = False
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16
+
+    # Create PyBuda module from PyTorch model
+    model = ptcv_get_model("lwopenpose2d_mobilenet_cmupan_coco", pretrained=True)
+    model.eval()
+    tt_model = pybuda.PyTorchModule("pt_lwopenpose_2d_osmr", model)
+
+    input_batch = get_image_tensor()
+
+    # Run inference on Tenstorrent device
+    output_q = pybuda.run_inference(tt_model, inputs=([input_batch]))
+    output = output_q.get()[0].value()
+
+    # Print output
+    print(output)
+
+
+if __name__ == "__main__":
+    run_lwopenpose_2d_osmr_pytorch()
diff --git a/model_demos/cv_demos/openpose/pytorch_lwopenpose_3d_osmr.py b/model_demos/cv_demos/openpose/pytorch_lwopenpose_3d_osmr.py
new file mode 100644
index 00000000..982d5f5c
--- /dev/null
+++ b/model_demos/cv_demos/openpose/pytorch_lwopenpose_3d_osmr.py
@@ -0,0 +1,53 @@
+# LW-OpenPose 2D Demo Script
+
+import pybuda
+import requests
+from PIL import Image
+from pytorchcv.model_provider import get_model as ptcv_get_model
+from torchvision import transforms
+
+
+def get_image_tensor():
+    # Image processing
+    url = "https://raw.githubusercontent.com/axinc-ai/ailia-models/master/pose_estimation_3d/blazepose-fullbody/girl-5204299_640.jpg"
+    input_image = Image.open(requests.get(url, stream=True).raw)
+    preprocess = transforms.Compose(
+        [
+            transforms.Resize(224),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+        ]
+    )
+    input_tensor = preprocess(input_image)
+    input_batch = input_tensor.unsqueeze(0)  # create a mini-batch as expected by the model
+    return input_batch
+
+
+def run_lwopenpose_3d_osmr_pytorch():
+
+    # Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "CNN"
+    compiler_cfg.enable_t_streaming = True
+    compiler_cfg.enable_auto_fusing = False
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16
+
+    # Create PyBuda module from PyTorch model
+    model = ptcv_get_model("lwopenpose3d_mobilenet_cmupan_coco", pretrained=True)
+    model.eval()
+    tt_model = pybuda.PyTorchModule("pt_lwopenpose_3d_osmr", model)
+
+    # Get sample input
+    input_batch = get_image_tensor()
+
+    # Run inference on Tenstorrent device
+    output_q = pybuda.run_inference(tt_model, inputs=([input_batch]))
+    output = output_q.get()[0].value()
+
+    # Print output
+    print(output)
+
+
+if __name__ == "__main__":
+    run_lwopenpose_3d_osmr_pytorch()
diff --git a/model_demos/cv_demos/resnet/onnx_resnet.py b/model_demos/cv_demos/resnet/onnx_resnet.py
new file mode 100644
index 00000000..867c0f6c
--- /dev/null
+++ b/model_demos/cv_demos/resnet/onnx_resnet.py
@@ -0,0 +1,116 @@
+# ResNet Demo Script - ONNX
+# Uses torch and torchvision for data pre- and post-processing;
+# can use other frameworks such as MXNet, TensorFlow or Numpy
+
+import os
+import urllib
+
+import onnx
+import pybuda
+import requests
+import torch
+from PIL import Image
+from torchvision import transforms
+
+
+def preprocess(image: Image) -> torch.tensor:
+    """Image preprocessing for ResNet50
+
+    Parameters
+    ----------
+    image : PIL.Image
+        PIL Image sample
+
+    Returns
+    -------
+    torch.tensor
+        Preprocessed input tensor
+    """
+    transform_fn = transforms.Compose(
+        [
+            transforms.Resize([256, 256]),
+            transforms.RandomCrop(224),
+            transforms.ToTensor(),
+            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+        ]
+    )
+    pixel_values = transform_fn(image).unsqueeze(0)
+
+    return pixel_values
+
+
+def postprocess(predictions: torch.tensor) -> tuple:
+    """Model prediction postprocessing for ResNet50
+
+    Parameters
+    ----------
+    predictions : torch.tensor
+        Model predictions
+
+    Returns
+    -------
+    tuple
+        topk probability and category ID
+    """
+
+    # Get probabilities
+    probabilities = torch.nn.functional.softmax(predictions, dim=0)
+
+    # Get top-k prediction
+    top1_prob, top1_catid = torch.topk(probabilities, 1)
+
+    return top1_prob, top1_catid
+
+
+def run_resnet_onnx():
+
+    # Download model weights
+    url = "https://github.com/onnx/models/raw/main/validated/vision/classification/resnet/model/resnet50-v1-7.onnx?download="
+    load_path = "cv_demos/resnet/" + url.split("/")[-1]
+    response = requests.get(url, stream=True)
+    with open(load_path, "wb") as f:
+        f.write(response.content)
+
+    # Load ResNet feature extractor and model checkpoint from HuggingFace
+    model = onnx.load(load_path)
+
+    # Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object
+    compiler_cfg.balancer_policy = "CNN"
+    compiler_cfg.enable_t_streaming = True
+    compiler_cfg.enable_auto_fusing = False
+    os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
+
+    # Load data sample
+    url = "https://datasets-server.huggingface.co/assets/imagenet-1k/--/default/train/18/image/image.jpg"
+    image = Image.open(requests.get(url, stream=True).raw)
+    label = "tiger"
+
+    # Data preprocessing
+    pixel_values = preprocess(image)
+
+    # Run inference on Tenstorrent device
+    output_q = pybuda.run_inference(
+        pybuda.OnnxModule("onnx_resnet50", model, load_path),
+        inputs=[(pixel_values,)],
+    )
+    output = output_q.get()
+
+    # Data postprocessing
+    top1_prob, top1_catid = postprocess(output[0].value()[0])
+
+    # Get ImageNet class mappings
+    url = "https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt"
+    image_classes = urllib.request.urlopen(url)
+    categories = [s.decode("utf-8").strip() for s in image_classes.readlines()]
+    predicted_label = categories[top1_catid]
+
+    # Results
+    print(f"True Label: {label} | Predicted Label: {predicted_label} | Predicted Probability: {top1_prob.item():.2f}")
+
+    # Remove weight file
+    os.remove(load_path)
+
+
+if __name__ == "__main__":
+    run_resnet_onnx()
diff --git a/model_demos/cv_demos/retinanet/onnx_retinanet_r101.py b/model_demos/cv_demos/retinanet/onnx_retinanet_r101.py
new file mode 100644
index 00000000..7e3e4945
--- /dev/null
+++ b/model_demos/cv_demos/retinanet/onnx_retinanet_r101.py
@@ -0,0 +1,81 @@
+# import PyBuda library
+
+import os
+
+import numpy as np
+import onnx
+import pybuda
+import requests
+import torch
+from PIL import Image
+
+
+def img_preprocess(scal_val=1):
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    pil_img = Image.open(requests.get(url, stream=True).raw)
+    scale = scal_val
+    w, h = pil_img.size
+    print("----", w, h)
+    newW, newH = int(scale * w), int(scale * h)
+    newW, newH = 640, 480
+    assert newW > 0 and newH > 0, "Scale is too small, resized images would have no pixel"
+    pil_img = pil_img.resize((newW, newH), resample=Image.BICUBIC)
+    img = np.asarray(pil_img, dtype=np.float32)
+    if img.ndim == 2:
+        img = img[np.newaxis, ...]
+    else:
+        img = img.transpose((2, 0, 1))
+    if (img > 1).any():
+        img = img / 255.0
+    img = torch.from_numpy(img)
+    img = img.unsqueeze(0)
+    return img
+
+
+def run_retinanet_r101_640x480_onnx():
+
+    # Set PyBuda configuration parameters
+    os.environ["PYBUDA_DECOMPOSE_SIGMOID"] = "1"
+    os.environ["PYBUDA_DISABLE_CONV_MULTI_OP_FRACTURE"] = "1"
+    os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{75*1024}"
+    available_devices = pybuda.detect_available_devices()
+    if available_devices:
+        if available_devices[0] == pybuda.BackendDevice.Grayskull:
+            os.environ["PYBUDA_RIBBON2"] = "1"
+            os.environ["PYBUDA_FORCE_EMULATE_HARVESTED"] = "1"
+
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.enable_t_streaming = True
+    compiler_cfg.graph_solver_self_cut_type = "ConsumerOperandDataEdgesFirst"
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+    compiler_cfg.enable_auto_fusing = False
+    compiler_cfg.conv_multi_op_fracture_factor_override["conv2d_356"] = 3
+
+    # Download model weights
+    url = "https://github.com/onnx/models/raw/main/validated/vision/object_detection_segmentation/retinanet/model/retinanet-9.onnx?download="
+    load_path = "cv_demos/retinanet/" + url.split("/")[-1]
+    response = requests.get(url, stream=True)
+    with open(load_path, "wb") as f:
+        f.write(response.content)
+
+    # Create PyBuda module from PyTorch model
+    model = onnx.load(load_path)
+    tt_model = pybuda.OnnxModule("onnx_retinanet", model, load_path)
+
+    # Image preprocessing
+    img_tensor = img_preprocess()
+
+    # Run inference on Tenstorrent device
+    output_q = pybuda.run_inference(tt_model, inputs=([img_tensor]))
+    output = output_q.get()
+
+    # Print outputs
+    print(output)
+
+    # Remove weight file
+    os.remove(load_path)
+
+
+if __name__ == "__main__":
+    run_retinanet_r101_640x480_onnx()
diff --git a/model_demos/cv_demos/stable_diffusion/pytorch_stable_diffusion.py b/model_demos/cv_demos/stable_diffusion/pytorch_stable_diffusion.py
index 7f20b359..6f8ce01d 100644
--- a/model_demos/cv_demos/stable_diffusion/pytorch_stable_diffusion.py
+++ b/model_demos/cv_demos/stable_diffusion/pytorch_stable_diffusion.py
@@ -4,7 +4,6 @@
 from typing import List, Optional, Union
 
 import pybuda
-import pytest
 import torch
 from diffusers import StableDiffusionPipeline
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
@@ -236,8 +235,8 @@ def run_stable_diffusion_pytorch(variant="CompVis/stable-diffusion-v1-4"):
     available_devices = pybuda.detect_available_devices()
     if available_devices:
         if available_devices[0] == pybuda._C.backend_api.BackendDevice.Grayskull:
-            pytest.skip("Model not supported on Grayskull")
             raise NotImplementedError("Model not supported on Grayskull")
+
     # Set inference steps
     num_inference_steps = 50
 
diff --git a/model_demos/cv_demos/unet/pytorch_unet_torchhub.py b/model_demos/cv_demos/unet/pytorch_unet_torchhub.py
index 2375e3dd..0530e883 100644
--- a/model_demos/cv_demos/unet/pytorch_unet_torchhub.py
+++ b/model_demos/cv_demos/unet/pytorch_unet_torchhub.py
@@ -15,7 +15,7 @@ def run_unet_torchhub_pytorch():
 
     # Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()
-    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.balancer_policy = "CNN"
     compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
     os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
diff --git a/model_demos/cv_demos/vgg/pytorch_vgg_hf.py b/model_demos/cv_demos/vgg/pytorch_vgg_hf.py
index adf6e267..725ed201 100644
--- a/model_demos/cv_demos/vgg/pytorch_vgg_hf.py
+++ b/model_demos/cv_demos/vgg/pytorch_vgg_hf.py
@@ -25,6 +25,9 @@ def run_vgg_19_hf_pytorch(variant="vgg19"):
     if available_devices:
         if available_devices[0] == BackendDevice.Grayskull:
             os.environ["PYBUDA_FORCE_EMULATE_HARVESTED"] = "1"
+        else:
+            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "65536"
+            os.environ["PYBUDA_FORCE_SEQUENTIAL"] = "1"
 
     """
     # https://pypi.org/project/vgg-pytorch/
diff --git a/model_demos/cv_demos/vgg/pytorch_vgg_osmr.py b/model_demos/cv_demos/vgg/pytorch_vgg_osmr.py
index 5e879c96..d01cdf89 100644
--- a/model_demos/cv_demos/vgg/pytorch_vgg_osmr.py
+++ b/model_demos/cv_demos/vgg/pytorch_vgg_osmr.py
@@ -22,10 +22,13 @@ def run_vgg_osmr_pytorch(variant="vgg11"):
 
     # Device specific configurations
     available_devices = pybuda.detect_available_devices()
-    if variant in ["vgg11", "vgg13", "vgg16", "vgg19", "bn_vgg19", "bn_vgg19b"]:
-        if available_devices:
-            if available_devices[0] == BackendDevice.Grayskull:
+    if available_devices:
+        if available_devices[0] == BackendDevice.Grayskull:
+            if variant in ["vgg11", "vgg13", "vgg16", "vgg19", "bn_vgg19", "bn_vgg19b"]:
                 os.environ["PYBUDA_FORCE_EMULATE_HARVESTED"] = "1"
+        else:
+            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "65536"
+            os.environ["PYBUDA_FORCE_SEQUENTIAL"] = "1"
 
     # Create PyBuda module from PyTorch model
     model_ckpt = variant
diff --git a/model_demos/cv_demos/vgg/pytorch_vgg_timm.py b/model_demos/cv_demos/vgg/pytorch_vgg_timm.py
index a64cc4ae..4ad29bb2 100644
--- a/model_demos/cv_demos/vgg/pytorch_vgg_timm.py
+++ b/model_demos/cv_demos/vgg/pytorch_vgg_timm.py
@@ -1,5 +1,6 @@
 # VGG
 
+import os
 import urllib
 
 import pybuda
@@ -7,6 +8,7 @@
 import timm
 import torch
 from PIL import Image
+from pybuda._C.backend_api import BackendDevice
 from timm.data import resolve_data_config
 from timm.data.transforms_factory import create_transform
 
@@ -49,6 +51,13 @@ def run_vgg_bn19_timm_pytorch(variant="vgg19_bn"):
     compiler_cfg.balancer_policy = "Ribbon"
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
 
+    # Device specific configurations
+    available_devices = pybuda.detect_available_devices()
+    if available_devices:
+        if available_devices[0] == BackendDevice.Wormhole_B0:
+            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "65536"
+            os.environ["PYBUDA_FORCE_SEQUENTIAL"] = "1"
+
     # Create PyBuda module from PyTorch model
     tt_model = pybuda.PyTorchModule(model_name + "_timm_pt", model)
 
diff --git a/model_demos/cv_demos/vgg/pytorch_vgg_torchhub.py b/model_demos/cv_demos/vgg/pytorch_vgg_torchhub.py
index 9a8aeb63..82de2740 100644
--- a/model_demos/cv_demos/vgg/pytorch_vgg_torchhub.py
+++ b/model_demos/cv_demos/vgg/pytorch_vgg_torchhub.py
@@ -24,6 +24,9 @@ def run_vgg_bn19_torchhub_pytorch(variant="vgg19_bn"):
     if available_devices:
         if available_devices[0] == BackendDevice.Grayskull:
             os.environ["PYBUDA_FORCE_EMULATE_HARVESTED"] = "1"
+        else:
+            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "65536"
+            os.environ["PYBUDA_FORCE_SEQUENTIAL"] = "1"
 
     # Create PyBuda module from PyTorch model
     model = torch.hub.load("pytorch/vision:v0.10.0", variant, pretrained=True)
diff --git a/model_demos/cv_demos/vilt/pytorch_vilt_maskedlm.py b/model_demos/cv_demos/vilt/pytorch_vilt_maskedlm.py
new file mode 100644
index 00000000..fa8228a1
--- /dev/null
+++ b/model_demos/cv_demos/vilt/pytorch_vilt_maskedlm.py
@@ -0,0 +1,73 @@
+import os
+
+import pybuda
+import requests
+import torch
+from PIL import Image
+from transformers import ViltConfig, ViltForMaskedLM, ViltProcessor
+
+from .vilt_model import ViLtEmbeddingWrapper, ViltModelWrapper
+
+
+def run_vilt_maskedlm_pytorch(variant="dandelin/vilt-b32-mlm"):
+
+    # Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.enable_t_streaming = True
+    compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
+    compiler_cfg.balancer_policy = "Ribbon"
+
+    os.environ["PYBUDA_FORCE_EMULATE_HARVESTED"] = "1"
+    os.environ["PYBUDA_RIBBON2"] = "1"
+
+    # Sample Image
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    sample_image = Image.open(requests.get(url, stream=True).raw)
+
+    # Sample text
+    sample_text = "a bunch of cats laying on a [MASK]."
+
+    model_ckpt = variant
+
+    # Set model configurations
+    config = ViltConfig.from_pretrained(model_ckpt)
+    config_dict = config.to_dict()
+    config_dict["return_dict"] = False
+    config = ViltConfig(**config_dict)
+
+    # Load model and processor from HuggingFace
+    processor = ViltProcessor.from_pretrained(model_ckpt)
+    model = ViltForMaskedLM.from_pretrained(model_ckpt, config=config)
+    model.eval()
+
+    # prepare inputs
+    encoding = processor(sample_image, sample_text, return_tensors="pt")
+
+    # Wrapper
+    text_vision_embedding_model = ViLtEmbeddingWrapper(model)
+    vilt_model = ViltModelWrapper(model=model, task="maskedlm", text_seq_len=encoding["input_ids"].shape[1])
+
+    embedding_output, attention_mask = text_vision_embedding_model(**encoding)
+
+    tt0 = pybuda.TTDevice("tt0", module=pybuda.PyTorchModule("pt_vilt_maskedlm", vilt_model))
+    tt0.push_to_inputs((embedding_output.detach().cpu(), attention_mask.detach().cpu().to(torch.float32)))
+
+    # Run inference on Tenstorrent device
+    output_q = pybuda.run_inference(_sequential=True)
+    mlm_logits = output_q.get()[0].value().detach().float()
+
+    # PostProcessing
+    input_ids = encoding["input_ids"][0][1:-1]
+    mlm_logits = mlm_logits[0, 1 : encoding.input_ids.shape[1] - 1, :]
+
+    mlm_values, mlm_ids = mlm_logits.softmax(dim=-1).max(dim=-1)
+    mlm_values[input_ids != 103] = 0
+    select = mlm_values.argmax().item()
+    inferred_token = processor.decode(mlm_ids[select].item())
+
+    # Model Output (i.e Masked token: Couch)
+    print("Masked token: ", inferred_token)
+
+
+if __name__ == "__main__":
+    run_vilt_maskedlm_pytorch()
diff --git a/model_demos/cv_demos/vilt/pytorch_vilt_question_answering.py b/model_demos/cv_demos/vilt/pytorch_vilt_question_answering.py
new file mode 100644
index 00000000..991b3a8e
--- /dev/null
+++ b/model_demos/cv_demos/vilt/pytorch_vilt_question_answering.py
@@ -0,0 +1,66 @@
+import os
+
+import pybuda
+import requests
+import torch
+from PIL import Image
+from transformers import ViltConfig, ViltForQuestionAnswering, ViltProcessor
+
+from .vilt_model import ViLtEmbeddingWrapper, ViltModelWrapper
+
+
+def run_vilt_for_question_answering_pytorch(variant="dandelin/vilt-b32-finetuned-vqa"):
+
+    # Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.enable_t_streaming = True
+    compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
+    compiler_cfg.balancer_policy = "Ribbon"
+
+    os.environ["PYBUDA_FORCE_EMULATE_HARVESTED"] = "1"
+    os.environ["PYBUDA_RIBBON2"] = "1"
+
+    # Sample Image
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    sample_image = Image.open(requests.get(url, stream=True).raw)
+
+    # Sample text
+    sample_text = "How many cats are there?"
+
+    model_ckpt = variant
+
+    # Set model configurations
+    config = ViltConfig.from_pretrained(model_ckpt)  # matmul_2008
+    config_dict = config.to_dict()
+    config_dict["return_dict"] = False
+    config = ViltConfig(**config_dict)
+
+    # Load model and processor from HuggingFace
+    processor = ViltProcessor.from_pretrained(model_ckpt)
+    model = ViltForQuestionAnswering.from_pretrained(model_ckpt, config=config)
+    model.eval()
+
+    # Sample inputs
+    encoding = processor(sample_image, sample_text, return_tensors="pt")
+
+    # Wrapper
+    text_vision_embedding_model = ViLtEmbeddingWrapper(model)
+    viltquestionanswering_model = ViltModelWrapper(model, task="qa")
+
+    embedding_output, attention_mask = text_vision_embedding_model(**encoding)
+
+    tt0 = pybuda.TTDevice("tt0", module=pybuda.PyTorchModule("pt_viltquestionanswering", viltquestionanswering_model))
+
+    tt0.push_to_inputs(embedding_output.detach().cpu(), attention_mask.detach().cpu().to(torch.float32))
+
+    # Run inference on Tenstorrent device
+    output_q = pybuda.run_inference(_sequential=True)
+
+    # Model output (i.e Predicted answer: 2)
+    output = output_q.get()[0].value().detach().float()
+    idx = output.argmax(-1).item()
+    print("Predicted answer: ", model.config.id2label[idx])
+
+
+if __name__ == "__main__":
+    run_vilt_for_question_answering_pytorch()
diff --git a/model_demos/cv_demos/vilt/vilt_model.py b/model_demos/cv_demos/vilt/vilt_model.py
new file mode 100644
index 00000000..d1ac342e
--- /dev/null
+++ b/model_demos/cv_demos/vilt/vilt_model.py
@@ -0,0 +1,83 @@
+import torch
+
+
+class ViLtEmbeddingWrapper(torch.nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.vilt_model = model
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        pixel_values=None,
+        pixel_mask=None,
+        inputs_embeds=None,
+        image_embeds=None,
+        image_token_type_idx=None,
+    ):
+
+        embeddings, masks = self.vilt_model.vilt.embeddings(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            pixel_values=pixel_values,
+            pixel_mask=pixel_mask,
+            inputs_embeds=inputs_embeds,
+            image_embeds=image_embeds,
+            image_token_type_idx=image_token_type_idx,
+        )
+        return embeddings, masks
+
+
+class ViltModelWrapper(torch.nn.Module):
+    def __init__(self, model, task=None, text_seq_len=None):
+        super().__init__()
+        self.vilt_model = model
+        self.task = task
+        self.text_seq_len = text_seq_len
+
+    def forward(self, embedding_output, attention_mask, head_mask=None):
+
+        head_mask = self.vilt_model.vilt.get_head_mask(head_mask, self.vilt_model.vilt.config.num_hidden_layers)
+
+        extended_attention_mask = attention_mask[:, None, None, :]
+        extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(torch.float32).min
+
+        encoder_outputs = self.vilt_model.vilt.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            return_dict=False,
+        )
+
+        sequence_output = encoder_outputs[0]
+
+        sequence_output = self.vilt_model.vilt.layernorm(sequence_output)
+        pooled_output = (
+            self.vilt_model.vilt.pooler(sequence_output) if self.vilt_model.vilt.pooler is not None else None
+        )
+
+        viltmodel_output = (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        sequence_output, pooled_output = viltmodel_output[:2]
+
+        if self.task == "maskedlm":
+
+            if self.text_seq_len is None:
+                raise ValueError("You cannot must provide text sequence length")
+
+            text_features, _ = (sequence_output[:, : self.text_seq_len], sequence_output[:, self.text_seq_len :])
+
+            mlm_logits = self.vilt_model.mlm_score(text_features)
+
+            viltmodel_output = (mlm_logits,) + viltmodel_output[2:]
+
+        if self.task == "qa":
+
+            logits = self.vilt_model.classifier(pooled_output)
+
+            viltmodel_output = (logits,) + viltmodel_output[2:]
+
+        return viltmodel_output
diff --git a/model_demos/cv_demos/wideresnet/pytorch_wideresnet_timm.py b/model_demos/cv_demos/wideresnet/pytorch_wideresnet_timm.py
new file mode 100644
index 00000000..3fa3f2b4
--- /dev/null
+++ b/model_demos/cv_demos/wideresnet/pytorch_wideresnet_timm.py
@@ -0,0 +1,62 @@
+# Wideresnet
+
+import os
+import urllib
+
+import pybuda
+import requests
+import timm
+import torch
+from PIL import Image
+from timm.data import resolve_data_config
+from timm.data.transforms_factory import create_transform
+
+
+def run_wideresnet_timm_pytorch(variant="wide_resnet50_2"):
+    """
+    Variants = {
+     'wide_resnet50_2',
+     'wide_resnet101_2'
+    }
+    """
+
+    # Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object
+    compiler_cfg.enable_t_streaming = True
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+    os.environ["PYBUDA_RIBBON2"] = "1"
+
+    model_name = variant
+    model = timm.create_model(model_name, pretrained=True)
+
+    config = resolve_data_config({}, model=model)
+    transform = create_transform(**config)
+
+    url = "https://raw.githubusercontent.com/pytorch/hub/master/images/dog.jpg"
+    img = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+    tensor = transform(img).unsqueeze(0)  # transform and add batch dimension
+
+    # Get imagenet class mappings
+    url = "https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt"
+    image_classes = urllib.request.urlopen(url)
+    categories = [s.decode("utf-8").strip() for s in image_classes.readlines()]
+
+    # Create PyBuda module from PyTorch model
+    tt_model = pybuda.PyTorchModule(model_name + "_timm_pt", model)
+
+    # Run inference on Tenstorrent device
+    output_q = pybuda.run_inference(tt_model, inputs=([tensor]))
+    output = output_q.get()[0].value()
+
+    # Postprocessing
+    probabilities = torch.nn.functional.softmax(output[0], dim=0)
+
+    # Print top categories per image
+    top5_prob, top5_catid = torch.topk(probabilities, 5)
+    for i in range(top5_prob.size(0)):
+        print(categories[top5_catid[i]], top5_prob[i].item())
+
+
+if __name__ == "__main__":
+    run_wideresnet_timm_pytorch()
diff --git a/model_demos/cv_demos/wideresnet/pytorch_wideresnet_torchhub.py b/model_demos/cv_demos/wideresnet/pytorch_wideresnet_torchhub.py
new file mode 100644
index 00000000..608d1203
--- /dev/null
+++ b/model_demos/cv_demos/wideresnet/pytorch_wideresnet_torchhub.py
@@ -0,0 +1,64 @@
+# Wideresnet Demo Script
+
+import os
+import urllib
+
+import pybuda
+import requests
+import torch
+from PIL import Image
+from torchvision import transforms
+
+
+def run_wideresnet_torchhub_pytorch(variant="wide_resnet50_2"):
+    # Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object
+    compiler_cfg.enable_t_streaming = True
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+    os.environ["PYBUDA_RIBBON2"] = "1"
+
+    # Create PyBuda module from PyTorch model
+    model = torch.hub.load("pytorch/vision:v0.10.0", variant, pretrained=True)
+
+    model_name = f"pt_{variant}"
+
+    tt_model = pybuda.PyTorchModule(model_name, model)
+
+    url = "https://raw.githubusercontent.com/pytorch/hub/master/images/dog.jpg"
+    input_image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+
+    # preprocessing
+    preprocess = transforms.Compose(
+        [
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+        ]
+    )
+    input_tensor = preprocess(input_image)
+    input_batch = input_tensor.unsqueeze(0)  # create a mini-batch as expected by the model
+
+    # Run inference on Tenstorrent device
+    output_q = pybuda.run_inference(tt_model, inputs=([input_batch]))
+    output = output_q.get()[0].value()
+
+    # Data postprocessing
+    probabilities = torch.nn.functional.softmax(output[0], dim=0)
+
+    # Get imagenet class mappings
+    url = "https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt"
+    image_classes = urllib.request.urlopen(url)
+    categories = [s.decode("utf-8").strip() for s in image_classes.readlines()]
+
+    # Print top categories per image
+    top5_prob, top5_catid = torch.topk(probabilities, 5)
+    result = {}
+    for i in range(top5_prob.size(0)):
+        result[categories[top5_catid[i]]] = top5_prob[i].item()
+    print(result)
+
+
+if __name__ == "__main__":
+    run_wideresnet_torchhub_pytorch()
diff --git a/model_demos/cv_demos/xception/timm_xception.py b/model_demos/cv_demos/xception/timm_xception.py
new file mode 100644
index 00000000..956a7266
--- /dev/null
+++ b/model_demos/cv_demos/xception/timm_xception.py
@@ -0,0 +1,76 @@
+# Xception
+
+import os
+import urllib
+
+import pybuda
+import requests
+import timm
+import torch
+from PIL import Image
+from pybuda._C.backend_api import BackendDevice
+from timm.data import resolve_data_config
+from timm.data.transforms_factory import create_transform
+
+
+def run_xception_timm(variant="xception"):
+    """
+    Variants = {
+     'xception',
+     'xception41',
+     'xception65',
+     'xception71'
+    }
+    """
+
+    # Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object
+    compiler_cfg.enable_t_streaming = True
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+    compiler_cfg.balancer_policy = "Ribbon"
+    os.environ["PYBUDA_RIBBON2"] = "1"
+    os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
+    available_devices = pybuda.detect_available_devices()
+
+    if variant == "xception":
+        if available_devices[0] == BackendDevice.Wormhole_B0:
+            compiler_cfg.balancer_policy = "CNN"
+        elif available_devices[0] == BackendDevice.Grayskull:
+            compiler_cfg.amp_level = 1
+            compiler_cfg.place_on_new_epoch("relu_74")
+            os.environ["PYBUDA_LEGACY_UBLOCK_SHAPE"] = "1"
+            os.environ["PYBUDA_PAD_SPARSE_MM"] = "{43:48}"
+
+    model_name = variant
+    model = timm.create_model(model_name, pretrained=True)
+
+    # preprocessing
+    config = resolve_data_config({}, model=model)
+    transform = create_transform(**config)
+    url = "https://raw.githubusercontent.com/pytorch/hub/master/images/dog.jpg"
+    img = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+    tensor = transform(img).unsqueeze(0)  # transform and add batch dimension
+
+    # Create PyBuda module from PyTorch model
+    tt_model = pybuda.PyTorchModule(f"{variant}_timm_pt", model)
+
+    # Run inference on Tenstorrent device
+    output_q = pybuda.run_inference(tt_model, inputs=([tensor]))
+    output = output_q.get()[0].value()
+
+    # postprocessing
+    probabilities = torch.nn.functional.softmax(output[0], dim=0)
+
+    # Get imagenet class mappings
+    url = "https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt"
+    image_classes = urllib.request.urlopen(url)
+    categories = [s.decode("utf-8").strip() for s in image_classes.readlines()]
+
+    # Print top categories per image
+    top5_prob, top5_catid = torch.topk(probabilities, 5)
+    for i in range(top5_prob.size(0)):
+        print(categories[top5_catid[i]], top5_prob[i].item())
+
+
+if __name__ == "__main__":
+    run_xception_timm()
diff --git a/model_demos/cv_demos/yolo_v3/holli_src/utils.py b/model_demos/cv_demos/yolo_v3/holli_src/utils.py
new file mode 100644
index 00000000..bfbcb5a2
--- /dev/null
+++ b/model_demos/cv_demos/yolo_v3/holli_src/utils.py
@@ -0,0 +1,256 @@
+import math
+
+import numpy as np
+import PIL
+import torch
+from matplotlib import patches, patheffects
+from matplotlib import pyplot as plt
+from PIL import Image
+
+
+def nms(boxes, nms_thresh):
+    if len(boxes) == 0:
+        return boxes
+
+    confs = [(1 - b[4]) for b in boxes]
+    sorted_idx = np.argsort(confs)
+    out_boxes = []
+
+    for i in range(len(boxes)):
+        box_i = boxes[sorted_idx[i]]
+        if confs[i] > -1:
+            out_boxes.append(box_i)
+            for j in range(i + 1, len(boxes)):
+                if confs[j] > -1:
+                    box_j = boxes[sorted_idx[j]]
+                    if bbox_iou(box_i, box_j, x1y1x2y2=False) > nms_thresh:
+                        confs[j] = -1
+    return out_boxes
+
+
+def do_detect(model, img, conf_thresh, nms_thresh, use_cuda=True):
+    model.eval()
+    img = image2torch(img)
+    img = img.to(torch.device("cuda" if use_cuda else "cpu"))
+    all_boxes = model.predict_img(img)[0]
+    boxes = nms(all_boxes, nms_thresh)
+    return boxes
+
+
+def image2torch(img):
+    if isinstance(img, Image.Image):
+        width = img.width
+        height = img.height
+        img = torch.ByteTensor(torch.ByteStorage.from_buffer(img.tobytes()))
+        img = img.view(height, width, 3).transpose(0, 1).transpose(0, 2).contiguous()
+        img = img.view(1, 3, height, width)
+        img = img.float().div(255.0)
+    elif type(img) == np.ndarray:  # cv2 image
+        img = torch.from_numpy(img.transpose(2, 0, 1)).float().div(255.0).unsqueeze(0)
+    else:
+        print("unknown image type")
+        exit(-1)
+    return img
+
+
+def bbox_iou(box1, box2, x1y1x2y2=True):
+    if x1y1x2y2:
+        x1_min = min(box1[0], box2[0])
+        x2_max = max(box1[2], box2[2])
+        y1_min = min(box1[1], box2[1])
+        y2_max = max(box1[3], box2[3])
+        w1, h1 = box1[2] - box1[0], box1[3] - box1[1]
+        w2, h2 = box2[2] - box2[0], box2[3] - box2[1]
+    else:
+        w1, h1 = box1[2], box1[3]
+        w2, h2 = box2[2], box2[3]
+        x1_min = min(box1[0] - w1 / 2.0, box2[0] - w2 / 2.0)
+        x2_max = max(box1[0] + w1 / 2.0, box2[0] + w2 / 2.0)
+        y1_min = min(box1[1] - h1 / 2.0, box2[1] - h2 / 2.0)
+        y2_max = max(box1[1] + h1 / 2.0, box2[1] + h2 / 2.0)
+
+    w_union = x2_max - x1_min
+    h_union = y2_max - y1_min
+    w_cross = w1 + w2 - w_union
+    h_cross = h1 + h2 - h_union
+    carea = 0
+    if w_cross <= 0 or h_cross <= 0:
+        return 0.0
+
+    area1 = w1 * h1
+    area2 = w2 * h2
+    carea = w_cross * h_cross
+    uarea = area1 + area2 - carea
+    return float(carea / uarea)
+
+
+def multi_bbox_ious(boxes1, boxes2, x1y1x2y2=True):
+    if x1y1x2y2:
+        x1_min = torch.min(boxes1[0], boxes2[0])
+        x2_max = torch.max(boxes1[2], boxes2[2])
+        y1_min = torch.min(boxes1[1], boxes2[1])
+        y2_max = torch.max(boxes1[3], boxes2[3])
+        w1, h1 = boxes1[2] - boxes1[0], boxes1[3] - boxes1[1]
+        w2, h2 = boxes2[2] - boxes2[0], boxes2[3] - boxes2[1]
+    else:
+        w1, h1 = boxes1[2], boxes1[3]
+        w2, h2 = boxes2[2], boxes2[3]
+        x1_min = torch.min(boxes1[0] - w1 / 2.0, boxes2[0] - w2 / 2.0)
+        x2_max = torch.max(boxes1[0] + w1 / 2.0, boxes2[0] + w2 / 2.0)
+        y1_min = torch.min(boxes1[1] - h1 / 2.0, boxes2[1] - h2 / 2.0)
+        y2_max = torch.max(boxes1[1] + h1 / 2.0, boxes2[1] + h2 / 2.0)
+
+    w_union = x2_max - x1_min
+    h_union = y2_max - y1_min
+    w_cross = w1 + w2 - w_union
+    h_cross = h1 + h2 - h_union
+    mask = ((w_cross <= 0) + (h_cross <= 0)) > 0
+    area1 = w1 * h1
+    area2 = w2 * h2
+    carea = w_cross * h_cross
+    carea[mask] = 0
+    uarea = area1 + area2 - carea
+    return carea / uarea
+
+
+# Plotting helpers
+
+# e.g. plot_multi_detections(img_tensor, model.predict_img(img_tensor))
+def plot_multi_detections(imgs, results, figsize=None, **kwargs):
+    if not figsize:
+        figsize = (12, min(math.ceil(len(imgs) / 3) * 4, 30))
+    _, axes = plt.subplots(math.ceil(len(imgs) / 3), 3, figsize=figsize)
+
+    if type(imgs) == np.ndarray and len(imgs.shape) == 4:
+        imgs = [imgs]
+
+    classes = []
+    boxes = []
+    extras = []
+    for r in results:
+        res = np.array([[float(b) for b in arr] for arr in r])
+        if len(res) > 0:
+            cla = res[:, -1].astype(int)
+            b = res[:, 0:4]
+            e = ["{:.2f} ({:.2f})".format(float(y[4]), float(y[5])) for y in res]
+        else:
+            cla, b, e = [], [], []
+        classes.append(cla)
+        boxes.append(b)
+        extras.append(e)
+
+    for j, ax in enumerate(axes.flat):
+        if j >= len(imgs):
+            # break
+            plt.delaxes(ax)
+        else:
+            plot_img_boxes(imgs[j], boxes[j], classes[j], extras[j], plt_ax=ax, **kwargs)
+
+    plt.tight_layout()
+
+
+def plot_img_detections(img, result_boxes, **kwargs):
+    b = np.array(result_boxes)
+    if len(b) > 0:
+        classes = b[:, -1].astype(int)
+        boxes = b[:, 0:4]
+    else:
+        classes, boxes = [], []
+    extras = ["{:.2f} ({:.2f})".format(b[4], b[5]) for b in result_boxes]
+    return plot_img_boxes(img, boxes, classes, extras=extras, **kwargs)
+
+
+def plot_img_data(x, y, rows=2, figsize=(12, 8), **kwargs):
+    _, axes = plt.subplots(rows, 3, figsize=figsize)
+
+    for j, ax in enumerate(axes.flat):
+        if j >= len(y):
+            break
+        targets = y[j]
+        if isinstance(targets, torch.Tensor):
+            targets = targets.clone().reshape(-1, 5)
+            classes = targets[:, 0].cpu().numpy().astype(int)
+        else:
+            classes = targets[:, 0].astype(int)
+        plot_img_boxes(x[j], targets[:, 1:], classes, plt_ax=ax, **kwargs)
+
+    plt.tight_layout()
+
+
+def plot_img_boxes(
+    img,
+    boxes,
+    classes,
+    extras=None,
+    plt_ax=None,
+    figsize=None,
+    class_names=None,
+    real_pixels=False,
+    box_centered=True,
+):
+    if not plt_ax:
+        _, plt_ax = plt.subplots(figsize=figsize)
+    colors = np.array([[1, 0, 1], [0, 0, 1], [0, 1, 1], [0, 1, 0], [1, 1, 0], [1, 0, 0]])
+
+    if type(img) == PIL.Image.Image:
+        width = img.width
+        height = img.height
+    elif type(img) in [torch.Tensor, np.ndarray]:
+        if type(img) == torch.Tensor:
+            img = img.clone().cpu().numpy()
+        width = img.shape[2]
+        height = img.shape[1]
+        img = img.transpose(1, 2, 0)
+        if (img < 1.01).all() and (img >= 0).all():
+            img = img.clip(0, 1)  # avoid "Clipping input data to the valid range" warning after tensor roundings
+    else:
+        raise (f"Unkown type for image: {type(img)}")
+
+    if len(boxes) > 0 and not real_pixels:
+        boxes[:, 0] *= width
+        boxes[:, 2] *= width
+        boxes[:, 1] *= height
+        boxes[:, 3] *= height
+
+    for i in range(len(boxes)):
+        b, class_id = boxes[i], classes[i]
+        if b[0] == 0:
+            break
+
+        color = colors[class_id % len(colors)]
+
+        if box_centered:
+            x, y = (b[0] - b[2] / 2, b[1] - b[3] / 2)
+            w, h = (b[2], b[3])
+        else:
+            x, y = b[0], b[1]
+            w, h = b[2], b[3]
+
+        patch = plt_ax.add_patch(patches.Rectangle([x, y], w, h, fill=False, edgecolor=color, lw=2))
+        patch.set_path_effects(
+            [
+                patheffects.Stroke(linewidth=3, foreground="black", alpha=0.5),
+                patheffects.Normal(),
+            ]
+        )
+
+        s = class_names[class_id] if class_names else str(class_id)
+        if extras:
+            s += "\n" + str(extras[i])
+        patch = plt_ax.text(
+            x + 2,
+            y,
+            s,
+            verticalalignment="top",
+            color=color,
+            fontsize=16,
+            weight="bold",
+        )
+        patch.set_path_effects(
+            [
+                patheffects.Stroke(linewidth=1, foreground="black", alpha=0.5),
+                patheffects.Normal(),
+            ]
+        )
+
+    _ = plt_ax.imshow(img)
diff --git a/model_demos/cv_demos/yolo_v3/holli_src/yolo_layer.py b/model_demos/cv_demos/yolo_v3/holli_src/yolo_layer.py
new file mode 100644
index 00000000..ddd4a268
--- /dev/null
+++ b/model_demos/cv_demos/yolo_v3/holli_src/yolo_layer.py
@@ -0,0 +1,227 @@
+import math
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from .utils import multi_bbox_ious
+
+
+class YoloLayer(nn.Module):
+    def __init__(self, anchors, stride, num_classes):
+        super().__init__()
+        self.anchors, self.stride = np.array(anchors), stride
+        self.num_classes = num_classes
+
+    def get_masked_anchors(self):
+        return self.anchors / self.stride
+
+    def get_region_boxes(self, output, conf_thresh):
+        if output.dim() == 3:
+            output = output.unsqueeze(0)
+        device = output.device  # torch.device(torch_device)
+        anchors = torch.from_numpy(self.get_masked_anchors().astype(np.float32)).to(device)
+
+        nB = output.size(0)
+        nA = len(anchors)
+        nC = self.num_classes
+        nH = output.size(2)
+        nW = output.size(3)
+        cls_anchor_dim = nB * nA * nH * nW
+
+        assert output.size(1) == (5 + nC) * nA
+
+        output = output.view(nB * nA, 5 + nC, nH * nW).transpose(0, 1).contiguous().view(5 + nC, cls_anchor_dim)
+
+        grid_x = torch.linspace(0, nW - 1, nW).repeat(nB * nA, nH, 1).view(cls_anchor_dim).to(device)
+        grid_y = torch.linspace(0, nH - 1, nH).repeat(nW, 1).t().repeat(nB * nA, 1, 1).view(cls_anchor_dim).to(device)
+        ix = torch.LongTensor(range(0, 2)).to(device)
+        anchor_w = anchors.index_select(1, ix[0]).repeat(1, nB, nH * nW).view(cls_anchor_dim)
+        anchor_h = anchors.index_select(1, ix[1]).repeat(1, nB, nH * nW).view(cls_anchor_dim)
+
+        xs, ys = (
+            torch.sigmoid(output[0]) + grid_x,
+            torch.sigmoid(output[1]) + grid_y,
+        )
+        ws, hs = (
+            torch.exp(output[2]) * anchor_w.detach(),
+            torch.exp(output[3]) * anchor_h.detach(),
+        )
+        det_confs = torch.sigmoid(output[4])
+
+        cls_confs = torch.nn.Softmax(dim=1)(output[5 : 5 + nC].transpose(0, 1)).detach()
+        cls_max_confs, cls_max_ids = torch.max(cls_confs, 1)
+        cls_max_confs = cls_max_confs.view(-1)
+        cls_max_ids = cls_max_ids.view(-1)
+
+        det_confs = det_confs.to("cpu")  # , non_blocking=True for torch 4.1?
+        cls_max_confs = cls_max_confs.to("cpu")
+        cls_max_ids = cls_max_ids.to("cpu")
+        xs, ys = xs.to("cpu"), ys.to("cpu")
+        ws, hs = ws.to("cpu"), hs.to("cpu")
+
+        all_boxes = [[] for i in range(nB)]
+
+        inds = torch.LongTensor(range(0, len(det_confs)))
+        for ind in inds[det_confs > conf_thresh]:
+            bcx = xs[ind]
+            bcy = ys[ind]
+            bw = ws[ind]
+            bh = hs[ind]
+            box = [
+                bcx / nW,
+                bcy / nH,
+                bw / nW,
+                bh / nH,
+                det_confs[ind],
+                cls_max_confs[ind],
+                cls_max_ids[ind],
+            ]
+            box = [i.item() for i in box]
+
+            batch = math.ceil(ind / (nA * nH * nW))
+            all_boxes[batch].append(box)
+
+        return all_boxes
+
+    def build_targets(self, pred_boxes, target, anchors, nH, nW):
+        self.ignore_thresh = 0.5
+        self.truth_thresh = 1.0
+
+        # Works faster on CPU than on GPU.
+        devi = torch.device("cpu")
+        pred_boxes = pred_boxes.to(devi)
+        target = target.to(devi)
+        anchors = anchors.to(devi)
+
+        nB = target.size(0)
+        nA = len(anchors)
+
+        anchor_step = anchors.size(1)  # anchors[nA][anchor_step]
+        conf_mask = torch.ones(nB, nA, nH, nW)
+        coord_mask = torch.zeros(nB, nA, nH, nW)
+        cls_mask = torch.zeros(nB, nA, nH, nW)
+        tcoord = torch.zeros(4, nB, nA, nH, nW)
+        tconf = torch.zeros(nB, nA, nH, nW)
+        tcls = torch.zeros(nB, nA, nH, nW)
+        twidth, theight = nW, nH
+        nAnchors = nA * nH * nW
+
+        for b in range(nB):
+            cur_pred_boxes = pred_boxes[b * nAnchors : (b + 1) * nAnchors].t()
+            cur_ious = torch.zeros(nAnchors)
+            tbox = target[b].view(-1, 5)
+
+            # If the bounding box prior is not the best but does overlap a ground truth object by
+            # more than some threshold we ignore the prediction (conf_mask)
+            for t in range(tbox.size(0)):
+                if tbox[t][1] == 0:
+                    break
+                gx, gy = tbox[t][1] * nW, tbox[t][2] * nH
+                gw, gh = tbox[t][3] * twidth, tbox[t][4] * theight
+                cur_gt_boxes = torch.FloatTensor([gx, gy, gw, gh]).repeat(nAnchors, 1).t()
+                cur_ious = torch.max(
+                    cur_ious,
+                    multi_bbox_ious(cur_pred_boxes, cur_gt_boxes, x1y1x2y2=False),
+                )
+            ignore_ix = cur_ious > self.ignore_thresh
+            conf_mask[b][ignore_ix.view(nA, nH, nW)] = 0
+
+            for t in range(tbox.size(0)):
+                if tbox[t][1] == 0:
+                    break
+                gx, gy = tbox[t][1] * nW, tbox[t][2] * nH
+                gw, gh = tbox[t][3] * twidth, tbox[t][4] * theight
+                gw, gh = gw.float(), gh.float()
+                gi, gj = int(gx), int(gy)
+
+                tmp_gt_boxes = torch.FloatTensor([0, 0, gw, gh]).repeat(nA, 1).t()
+                anchor_boxes = torch.cat((torch.zeros(nA, anchor_step), anchors), 1).t()
+                _, best_n = torch.max(
+                    multi_bbox_ious(tmp_gt_boxes, anchor_boxes, x1y1x2y2=False),
+                    0,
+                )
+
+                coord_mask[b][best_n][gj][gi] = 1
+                cls_mask[b][best_n][gj][gi] = 1
+                conf_mask[b][best_n][gj][gi] = 1
+                tcoord[0][b][best_n][gj][gi] = gx - gi
+                tcoord[1][b][best_n][gj][gi] = gy - gj
+                tcoord[2][b][best_n][gj][gi] = math.log(gw / anchors[best_n][0])
+                tcoord[3][b][best_n][gj][gi] = math.log(gh / anchors[best_n][1])
+                tcls[b][best_n][gj][gi] = tbox[t][0]
+                tconf[b][best_n][gj][gi] = 1  # yolov1 would have used iou-value here
+
+        return coord_mask, conf_mask, cls_mask, tcoord, tconf, tcls
+
+    def get_loss(self, output, target, return_single_value=True):
+        device = output.device
+
+        anchors = torch.from_numpy(self.get_masked_anchors().astype(np.float32)).to(device)
+
+        nB = output.data.size(0)  # batch size
+        nA = len(anchors)
+        nC = self.num_classes
+        nH = output.data.size(2)
+        nW = output.data.size(3)
+        cls_anchor_dim = nB * nA * nH * nW
+
+        output = output.view(nB, nA, (5 + nC), nH, nW)
+
+        ix = torch.LongTensor(range(0, 5)).to(device)
+        coord = (
+            output.index_select(2, ix[0:4])
+            .view(nB * nA, -1, nH * nW)
+            .transpose(0, 1)
+            .contiguous()
+            .view(4, cls_anchor_dim)
+        )  # x, y, w, h
+        coord[0:2] = coord[0:2].sigmoid()  # x, y:   bx = σ(tx) (+ cx)
+        conf = output.index_select(2, ix[4]).view(nB, nA, nH, nW).sigmoid()
+
+        grid_x = torch.linspace(0, nW - 1, nW).repeat(nB * nA, nH, 1).view(cls_anchor_dim).to(device)
+        grid_y = torch.linspace(0, nH - 1, nH).repeat(nW, 1).t().repeat(nB * nA, 1, 1).view(cls_anchor_dim).to(device)
+        anchor_w = anchors.index_select(1, ix[0]).repeat(1, nB * nH * nW).view(cls_anchor_dim)
+        anchor_h = anchors.index_select(1, ix[1]).repeat(1, nB * nH * nW).view(cls_anchor_dim)
+
+        pred_boxes = torch.FloatTensor(4, cls_anchor_dim).to(device)
+        pred_boxes[0] = coord[0] + grid_x  # bx = σ(tx) + cx
+        pred_boxes[1] = coord[1] + grid_y
+        pred_boxes[2] = coord[2].exp() * anchor_w  # pw*e(tw)
+        pred_boxes[3] = coord[3].exp() * anchor_h
+        pred_boxes = pred_boxes.transpose(0, 1).contiguous().view(-1, 4)
+
+        (
+            coord_mask,
+            conf_mask,
+            cls_mask,
+            tcoord,
+            tconf,
+            tcls,
+        ) = self.build_targets(pred_boxes.detach(), target.detach(), anchors.detach(), nH, nW)
+
+        cls_grid = torch.linspace(5, 5 + nC - 1, nC).long().to(device)
+        cls = output.index_select(2, cls_grid)
+        cls = cls.view(nB * nA, nC, nH * nW).transpose(1, 2).contiguous().view(cls_anchor_dim, nC)
+        cls_mask = cls_mask == 1
+        tcls = tcls[cls_mask].long().view(-1)
+        cls_mask = cls_mask.view(-1, 1).repeat(1, nC).to(device)
+        cls = cls[cls_mask].view(-1, nC)
+
+        tcoord = tcoord.view(4, cls_anchor_dim).to(device)
+        tconf, tcls = tconf.to(device), tcls.to(device)
+        coord_mask, conf_mask = coord_mask.view(cls_anchor_dim).to(device), conf_mask.to(device)
+
+        loss_coord = nn.MSELoss(size_average=False)(coord * coord_mask, tcoord * coord_mask) / 2
+        loss_conf = nn.MSELoss(size_average=False)(conf * conf_mask, tconf * conf_mask)
+        loss_cls = nn.CrossEntropyLoss(size_average=False)(cls, tcls) if cls.size(0) > 0 else 0
+        loss = loss_coord + loss_conf + loss_cls
+
+        if math.isnan(loss.item()):
+            print(conf, tconf)
+            raise ValueError("YoloLayer has isnan in loss")
+
+        if return_single_value:
+            return loss
+        else:
+            return [loss, loss_coord, loss_conf, loss_cls]
diff --git a/model_demos/cv_demos/yolo_v3/holli_src/yolov3.py b/model_demos/cv_demos/yolo_v3/holli_src/yolov3.py
new file mode 100644
index 00000000..10b303c9
--- /dev/null
+++ b/model_demos/cv_demos/yolo_v3/holli_src/yolov3.py
@@ -0,0 +1,128 @@
+import torch
+import torch.nn as nn
+
+from .yolo_layer import *
+from .yolov3_base import *
+
+
+class Yolov3(Yolov3Base):
+    def __init__(self, num_classes=80):
+        super().__init__()
+        self.backbone = Darknet([1, 2, 8, 8, 4])
+
+        anchors_per_region = 3
+        self.yolo_0_pre = Yolov3UpsamplePrep([512, 1024], 1024, anchors_per_region * (5 + num_classes))
+        self.yolo_0 = YoloLayer(
+            anchors=[(116.0, 90.0), (156.0, 198.0), (373.0, 326.0)],
+            stride=32,
+            num_classes=num_classes,
+        )
+
+        self.yolo_1_c = ConvBN(512, 256, 1)
+        self.yolo_1_prep = Yolov3UpsamplePrep([256, 512], 512 + 256, anchors_per_region * (5 + num_classes))
+        self.yolo_1 = YoloLayer(
+            anchors=[(30.0, 61.0), (62.0, 45.0), (59.0, 119.0)],
+            stride=16,
+            num_classes=num_classes,
+        )
+
+        self.yolo_2_c = ConvBN(256, 128, 1)
+        self.yolo_2_prep = Yolov3UpsamplePrep([128, 256], 256 + 128, anchors_per_region * (5 + num_classes))
+        self.yolo_2 = YoloLayer(
+            anchors=[(10.0, 13.0), (16.0, 30.0), (33.0, 23.0)],
+            stride=8,
+            num_classes=num_classes,
+        )
+
+    def get_loss_layers(self):
+        return [self.yolo_0, self.yolo_1, self.yolo_2]
+
+    def forward_yolo(self, xb):
+        x, y0 = self.yolo_0_pre(xb[-1])
+
+        x = self.yolo_1_c(x)
+        x = nn.Upsample(scale_factor=2, mode="nearest")(x)
+        x = torch.cat([x, xb[-2]], 1)
+        x, y1 = self.yolo_1_prep(x)
+
+        x = self.yolo_2_c(x)
+        x = nn.Upsample(scale_factor=2, mode="nearest")(x)
+        x = torch.cat([x, xb[-3]], 1)
+        x, y2 = self.yolo_2_prep(x)
+
+        return [y0, y1, y2]
+
+
+# Backbone and helper modules
+
+
+class DarknetBlock(nn.Module):
+    def __init__(self, ch_in):
+        super().__init__()
+        ch_hid = ch_in // 2
+        self.conv1 = ConvBN(ch_in, ch_hid, kernel_size=1, stride=1, padding=0)
+        self.conv2 = ConvBN(ch_hid, ch_in, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, x):
+        return self.conv2(self.conv1(x)) + x
+
+
+class Darknet(nn.Module):
+    def __init__(self, num_blocks, start_nf=32):
+        super().__init__()
+        nf = start_nf
+        self.base = ConvBN(3, nf, kernel_size=3, stride=1)  # , padding=1)
+        self.layers = []
+        for i, nb in enumerate(num_blocks):
+            # dn_layer = make_group_layer(nf, nb, stride=(1 if i==-1 else 2))
+            dn_layer = self.make_group_layer(nf, nb, stride=2)
+            self.add_module(f"darknet_{i}", dn_layer)
+            self.layers.append(dn_layer)
+            nf *= 2
+
+    def make_group_layer(self, ch_in, num_blocks, stride=2):
+        layers = [ConvBN(ch_in, ch_in * 2, stride=stride)]
+        for i in range(num_blocks):
+            layers.append(DarknetBlock(ch_in * 2))
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        y = [self.base(x)]
+        for layer in self.layers:
+            y.append(layer(y[-1]))
+        return y
+
+
+class Yolov3UpsamplePrep(nn.Module):
+    def __init__(self, filters_list, in_filters, out_filters):
+        super().__init__()
+        self.branch = nn.ModuleList(
+            [
+                ConvBN(in_filters, filters_list[0], 1),
+                ConvBN(filters_list[0], filters_list[1], kernel_size=3),
+                ConvBN(filters_list[1], filters_list[0], kernel_size=1),
+                ConvBN(filters_list[0], filters_list[1], kernel_size=3),
+                ConvBN(filters_list[1], filters_list[0], kernel_size=1),
+            ]
+        )
+        self.for_yolo = nn.ModuleList(
+            [
+                ConvBN(filters_list[0], filters_list[1], kernel_size=3),
+                nn.Conv2d(
+                    filters_list[1],
+                    out_filters,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                    bias=True,
+                ),
+            ]
+        )
+
+    def forward(self, x):
+        for m in self.branch:
+            x = m(x)
+        branch_out = x
+        for m in self.for_yolo:
+            x = m(x)
+        return branch_out, x
diff --git a/model_demos/cv_demos/yolo_v3/holli_src/yolov3_base.py b/model_demos/cv_demos/yolo_v3/holli_src/yolov3_base.py
new file mode 100644
index 00000000..565374ff
--- /dev/null
+++ b/model_demos/cv_demos/yolo_v3/holli_src/yolov3_base.py
@@ -0,0 +1,113 @@
+import importlib
+from abc import ABCMeta, abstractmethod
+from collections import Iterable, OrderedDict, defaultdict
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .yolo_layer import *
+
+
+class Yolov3Base(nn.Module, metaclass=ABCMeta):
+    def __init__(self):
+        super().__init__()
+
+    @abstractmethod
+    def get_loss_layers(self):
+        return [self.yolo_0, self.yolo_1]
+
+    def forward_backbone(self, x):
+        return self.backbone(x)
+
+    def forward(self, x):
+        shape = x.shape
+        assert (
+            shape[1] == 3 and shape[2] % 32 == 0 and shape[3] % 32 == 0
+        ), f"Tensor shape should be [bs, 3, x*32, y*32], was {shape}"
+        xb = self.forward_backbone(x)
+        return self.forward_yolo(xb)
+
+    def boxes_from_output(self, outputs, conf_thresh=0.25):
+        all_boxes = [[] for j in range(outputs[0].size(0))]
+        for i, layer in enumerate(self.get_loss_layers()):
+            layer_boxes = layer.get_region_boxes(outputs[i], conf_thresh=conf_thresh)
+            for j, layer_box in enumerate(layer_boxes):
+                all_boxes[j] += layer_box
+
+        return all_boxes
+
+    def predict_img(self, imgs, conf_thresh=0.25):
+        self.eval()
+        if len(imgs.shape) == 3:
+            imgs = imgs.unsqueeze(-1)
+
+        outputs = self.forward(imgs)
+        return self.boxes_from_output(outputs, conf_thresh)
+
+    def freeze_backbone(self, requires_grad=False):
+        for _, p in self.backbone.named_parameters():
+            p.requires_grad = requires_grad
+
+    def unfreeze(self):
+        for _, p in self.named_parameters():
+            p.requires_grad = True
+
+    def freeze_info(self, print_all=False):
+        d = defaultdict(set)
+        print("Layer: param.requires_grad")
+        for name, param in self.named_parameters():
+            if print_all:
+                print(f"{name}: {param.requires_grad}")
+            else:
+                d[name.split(".")[0]].add(param.requires_grad)
+        if not print_all:
+            for k, v in d.items():
+                print(k, ": ", v)
+
+    def load_backbone(self, h5_path):
+        state_old = self.state_dict()
+        state_new = torch.load(h5_path)
+
+        skipped_layers = []
+        for k in list(state_new.keys()):
+            if state_old[k].shape != state_new[k].shape:
+                skipped_layers.append(k)
+                del state_new[k]
+
+        return self.load_state_dict(state_new, strict=False), skipped_layers
+
+
+# Common helper modules
+
+
+class ConvBN(nn.Module):
+    "convolutional layer then batchnorm"
+
+    def __init__(self, ch_in, ch_out, kernel_size=3, stride=1, padding=None):
+        super().__init__()
+        if padding is None:
+            padding = (kernel_size - 1) // 2  # we should never need to set padding
+        self.conv = nn.Conv2d(
+            ch_in,
+            ch_out,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            bias=False,
+        )
+        self.bn = nn.BatchNorm2d(ch_out, momentum=0.01)
+        self.relu = nn.LeakyReLU(0.1, inplace=True)
+
+    def forward(self, x):
+        return self.relu(self.bn(self.conv(x)))
+
+
+class Upsample(nn.Module):
+    def __init__(self, stride=2):
+        super().__init__()
+        self.stride = stride
+
+    def forward(self, x):
+        assert x.data.dim() == 4
+        return nn.Upsample(scale_factor=self.stride, mode="nearest")(x)
diff --git a/model_demos/cv_demos/yolo_v3/holli_src/yolov3_tiny.py b/model_demos/cv_demos/yolo_v3/holli_src/yolov3_tiny.py
new file mode 100644
index 00000000..7e87202f
--- /dev/null
+++ b/model_demos/cv_demos/yolo_v3/holli_src/yolov3_tiny.py
@@ -0,0 +1,134 @@
+import torch
+import torch.nn as nn
+
+from .yolo_layer import *
+from .yolov3_base import *
+
+
+class Yolov3Tiny(Yolov3Base):
+    def __init__(self, num_classes, use_wrong_previous_anchors=False):
+        super().__init__()
+
+        self.num_classes = num_classes
+        self.return_out_boxes = False
+        self.skip_backbone = False
+
+        self.backbone = Yolov3TinyBackbone()
+
+        anchors_per_region = 3
+        self.yolo_0_pre = nn.Sequential(
+            OrderedDict(
+                [
+                    ("14_convbatch", ConvBN(256, 512, 3, 1, 1)),
+                    (
+                        "15_conv",
+                        nn.Conv2d(
+                            512,
+                            anchors_per_region * (5 + self.num_classes),
+                            1,
+                            1,
+                            0,
+                        ),
+                    ),
+                ]
+            )
+        )
+        self.yolo_0 = YoloLayer(
+            anchors=[(81.0, 82.0), (135.0, 169.0), (344.0, 319.0)],
+            stride=32,
+            num_classes=num_classes,
+        )
+
+        self.up_1 = nn.Sequential(
+            OrderedDict(
+                [
+                    ("17_convbatch", ConvBN(256, 128, 1, 1, 0)),
+                    ("18_upsample", Upsample(2)),
+                ]
+            )
+        )
+
+        self.yolo_1_pre = nn.Sequential(
+            OrderedDict(
+                [
+                    ("19_convbatch", ConvBN(128 + 256, 256, 3, 1, 1)),
+                    (
+                        "20_conv",
+                        nn.Conv2d(
+                            256,
+                            anchors_per_region * (5 + self.num_classes),
+                            1,
+                            1,
+                            0,
+                        ),
+                    ),
+                ]
+            )
+        )
+
+        # Tiny yolo weights were originally trained using wrong anchor mask
+        # https://github.com/pjreddie/darknet/commit/f86901f6177dfc6116360a13cc06ab680e0c86b0#diff-2b0e16f442a744897f1606ff1a0f99d3L175
+        if use_wrong_previous_anchors:
+            yolo_1_anchors = [(23.0, 27.0), (37.0, 58.0), (81.0, 82.0)]
+        else:
+            yolo_1_anchors = [(10.0, 14.0), (23.0, 27.0), (37.0, 58.0)]
+
+        self.yolo_1 = YoloLayer(anchors=yolo_1_anchors, stride=16.0, num_classes=num_classes)
+
+    def get_loss_layers(self):
+        return [self.yolo_0, self.yolo_1]
+
+    def forward_yolo(self, xb):
+        x_b_0, x_b_full = xb[0], xb[1]
+        y0 = self.yolo_0_pre(x_b_full)
+
+        x_up = self.up_1(x_b_full)
+        x_up = torch.cat((x_up, x_b_0), 1)
+        y1 = self.yolo_1_pre(x_up)
+
+        return [y0, y1]
+
+
+# Backbone and helper modules
+
+
+class MaxPoolStride1(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        x = F.max_pool2d(F.pad(x, (0, 1, 0, 1), mode="replicate"), 2, stride=1)
+        return x
+
+
+class Yolov3TinyBackbone(nn.Module):
+    def __init__(self, input_channels=3):
+        super().__init__()
+        self.layers_list = OrderedDict(
+            [
+                ("0_convbatch", ConvBN(input_channels, 16, 3, 1, 1)),
+                ("1_max", nn.MaxPool2d(2, 2)),
+                ("2_convbatch", ConvBN(16, 32, 3, 1, 1)),
+                ("3_max", nn.MaxPool2d(2, 2)),
+                ("4_convbatch", ConvBN(32, 64, 3, 1, 1)),
+                ("5_max", nn.MaxPool2d(2, 2)),
+                ("6_convbatch", ConvBN(64, 128, 3, 1, 1)),
+                ("7_max", nn.MaxPool2d(2, 2)),
+                ("8_convbatch", ConvBN(128, 256, 3, 1, 1)),
+                ("9_max", nn.MaxPool2d(2, 2)),
+                ("10_convbatch", ConvBN(256, 512, 3, 1, 1)),
+                ("11_max", MaxPoolStride1()),
+                ("12_convbatch", ConvBN(512, 1024, 3, 1, 1)),
+                (
+                    "13_convbatch",
+                    ConvBN(1024, 256, 1, 1, 0),
+                ),  # padding = kernel_size-1//2
+            ]
+        )
+        self.layers = nn.Sequential(self.layers_list)
+        self.idx = 9
+
+    def forward(self, x):
+        x_b_0 = self.layers[: self.idx](x)
+        x_b_full = self.layers[self.idx :](x_b_0)
+        return x_b_0, x_b_full
diff --git a/model_demos/cv_demos/yolo_v3/pytorch_yolov3_holli.py b/model_demos/cv_demos/yolo_v3/pytorch_yolov3_holli.py
new file mode 100644
index 00000000..baa6c01c
--- /dev/null
+++ b/model_demos/cv_demos/yolo_v3/pytorch_yolov3_holli.py
@@ -0,0 +1,63 @@
+import os
+
+import pybuda
+import requests
+from PIL import Image
+from pybuda._C.backend_api import BackendDevice
+
+from cv_demos.yolo_v3.holli_src import utils
+from cv_demos.yolo_v3.holli_src.yolov3 import *
+
+
+def run_yolov3_holli_pytorch():
+
+    # et PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.enable_t_streaming = True
+    compiler_cfg.default_df_override = pybuda._C.Float16_b
+    os.environ["PYBUDA_RIBBON2"] = "1"
+
+    # Device specific configurations
+    available_devices = pybuda.detect_available_devices()
+    if available_devices:
+        if available_devices[0] == BackendDevice.Grayskull:
+            os.environ["PYBUDA_FORK_JOIN_SKIP_EXPANDING_BUFFERS"] = "1"
+
+    # Download model weights
+    url = "https://www.ollihuotari.com/data/yolov3_pytorch/yolov3_coco_01.h5"
+    load_path = "cv_demos/yolo_v3/" + url.split("/")[-1]
+    response = requests.get(url, stream=True)
+    with open(load_path, "wb") as f:
+        f.write(response.content)
+
+    # Load model
+    model = Yolov3(num_classes=80)
+    model.load_state_dict(
+        torch.load(
+            load_path,
+            map_location=torch.device("cpu"),
+        )
+    )
+    model.eval()
+
+    # Create PyBuda module from PyTorch model
+    tt_model = pybuda.PyTorchModule("pytorch_yolov3_holli", model)
+
+    sz = 512
+    image_url = "https://raw.githubusercontent.com/pytorch/hub/master/images/dog.jpg"
+    img_org = Image.open(requests.get(image_url, stream=True).raw).convert("RGB")
+    img_resized = img_org.resize((sz, sz))
+    img_tensor = utils.image2torch(img_resized)
+
+    # Run inference on Tenstorrent device
+    output_q = pybuda.run_inference(tt_model, inputs=([img_tensor]))
+    output = output_q.get()
+    print(output)
+
+    # Remove weight file
+    os.remove(load_path)
+
+
+if __name__ == "__main__":
+    run_yolov3_holli_pytorch()
diff --git a/model_demos/cv_demos/yolo_v3/pytorch_yolov3_holli_1x1.py b/model_demos/cv_demos/yolo_v3/pytorch_yolov3_holli_1x1.py
new file mode 100644
index 00000000..84928790
--- /dev/null
+++ b/model_demos/cv_demos/yolo_v3/pytorch_yolov3_holli_1x1.py
@@ -0,0 +1,74 @@
+import os
+
+import pybuda
+import requests
+from PIL import Image
+from pybuda._C.backend_api import BackendDevice
+
+from cv_demos.yolo_v3.holli_src import utils
+from cv_demos.yolo_v3.holli_src.yolov3 import *
+
+
+def run_yolov3_holli_pytorch_1x1():
+
+    # Set PyBUDA configuration parameters
+    available_devices = pybuda.detect_available_devices()
+    if available_devices:
+        if available_devices[0] != BackendDevice.Wormhole_B0:
+            raise NotImplementedError("Model not supported on Grayskull")
+
+    # Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.enable_t_streaming = True
+    compiler_cfg.default_df_override = pybuda._C.Float16_b
+
+    # Device specific configurations
+    available_devices = pybuda.detect_available_devices()
+    if available_devices:
+        if available_devices[0] == BackendDevice.Grayskull:
+            os.environ["PYBUDA_FORK_JOIN_SKIP_EXPANDING_BUFFERS"] = "1"
+
+    # Set PyBDUA environment variable
+    os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "wormhole_b0_1x1.yaml"
+    os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
+    os.environ["PYBUDA_RIBBON2"] = "1"
+
+    # Download model weights
+    url = "https://www.ollihuotari.com/data/yolov3_pytorch/yolov3_coco_01.h5"
+    load_path = "cv_demos/yolo_v3/" + url.split("/")[-1]
+    response = requests.get(url, stream=True)
+    with open(load_path, "wb") as f:
+        f.write(response.content)
+
+    # Load model
+    model = Yolov3(num_classes=80)
+    model.load_state_dict(
+        torch.load(
+            load_path,
+            map_location=torch.device("cpu"),
+        )
+    )
+    model.eval()
+
+    # Create PyBuda module from PyTorch model
+    tt_model = pybuda.PyTorchModule("pytorch_yolov3_holli_1x1", model)
+
+    # Load sample image
+    sz = 512
+    image_url = "https://raw.githubusercontent.com/pytorch/hub/master/images/dog.jpg"
+    img_org = Image.open(requests.get(image_url, stream=True).raw).convert("RGB")
+    img_resized = img_org.resize((sz, sz))
+    img_tensor = utils.image2torch(img_resized)
+
+    # Run inference on Tenstorrent device
+    output_q = pybuda.run_inference(tt_model, inputs=([img_tensor]))
+    output = output_q.get()
+    print(output)
+
+    # Remove weight file
+    os.remove(load_path)
+
+
+if __name__ == "__main__":
+    run_yolov3_holli_pytorch_1x1()
diff --git a/model_demos/cv_demos/yolo_v3/pytorch_yolov3_tiny_holli.py b/model_demos/cv_demos/yolo_v3/pytorch_yolov3_tiny_holli.py
new file mode 100644
index 00000000..c0330c7e
--- /dev/null
+++ b/model_demos/cv_demos/yolo_v3/pytorch_yolov3_tiny_holli.py
@@ -0,0 +1,57 @@
+import os
+
+import pybuda
+import requests
+from PIL import Image
+
+from cv_demos.yolo_v3.holli_src import utils
+from cv_demos.yolo_v3.holli_src.yolo_layer import *
+from cv_demos.yolo_v3.holli_src.yolov3_tiny import *
+
+
+def run_yolov3_tiny_holli_pytorch():
+
+    # Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.enable_t_streaming = True
+    compiler_cfg.enable_auto_fusing = False
+    compiler_cfg.default_df_override = pybuda._C.Float16_b
+
+    # Download model weights
+    url = "https://www.ollihuotari.com/data/yolov3_pytorch/yolov3_tiny_coco_01.h5"
+    load_path = "cv_demos/yolo_v3/" + url.split("/")[-1]
+    response = requests.get(url, stream=True)
+    with open(load_path, "wb") as f:
+        f.write(response.content)
+
+    # Load model
+    model = Yolov3Tiny(num_classes=80, use_wrong_previous_anchors=True)
+    model.load_state_dict(
+        torch.load(
+            load_path,
+            map_location=torch.device("cpu"),
+        )
+    )
+    model.eval()
+
+    # Create PyBuda module from PyTorch model
+    tt_model = pybuda.PyTorchModule("pytorch_yolov3_tiny_holli", model)
+
+    sz = 512
+    image_url = "https://raw.githubusercontent.com/pytorch/hub/master/images/dog.jpg"
+    img_org = Image.open(requests.get(image_url, stream=True).raw).convert("RGB")
+    img_resized = img_org.resize((sz, sz))
+    img_tensor = utils.image2torch(img_resized)
+
+    # Run inference on Tenstorrent device
+    output_q = pybuda.run_inference(tt_model, inputs=([img_tensor]))
+    output = output_q.get()
+    print(output)
+
+    # Remove weight file
+    os.remove(load_path)
+
+
+if __name__ == "__main__":
+    run_yolov3_tiny_holli_pytorch()
diff --git a/model_demos/cv_demos/yolo_v5/pytorch_yolov5_320.py b/model_demos/cv_demos/yolo_v5/pytorch_yolov5_320.py
index 899a0b8c..17bbc3ea 100644
--- a/model_demos/cv_demos/yolo_v5/pytorch_yolov5_320.py
+++ b/model_demos/cv_demos/yolo_v5/pytorch_yolov5_320.py
@@ -20,9 +20,7 @@ def run_pytorch_yolov5_320(variant="yolov5s"):
     compiler_cfg.enable_conv_prestride = True
     compiler_cfg.enable_tvm_constant_prop = True
     os.environ["PYBUDA_DECOMPOSE_SIGMOID"] = "1"
-
-    if variant == "yolov5m":
-        os.environ["PYBUDA_FORK_JOIN_SKIP_EXPANDING_BUFFERS"] = "1"
+    os.environ["PYBUDA_LEGACY_UBLOCK_SHAPE"] = "1"
 
     # Load YOLOv5 model
     # Variants: yolov5n, yolov5s, yolov5m, yolov5l, yolov5x
@@ -49,7 +47,7 @@ def run_pytorch_yolov5_320(variant="yolov5s"):
                 os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
                 os.environ["PYBUDA_BALANCER_PREPASS_DISABLED"] = "1"
                 compiler_cfg.enable_auto_fusing = False
-            elif model_ckpt in ["yolov5n", "yolov5s"]:
+            elif model_ckpt in ["yolov5n", "yolov5s", "yolov5m"]:
                 compiler_cfg.enable_auto_fusing = False
         else:
             print("not a supported device!")
diff --git a/model_demos/cv_demos/yolo_v5/pytorch_yolov5_480.py b/model_demos/cv_demos/yolo_v5/pytorch_yolov5_480.py
index aed0d84b..485574b9 100644
--- a/model_demos/cv_demos/yolo_v5/pytorch_yolov5_480.py
+++ b/model_demos/cv_demos/yolo_v5/pytorch_yolov5_480.py
@@ -20,39 +20,52 @@ def run_pytorch_yolov5_480(variant="yolov5s"):
     compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tm_cpu_fallback = True
     os.environ["PYBUDA_DECOMPOSE_SIGMOID"] = "1"
+    os.environ["PYBUDA_LEGACY_UBLOCK_SHAPE"] = "1"
 
     # Device specific configurations
     available_devices = pybuda.detect_available_devices()
     if available_devices:
         if available_devices[0] == BackendDevice.Grayskull:
-            compiler_cfg.default_dram_parameters = True
             # Set PyBUDA environment variables
             os.environ["PYBUDA_PAD_SPARSE_MM"] = "{113:128}"
             os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{16*1024}"
-
-            if variant in ["yolov5s"]:
-                os.environ["PYBUDA_FORK_JOIN_SKIP_EXPANDING_BUFFERS"] = "1"
-            if variant in ["yolov5m", "yolov5l"]:
-                compiler_cfg.enable_auto_fusing = False
-                compiler_cfg.enable_enumerate_u_kt = False
-
-            if variant in ["yolov5n", "yolov5x"]:
+            if variant == "yolov5m":
+                os.environ["PYBUDA_INSERT_SLICE_FOR_CONCAT"] = "1"
+                os.environ["PYBUDA_CONCAT_SLICE_Y"] = "10"
+                compiler_cfg.balancer_op_override(
+                    "concatenate_26.dc.concatenate.30.dc.concatenate.1.dc.buffer.0", "t_stream_shape", (6, 1)
+                )
+                os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{32*1024}"
+            if variant == "yolov5l":
                 os.environ["PYBUDA_FORK_JOIN_SKIP_EXPANDING_BUFFERS"] = "1"
+            if variant == "yolov5x":
+                os.environ["PYBUDA_TEMP_ELT_UNARY_ESTIMATES_LEGACY"] = "1"
+                os.environ["PYBUDA_INSERT_SLICE_FOR_CONCAT"] = "1"
+                os.environ["PYBUDA_CONCAT_SLICE_Y"] = "10"
+                compiler_cfg.balancer_op_override(
+                    "concatenate_40.dc.concatenate.30.dc.concatenate.1.dc.buffer.0", "t_stream_shape", (6, 1)
+                )
+                compiler_cfg.balancer_op_override("conv2d_41.dc.matmul.8", "grid_shape", (5, 5))
         elif available_devices[0] == BackendDevice.Wormhole_B0:
             # Set PyBUDA environment variables
             compiler_cfg.enable_auto_fusing = False
             compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
             compiler_cfg.default_dram_parameters = True
+            os.environ["PYBUDA_RIBBON2"] = "1"
             os.environ["PYBUDA_PAD_SPARSE_MM"] = "{13:16, 3:4}"
+            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{64*1024}"
             if variant == "yolov5m":
-                os.environ["PYBUDA_RIBBON2"] = "1"
-            if variant == "yolov5x":
+                os.environ["PYBUDA_INSERT_SLICE_FOR_CONCAT"] = "1"
+                os.environ["PYBUDA_CONCAT_SLICE_Y"] = "10"
+                compiler_cfg.balancer_op_override(
+                    "concatenate_26.dc.concatenate.30.dc.concatenate.1.dc.buffer.0", "t_stream_shape", (6, 1)
+                )
+            elif variant == "yolov5l":
+                compiler_cfg.place_on_new_epoch("concatenate_208.dc.concatenate.0")
+            elif variant == "yolov5x":
+                os.environ["PYBUDA_INSERT_SLICE_FOR_CONCAT"] = "1"
+                os.environ["PYBUDA_CONCAT_SLICE_Y"] = "10"
                 os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
-            if variant == "yolov5n" or variant == "yolov5l" or variant == "yolov5x":
-                if variant == "yolov5l":
-                    compiler_cfg.place_on_new_epoch("concatenate_208.dc.concatenate.0")
-                elif variant == "yolov5x":
-                    os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
         else:
             print("not a supported device!")
             sys.exit()
diff --git a/model_demos/cv_demos/yolo_v5/pytorch_yolov5_640.py b/model_demos/cv_demos/yolo_v5/pytorch_yolov5_640.py
index f021d8a9..b53b2412 100644
--- a/model_demos/cv_demos/yolo_v5/pytorch_yolov5_640.py
+++ b/model_demos/cv_demos/yolo_v5/pytorch_yolov5_640.py
@@ -24,24 +24,39 @@ def run_pytorch_yolov5_640(variant="yolov5s"):
     compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_auto_fusing = False
     os.environ["PYBUDA_DECOMPOSE_SIGMOID"] = "1"
-
-    # Model specific configurations
-    if model_ckpt == "yolov5l":
-        compiler_cfg.enable_auto_transposing_placement = True
+    os.environ["PYBUDA_DISABLE_CAP_SPARSE_MM_FIDELITY"] = "1"
+    os.environ["PYBUDA_LEGACY_UBLOCK_SHAPE"] = "1"
 
     # Device specific configurations
     available_devices = pybuda.detect_available_devices()
     if available_devices:
         if available_devices[0] == BackendDevice.Grayskull:
             # Set PyBUDA environment variables
-            if model_ckpt in ["yolov5s", "yolov5m", "yolov5l", "yolov5x"]:
-                os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{65*1024}"
-                os.environ["PYBUDA_FORCE_EMULATE_HARVESTED"] = "1"
+            compiler_cfg.enable_enumerate_u_kt = False
+            os.environ["PYBUDA_FORK_JOIN_SKIP_EXPANDING_BUFFERS"] = "1"
             compiler_cfg.enable_tm_cpu_fallback = True
             compiler_cfg.enable_conv_prestride = True
-            compiler_cfg.enable_enumerate_u_kt = False
             os.environ["PYBUDA_PAD_SPARSE_MM"] = "{13:16, 3:4}"
-            os.environ["PYBUDA_FORK_JOIN_SKIP_EXPANDING_BUFFERS"] = "1"
+            if model_ckpt in ["yolov5s", "yolov5m", "yolov5l", "yolov5x"]:
+                os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{65*1024}"
+                os.environ["PYBUDA_FORCE_EMULATE_HARVESTED"] = "1"
+            if model_ckpt in ["yolov5l", "yolov5x"]:
+                os.environ["PYBUDA_GRAPHSOLVER_SELF_CUT_TYPE"] = "ConsumerOperandDataEdgesFirst"
+                os.environ["PYBUDA_TEMP_ELT_UNARY_ESTIMATES_LEGACY"] = "1"
+                os.environ["PYBUDA_INSERT_SLICE_FOR_CONCAT"] = "1"
+                os.environ["PYBUDA_CONCAT_SLICE_Y"] = "10"
+                os.environ["PYBUDA_RIBBON2"] = "1"
+                compiler_cfg.balancer_op_override("conv2d_41.dc.matmul.8", "grid_shape", (5, 5))
+                if model_ckpt == "yolov5x":
+                    compiler_cfg.enable_enumerate_u_kt = True
+                    compiler_cfg.place_on_new_epoch("concatenate_40.dc.select.28")
+                    compiler_cfg.place_on_new_epoch("conv2d_210.dc.matmul.11")
+            if model_ckpt in ["yolov5m"]:
+                os.environ["PYBUDA_RIBBON2"] = "1"
+                os.environ["PYBUDA_INSERT_SLICE_FOR_CONCAT"] = "1"
+                os.environ["PYBUDA_CONCAT_SLICE_Y"] = "10"
+            if variant == "yolov5n":
+                os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "65536"
         elif available_devices[0] == BackendDevice.Wormhole_B0:
             os.environ["PYBUDA_PAD_SPARSE_MM"] = "{13:16, 3:4}"
             os.environ["PYBUDA_MAX_GRAPH_CUT_RETRY"] = "100"
@@ -56,10 +71,16 @@ def run_pytorch_yolov5_640(variant="yolov5s"):
             compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
             if model_ckpt in ["yolov5n", "yolov5m"]:
                 compiler_cfg.enable_tm_cpu_fallback = False
+            if model_ckpt in ["yolov5s", "yolov5n", "yolov5l"]:
+                os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{64*1024}"
+            if model_ckpt == "yolov5n":
+                compiler_cfg.balancer_op_override(
+                    "concatenate_19.dc.concatenate.30.dc.concatenate.1.dc.buffer.0", "t_stream_shape", (3, 1)
+                )
             if model_ckpt == "yolov5m":
                 compiler_cfg.balancer_op_override("concatenate_260.dc.concatenate.0", "grid_shape", (1, 1))
                 os.environ["PYBUDA_RIBBON2"] = "1"
-                os.environ["PYBUDA_RIBBON2_OPTIMIZATION_ITERATIONS"] = "10"
+                os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{112*1024}"
             if model_ckpt == "yolov5l":
                 compiler_cfg.enable_auto_transposing_placement = True
                 compiler_cfg.enable_tm_cpu_fallback = True
@@ -67,8 +88,10 @@ def run_pytorch_yolov5_640(variant="yolov5s"):
                 os.environ["PYBUDA_RIBBON2"] = "1"
             if model_ckpt == "yolov5x":
                 compiler_cfg.balancer_op_override("concatenate_363.dc.concatenate.0", "grid_shape", (1, 1))
+                compiler_cfg.balancer_op_override("conv2d_41.dc.matmul.8", "t_stream_shape", (1, 1))
                 os.environ["PYBUDA_RIBBON2"] = "1"
                 compiler_cfg.enable_tm_cpu_fallback = True
+                os.environ["PYBUDA_DISABLE_CAP_SPARSE_MM_FIDELITY"] = "0"
         else:
             print("not a supported device!")
             sys.exit()
diff --git a/model_demos/nlp_demos/codegen/pytorch_codegen_causal_lm.py b/model_demos/nlp_demos/codegen/pytorch_codegen_causal_lm.py
index 84c078d2..7b44a52c 100644
--- a/model_demos/nlp_demos/codegen/pytorch_codegen_causal_lm.py
+++ b/model_demos/nlp_demos/codegen/pytorch_codegen_causal_lm.py
@@ -24,6 +24,7 @@ def run_codegen_causal_lm(variant="Salesforce/codegen-350M-mono"):
     if available_devices:
         if available_devices[0] == BackendDevice.Grayskull:
             compiler_cfg.default_dram_parameters = False
+            compiler_cfg.balancer_policy = "Ribbon"
     # DRAM stream limit
     compiler_cfg.balancer_op_override("matmul_1829", "grid_shape", (2, 8))
 
diff --git a/model_demos/nlp_demos/falcon/pytorch_falcon.py b/model_demos/nlp_demos/falcon/pytorch_falcon.py
index 4633a287..9831582c 100644
--- a/model_demos/nlp_demos/falcon/pytorch_falcon.py
+++ b/model_demos/nlp_demos/falcon/pytorch_falcon.py
@@ -1,7 +1,6 @@
 # Falcon-7B Demo Script
 
 import pybuda
-import pytest
 from pybuda._C.backend_api import BackendDevice
 
 from nlp_demos.falcon.utils.model import Falcon
@@ -11,7 +10,6 @@ def run_falcon_pytorch():
     available_devices = pybuda.detect_available_devices()
     if available_devices:
         if available_devices[0] == BackendDevice.Grayskull:
-            pytest.skip("Model not supported on Grayskull")
             raise NotImplementedError("Model not supported on Grayskull")
 
     # Load model from HuggingFace
diff --git a/model_demos/nlp_demos/fuyu8b/pytorch_fuyu8b_past_cache.py b/model_demos/nlp_demos/fuyu8b/pytorch_fuyu8b_past_cache.py
new file mode 100644
index 00000000..ea6d7054
--- /dev/null
+++ b/model_demos/nlp_demos/fuyu8b/pytorch_fuyu8b_past_cache.py
@@ -0,0 +1,282 @@
+# Fuyu8b Demo - Conditional Generation
+
+import os
+
+import pybuda
+import requests
+import torch
+import torch.nn as nn
+from PIL import Image
+from pybuda._C.backend_api import BackendDevice, BackendType
+from pybuda.pybudaglobal import TILE_DIM
+from pybuda.utils import align_up_tile
+from transformers import (
+    AutoTokenizer,
+    FuyuConfig,
+    FuyuForCausalLM,
+    FuyuImageProcessor,
+    FuyuProcessor,
+    LogitsProcessorList,
+)
+from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
+
+
+def generate_fuyu_embedding(model, input_ids, image_patches, image_patches_indices):
+    inputs_embeds = model.language_model.get_input_embeddings()(input_ids)
+    patch_embeddings = model.vision_embed_tokens(image_patches.to(model.vision_embed_tokens.weight.dtype))
+    inputs_embeds = model.gather_continuous_embeddings(
+        word_embeddings=inputs_embeds,
+        continuous_embeddings=patch_embeddings,
+        image_patch_input_indices=image_patches_indices,
+    )
+    return inputs_embeds
+
+
+class FuyuModelImgDecoderWrapper(nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.fuyu_model = model
+        self.fuyu_config = model.config
+
+    def forward(self, inputs_embeds, attention_mask):
+        batch_size, seq_length, hidden_dim = inputs_embeds.shape
+        position_ids = torch.arange(seq_length, dtype=torch.long)
+        position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        hidden_states = inputs_embeds
+
+        presents = []
+        for idx, decoder_layer in enumerate(self.fuyu_model.language_model.model.layers):
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                output_attentions=False,
+                use_cache=True,
+            )
+
+            hidden_states = layer_outputs[0]
+            presents.append(layer_outputs[1])
+
+        hidden_states = self.fuyu_model.language_model.model.final_layernorm(hidden_states)
+        return hidden_states, *presents
+
+
+class FuyuModelTxtDecoderWrapper(nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.fuyu_model = model
+        self.fuyu_config = model.config
+
+    def forward(self, inputs_embeds, attention_mask, position_ids, *past_key_values):
+        batch_size, seq_length, _ = inputs_embeds.shape
+        attention_mask = _prepare_4d_causal_attention_mask(
+            attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values[0].shape[-2]
+        )
+
+        position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        hidden_states = inputs_embeds
+
+        presents = []
+        for idx, decoder_layer in enumerate(self.fuyu_model.language_model.model.layers):
+            pkv = tuple([past_key_values[(idx * 2) + j] for j in range(2)])
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=pkv,
+                output_attentions=False,
+                use_cache=True,
+            )
+
+            hidden_states = layer_outputs[0]
+            presents.append(layer_outputs[1])
+
+        hidden_states = self.fuyu_model.language_model.model.final_layernorm(hidden_states)
+        return hidden_states, *presents
+
+
+def run_fuyu8b_past_cache():
+    # Skip tests
+    available_devices = pybuda.detect_available_devices()
+    if available_devices[0] == BackendDevice.Grayskull or available_devices[0] == BackendDevice.Wormhole_B0:
+        raise NotImplementedError("Model not supported.")
+
+    # Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.enable_t_streaming = True
+    compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
+    compiler_cfg.enable_tvm_cpu_fallback = False
+    compiler_cfg.compile_subgraphs = True
+    compiler_cfg.convert_framework_params_to_tvm = False
+    compiler_cfg.enable_link_past_cache_ios = True
+    compiler_cfg.amp_level = 2
+    compiler_cfg.default_dram_parameters = True
+    os.environ["PYBUDA_GRAPHSOLVER_SELF_CUT_TYPE"] = "FastCut"
+    os.environ["PYBUDA_RIBBON2"] = "1"
+    os.environ["PYBUDA_FORCE_SEQUENTIAL"] = "1"
+    os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{84*1024}"
+    for i in range(0, 36):
+        compiler_cfg.balancer_op_override(f"matmul_{i*80+68}", "grid_shape", (1, 8))
+        compiler_cfg.balancer_op_override(
+            f"pt_fuyu8b_past_cache_img.output_concatenate_{i*80+41}_stack", "grid_shape", (1, 1)
+        )
+        compiler_cfg.balancer_op_override(
+            f"pt_fuyu8b_past_cache_img.output_transpose_{i*80+53}_stack", "grid_shape", (1, 1)
+        )
+        compiler_cfg.balancer_op_override(f"transpose_{i*80+91}.dc.sparse_matmul.4.lc2", "grid_shape", (8, 1))
+        compiler_cfg.balancer_op_override(f"transpose_{i*80+111}.dc.sparse_matmul.4.lc2", "grid_shape", (8, 1))
+        compiler_cfg.balancer_op_override(f"transpose_{(i-1)*160+281}.dc.sparse_matmul.4.lc2", "grid_shape", (8, 1))
+    for i in range(69):
+        compiler_cfg.balancer_op_override(f"transpose_{i*80+262}.dc.sparse_matmul.4.lc2", "grid_shape", (2, 1))
+    for i in range(17):
+        compiler_cfg.balancer_op_override(f"transpose_{i*160+3081}.dc.sparse_matmul.4.lc2", "grid_shape", (8, 1))
+
+    # Setup Fuyu8b config
+    config = FuyuConfig.from_pretrained("adept/fuyu-8b")
+    config_dict = config.to_dict()
+    config_dict["return_dict"] = False
+    config_dict["use_cache"] = False
+    config_dict["text_config"]["max_position_embeddings"] = 448  # 512
+    config_dict["text_config"][
+        "pad_token_id"
+    ] = 0  # set '<unk>' equivalent id as pad-token-id of persimmon model (no default value is set)
+    config = FuyuConfig(**config_dict)
+
+    # Load post-processing modules  (run on CPU)
+    tokenizer = AutoTokenizer.from_pretrained("adept/fuyu-8b")
+    image_processor = FuyuImageProcessor()
+    processor = FuyuProcessor(image_processor=image_processor, tokenizer=tokenizer)
+
+    # Create PyBuda module from PyTorch model
+    fuyu_model = FuyuForCausalLM.from_pretrained("adept/fuyu-8b", config=config)
+
+    # Prepare inputs
+    text_prompt = "Generate a coco-style caption. "
+    url = "https://huggingface.co/adept/fuyu-8b/resolve/main/bus.png"
+    image_pil = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+    model_inputs = processor(text=text_prompt, images=[image_pil], device="cpu", return_tensor="pt")
+
+    # Retrieve config numbers and logit function
+    persimmon_config = fuyu_model.language_model.model.config
+    max_length = persimmon_config.max_position_embeddings
+    _, emb_seq_length = model_inputs["input_ids"].shape
+
+    # Pad input_ids and image_patches_indices
+    pad_inputs = True
+    if pad_inputs:
+        tmp_padding_token = 71128  # set \n as temporary padding string (does not matter)
+        target_length = max_length - TILE_DIM
+        org_length = model_inputs["input_ids"].shape[-1]
+        model_inputs["input_ids"] = torch.nn.functional.pad(
+            model_inputs["input_ids"], (0, target_length - org_length), "constant", tmp_padding_token
+        )
+        model_inputs["input_ids"][:, org_length - 1] = tmp_padding_token
+        model_inputs["input_ids"][:, -1] = 71122
+        model_inputs["image_patches_indices"] = torch.nn.functional.pad(
+            model_inputs["image_patches_indices"],
+            (0, target_length + 10 - model_inputs["image_patches_indices"].shape[-1]),
+            "constant",
+            -1,
+        )
+
+    # Generate input embedding for the 1st iteration
+    inputs_embeds = generate_fuyu_embedding(
+        fuyu_model, model_inputs["input_ids"], model_inputs["image_patches"][0], model_inputs["image_patches_indices"]
+    )
+    inputs_embeds = inputs_embeds.clone().detach()
+
+    # Obtain logit function
+    logits_processor = fuyu_model._get_logits_processor(
+        fuyu_model.generation_config, TILE_DIM, inputs_embeds, None, LogitsProcessorList()
+    )
+
+    # Prepare compile-inputs for img-decoder
+    attention_mask = torch.zeros((1, max_length))
+    attention_mask[0, :emb_seq_length] = 1
+    img_attention_mask = torch.zeros((1, max_length - TILE_DIM), dtype=torch.bool)
+    img_attention_mask[0, :emb_seq_length] = 1
+    img_attention_mask = _prepare_4d_causal_attention_mask(
+        img_attention_mask, (1, max_length - TILE_DIM), inputs_embeds, 0
+    )
+    img_decoder_inputs = [inputs_embeds, img_attention_mask]
+
+    # Prepare compile-inputs for txt-decoder
+    input_ids = torch.zeros((1, TILE_DIM), dtype=torch.int)  # 0 (corresponds to '<unk>')
+    inputs_embeds_dummy = torch.zeros((1, TILE_DIM, 4096))  # 4096 is hidden-state dim
+    position_ids = torch.arange(TILE_DIM, dtype=torch.int).reshape(1, TILE_DIM) + align_up_tile(emb_seq_length)
+    first_current_index = max_length - TILE_DIM
+    past_cache_self_shape = (
+        1,
+        persimmon_config.num_attention_heads,
+        max_length - TILE_DIM,
+        persimmon_config.hidden_size // persimmon_config.num_attention_heads,
+    )
+    txt_decoder_inputs = [inputs_embeds_dummy, attention_mask, position_ids.long()]
+    for _ in range(len(fuyu_model.language_model.model.layers)):
+        txt_decoder_inputs += [
+            torch.zeros(past_cache_self_shape),
+            torch.zeros(past_cache_self_shape),
+        ]
+
+    # Instantiate modules
+    img_decoder = pybuda.PyTorchModule(
+        "pt_fuyu8b_past_cache_img", FuyuModelImgDecoderWrapper(fuyu_model)
+    )  # feed inputs_embeds
+    txt_decoder = pybuda.PyTorchModule(
+        "pt_fuyu8b_past_cache_txt", FuyuModelTxtDecoderWrapper(fuyu_model)
+    )  # feed inputs_embeds
+
+    # Place modules
+    tt0 = pybuda.TTDevice("tt0", module=[img_decoder, txt_decoder])
+
+    output_q = pybuda.initialize_pipeline(training=False, sample_inputs=((img_decoder_inputs), (txt_decoder_inputs)))
+
+    generated_tokens = []
+    current_token_index = align_up_tile(emb_seq_length)
+    tokens_to_generate = 7
+    for idx in range(tokens_to_generate):
+        if idx == 0:
+            tt0.set_active_subgraph(0)
+            tt0.push_to_inputs([inputs_embeds, img_attention_mask])
+            pybuda.run_generate(input_count=1, write_index=0)
+            ans = output_q.get()
+            tt0.set_active_subgraph(1)
+        else:
+            tt0.push_to_inputs([inputs_embeds, attention_mask, position_ids])
+            pybuda.run_generate(
+                input_count=1,
+                write_index=current_token_index // TILE_DIM,
+            )
+            ans = output_q.get()
+
+        hidden_states = ans[0].value().detach()
+        lm_head = fuyu_model.language_model.lm_head(hidden_states.float()).detach()
+        _input_ids = torch.cat([torch.tensor([[1]]), input_ids[:, : current_token_index % TILE_DIM]], dim=-1)
+        if idx == 0:
+            tokens_scores = logits_processor(_input_ids, lm_head[:, current_token_index - 1, :])
+        else:
+            tokens_scores = logits_processor(_input_ids, lm_head[:, (current_token_index - 1) % TILE_DIM, :])
+        next_token = torch.argmax(tokens_scores, dim=-1).item()
+        generated_tokens.append(next_token)
+
+        current_token_index += 1
+        if current_token_index % TILE_DIM == 0:
+            attention_mask[0, :current_token_index] = 1
+            attention_mask[0, first_current_index:] = 0
+            position_ids = position_ids + TILE_DIM
+            input_ids[0, :] = 0
+
+        input_ids[0, (current_token_index - 1) % TILE_DIM] = next_token
+        attention_mask[0, first_current_index + ((current_token_index - 1) % TILE_DIM)] = 1
+        inputs_embeds = fuyu_model.language_model.model.embed_tokens(input_ids).detach()
+
+    # Post-process
+    print("generated-tokens = ", generated_tokens)
+    generated_text = processor.batch_decode(torch.tensor([generated_tokens]), skip_special_tokens=True)
+    print("generated-text = ", generated_text)
+
+
+if __name__ == "__main__":
+    run_fuyu8b_past_cache()
diff --git a/model_demos/nlp_demos/opt/pytorch_opt_causal_lm.py b/model_demos/nlp_demos/opt/pytorch_opt_causal_lm.py
index 621ae6f6..42a9317b 100644
--- a/model_demos/nlp_demos/opt/pytorch_opt_causal_lm.py
+++ b/model_demos/nlp_demos/opt/pytorch_opt_causal_lm.py
@@ -23,6 +23,8 @@ def run_opt_casual_lm(variant="facebook/opt-350m"):
 
             # Disable expanding output buffer of fork nodes - causes out of memory issue in blobgen.
             os.environ["PYBUDA_FORK_JOIN_EXPAND_FORK_OUTPUT_BUF"] = "0"
+        if variant == "facebook/opt-350m":
+            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "65536"
 
     # Set model configurations
     config = OPTConfig.from_pretrained(model_ckpt)
diff --git a/model_demos/nlp_demos/t5/pytorch_t5_generation.py b/model_demos/nlp_demos/t5/pytorch_t5_generation.py
index 8ac77a40..ba8aac04 100644
--- a/model_demos/nlp_demos/t5/pytorch_t5_generation.py
+++ b/model_demos/nlp_demos/t5/pytorch_t5_generation.py
@@ -3,11 +3,13 @@
 import os
 
 import pybuda
+from pybuda._C.backend_api import BackendDevice
 from pybuda.transformers.pipeline import pipeline as pybuda_pipeline
 from transformers import T5Config, T5ForConditionalGeneration, T5Tokenizer
 
 
 def run_t5_pybuda_pipeline(variant="t5-small"):
+    available_devices = pybuda.detect_available_devices()
 
     # Add PyBUDA configurations
     os.environ["PYBUDA_DISABLE_STREAM_OUTPUT"] = "1"
@@ -26,6 +28,11 @@ def run_t5_pybuda_pipeline(variant="t5-small"):
     compiler_cfg.enable_auto_fusing = False
     compiler_cfg.enable_enumerate_u_kt = False
     compiler_cfg.enable_amp_light()
+    if "large" in variant:
+        os.environ["PYBUDA_LEGACY_UBLOCK_SHAPE"] = "1"
+    if available_devices[0] == BackendDevice.Grayskull:
+        if "base" in variant:
+            os.environ["PYBUDA_LEGACY_UBLOCK_SHAPE"] = "1"
 
     # Variants: t5-small, t5-base, t5-large
     model_ckpt = variant
diff --git a/model_demos/nlp_demos/xglm/pytorch_xglm_causal_lm.py b/model_demos/nlp_demos/xglm/pytorch_xglm_causal_lm.py
index 3c9469b5..5278d61c 100644
--- a/model_demos/nlp_demos/xglm/pytorch_xglm_causal_lm.py
+++ b/model_demos/nlp_demos/xglm/pytorch_xglm_causal_lm.py
@@ -2,6 +2,7 @@
 import os
 
 import pybuda
+from pybuda import BackendDevice
 from pybuda.transformers.pipeline import pipeline as pybuda_pipeline
 from transformers import AutoTokenizer, XGLMConfig, XGLMForCausalLM
 
@@ -17,9 +18,18 @@ def run_xglm_causal_lm(variant="facebook/xglm-564M"):
 
     # Variants: "facebook/xglm-564M", "facebook/xglm-1.7B"
     model_ckpt = variant
-    if model_ckpt == "facebook/xglm-1.7B":
-        os.environ["PYBUDA_FORK_JOIN_SKIP_EXPANDING_BUFFERS"] = "1"
-        compiler_cfg.amp_level = 1
+    available_devices = pybuda.detect_available_devices()
+    if available_devices:
+        if model_ckpt == "facebook/xglm-1.7B":
+            compiler_cfg.amp_level = 1
+            if available_devices[0] == BackendDevice.Grayskull:
+                os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{16*1024}"
+        if (available_devices[0] == BackendDevice.Grayskull and model_ckpt == "facebook/xglm-564M") or (
+            available_devices[0] == BackendDevice.Wormhole_B0
+        ):
+            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "65536"
+        if available_devices[0] == BackendDevice.Grayskull and model_ckpt == "facebook/xglm-564M":
+            compiler_cfg.default_dram_parameters = True
 
     # set model configurations
     config = XGLMConfig.from_pretrained(model_ckpt)
diff --git a/model_demos/pyproject.toml b/model_demos/pyproject.toml
index f615178f..c682d7eb 100644
--- a/model_demos/pyproject.toml
+++ b/model_demos/pyproject.toml
@@ -67,4 +67,17 @@ markers = [
     "unet: tests that involve U-Net",
     "falcon: tests that involve Falcon",
     "stablediffusion: tests that involve Stable Diffusion",
+    "retinanet: tests that involve RetinaNet",
+    "beit: tests that involve BeiT",
+    "fuyu8b: tests that involve Fuyu-8B",
+    "mlpmixer: tests that involve MLP-Mixer",
+    "openpose: tests that involve OpenPose",
+    "vilt: tests that involve ViLT",
+    "landmark: tests that involve Landmark",
+    "yolov3: tests that involve YOLOv3",
+    "efficientnetlite: tests that involve EfficientNet-Lite",
+    "mobilenetssd: tests that involve MobileNet-SSD",
+    "wideresnet: tests that involve WideResNet",
+    "xception: tests that involve Xception",
+    "ghostnet: tests that involve GhostNet",
 ]
diff --git a/model_demos/requirements.txt b/model_demos/requirements.txt
index f554d4d6..9c2942a0 100644
--- a/model_demos/requirements.txt
+++ b/model_demos/requirements.txt
@@ -11,3 +11,4 @@ numba==0.53.1  # For Whisper
 segmentation-models-pytorch==0.3.3  # For U-Net
 pylocron==0.2.1  # For U-Net
 diffusers==0.14.0  # For Stable Diffusion
+transformers==4.35.2 # For Fuyu8B
diff --git a/model_demos/tests/conftest.py b/model_demos/tests/conftest.py
index efbaac7e..affe165b 100644
--- a/model_demos/tests/conftest.py
+++ b/model_demos/tests/conftest.py
@@ -65,15 +65,9 @@ def archive_files(src_directory=Path("./"), dest_directory=Path("archive")):
 
 def pytest_addoption(parser):
     parser.addoption(
-        "--silicon-only",
-        action="store_true",
-        default=False,
-        help="run silicon tests only, skip golden/model",
+        "--silicon-only", action="store_true", default=False, help="run silicon tests only, skip golden/model"
     )
     parser.addoption("--no-silicon", action="store_true", default=False, help="skip silicon tests")
     parser.addoption(
-        "--no-skips",
-        action="store_true",
-        default=False,
-        help="ignore pytest.skip() calls, and continue on with test",
+        "--no-skips", action="store_true", default=False, help="ignore pytest.skip() calls, and continue on with test"
     )
diff --git a/model_demos/tests/test_onnx_resnet.py b/model_demos/tests/test_onnx_resnet.py
new file mode 100644
index 00000000..9cb2fac8
--- /dev/null
+++ b/model_demos/tests/test_onnx_resnet.py
@@ -0,0 +1,8 @@
+import pytest
+
+from cv_demos.resnet.onnx_resnet import run_resnet_onnx
+
+
+@pytest.mark.resnet
+def test_resnet_onnx(clear_pybuda):
+    run_resnet_onnx()
diff --git a/model_demos/tests/test_onnx_retinanet.py b/model_demos/tests/test_onnx_retinanet.py
new file mode 100644
index 00000000..79911b7f
--- /dev/null
+++ b/model_demos/tests/test_onnx_retinanet.py
@@ -0,0 +1,8 @@
+import pytest
+
+from cv_demos.retinanet.onnx_retinanet_r101 import run_retinanet_r101_640x480_onnx
+
+
+@pytest.mark.retinanet
+def test_retinanet_onnx(clear_pybuda):
+    run_retinanet_r101_640x480_onnx()
diff --git a/model_demos/tests/test_pytorch_beit.py b/model_demos/tests/test_pytorch_beit.py
new file mode 100644
index 00000000..c8c8e25b
--- /dev/null
+++ b/model_demos/tests/test_pytorch_beit.py
@@ -0,0 +1,11 @@
+import pytest
+
+from cv_demos.beit.pytorch_beit_classify_16_224_hf import run_beit_classify_224_hf_pytorch
+
+variants = ["microsoft/beit-base-patch16-224", "microsoft/beit-large-patch16-224"]
+
+
+@pytest.mark.parametrize("variant", variants, ids=variants)
+@pytest.mark.beit
+def test_beit_classify_224_hf_pytorch(clear_pybuda, variant):
+    run_beit_classify_224_hf_pytorch(variant)
diff --git a/model_demos/tests/test_pytorch_distilbert.py b/model_demos/tests/test_pytorch_distilbert.py
index 8b22fe5a..9ec60eb5 100644
--- a/model_demos/tests/test_pytorch_distilbert.py
+++ b/model_demos/tests/test_pytorch_distilbert.py
@@ -7,11 +7,7 @@
 )
 from nlp_demos.distilbert.pytorch_distilbert_token_classification import run_distilbert_token_classification_pytorch
 
-variants = [
-    "distilbert-base-uncased",
-    "distilbert-base-cased",
-    "distilbert-base-multilingual-cased",
-]
+variants = ["distilbert-base-uncased", "distilbert-base-cased", "distilbert-base-multilingual-cased"]
 
 
 @pytest.mark.parametrize("variant", variants, ids=variants)
diff --git a/model_demos/tests/test_pytorch_dpr.py b/model_demos/tests/test_pytorch_dpr.py
index 1f7e14b9..98079889 100644
--- a/model_demos/tests/test_pytorch_dpr.py
+++ b/model_demos/tests/test_pytorch_dpr.py
@@ -4,18 +4,9 @@
 from nlp_demos.dpr.pytorch_dpr_question_encoder import run_dpr_question_encoder_pytorch
 from nlp_demos.dpr.pytorch_dpr_reader import run_dpr_reader_pytorch
 
-variants_ctx = [
-    "facebook/dpr-ctx_encoder-single-nq-base",
-    "facebook/dpr-ctx_encoder-multiset-base",
-]
-variants_qe = [
-    "facebook/dpr-question_encoder-single-nq-base",
-    "facebook/dpr-question_encoder-multiset-base",
-]
-variants_reader = [
-    "facebook/dpr-reader-single-nq-base",
-    "facebook/dpr-reader-multiset-base",
-]
+variants_ctx = ["facebook/dpr-ctx_encoder-single-nq-base", "facebook/dpr-ctx_encoder-multiset-base"]
+variants_qe = ["facebook/dpr-question_encoder-single-nq-base", "facebook/dpr-question_encoder-multiset-base"]
+variants_reader = ["facebook/dpr-reader-single-nq-base", "facebook/dpr-reader-multiset-base"]
 
 
 @pytest.mark.parametrize("variant", variants_ctx, ids=variants_ctx)
diff --git a/model_demos/tests/test_pytorch_fuyu8b.py b/model_demos/tests/test_pytorch_fuyu8b.py
new file mode 100644
index 00000000..0320fed0
--- /dev/null
+++ b/model_demos/tests/test_pytorch_fuyu8b.py
@@ -0,0 +1,8 @@
+import pytest
+
+from nlp_demos.fuyu8b.pytorch_fuyu8b_past_cache import run_fuyu8b_past_cache
+
+
+@pytest.mark.fuyu8b
+def test_fuyu8b_past_cache_pytorch(clear_pybuda):
+    run_fuyu8b_past_cache()
diff --git a/model_demos/tests/test_pytorch_ghostnet.py b/model_demos/tests/test_pytorch_ghostnet.py
new file mode 100644
index 00000000..aa980e32
--- /dev/null
+++ b/model_demos/tests/test_pytorch_ghostnet.py
@@ -0,0 +1,8 @@
+import pytest
+
+from cv_demos.ghostnet.timm_ghostnet import run_ghostnet_timm
+
+
+@pytest.mark.ghostnet
+def test_ghostnet_timm_pytorch(clear_pybuda):
+    run_ghostnet_timm()
diff --git a/model_demos/tests/test_pytorch_mlpmixer.py b/model_demos/tests/test_pytorch_mlpmixer.py
new file mode 100644
index 00000000..3030eebe
--- /dev/null
+++ b/model_demos/tests/test_pytorch_mlpmixer.py
@@ -0,0 +1,8 @@
+import pytest
+
+from cv_demos.mlpmixer.timm_mlpmixer import run_mlpmixer_timm
+
+
+@pytest.mark.mlpmixer
+def test_mlpmixer_timm(clear_pybuda):
+    run_mlpmixer_timm()
diff --git a/model_demos/tests/test_pytorch_openpose.py b/model_demos/tests/test_pytorch_openpose.py
new file mode 100644
index 00000000..d097e014
--- /dev/null
+++ b/model_demos/tests/test_pytorch_openpose.py
@@ -0,0 +1,14 @@
+import pytest
+
+from cv_demos.openpose.pytorch_lwopenpose_2d_osmr import run_lwopenpose_2d_osmr_pytorch
+from cv_demos.openpose.pytorch_lwopenpose_3d_osmr import run_lwopenpose_3d_osmr_pytorch
+
+
+@pytest.mark.openpose
+def test_openpose_2d_osmr(clear_pybuda):
+    run_lwopenpose_2d_osmr_pytorch()
+
+
+@pytest.mark.openpose
+def test_openpose_3d_osmr(clear_pybuda):
+    run_lwopenpose_3d_osmr_pytorch()
diff --git a/model_demos/tests/test_pytorch_vgg.py b/model_demos/tests/test_pytorch_vgg.py
index 415e123f..0442fb91 100644
--- a/model_demos/tests/test_pytorch_vgg.py
+++ b/model_demos/tests/test_pytorch_vgg.py
@@ -5,16 +5,7 @@
 from cv_demos.vgg.pytorch_vgg_timm import run_vgg_bn19_timm_pytorch
 from cv_demos.vgg.pytorch_vgg_torchhub import run_vgg_bn19_torchhub_pytorch
 
-variants1 = [
-    "vgg11",
-    "vgg13",
-    "vgg16",
-    "vgg19",
-    "vgg11_bn",
-    "vgg13_bn",
-    "vgg16_bn",
-    "vgg19_bn",
-]
+variants1 = ["vgg11", "vgg13", "vgg16", "vgg19", "vgg11_bn", "vgg13_bn", "vgg16_bn", "vgg19_bn"]
 variants2 = ["vgg11", "vgg13", "vgg16", "vgg19", "bn_vgg19", "bn_vgg19b"]
 
 
diff --git a/model_demos/tests/test_pytorch_vilt.py b/model_demos/tests/test_pytorch_vilt.py
new file mode 100644
index 00000000..cddea4f9
--- /dev/null
+++ b/model_demos/tests/test_pytorch_vilt.py
@@ -0,0 +1,14 @@
+import pytest
+
+from cv_demos.vilt.pytorch_vilt_maskedlm import run_vilt_maskedlm_pytorch
+from cv_demos.vilt.pytorch_vilt_question_answering import run_vilt_for_question_answering_pytorch
+
+
+@pytest.mark.vilt
+def test_vilt_for_question_answering_pytorch(clear_pybuda):
+    run_vilt_for_question_answering_pytorch()
+
+
+@pytest.mark.vilt
+def test_vilt_maskedlm_pytorch(clear_pybuda):
+    run_vilt_maskedlm_pytorch()
diff --git a/model_demos/tests/test_pytorch_wideresnet.py b/model_demos/tests/test_pytorch_wideresnet.py
new file mode 100644
index 00000000..db3251cd
--- /dev/null
+++ b/model_demos/tests/test_pytorch_wideresnet.py
@@ -0,0 +1,18 @@
+import pytest
+
+from cv_demos.wideresnet.pytorch_wideresnet_timm import run_wideresnet_timm_pytorch
+from cv_demos.wideresnet.pytorch_wideresnet_torchhub import run_wideresnet_torchhub_pytorch
+
+variants = ["wide_resnet50_2", "wide_resnet101_2"]
+
+
+@pytest.mark.parametrize("variant", variants, ids=variants)
+@pytest.mark.wideresnet
+def test_wideresnet_torchhub_pytorch(clear_pybuda, variant):
+    run_wideresnet_torchhub_pytorch(variant)
+
+
+@pytest.mark.parametrize("variant", variants, ids=variants)
+@pytest.mark.wideresnet
+def test_wideresnet_timm_pytorch(clear_pybuda, variant):
+    run_wideresnet_timm_pytorch(variant)
diff --git a/model_demos/tests/test_pytorch_xception.py b/model_demos/tests/test_pytorch_xception.py
new file mode 100644
index 00000000..d4aa3206
--- /dev/null
+++ b/model_demos/tests/test_pytorch_xception.py
@@ -0,0 +1,11 @@
+import pytest
+
+from cv_demos.xception.timm_xception import run_xception_timm
+
+variants = ["xception", "xception41", "xception65", "xception71"]
+
+
+@pytest.mark.parametrize("variant", variants, ids=variants)
+@pytest.mark.xception
+def test_xception_timm_pytorch(clear_pybuda, variant):
+    run_xception_timm(variant)
diff --git a/model_demos/tests/test_pytorch_yolov3.py b/model_demos/tests/test_pytorch_yolov3.py
new file mode 100644
index 00000000..745183c3
--- /dev/null
+++ b/model_demos/tests/test_pytorch_yolov3.py
@@ -0,0 +1,20 @@
+import pytest
+
+from cv_demos.yolo_v3.pytorch_yolov3_holli import run_yolov3_holli_pytorch
+from cv_demos.yolo_v3.pytorch_yolov3_holli_1x1 import run_yolov3_holli_pytorch_1x1
+from cv_demos.yolo_v3.pytorch_yolov3_tiny_holli import run_yolov3_tiny_holli_pytorch
+
+
+@pytest.mark.yolov3
+def test_yolov3_holli(clear_pybuda):
+    run_yolov3_holli_pytorch()
+
+
+@pytest.mark.yolov3
+def test_yolov3_holli_tiny(clear_pybuda):
+    run_yolov3_tiny_holli_pytorch()
+
+
+@pytest.mark.yolov3
+def test_yolov3_holli_1x1(clear_pybuda):
+    run_yolov3_holli_pytorch_1x1()
diff --git a/model_demos/tests/test_tflite_efficientnet_lite.py b/model_demos/tests/test_tflite_efficientnet_lite.py
new file mode 100644
index 00000000..64cb61d0
--- /dev/null
+++ b/model_demos/tests/test_tflite_efficientnet_lite.py
@@ -0,0 +1,14 @@
+import pytest
+
+from cv_demos.efficientnet_lite.tflite_efficientnet_lite0_1x1 import run_efficientnet_lite0_1x1
+from cv_demos.efficientnet_lite.tflite_efficientnet_lite4_1x1 import run_efficientnet_lite4_1x1
+
+
+@pytest.mark.efficientnetlite
+def test_efficientnet_lite0_1x1(clear_pybuda):
+    run_efficientnet_lite0_1x1()
+
+
+@pytest.mark.efficientnetlite
+def test_efficientnet_lite4_1x1(clear_pybuda):
+    run_efficientnet_lite4_1x1()
diff --git a/model_demos/tests/test_tflite_landmark.py b/model_demos/tests/test_tflite_landmark.py
new file mode 100644
index 00000000..b3f14632
--- /dev/null
+++ b/model_demos/tests/test_tflite_landmark.py
@@ -0,0 +1,20 @@
+import pytest
+
+from cv_demos.landmark.hand_landmark_lite_1x1 import run_hand_landmark_lite_1x1
+from cv_demos.landmark.palm_detection_lite_1x1 import run_palm_detection_lite_1x1
+from cv_demos.landmark.pose_landmark_lite_1x1 import run_pose_landmark_lite_1x1
+
+
+@pytest.mark.landmark
+def test_hand_landmark_lite_1x1():
+    run_hand_landmark_lite_1x1()
+
+
+@pytest.mark.landmark
+def test_palm_detection_lite_1x1():
+    run_palm_detection_lite_1x1()
+
+
+@pytest.mark.landmark
+def test_pose_landmark_lite_1x1():
+    run_pose_landmark_lite_1x1()
diff --git a/model_demos/tests/test_tflite_mobilenet_ssd.py b/model_demos/tests/test_tflite_mobilenet_ssd.py
new file mode 100644
index 00000000..baf99e5b
--- /dev/null
+++ b/model_demos/tests/test_tflite_mobilenet_ssd.py
@@ -0,0 +1,8 @@
+import pytest
+
+from cv_demos.mobilenet_ssd.tflite_mobilenet_v2_ssd_1x1 import run_mobilenetv2_ssd_1x1_tflite
+
+
+@pytest.mark.mobilenetssd
+def test_mobilenetv2_ssd_1x1_tflite(clear_pybuda):
+    run_mobilenetv2_ssd_1x1_tflite()