diff --git a/src/SSCMA_Micro_Core.cpp b/src/SSCMA_Micro_Core.cpp
index 64a9318..a809840 100644
--- a/src/SSCMA_Micro_Core.cpp
+++ b/src/SSCMA_Micro_Core.cpp
@@ -440,9 +440,7 @@ SSCMAMicroCore::Expected SSCMAMicroCore::invoke(const Frame& frame, const Invoke
             }
             auto results = algorithm->getResults();
             if (_config.invoke_config && _config.invoke_config->top_k > 0) {
-                std::sort(results.begin(), results.end(), [](const ma_keypoint3f_t& a, const ma_keypoint3f_t& b) { return a.box.score > b.box.score; });
-                results.resize(std::min(results.size(), static_cast<size_t>(_config.invoke_config->top_k)));
-                results.shrink_to_fit();
+                results.sort([](const ma_keypoint3f_t& a, const ma_keypoint3f_t& b) { return a.box.score > b.box.score; });
             }
             std::vector<SSCMAMicroCore::Keypoints> keypoints;
             for (const auto& result : results) {
diff --git a/src/components/sscma-micro/sscma/core/cv/ma_cv.h b/src/components/sscma-micro/sscma/core/cv/ma_cv.h
index 93ee012..316797a 100644
--- a/src/components/sscma-micro/sscma/core/cv/ma_cv.h
+++ b/src/components/sscma-micro/sscma/core/cv/ma_cv.h
@@ -13,7 +13,7 @@ ma_err_t convert(const ma_img_t* src, ma_img_t* dst);
 
 #if MA_USE_LIB_JPEGENC
 ma_err_t rgb_to_jpeg(const ma_img_t* src, ma_img_t* dst);
-#endif 
+#endif
 
 #ifdef __cplusplus
 }
diff --git a/src/components/sscma-micro/sscma/core/engine/ma_engine_halio.cpp b/src/components/sscma-micro/sscma/core/engine/ma_engine_halio.cpp
index 89dbddb..3a2bbb3 100644
--- a/src/components/sscma-micro/sscma/core/engine/ma_engine_halio.cpp
+++ b/src/components/sscma-micro/sscma/core/engine/ma_engine_halio.cpp
@@ -48,13 +48,20 @@ ma_err_t EngineHalio::run() {
         return MA_FAILED;
     }
 
-    auto job = _configured_model->run_async(*_bindings, [](const AsyncInferCompletionInfo& info) {});
+    auto job = _configured_model->run_async(*_bindings, [&](const AsyncInferCompletionInfo& info) { sta = info.status; });
 
     do {
         this_thread::yield();
-    } while (job->wait(1000ms) != HAILO_SUCCESS);
+    } while (job->wait(50ms) != HAILO_SUCCESS);
 
-    return MA_OK;
+    switch (sta) {
+        case HAILO_SUCCESS:
+            return MA_OK;
+        case HAILO_TIMEOUT:
+            return MA_ETIMEOUT;
+        default:
+            return MA_FAILED;
+    }
 }
 
 #if MA_USE_FILESYSTEM
@@ -121,11 +128,16 @@ ma_err_t EngineHalio::load(const string& model_path) {
 
     {
 
-        auto create_internal_bindings = [&](const string& name, const InferModel::InferStream& tsr, shared_ptr<ma_tensor_t>& tensor) {
+        auto create_internal_bindings =
+            [&](const string& name, const InferModel::InferStream& tsr, shared_ptr<ma_tensor_t>& tensor, hailort::ConfiguredInferModel::Bindings::InferStream* cis, bool is_input) -> ma_err_t {
             auto shape  = tsr.shape();
             auto size   = tsr.get_frame_size();
             auto format = tsr.format();
 
+            if (!cis) {
+                return MA_FAILED;
+            }
+
             void* buffer = aligned_alloc(4096, size);
             if (!buffer) {
                 return MA_ENOMEM;
@@ -145,27 +157,30 @@ ma_err_t EngineHalio::load(const string& model_path) {
                 return MA_ENOMEM;
             }
 
+            cis->set_buffer(MemoryView(buffer, size));
+
             tensor->data.data = buffer;
             tensor->size      = size;
 
-            tensor->shape.size = 3;
+            tensor->shape.size    = 4;
+            tensor->shape.dims[0] = 1;
             switch (format.order) {
                 case HAILO_FORMAT_ORDER_NCHW:
-                    tensor->shape.dims[0] = shape.features;
-                    tensor->shape.dims[1] = shape.height;
-                    tensor->shape.dims[2] = shape.width;
+                    tensor->shape.dims[1] = shape.features;
+                    tensor->shape.dims[2] = shape.height;
+                    tensor->shape.dims[3] = shape.width;
                     break;
                 case HAILO_FORMAT_ORDER_NHWC:
                 case HAILO_FORMAT_ORDER_FCR:
                 case HAILO_FORMAT_ORDER_HAILO_NMS:
-                    tensor->shape.dims[0] = shape.height;
-                    tensor->shape.dims[1] = shape.width;
-                    tensor->shape.dims[2] = shape.features;
+                    tensor->shape.dims[1] = shape.height;
+                    tensor->shape.dims[2] = shape.width;
+                    tensor->shape.dims[3] = shape.features;
                     break;
                 case HAILO_FORMAT_ORDER_NHCW:
-                    tensor->shape.dims[0] = shape.height;
-                    tensor->shape.dims[1] = shape.features;
-                    tensor->shape.dims[2] = shape.width;
+                    tensor->shape.dims[1] = shape.height;
+                    tensor->shape.dims[2] = shape.features;
+                    tensor->shape.dims[3] = shape.width;
                     break;
                 default:
                     break;
@@ -192,44 +207,76 @@ ma_err_t EngineHalio::load(const string& model_path) {
                     break;
                 case HAILO_FORMAT_TYPE_FLOAT32:
                     tensor->type = MA_TENSOR_TYPE_F32;
-                    if (format.order == HAILO_FORMAT_ORDER_HAILO_NMS) {
+                    break;
+                default:
+                    tensor->type = MA_TENSOR_TYPE_NONE;
+                    break;
+            }
+
+            if (format.order == HAILO_FORMAT_ORDER_HAILO_NMS) {
+                switch (format.type) {
+                    case HAILO_FORMAT_TYPE_UINT16:
+                        tensor->type = MA_TENSOR_TYPE_NMS_BBOX_U16;
+                        break;
+                    case HAILO_FORMAT_TYPE_FLOAT32:
                         tensor->type = MA_TENSOR_TYPE_NMS_BBOX_F32;
+                        break;
+                    default:
+                        tensor->type = MA_TENSOR_TYPE_NONE;
+                        break;
+                }
 
-                        function<ma_err_t(int, void*, size_t)> f = [this_ptr = this, name](int flag, void* data, size_t size) -> ma_err_t {
-                            if (!data || sizeof(float) != size) {
+                auto fp = make_shared<ExternalHandler>([this_ptr = this, name, is_input](int flag, void* data, size_t size) -> ma_err_t {
+                    if (!data) {
+                        return MA_EINVAL;
+                    }
+                    auto tsr = is_input ? this_ptr->_model->input(name) : this_ptr->_model->output(name);
+                    if (!tsr) {
+                        return MA_FAILED;
+                    }
+                    switch (flag) {
+                        case 0:  // get score threshold
+                            return MA_ENOTSUP;
+                        case 1:  // set score threshold
+                        {
+                            if (sizeof(float) != size) {
+                                return MA_EINVAL;
+                            }
+                            float threshold = *static_cast<float*>(data);
+                            tsr->set_nms_score_threshold(threshold);
+                            return MA_OK;
+                        }
+                        case 2:  // get iou threshold
+                            return MA_ENOTSUP;
+                        case 3:  // set iou threshold
+                        {
+                            if (sizeof(float) != size) {
                                 return MA_EINVAL;
                             }
                             float threshold = *static_cast<float*>(data);
-                            auto tsr        = this_ptr->_model->input(name);
-                            if (!tsr) {
+                            tsr->set_nms_iou_threshold(threshold);
+                            return MA_OK;
+                        }
+                        case 4:  // get nms shape
+                        {
+                            auto nms_shape = tsr->get_nms_shape();
+                            if (!nms_shape) {
                                 return MA_FAILED;
                             }
-                            switch (flag) {
-                                case 0:  // get score threshold
-                                    return MA_ENOTSUP;
-                                case 1:  // set score threshold
-                                    tsr->set_nms_score_threshold(threshold);
-                                    return MA_OK;
-                                case 2:  // get iou threshold
-                                    return MA_ENOTSUP;
-                                case 3:  // set iou threshold
-                                    tsr->set_nms_iou_threshold(threshold);
-                                    return MA_OK;
-                                default:
-                                    return MA_ENOTSUP;
+                            auto shape = nms_shape.value();
+                            if (sizeof(hailo_nms_shape_t) != size) {
+                                return MA_EINVAL;
                             }
-                        };
-
-                        _external_handlers[name] = f;
-                        if (!_external_handlers[name]) {
-                            break;
+                            *static_cast<hailo_nms_shape_t*>(data) = shape;
+                            return MA_OK;
                         }
-                        tensor->external_handler = reinterpret_cast<void*>(&_external_handlers[name]);
+                        default:
+                            return MA_ENOTSUP;
                     }
-                    break;
-                default:
-                    tensor->type = MA_TENSOR_TYPE_NONE;
-                    break;
+                });
+
+                _external_handlers[name] = fp;
+                tensor->external_handler = reinterpret_cast<void*>(fp.get());
             }
 
             _io_buffers[name] = tensor;
@@ -243,14 +290,15 @@ ma_err_t EngineHalio::load(const string& model_path) {
             if (_io_buffers.find(name) != _io_buffers.end()) {
                 continue;
             }
-
             shared_ptr<ma_tensor_t> tensor = nullptr;
-
-            auto ret = create_internal_bindings(name, tsr, tensor);
+            auto bindings_input            = _bindings->input(name);
+            if (!bindings_input) {
+                return MA_FAILED;
+            }
+            auto ret = create_internal_bindings(name, tsr, tensor, &bindings_input.value(), true);
             if (ret != MA_OK) {
                 return ret;
             }
-
             _input_tensors.push_back(tensor);
         }
 
@@ -260,19 +308,19 @@ ma_err_t EngineHalio::load(const string& model_path) {
             if (_io_buffers.find(name) != _io_buffers.end()) {
                 continue;
             }
-
             shared_ptr<ma_tensor_t> tensor = nullptr;
-
-            auto ret = create_internal_bindings(name, tsr, tensor);
+            auto bindings_output           = _bindings->output(name);
+            if (!bindings_output) {
+                return MA_FAILED;
+            }
+            auto ret = create_internal_bindings(name, tsr, tensor, &bindings_output.value(), false);
             if (ret != MA_OK) {
                 return ret;
             }
-
             _output_tensors.push_back(tensor);
         }
     }
 
-
     return MA_OK;
 }
 
@@ -348,7 +396,7 @@ ma_quant_param_t EngineHalio::getOutputQuantParam(int32_t index) {
 
 
 ma_err_t EngineHalio::setInput(int32_t index, const ma_tensor_t& tensor) {
-    return MA_ENOTSUP;  
+    return MA_ENOTSUP;
 }
 
 }  // namespace ma::engine
diff --git a/src/components/sscma-micro/sscma/core/engine/ma_engine_halio.h b/src/components/sscma-micro/sscma/core/engine/ma_engine_halio.h
index ad7e16e..2175767 100644
--- a/src/components/sscma-micro/sscma/core/engine/ma_engine_halio.h
+++ b/src/components/sscma-micro/sscma/core/engine/ma_engine_halio.h
@@ -23,6 +23,8 @@ using namespace hailort;
 
 class EngineHalio final : public Engine {
 public:
+    using ExternalHandler = function<ma_err_t(int, void*, size_t)>;
+
     EngineHalio();
     ~EngineHalio() override;
 
@@ -56,7 +58,7 @@ class EngineHalio final : public Engine {
     shared_ptr<ConfiguredInferModel::Bindings> _bindings;
 
     unordered_map<string, shared_ptr<ma_tensor_t>> _io_buffers;
-    unordered_map<string, function<ma_err_t(int, void*, size_t)>> _external_handlers;
+    unordered_map<string, shared_ptr<ExternalHandler>> _external_handlers;
 
     vector<shared_ptr<ma_tensor_t>> _input_tensors;
     vector<shared_ptr<ma_tensor_t>> _output_tensors;
@@ -66,4 +68,4 @@ class EngineHalio final : public Engine {
 
 #endif
 
-#endif
+#endif
\ No newline at end of file
diff --git a/src/components/sscma-micro/sscma/core/ma_types.h b/src/components/sscma-micro/sscma/core/ma_types.h
index b062d6f..984c65e 100644
--- a/src/components/sscma-micro/sscma/core/ma_types.h
+++ b/src/components/sscma-micro/sscma/core/ma_types.h
@@ -67,7 +67,8 @@ typedef enum {
     MA_TENSOR_TYPE_STR  = 12,
     MA_TENSOR_TYPE_BOOL = 13,
     MA_TENSOR_TYPE_BF16 = 14,
-    MA_TENSOR_TYPE_NMS_BBOX_F32 = 15,
+    MA_TENSOR_TYPE_NMS_BBOX_U16 = 15,
+    MA_TENSOR_TYPE_NMS_BBOX_F32 = 16,
 } ma_tensor_type_t;
 
 typedef struct {
@@ -212,6 +213,17 @@ struct ma_keypoint4f_t {
     ma_bbox_t box;
     std::vector<ma_pt4f_t> pts;
 };
+
+struct ma_segm2f_t {
+    ma_bbox_t box;
+    struct {
+        uint16_t width;
+        uint16_t height;
+        std::vector<uint8_t> data;
+    } mask;
+};
+
+
 #endif
 
 typedef enum {
@@ -239,6 +251,26 @@ typedef enum {
 
 typedef enum { MA_MSG_TYPE_RESP = 0, MA_MSG_TYPE_EVT = 1, MA_MSG_TYPE_LOG = 2, MA_MSG_TYPE_REQ = 3, MA_MSG_TYPE_HB = 4 } ma_msg_type_t;
 
+#define MA_INPUT_TYPE_MASK  0xF000
+#define MA_OUTPUT_TYPE_MASK 0x0F00
+#define MA_MODEL_TYPE_MASK  0x00FF
+
+typedef enum {
+    MA_INPUT_TYPE_TENSOR = 0x0000,
+    MA_INPUT_TYPE_IMAGE  = 0x1000,
+    MA_INPUT_TYPE_AUDIO  = 0x2000,
+} ma_input_type_t;
+
+typedef enum {
+    MA_OUTPUT_TYPE_TENSOR       = 0x0000,
+    MA_OUTPUT_TYPE_CLASS        = 0x0100,
+    MA_OUTPUT_TYPE_POINT        = 0x0200,
+    MA_OUTPUT_TYPE_BBOX         = 0x0300,
+    MA_OUTPUT_TYPE_KEYPOINT     = 0x0400,
+    MA_OUTPUT_TYPE_SEGMENTATION = 0x0500,
+} ma_output_type_t;
+
+
 typedef enum {
     MA_MODEL_TYPE_UNDEFINED   = 0u,
     MA_MODEL_TYPE_FOMO        = 1u,
@@ -249,7 +281,9 @@ typedef enum {
     MA_MODEL_TYPE_YOLOV8      = 6u,
     MA_MODEL_TYPE_NVIDIA_DET  = 7u,
     MA_MODEL_TYPE_YOLO_WORLD  = 8u,
-    MA_MODEL_TYPE_YOLO11     = 9u,
+    MA_MODEL_TYPE_YOLO11      = 9u,
+    MA_MODEL_TYPE_YOLO11_POSE = 10u,
+    MA_MODEL_TYPE_YOLO11_SEG = 11u,
 } ma_model_type_t;
 
 typedef struct {
diff --git a/src/components/sscma-micro/sscma/core/model/ma_model_base.cpp b/src/components/sscma-micro/sscma/core/model/ma_model_base.cpp
index be60c24..9257d46 100644
--- a/src/components/sscma-micro/sscma/core/model/ma_model_base.cpp
+++ b/src/components/sscma-micro/sscma/core/model/ma_model_base.cpp
@@ -4,15 +4,11 @@ namespace ma {
 
 constexpr char TAG[] = "ma::model";
 
-Model::Model(Engine* engine, const char* name, ma_model_type_t type)
-    
-      
-      // Initialize performance metrics to 0 using initializer list
-{
-
-    p_engine_ = engine;
-    p_name_   = name;
-    m_type_   = type;
+Model::Model(Engine* engine, const char* name, uint16_t type) {
+
+    p_engine_   = engine;
+    p_name_     = name;
+    m_type_     = type;
     p_user_ctx_ = nullptr;
 
     p_preprocess_done_     = nullptr;
@@ -75,7 +71,14 @@ const char* Model::getName() const {
 }
 
 ma_model_type_t Model::getType() const {
-    return m_type_;
+    return static_cast<ma_model_type_t>(m_type_ & MA_MODEL_TYPE_MASK);
+}
+
+ma_input_type_t Model::getInputType() const {
+    return static_cast<ma_input_type_t>(m_type_ & MA_INPUT_TYPE_MASK);
+}
+ma_output_type_t Model::getOutputType() const {
+    return static_cast<ma_output_type_t>(m_type_ & MA_OUTPUT_TYPE_MASK);
 }
 
 void Model::setPreprocessDone(std::function<void(void*)> func) {
@@ -94,11 +97,4 @@ void Model::setUserCtx(void* ctx) {
     p_user_ctx_ = ctx;
 }
 
-class ModelFactory {
-public:
-    static Model* create(Engine* engine);
-    static ma_err_t remove(const std::string& name);
-};
-
-
-}  // namespace ma::model
\ No newline at end of file
+}  // namespace ma
\ No newline at end of file
diff --git a/src/components/sscma-micro/sscma/core/model/ma_model_base.h b/src/components/sscma-micro/sscma/core/model/ma_model_base.h
index 258ca93..a7604c8 100644
--- a/src/components/sscma-micro/sscma/core/model/ma_model_base.h
+++ b/src/components/sscma-micro/sscma/core/model/ma_model_base.h
@@ -2,8 +2,8 @@
 #define _MA_MODEL_BASE_H_
 
 #include <cstdarg>
-#include <string>
 #include <functional>
+#include <string>
 
 #include "../engine/ma_engine.h"
 #include "../ma_common.h"
@@ -12,33 +12,36 @@ namespace ma {
 
 using namespace ma::engine;
 class Model {
-   private:
+private:
     ma_perf_t perf_;
-    std::function<void(void*)> p_preprocess_done_     ;
-    std::function<void(void*)> p_postprocess_done_    ;
-    std::function<void(void*)> p_underlying_run_done_ ;
-    void*           p_user_ctx_;
-    ma_model_type_t m_type_;
+    std::function<void(void*)> p_preprocess_done_;
+    std::function<void(void*)> p_postprocess_done_;
+    std::function<void(void*)> p_underlying_run_done_;
+    void* p_user_ctx_;
+    uint16_t m_type_;
 
-   protected:
-    Engine*          p_engine_;
-    const char*      p_name_;
+protected:
+    Engine* p_engine_;
+    const char* p_name_;
     virtual ma_err_t preprocess()  = 0;
     virtual ma_err_t postprocess() = 0;
-    ma_err_t         underlyingRun();
+    ma_err_t underlyingRun();
 
-   public:
-    Model(Engine* engine, const char* name, ma_model_type_t type);
+public:
+    Model(Engine* engine, const char* name, uint16_t type);
     virtual ~Model();
-    const ma_perf_t  getPerf() const;
-    const char*      getName() const;
-    ma_model_type_t  getType() const;
+    const ma_perf_t getPerf() const;
+    const char* getName() const;
+    ma_model_type_t getType() const;
+    ma_input_type_t getInputType() const;
+    ma_output_type_t getOutputType() const;
+    virtual const void* getInput()                          = 0;
     virtual ma_err_t setConfig(ma_model_cfg_opt_t opt, ...) = 0;
     virtual ma_err_t getConfig(ma_model_cfg_opt_t opt, ...) = 0;
-    void             setPreprocessDone  (std::function<void(void*)> func);
-    void             setPostprocessDone (std::function<void(void*)> func);
-    void             setRunDone         (std::function<void(void*)> func);
-    void             setUserCtx(void* ctx);
+    void setPreprocessDone(std::function<void(void*)> func);
+    void setPostprocessDone(std::function<void(void*)> func);
+    void setRunDone(std::function<void(void*)> func);
+    void setUserCtx(void* ctx);
 };
 }  // namespace ma
 
diff --git a/src/components/sscma-micro/sscma/core/model/ma_model_classifier.cpp b/src/components/sscma-micro/sscma/core/model/ma_model_classifier.cpp
index 8cbae3f..9b51db1 100644
--- a/src/components/sscma-micro/sscma/core/model/ma_model_classifier.cpp
+++ b/src/components/sscma-micro/sscma/core/model/ma_model_classifier.cpp
@@ -6,7 +6,7 @@ namespace ma::model {
 
 constexpr char TAG[] = "ma::model::classifier";
 
-Classifier::Classifier(Engine* p_engine) : Model(p_engine, "IMCLS", MA_MODEL_TYPE_IMCLS) {
+Classifier::Classifier(Engine* p_engine) : Model(p_engine, "IMCLS", MA_INPUT_TYPE_IMAGE | MA_OUTPUT_TYPE_CLASS | MA_MODEL_TYPE_IMCLS) {
     input_           = p_engine_->getInput(0);
     output_          = p_engine_->getOutput(0);
     threshold_score_ = 0.5f;
@@ -33,38 +33,36 @@ Classifier::~Classifier() {}
 
 bool Classifier::isValid(Engine* engine) {
 
-    const auto& input_shape = engine->getInputShape(0);
-    auto is_nhwc{input_shape.dims[3] == 3 || input_shape.dims[3] == 1};
-
-    if (is_nhwc) {
-        if (input_shape.size != 4 ||      // N, H, W, C
-            input_shape.dims[0] != 1 ||   // N = 1
-            input_shape.dims[1] < 16 ||   // H >= 16
-            input_shape.dims[2] < 16 ||   // W >= 16
-            (input_shape.dims[3] != 3 &&  // C = RGB or Gray
-             input_shape.dims[3] != 1))
-            return false;
-    } else {
+    const auto inputs_count  = engine->getInputSize();
+    const auto outputs_count = engine->getOutputSize();
 
-        if (input_shape.size != 4 ||      // N, C, H, W
-            input_shape.dims[0] != 1 ||   // N = 1
-            input_shape.dims[2] < 16 ||   // H >= 16
-            input_shape.dims[3] < 16 ||   // W >= 16
-            (input_shape.dims[1] != 3 &&  // C = RGB or Gray
-             input_shape.dims[1] != 1))
-            return false;
+    if (inputs_count != 1 || outputs_count != 1) {
+        return false;
     }
 
-
+    const auto& input_shape = engine->getInputShape(0);
     const auto& output_shape{engine->getOutputShape(0)};
 
-    if (output_shape.size != 2 ||     // N, C
-        output_shape.dims[0] != 1 ||  // N = 1
+    int n = input_shape.dims[0], h = input_shape.dims[1], w = input_shape.dims[2], c = input_shape.dims[3];
+    bool is_nhwc = c == 3 || c == 1;
+
+    if (!is_nhwc)
+        std::swap(h, c);
+
+    if (n != 1 || h < 32 || h % 32 != 0 || (c != 3 && c != 1))
+        return false;
+
+
+    if (output_shape.dims[0] != 1 ||  // N = 1
         output_shape.dims[1] < 2      // C >= 2
     ) {
         return false;
     }
 
+    if (output_shape.size >= 3) {
+        return false;
+    }
+
     return true;
 }
 
@@ -107,6 +105,16 @@ ma_err_t Classifier::postprocess() {
             if (score > threshold_score_)
                 results_.emplace_front(ma_class_t{score, i});
         }
+    }
+    if (output_.type == MA_TENSOR_TYPE_F32) {
+        auto* data = output_.data.f32;
+        auto pred_l{output_.shape.dims[1]};
+        for (decltype(pred_l) i{0}; i < pred_l; ++i) {
+            auto score{data[i]};
+            if (score > threshold_score_)
+                results_.emplace_front(ma_class_t{score, i});
+        }
+
     } else {
         return MA_ENOTSUP;
     }
@@ -121,7 +129,7 @@ const std::forward_list<ma_class_t>& Classifier::getResults() {
     return results_;
 }
 
-const ma_img_t* Classifier::getInputImg() {
+const void* Classifier::getInput() {
     return &img_;
 }
 
diff --git a/src/components/sscma-micro/sscma/core/model/ma_model_classifier.h b/src/components/sscma-micro/sscma/core/model/ma_model_classifier.h
index 163af9f..703206d 100644
--- a/src/components/sscma-micro/sscma/core/model/ma_model_classifier.h
+++ b/src/components/sscma-micro/sscma/core/model/ma_model_classifier.h
@@ -28,7 +28,7 @@ class Classifier : public Model {
     virtual ~Classifier();
     static bool isValid(Engine* engine);
     const std::forward_list<ma_class_t>& getResults();
-    const ma_img_t* getInputImg();
+    const void *getInput();
     ma_err_t run(const ma_img_t* img);
     ma_err_t setConfig(ma_model_cfg_opt_t opt, ...) override;
     ma_err_t getConfig(ma_model_cfg_opt_t opt, ...) override;
diff --git a/src/components/sscma-micro/sscma/core/model/ma_model_detector.cpp b/src/components/sscma-micro/sscma/core/model/ma_model_detector.cpp
index 3f40d35..52a0b53 100644
--- a/src/components/sscma-micro/sscma/core/model/ma_model_detector.cpp
+++ b/src/components/sscma-micro/sscma/core/model/ma_model_detector.cpp
@@ -9,7 +9,7 @@ namespace ma::model {
 constexpr char TAG[] = "ma::model::detecor";
 
 Detector::Detector(Engine* p_engine, const char* name, ma_model_type_t type)
-    : Model(p_engine, name, type),
+    : Model(p_engine, name, MA_INPUT_TYPE_IMAGE | MA_OUTPUT_TYPE_BBOX | type),
       input_(p_engine->getInput(0)),  // Use direct method call instead of p_engine_->
       threshold_nms_(0.45),
       threshold_score_(0.25) {
@@ -58,10 +58,10 @@ const std::forward_list<ma_bbox_t>& Detector::getResults() {
     return results_;
 }
 
-
-const ma_img_t* Detector::getInputImg() {
-    return &img_;
+const void* Detector::getInput() {
+    return static_cast<const void*>(&img_);
 }
+
 ma_err_t Detector::run(const ma_img_t* img) {
     // MA_ASSERT(img != nullptr);
     input_img_ = img;
diff --git a/src/components/sscma-micro/sscma/core/model/ma_model_detector.h b/src/components/sscma-micro/sscma/core/model/ma_model_detector.h
index 784216b..04f02c7 100644
--- a/src/components/sscma-micro/sscma/core/model/ma_model_detector.h
+++ b/src/components/sscma-micro/sscma/core/model/ma_model_detector.h
@@ -26,7 +26,7 @@ class Detector : public Model {
     Detector(Engine* engine, const char* name, ma_model_type_t type);
     virtual ~Detector();
     const std::forward_list<ma_bbox_t>& getResults();
-    const ma_img_t* getInputImg();
+    const void* getInput() override;
     ma_err_t run(const ma_img_t* img);
     ma_err_t setConfig(ma_model_cfg_opt_t opt, ...) override;
     ma_err_t getConfig(ma_model_cfg_opt_t opt, ...) override;
diff --git a/src/components/sscma-micro/sscma/core/model/ma_model_factory.cpp b/src/components/sscma-micro/sscma/core/model/ma_model_factory.cpp
index 3d02d89..551d9f3 100644
--- a/src/components/sscma-micro/sscma/core/model/ma_model_factory.cpp
+++ b/src/components/sscma-micro/sscma/core/model/ma_model_factory.cpp
@@ -55,6 +55,14 @@ Model* ModelFactory::create(Engine* engine, size_t algorithm_id) {
         if (Yolo11::isValid(engine)) {
             return new Yolo11(engine);
         }
+    case MA_MODEL_TYPE_YOLO11_POSE:
+        if (Yolo11Pose::isValid(engine)) {
+            return new Yolo11Pose(engine);
+        }
+    case MA_MODEL_TYPE_YOLO11_SEG:
+        if (Yolo11Seg::isValid(engine)) {
+            return new Yolo11Seg(engine);
+        }
     }
 
     return nullptr;
diff --git a/src/components/sscma-micro/sscma/core/model/ma_model_factory.h b/src/components/sscma-micro/sscma/core/model/ma_model_factory.h
index cf32a3f..c462610 100644
--- a/src/components/sscma-micro/sscma/core/model/ma_model_factory.h
+++ b/src/components/sscma-micro/sscma/core/model/ma_model_factory.h
@@ -7,17 +7,19 @@
 
 #include "ma_model_classifier.h"
 #include "ma_model_detector.h"
-#include "ma_model_pose_detector.h"
 #include "ma_model_point_detector.h"
+#include "ma_model_pose_detector.h"
 
-#include "ma_model_yolov5.h"
-#include "ma_model_yolov8.h"
-#include "ma_model_yolov8_pose.h"
-#include "ma_model_nvidia_det.h"
 #include "ma_model_fomo.h"
+#include "ma_model_nvidia_det.h"
 #include "ma_model_pfld.h"
-#include "ma_model_yolo_world.h"
 #include "ma_model_yolo11.h"
+#include "ma_model_yolo11_pose.h"
+#include "ma_model_yolo11_seg.h"
+#include "ma_model_yolo_world.h"
+#include "ma_model_yolov5.h"
+#include "ma_model_yolov8.h"
+#include "ma_model_yolov8_pose.h"
 
 namespace ma {
 
diff --git a/src/components/sscma-micro/sscma/core/model/ma_model_point_detector.cpp b/src/components/sscma-micro/sscma/core/model/ma_model_point_detector.cpp
index dec3c28..f4c78c4 100644
--- a/src/components/sscma-micro/sscma/core/model/ma_model_point_detector.cpp
+++ b/src/components/sscma-micro/sscma/core/model/ma_model_point_detector.cpp
@@ -6,7 +6,7 @@ namespace ma::model {
 
 constexpr char TAG[] = "ma::model::point_detecor";
 
-PointDetector::PointDetector(Engine* p_engine, const char* name, ma_model_type_t type) : Model(p_engine, name, type) {
+PointDetector::PointDetector(Engine* p_engine, const char* name, ma_model_type_t type) : Model(p_engine, name, MA_INPUT_TYPE_IMAGE | MA_OUTPUT_TYPE_POINT | type) {
     input_           = p_engine_->getInput(0);
     threshold_score_ = 0.25;
 
@@ -30,7 +30,9 @@ PointDetector::PointDetector(Engine* p_engine, const char* name, ma_model_type_t
 
 PointDetector::~PointDetector() {}
 
-const std::vector<ma_point_t>& PointDetector::getResults() const { return results_; }
+const std::vector<ma_point_t>& PointDetector::getResults() const {
+    return results_;
+}
 
 ma_err_t PointDetector::preprocess() {
     ma_err_t ret = MA_OK;
@@ -56,19 +58,23 @@ ma_err_t PointDetector::run(const ma_img_t* img) {
     return underlyingRun();
 }
 
+const void* PointDetector::getInput() {
+    return static_cast<const void*>(&img_);
+}
+
 ma_err_t PointDetector::setConfig(ma_model_cfg_opt_t opt, ...) {
     ma_err_t ret = MA_OK;
-    va_list  args;
+    va_list args;
     va_start(args, opt);
     switch (opt) {
-    case MA_MODEL_CFG_OPT_THRESHOLD:
-        threshold_score_ = va_arg(args, double);
-        ret              = MA_OK;
-        break;
-
-    default:
-        ret = MA_EINVAL;
-        break;
+        case MA_MODEL_CFG_OPT_THRESHOLD:
+            threshold_score_ = va_arg(args, double);
+            ret              = MA_OK;
+            break;
+
+        default:
+            ret = MA_EINVAL;
+            break;
     }
     va_end(args);
     return ret;
@@ -76,18 +82,18 @@ ma_err_t PointDetector::setConfig(ma_model_cfg_opt_t opt, ...) {
 
 ma_err_t PointDetector::getConfig(ma_model_cfg_opt_t opt, ...) {
     ma_err_t ret = MA_OK;
-    va_list  args;
-    void*    p_arg = nullptr;
+    va_list args;
+    void* p_arg = nullptr;
     va_start(args, opt);
     switch (opt) {
-    case MA_MODEL_CFG_OPT_THRESHOLD:
-        p_arg                          = va_arg(args, void*);
-        *(static_cast<double*>(p_arg)) = threshold_score_;
-        break;
-
-    default:
-        ret = MA_EINVAL;
-        break;
+        case MA_MODEL_CFG_OPT_THRESHOLD:
+            p_arg                          = va_arg(args, void*);
+            *(static_cast<double*>(p_arg)) = threshold_score_;
+            break;
+
+        default:
+            ret = MA_EINVAL;
+            break;
     }
     va_end(args);
     return ret;
diff --git a/src/components/sscma-micro/sscma/core/model/ma_model_point_detector.h b/src/components/sscma-micro/sscma/core/model/ma_model_point_detector.h
index 9796c61..80761b6 100644
--- a/src/components/sscma-micro/sscma/core/model/ma_model_point_detector.h
+++ b/src/components/sscma-micro/sscma/core/model/ma_model_point_detector.h
@@ -8,9 +8,9 @@
 namespace ma::model {
 
 class PointDetector : public Model {
-   protected:
-    ma_tensor_t     input_;
-    ma_img_t        img_;
+protected:
+    ma_tensor_t input_;
+    ma_img_t img_;
     const ma_img_t* input_img_;
 
     float threshold_score_;
@@ -19,10 +19,10 @@ class PointDetector : public Model {
 
     std::vector<ma_point_t> results_;
 
-   protected:
+protected:
     ma_err_t preprocess() override;
 
-   public:
+public:
     PointDetector(Engine* engine, const char* name, ma_model_type_t type);
     virtual ~PointDetector();
 
@@ -30,6 +30,8 @@ class PointDetector : public Model {
 
     ma_err_t run(const ma_img_t* img);
 
+    const void* getInput() override;
+
     ma_err_t setConfig(ma_model_cfg_opt_t opt, ...) override;
 
     ma_err_t getConfig(ma_model_cfg_opt_t opt, ...) override;
diff --git a/src/components/sscma-micro/sscma/core/model/ma_model_pose_detector.cpp b/src/components/sscma-micro/sscma/core/model/ma_model_pose_detector.cpp
index f1cde82..f0efde5 100644
--- a/src/components/sscma-micro/sscma/core/model/ma_model_pose_detector.cpp
+++ b/src/components/sscma-micro/sscma/core/model/ma_model_pose_detector.cpp
@@ -6,7 +6,7 @@ namespace ma::model {
 
 constexpr char TAG[] = "ma::model::pose_detecor";
 
-PoseDetector::PoseDetector(Engine* p_engine, const char* name, ma_model_type_t type) : Model(p_engine, name, type) {
+PoseDetector::PoseDetector(Engine* p_engine, const char* name, ma_model_type_t type) : Model(p_engine, name, MA_INPUT_TYPE_IMAGE | MA_OUTPUT_TYPE_KEYPOINT | type) {
     input_           = p_engine_->getInput(0);
     threshold_nms_   = 0.45;
     threshold_score_ = 0.25;
@@ -31,7 +31,13 @@ PoseDetector::PoseDetector(Engine* p_engine, const char* name, ma_model_type_t t
 
 PoseDetector::~PoseDetector() {}
 
-const std::vector<ma_keypoint3f_t>& PoseDetector::getResults() const { return results_; }
+const std::forward_list<ma_keypoint3f_t>& PoseDetector::getResults() const {
+    return results_;
+}
+
+const void* PoseDetector::getInput() {
+    return static_cast<const void*>(&img_);
+}
 
 ma_err_t PoseDetector::preprocess() {
     ma_err_t ret = MA_OK;
@@ -59,20 +65,20 @@ ma_err_t PoseDetector::run(const ma_img_t* img) {
 
 ma_err_t PoseDetector::setConfig(ma_model_cfg_opt_t opt, ...) {
     ma_err_t ret = MA_OK;
-    va_list  args;
+    va_list args;
     va_start(args, opt);
     switch (opt) {
-    case MA_MODEL_CFG_OPT_THRESHOLD:
-        threshold_score_ = va_arg(args, double);
-        ret              = MA_OK;
-        break;
-    case MA_MODEL_CFG_OPT_NMS:
-        threshold_nms_ = va_arg(args, double);
-        ret            = MA_OK;
-        break;
-    default:
-        ret = MA_EINVAL;
-        break;
+        case MA_MODEL_CFG_OPT_THRESHOLD:
+            threshold_score_ = va_arg(args, double);
+            ret              = MA_OK;
+            break;
+        case MA_MODEL_CFG_OPT_NMS:
+            threshold_nms_ = va_arg(args, double);
+            ret            = MA_OK;
+            break;
+        default:
+            ret = MA_EINVAL;
+            break;
     }
     va_end(args);
     return ret;
@@ -80,21 +86,21 @@ ma_err_t PoseDetector::setConfig(ma_model_cfg_opt_t opt, ...) {
 
 ma_err_t PoseDetector::getConfig(ma_model_cfg_opt_t opt, ...) {
     ma_err_t ret = MA_OK;
-    va_list  args;
-    void*    p_arg = nullptr;
+    va_list args;
+    void* p_arg = nullptr;
     va_start(args, opt);
     switch (opt) {
-    case MA_MODEL_CFG_OPT_THRESHOLD:
-        p_arg                          = va_arg(args, void*);
-        *(static_cast<double*>(p_arg)) = threshold_score_;
-        break;
-    case MA_MODEL_CFG_OPT_NMS:
-        p_arg                          = va_arg(args, void*);
-        *(static_cast<double*>(p_arg)) = threshold_nms_;
-        break;
-    default:
-        ret = MA_EINVAL;
-        break;
+        case MA_MODEL_CFG_OPT_THRESHOLD:
+            p_arg                          = va_arg(args, void*);
+            *(static_cast<double*>(p_arg)) = threshold_score_;
+            break;
+        case MA_MODEL_CFG_OPT_NMS:
+            p_arg                          = va_arg(args, void*);
+            *(static_cast<double*>(p_arg)) = threshold_nms_;
+            break;
+        default:
+            ret = MA_EINVAL;
+            break;
     }
     va_end(args);
     return ret;
diff --git a/src/components/sscma-micro/sscma/core/model/ma_model_pose_detector.h b/src/components/sscma-micro/sscma/core/model/ma_model_pose_detector.h
index fcdb274..38ab2d9 100644
--- a/src/components/sscma-micro/sscma/core/model/ma_model_pose_detector.h
+++ b/src/components/sscma-micro/sscma/core/model/ma_model_pose_detector.h
@@ -8,9 +8,9 @@
 namespace ma::model {
 
 class PoseDetector : public Model {
-   protected:
-    ma_tensor_t     input_;
-    ma_img_t        img_;
+protected:
+    ma_tensor_t input_;
+    ma_img_t img_;
     const ma_img_t* input_img_;
 
     float threshold_nms_;
@@ -18,21 +18,23 @@ class PoseDetector : public Model {
 
     bool is_nhwc_;
 
-    std::vector<ma_keypoint3f_t> results_;
+    std::forward_list<ma_keypoint3f_t> results_;
 
-   protected:
+protected:
     ma_err_t preprocess() override;
 
-   public:
+public:
     PoseDetector(Engine* engine, const char* name, ma_model_type_t type);
     virtual ~PoseDetector();
 
-    const std::vector<ma_keypoint3f_t>& getResults() const;
+    const std::forward_list<ma_keypoint3f_t>& getResults() const;
 
     ma_err_t run(const ma_img_t* img);
 
+    const void* getInput() override;
+
     ma_err_t setConfig(ma_model_cfg_opt_t opt, ...) override;
-    
+
     ma_err_t getConfig(ma_model_cfg_opt_t opt, ...) override;
 };
 
diff --git a/src/components/sscma-micro/sscma/core/model/ma_model_segmenter.cpp b/src/components/sscma-micro/sscma/core/model/ma_model_segmenter.cpp
new file mode 100644
index 0000000..e6c54ef
--- /dev/null
+++ b/src/components/sscma-micro/sscma/core/model/ma_model_segmenter.cpp
@@ -0,0 +1,108 @@
+#include "ma_model_segmenter.h"
+
+#include "../cv/ma_cv.h"
+
+namespace ma::model {
+
+constexpr char TAG[] = "ma::model::segmenter";
+
+Segmenter::Segmenter(Engine* p_engine, const char* name, ma_model_type_t type) : Model(p_engine, name, MA_INPUT_TYPE_IMAGE | MA_OUTPUT_TYPE_SEGMENTATION | type) {
+    input_           = p_engine_->getInput(0);
+    threshold_nms_   = 0.45;
+    threshold_score_ = 0.25;
+
+    is_nhwc_ = input_.shape.dims[3] == 3 || input_.shape.dims[3] == 1;
+
+    if (is_nhwc_) {
+        img_.height = input_.shape.dims[1];
+        img_.width  = input_.shape.dims[2];
+        img_.size   = input_.shape.dims[1] * input_.shape.dims[2] * input_.shape.dims[3];
+        img_.format = input_.shape.dims[3] == 3 ? MA_PIXEL_FORMAT_RGB888 : MA_PIXEL_FORMAT_GRAYSCALE;
+
+    } else {
+        img_.height = input_.shape.dims[2];
+        img_.width  = input_.shape.dims[3];
+        img_.size   = input_.shape.dims[3] * input_.shape.dims[2] * input_.shape.dims[1];
+        img_.format = input_.shape.dims[1] == 3 ? MA_PIXEL_FORMAT_RGB888 : MA_PIXEL_FORMAT_GRAYSCALE;
+    }
+
+    img_.data = input_.data.u8;
+}
+
+Segmenter::~Segmenter() {}
+ma_err_t Segmenter::preprocess() {
+    ma_err_t ret = MA_OK;
+
+    ret = ma::cv::convert(input_img_, &img_);
+    if (ret != MA_OK) {
+        return ret;
+    }
+    if (input_.type == MA_TENSOR_TYPE_S8) {
+        for (int i = 0; i < input_.size; i++) {
+            input_.data.u8[i] -= 128;
+        }
+    }
+
+    return ret;
+}
+
+const void* Segmenter::getInput() {
+    return static_cast<const void*>(&img_);
+}
+
+const std::forward_list<ma_segm2f_t>& Segmenter::getResults() const {
+    return results_;
+}
+
+ma_err_t Segmenter::run(const ma_img_t* img) {
+    MA_ASSERT(img != nullptr);
+
+    input_img_ = img;
+
+    return underlyingRun();
+}
+
+ma_err_t Segmenter::setConfig(ma_model_cfg_opt_t opt, ...) {
+    ma_err_t ret = MA_OK;
+    va_list args;
+    va_start(args, opt);
+    switch (opt) {
+        case MA_MODEL_CFG_OPT_THRESHOLD:
+            threshold_score_ = va_arg(args, double);
+            ret              = MA_OK;
+            break;
+        case MA_MODEL_CFG_OPT_NMS:
+            threshold_nms_ = va_arg(args, double);
+            ret            = MA_OK;
+            break;
+        default:
+            ret = MA_EINVAL;
+            break;
+    }
+    va_end(args);
+    return ret;
+}
+
+ma_err_t Segmenter::getConfig(ma_model_cfg_opt_t opt, ...) {
+    ma_err_t ret = MA_OK;
+    va_list args;
+    void* p_arg = nullptr;
+    va_start(args, opt);
+    switch (opt) {
+        case MA_MODEL_CFG_OPT_THRESHOLD:
+            p_arg                          = va_arg(args, void*);
+            *(static_cast<double*>(p_arg)) = threshold_score_;
+            break;
+        case MA_MODEL_CFG_OPT_NMS:
+            p_arg                          = va_arg(args, void*);
+            *(static_cast<double*>(p_arg)) = threshold_nms_;
+            break;
+        default:
+            ret = MA_EINVAL;
+            break;
+    }
+    va_end(args);
+    return ret;
+}
+
+}  // namespace ma::model
diff --git a/src/components/sscma-micro/sscma/core/model/ma_model_segmenter.h b/src/components/sscma-micro/sscma/core/model/ma_model_segmenter.h
new file mode 100644
index 0000000..258b340
--- /dev/null
+++ b/src/components/sscma-micro/sscma/core/model/ma_model_segmenter.h
@@ -0,0 +1,43 @@
+#ifndef _MA_MODEL_SEGMENTER_H_
+#define _MA_MODEL_SEGMENTER_H_
+
+#include <vector>
+
+#include "ma_model_base.h"
+
+namespace ma::model {
+
+class Segmenter : public Model {
+protected:
+    ma_tensor_t input_;
+    ma_img_t img_;
+    const ma_img_t* input_img_;
+
+    float threshold_nms_;
+    float threshold_score_;
+
+    bool is_nhwc_;
+
+    std::forward_list<ma_segm2f_t> results_;
+
+protected:
+    ma_err_t preprocess() override;
+
+public:
+    Segmenter(Engine* engine, const char* name, ma_model_type_t type);
+    virtual ~Segmenter();
+
+    const std::forward_list<ma_segm2f_t>& getResults() const;
+
+    ma_err_t run(const ma_img_t* img);
+
+    const void* getInput() override;
+
+    ma_err_t setConfig(ma_model_cfg_opt_t opt, ...) override;
+
+    ma_err_t getConfig(ma_model_cfg_opt_t opt, ...) override;
+};
+
+}  // namespace ma::model
+
+#endif  // _MA_MODEL_SEGMENTER_H_
diff --git a/src/components/sscma-micro/sscma/core/model/ma_model_yolo11_pose.cpp b/src/components/sscma-micro/sscma/core/model/ma_model_yolo11_pose.cpp
new file mode 100644
index 0000000..e004525
--- /dev/null
+++ b/src/components/sscma-micro/sscma/core/model/ma_model_yolo11_pose.cpp
@@ -0,0 +1,197 @@
+#include "ma_model_yolo11_pose.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <forward_list>
+#include <numeric>
+#include <utility>
+#include <vector>
+
+#include "../math/ma_math.h"
+#include "../utils/ma_anchors.h"
+#include "../utils/ma_nms.h"
+
+namespace ma::model {
+
+Yolo11Pose::Yolo11Pose(Engine* p_engine_) : PoseDetector(p_engine_, "yolo11_pose", MA_MODEL_TYPE_YOLO11_POSE) {
+    MA_ASSERT(p_engine_ != nullptr);
+
+    outputs_ = p_engine_->getOutput(0);
+
+    num_class_     = 1;  // only one class supported
+    num_record_    = outputs_.shape.dims[2];
+    num_keypoints_ = (outputs_.shape.dims[1] - 5) / 3;
+    num_element_   = outputs_.shape.dims[1];
+}
+
+Yolo11Pose::~Yolo11Pose() {}
+
+bool Yolo11Pose::isValid(Engine* engine) {
+
+    const auto inputs_count  = engine->getInputSize();
+    const auto outputs_count = engine->getOutputSize();
+
+    if (inputs_count != 1 || outputs_count != 1) {
+        return false;
+    }
+    const auto& input_shape  = engine->getInputShape(0);
+    const auto& output_shape = engine->getOutputShape(0);
+
+    // Validate input shape
+    if (input_shape.size != 4)
+        return false;
+
+    int n = input_shape.dims[0], h = input_shape.dims[1], w = input_shape.dims[2], c = input_shape.dims[3];
+    bool is_nhwc = c == 3 || c == 1;
+
+    if (!is_nhwc)
+        std::swap(h, c);
+
+
+    if (n != 1 || h < 32 || h % 32 != 0 || (c != 3 && c != 1))
+        return false;
+
+    // Calculate expected output size based on input
+    int s = w >> 5, m = w >> 4, l = w >> 3;
+    int ibox_len = (s * s + m * m + l * l);
+
+    // Validate output shape
+    if (output_shape.size != 3 && output_shape.size != 4)
+        return false;
+
+    if (output_shape.dims[0] != 1 || output_shape.dims[2] != ibox_len || output_shape.dims[1] < 6)
+        return false;
+
+    if ((output_shape.dims[1] - 5) % 3 != 0)
+        return false;
+
+    return true;
+}
+
+ma_err_t Yolo11Pose::postprocess() {
+    results_.clear();
+    if (outputs_.type == MA_TENSOR_TYPE_F32) {
+        return postProcessF32();
+    } else if (outputs_.type == MA_TENSOR_TYPE_S8) {
+        return postProcessI8();
+    }
+    return MA_ENOTSUP;
+}
+
+ma_err_t Yolo11Pose::postProcessI8() {
+
+    const float score_threshold_non_sigmoid = ma::math::inverseSigmoid(threshold_score_);
+
+    std::forward_list<ma_bbox_ext_t> multi_level_bboxes;
+
+    auto* data = outputs_.data.f32;
+    for (decltype(num_record_) i = 0; i < num_record_; ++i) {
+        auto score = data[i + num_record_ * 4];
+
+        if (score <= score_threshold_non_sigmoid)
+            continue;
+
+        float x = ma::math::dequantizeValue(data[i], outputs_.quant_param.scale, outputs_.quant_param.zero_point);
+        float y = ma::math::dequantizeValue(data[i + num_record_], outputs_.quant_param.scale, outputs_.quant_param.zero_point);
+        float w = ma::math::dequantizeValue(data[i + num_record_ * 2], outputs_.quant_param.scale, outputs_.quant_param.zero_point);
+        float h = ma::math::dequantizeValue(data[i + num_record_ * 3], outputs_.quant_param.scale, outputs_.quant_param.zero_point);
+
+        ma_bbox_ext_t bbox;
+        bbox.level  = 0;
+        bbox.index  = i;
+        bbox.x      = x / img_.width;
+        bbox.y      = y / img_.height;
+        bbox.w      = w / img_.width;
+        bbox.h      = h / img_.height;
+        bbox.score  = ma::math::dequantizeValue(score, outputs_.quant_param.scale, outputs_.quant_param.zero_point);
+        bbox.target = 0;
+
+        multi_level_bboxes.emplace_front(std::move(bbox));
+    }
+
+    ma::utils::nms(multi_level_bboxes, threshold_nms_, threshold_score_, false, true);
+
+    if (multi_level_bboxes.empty()) {
+        return MA_OK;
+    }
+
+    std::vector<ma_pt3f_t> n_keypoint(num_keypoints_);
+
+    for (auto& bbox : multi_level_bboxes) {
+
+        for (int i = 0; i < num_keypoints_; ++i) {
+            auto index      = bbox.index + num_record_ * (5 + i * 3);
+            n_keypoint[i].x = ma::math::dequantizeValue(data[index], outputs_.quant_param.scale, outputs_.quant_param.zero_point) / img_.width;
+            n_keypoint[i].y = ma::math::dequantizeValue(data[index + num_record_], outputs_.quant_param.scale, outputs_.quant_param.zero_point) / img_.height;
+            n_keypoint[i].z = ma::math::dequantizeValue(data[index + num_record_ * 2], outputs_.quant_param.scale, outputs_.quant_param.zero_point);
+        }
+
+        ma_keypoint3f_t keypoint;
+        keypoint.box = {.x = bbox.x, .y = bbox.y, .w = bbox.w, .h = bbox.h, .score = bbox.score, .target = bbox.target};
+        keypoint.pts = n_keypoint;
+
+
+        results_.emplace_front(std::move(keypoint));
+    }
+
+    return MA_OK;
+}
+ma_err_t Yolo11Pose::postProcessF32() {
+
+    std::forward_list<ma_bbox_ext_t> multi_level_bboxes;
+
+    auto* data = outputs_.data.f32;
+    for (decltype(num_record_) i = 0; i < num_record_; ++i) {
+        auto score = data[i + num_record_ * 4];
+
+        if (score <= threshold_score_)
+            continue;
+
+        float x = data[i];
+        float y = data[i + num_record_];
+        float w = data[i + num_record_ * 2];
+        float h = data[i + num_record_ * 3];
+
+        ma_bbox_ext_t bbox;
+        bbox.level  = 0;
+        bbox.index  = i;
+        bbox.x      = x / img_.width;
+        bbox.y      = y / img_.height;
+        bbox.w      = w / img_.width;
+        bbox.h      = h / img_.height;
+        bbox.score  = score;
+        bbox.target = 0;
+
+        multi_level_bboxes.emplace_front(std::move(bbox));
+    }
+
+    ma::utils::nms(multi_level_bboxes, threshold_nms_, threshold_score_, false, true);
+
+    if (multi_level_bboxes.empty()) {
+        return MA_OK;
+    }
+
+    std::vector<ma_pt3f_t> n_keypoint(num_keypoints_);
+
+    for (auto& bbox : multi_level_bboxes) {
+
+        for (int i = 0; i < num_keypoints_; ++i) {
+            auto index      = bbox.index + num_record_ * (5 + i * 3);
+            n_keypoint[i].x = data[index] / img_.width;
+            n_keypoint[i].y = data[index + num_record_] / img_.height;
+            n_keypoint[i].z = data[index + num_record_ * 2];
+        }
+
+        ma_keypoint3f_t keypoint;
+        keypoint.box = {.x = bbox.x, .y = bbox.y, .w = bbox.w, .h = bbox.h, .score = bbox.score, .target = bbox.target};
+        keypoint.pts = n_keypoint;
+
+
+        results_.emplace_front(std::move(keypoint));
+    }
+
+    return MA_OK;
+}
+
+}  // namespace ma::model
diff --git a/src/components/sscma-micro/sscma/core/model/ma_model_yolo11_pose.h b/src/components/sscma-micro/sscma/core/model/ma_model_yolo11_pose.h
new file mode 100644
index 0000000..288b34a
--- /dev/null
+++ b/src/components/sscma-micro/sscma/core/model/ma_model_yolo11_pose.h
@@ -0,0 +1,36 @@
+#ifndef _MA_MODEL_YOLO11_POSE_H_
+#define _MA_MODEL_YOLO11_POSE_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+#include "ma_model_pose_detector.h"
+
+namespace ma::model {
+
+class Yolo11Pose : public PoseDetector {
+private:
+    ma_tensor_t outputs_;
+    int32_t num_record_;
+    int32_t num_element_;
+    int32_t num_class_;
+    int32_t num_keypoints_;
+
+protected:
+    ma_err_t postprocess() override;
+
+    ma_err_t postProcessI8();
+    ma_err_t postProcessF32();
+
+public:
+    Yolo11Pose(Engine* engine);
+    ~Yolo11Pose();
+
+    static bool isValid(Engine* engine);
+};
+
+}  // namespace ma::model
+
+#endif  // _MA_MODEL_YOLO_H
diff --git a/src/components/sscma-micro/sscma/core/model/ma_model_yolo11_seg.cpp b/src/components/sscma-micro/sscma/core/model/ma_model_yolo11_seg.cpp
new file mode 100644
index 0000000..a9ccfe8
--- /dev/null
+++ b/src/components/sscma-micro/sscma/core/model/ma_model_yolo11_seg.cpp
@@ -0,0 +1,174 @@
+#include "ma_model_yolo11_seg.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <forward_list>
+#include <numeric>
+#include <utility>
+#include <vector>
+
+#include "../math/ma_math.h"
+#include "../utils/ma_nms.h"
+
+constexpr char TAG[] = "ma::model::yolo11_seg";
+
+namespace ma::model {
+
+Yolo11Seg::Yolo11Seg(Engine* p_engine_) : Segmenter(p_engine_, "yolo11_seg", MA_MODEL_TYPE_YOLO11_SEG) {
+    MA_ASSERT(p_engine_ != nullptr);
+
+    bboxes_ = p_engine_->getOutput(0);
+    protos_ = p_engine_->getOutput(1);
+
+    num_class_  = bboxes_.shape.dims[1] - 36;  // 4 + 1 + 32
+    num_record_ = bboxes_.shape.dims[2];
+}
+
+Yolo11Seg::~Yolo11Seg() {}
+
+bool Yolo11Seg::isValid(Engine* engine) {
+
+    const auto inputs_count  = engine->getInputSize();
+    const auto outputs_count = engine->getOutputSize();
+
+    if (inputs_count != 1 || outputs_count != 2) {
+        return false;
+    }
+    const auto& input_shape  = engine->getInputShape(0);
+    const auto& output_shape = engine->getOutputShape(0);
+    const auto& mask_shape   = engine->getOutputShape(1);
+
+    // Validate input shape
+    if (input_shape.size != 4) {
+        return false;
+    }
+
+    int n = input_shape.dims[0], h = input_shape.dims[1], w = input_shape.dims[2], c = input_shape.dims[3];
+    bool is_nhwc = c == 3 || c == 1;
+
+    if (!is_nhwc)
+        std::swap(h, c);
+
+
+    if (n != 1 || h < 32 || h % 32 != 0 || (c != 3 && c != 1)) {
+        return false;
+    }
+
+    // Calculate expected output size based on input
+    int s = w >> 5, m = w >> 4, l = w >> 3;
+    int ibox_len = (s * s + m * m + l * l);
+
+    // Validate output shape
+    if ((output_shape.size != 3 && output_shape.size != 4) || mask_shape.size != 4) {
+        return false;
+    }
+
+    if (output_shape.dims[0] != 1 || output_shape.dims[2] != ibox_len || output_shape.dims[1] < 37) {
+        return false;
+    }
+
+    if (mask_shape.dims[0] != 1 || mask_shape.dims[1] != 32 || mask_shape.dims[2] != w >> 2 || mask_shape.dims[3] != w >> 2) {
+        return false;
+    }
+
+    return true;
+}
+
+ma_err_t Yolo11Seg::postprocess() {
+    results_.clear();
+    if (bboxes_.type == MA_TENSOR_TYPE_F32) {
+        return postProcessF32();
+    }
+    return MA_ENOTSUP;
+}
+
+ma_err_t Yolo11Seg::postProcessF32() {
+
+    std::forward_list<ma_bbox_ext_t> multi_level_bboxes;
+    auto* data = bboxes_.data.f32;
+    for (decltype(num_record_) i = 0; i < num_record_; ++i) {
+
+        float max  = threshold_score_;
+        int target = -1;
+
+        for (int c = 0; c < num_class_; c++) {
+            float score = data[i + num_record_ * (4 + c)];
+            if (score < max) [[likely]] {
+                continue;
+            }
+            max    = score;
+            target = c;
+        }
+
+        if (target < 0)
+            continue;
+
+        float x = data[i];
+        float y = data[i + num_record_];
+        float w = data[i + num_record_ * 2];
+        float h = data[i + num_record_ * 3];
+
+
+        ma_bbox_ext_t bbox;
+        bbox.level  = 0;
+        bbox.index  = i;
+        bbox.x      = x / img_.width;
+        bbox.y      = y / img_.height;
+        bbox.w      = w / img_.width;
+        bbox.h      = h / img_.height;
+        bbox.score  = max;
+        bbox.target = target;
+
+        multi_level_bboxes.emplace_front(std::move(bbox));
+    }
+
+    ma::utils::nms(multi_level_bboxes, threshold_nms_, threshold_score_, false, true);
+
+    if (multi_level_bboxes.empty())
+        return MA_OK;
+
+    // fetch mask
+    for (auto& bbox : multi_level_bboxes) {
+        ma_segm2f_t seg;
+        seg.box         = {.x = bbox.x, .y = bbox.y, .w = bbox.w, .h = bbox.h, .score = bbox.score, .target = bbox.target};
+        seg.mask.width  = protos_.shape.dims[2];
+        seg.mask.height = protos_.shape.dims[3];
+        seg.mask.data.resize(protos_.shape.dims[2] * protos_.shape.dims[3] / 8, 0);  // bitwise
+
+        const int mask_size = protos_.shape.dims[2] * protos_.shape.dims[3];
+
+        std::vector<float> masks(mask_size, 0.0f);
+
+        // TODO: parallel for
+        for (int j = 0; j < protos_.shape.dims[1]; ++j) {
+            float mask_in = bboxes_.data.f32[bbox.index + num_record_ * (4 + num_class_ + j)];
+            for (int i = 0; i < mask_size; ++i) {
+                masks[i] += mask_in * protos_.data.f32[j * mask_size + i];
+            }
+        }
+
+        int x1 = (bbox.x - bbox.w / 2) * protos_.shape.dims[2];
+        int y1 = (bbox.y - bbox.h / 2) * protos_.shape.dims[3];
+        int x2 = (bbox.x + bbox.w / 2) * protos_.shape.dims[2];
+        int y2 = (bbox.y + bbox.h / 2) * protos_.shape.dims[3];
+
+        for (int i = 0; i < protos_.shape.dims[2]; i++) {
+            for (int j = 0; j < protos_.shape.dims[3]; j++) {
+                if (i < y1 || i >= y2 || j < x1 || j >= x2) [[likely]] {
+                    continue;
+                }
+                if (masks[i * protos_.shape.dims[3] + j] > 0.5) {
+                    seg.mask.data[i * protos_.shape.dims[3] / 8 + j / 8] |= (1 << (j % 8));
+                }
+            }
+        }
+
+        results_.emplace_front(std::move(seg));
+    }
+
+
+    return MA_OK;
+}
+
+}  // namespace ma::model
diff --git a/src/components/sscma-micro/sscma/core/model/ma_model_yolo11_seg.h b/src/components/sscma-micro/sscma/core/model/ma_model_yolo11_seg.h
new file mode 100644
index 0000000..f7025e3
--- /dev/null
+++ b/src/components/sscma-micro/sscma/core/model/ma_model_yolo11_seg.h
@@ -0,0 +1,34 @@
+#ifndef _MA_MODEL_YOLO11_SEG_H_
+#define _MA_MODEL_YOLO11_SEG_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+#include "ma_model_segmenter.h"
+
+namespace ma::model {
+
+class Yolo11Seg : public Segmenter {
+private:
+    ma_tensor_t bboxes_;
+    ma_tensor_t protos_;
+    int32_t num_record_;
+    int32_t num_class_;
+
+protected:
+    ma_err_t postprocess() override;
+    
+    ma_err_t postProcessF32();
+
+public:
+    Yolo11Seg(Engine* engine);
+    ~Yolo11Seg();
+
+    static bool isValid(Engine* engine);
+};
+
+}  // namespace ma::model
+
+#endif  // _MA_MODEL_YOLO_H
diff --git a/src/components/sscma-micro/sscma/core/model/ma_model_yolov5.cpp b/src/components/sscma-micro/sscma/core/model/ma_model_yolov5.cpp
index 7472726..07619f2 100644
--- a/src/components/sscma-micro/sscma/core/model/ma_model_yolov5.cpp
+++ b/src/components/sscma-micro/sscma/core/model/ma_model_yolov5.cpp
@@ -1,6 +1,7 @@
 #include <algorithm>
 #include <forward_list>
-#include <vector>
+#include <vector> 
+#include <utility>
 
 #include "../utils/ma_nms.h"
 
@@ -22,8 +23,7 @@ YoloV5::YoloV5(Engine* p_engine_) : Detector(p_engine_, "yolov5", MA_MODEL_TYPE_
 
 YoloV5::~YoloV5() {}
 
-bool YoloV5::isValid(Engine* engine) {
-
+static bool generalValid(Engine* engine) {
     const auto inputs_count = engine->getInputSize();
     const auto outputs_count = engine->getOutputSize();
 
@@ -62,9 +62,54 @@ bool YoloV5::isValid(Engine* engine) {
     return true;
 }
 
-ma_err_t YoloV5::postprocess() {
-    results_.clear();
+static bool nmsValid(Engine* engine) {
+#if MA_USE_ENGINE_HALIO
+    if (engine->getInputSize() != 1 || engine->getOutputSize() != 1)
+        return false;
+
+    auto input = engine->getInput(0);
+    auto output = engine->getOutput(0);
+
+    if (input.shape.size != 4 || output.shape.size != 4)
+        return false;
+
+    auto n = input.shape.dims[0];
+    auto h = input.shape.dims[1];
+    auto w = input.shape.dims[2];
+    auto c = input.shape.dims[3];
+
+    if (n != 1 || h < 32 || h % 32 != 0 || (c != 3 && c != 1))
+        return false;
+
+    auto b  = output.shape.dims[0];
+    auto cs = output.shape.dims[1];
+    auto mb = output.shape.dims[2];
+    auto f  = output.shape.dims[3];
+
+    if (b != 1 || cs <= 0 || mb <= 1 || f != 0) 
+        return false;
 
+    return true;
+#else
+    return false;
+#endif
+}
+
+bool YoloV5::isValid(Engine* engine) {
+    if (!engine || engine->getOutputSize() != 1)
+        return false;
+    auto output = engine->getOutput(0);
+
+    switch (output.type) {
+        case MA_TENSOR_TYPE_NMS_BBOX_U16:
+        case MA_TENSOR_TYPE_NMS_BBOX_F32:
+            return nmsValid(engine);
+        default:
+            return generalValid(engine);
+    }
+}
+
+ma_err_t YoloV5::generalPostProcess() {
     if (output_.type == MA_TENSOR_TYPE_S8) {
         auto* data      = output_.data.s8;
         auto scale      = output_.quant_param.scale;
@@ -161,4 +206,155 @@ ma_err_t YoloV5::postprocess() {
 
     return MA_OK;
 }
+
+ma_err_t YoloV5::nmsPostProcess() {
+#if MA_USE_ENGINE_HALIO
+
+    auto& output = output_;
+
+    if (output.shape.size < 4) {
+        return MA_FAILED;
+    }
+
+    size_t w = output.shape.dims[1];
+    size_t h = output.shape.dims[2];
+    size_t c = output.shape.dims[3];
+
+    hailo_nms_shape_t nms_shape;
+    if (output.external_handler) {
+        auto rc = (*reinterpret_cast<ma::engine::EngineHalio::ExternalHandler*>(output.external_handler))(4, &nms_shape, sizeof(hailo_nms_shape_t));
+        if (rc == MA_OK) {
+            w = nms_shape.number_of_classes;
+            h = nms_shape.max_bboxes_per_class;
+            c = nms_shape.max_accumulated_mask_size;
+        }
+    }
+
+    switch (output.type) {
+        case MA_TENSOR_TYPE_NMS_BBOX_U16: {
+            using T = uint16_t;
+            using P = hailo_bbox_t;
+
+            const auto zp    = output.quant_param.zero_point;
+            const auto scale = output.quant_param.scale;
+
+            auto ptr = output.data.u8;
+            for (size_t i = 0; i < w; ++i) {
+                auto bc = *reinterpret_cast<T*>(ptr);
+                ptr += sizeof(T);
+
+                if (bc <= 0) {
+                    continue;
+                } else if (bc > h) {
+                    break;
+                }
+
+                for (size_t j = 0; j < static_cast<size_t>(bc); ++j) {
+                    auto bbox = *reinterpret_cast<P*>(ptr);
+                    ptr += sizeof(P);
+
+                    ma_bbox_t res;
+                    
+                    auto x_min = static_cast<float>(bbox.x_min - zp) * scale;
+                    auto y_min = static_cast<float>(bbox.y_min - zp) * scale;
+                    auto x_max = static_cast<float>(bbox.x_max - zp) * scale;
+                    auto y_max = static_cast<float>(bbox.y_max - zp) * scale;
+                    res.w      = x_max - x_min;
+                    res.h      = y_max - y_min;
+                    res.x      = x_min + res.w * 0.5;
+                    res.y      = y_min + res.h * 0.5;
+                    res.score  = static_cast<float>(bbox.score - zp) * scale;
+                    
+                    res.target = static_cast<int>(i);
+
+                    res.x = MA_CLIP(res.x, 0, 1.0f);
+                    res.y = MA_CLIP(res.y, 0, 1.0f);
+                    res.w = MA_CLIP(res.w, 0, 1.0f);
+                    res.h = MA_CLIP(res.h, 0, 1.0f);
+
+                    results_.emplace_front(res);
+                }
+            }
+        } break;
+            
+        case MA_TENSOR_TYPE_NMS_BBOX_F32: {
+            using T = float32_t;
+            using P = hailo_bbox_float32_t;
+
+            auto ptr = output.data.u8;
+            for (size_t i = 0; i < w; ++i) {
+                auto bc = *reinterpret_cast<T*>(ptr);
+                ptr += sizeof(T);
+
+                if (bc <= 0) {
+                    continue;
+                } else if (bc > h) {
+                    break;
+                }
+
+                for (size_t j = 0; j < static_cast<size_t>(bc); ++j) {
+                    auto bbox = *reinterpret_cast<P*>(ptr);
+                    ptr += sizeof(P);
+
+                    ma_bbox_t res;
+                    
+                    res.w     = bbox.x_max - bbox.x_min;
+                    res.h     = bbox.y_max - bbox.y_min;
+                    res.x     = bbox.x_min + res.w * 0.5;
+                    res.y     = bbox.y_min + res.h * 0.5;
+                    res.score = bbox.score;
+                    
+                    res.target = static_cast<int>(i);
+
+                    res.x = MA_CLIP(res.x, 0, 1.0f);
+                    res.y = MA_CLIP(res.y, 0, 1.0f);
+                    res.w = MA_CLIP(res.w, 0, 1.0f);
+                    res.h = MA_CLIP(res.h, 0, 1.0f);
+
+                    results_.emplace_front(res);
+                }
+            }
+        } break;
+           
+        default:
+            return MA_ENOTSUP;
+    }
+
+    return MA_OK;
+#else
+    return MA_FAILED;
+#endif
+}
+
+ma_err_t YoloV5::postprocess() {
+    results_.clear();
+
+    switch (output_.type) {
+        case MA_TENSOR_TYPE_NMS_BBOX_U16:
+        case MA_TENSOR_TYPE_NMS_BBOX_F32: {
+#if MA_USE_ENGINE_HALIO
+            // TODO: can be optimized by whihout calling this handler for each frame
+            if (output.external_handler) {
+                auto ph   = reinterpret_cast<ma::engine::EngineHalio::ExternalHandler*>(output.external_handler);
+                float thr = threshold_score_;
+                auto rc   = (*ph)(1, &thr, sizeof(float));
+                if (rc == MA_OK) {
+                    threshold_score_ = thr;
+                }
+                thr = threshold_nms_;
+                rc   = (*ph)(3, &thr, sizeof(float));
+                if (rc == MA_OK) {
+                    threshold_nms_ = thr;
+                }
+            }
+#endif
+            return nmsPostProcess();
+        }
+
+        default:
+            return generalPostProcess();
+    }
+
+    return MA_ENOTSUP;
+}
 }  // namespace ma::model
diff --git a/src/components/sscma-micro/sscma/core/model/ma_model_yolov5.h b/src/components/sscma-micro/sscma/core/model/ma_model_yolov5.h
index 002855b..6941d79 100644
--- a/src/components/sscma-micro/sscma/core/model/ma_model_yolov5.h
+++ b/src/components/sscma-micro/sscma/core/model/ma_model_yolov5.h
@@ -24,6 +24,8 @@ class YoloV5 : public Detector {
 
 protected:
     ma_err_t postprocess() override;
+    ma_err_t generalPostProcess();
+    ma_err_t nmsPostProcess();
 
 public:
     YoloV5(Engine* engine);
diff --git a/src/components/sscma-micro/sscma/core/model/ma_model_yolov8_pose.cpp b/src/components/sscma-micro/sscma/core/model/ma_model_yolov8_pose.cpp
index 0b8165a..5fbbc38 100644
--- a/src/components/sscma-micro/sscma/core/model/ma_model_yolov8_pose.cpp
+++ b/src/components/sscma-micro/sscma/core/model/ma_model_yolov8_pose.cpp
@@ -41,26 +41,26 @@ YoloV8Pose::YoloV8Pose(Engine* p_engine_) : PoseDetector(p_engine_, "yolo_world"
         const auto dim_2 = outputs_[i].shape.dims[2];
 
         switch (dim_2) {
-        case 1:
-            for (size_t j = 0; j < anchor_variants_; ++j) {
-                if (dim_1 == static_cast<int>(anchor_strides_[j].size)) {
-                    output_scores_ids_[j] = i;
-                    break;
+            case 1:
+                for (size_t j = 0; j < anchor_variants_; ++j) {
+                    if (dim_1 == static_cast<int>(anchor_strides_[j].size)) {
+                        output_scores_ids_[j] = i;
+                        break;
+                    }
                 }
-            }
-            break;
-        case 64:
-            for (size_t j = 0; j < anchor_variants_; ++j) {
-                if (dim_1 == static_cast<int>(anchor_strides_[j].size)) {
-                    output_bboxes_ids_[j] = i;
-                    break;
+                break;
+            case 64:
+                for (size_t j = 0; j < anchor_variants_; ++j) {
+                    if (dim_1 == static_cast<int>(anchor_strides_[j].size)) {
+                        output_bboxes_ids_[j] = i;
+                        break;
+                    }
+                }
+                break;
+            default:
+                if (dim_2 % 3 == 0) {
+                    output_keypoints_id_ = i;
                 }
-            }
-            break;
-        default:
-            if (dim_2 % 3 == 0) {
-                output_keypoints_id_ = i;
-            }
         }
     }
 }
@@ -103,10 +103,7 @@ bool YoloV8Pose::isValid(Engine* engine) {
 
     auto anchor_strides_1 = ma::utils::generateAnchorStrides(std::min(h, w));
     auto anchor_strides_2 = anchor_strides_1;
-    auto sum =
-      std::accumulate(anchor_strides_1.begin(), anchor_strides_1.end(), 0u, [](auto sum, const auto& anchor_stride) {
-          return sum + anchor_stride.size;
-      });
+    auto sum              = std::accumulate(anchor_strides_1.begin(), anchor_strides_1.end(), 0u, [](auto sum, const auto& anchor_stride) { return sum + anchor_stride.size; });
 
     // Note: would fail if the model has 64 classes
     for (size_t i = 0; i < num_outputs_; ++i) {
@@ -116,39 +113,35 @@ bool YoloV8Pose::isValid(Engine* engine) {
         }
 
         switch (output_shape.dims[2]) {
-        case 1: {
-            auto it = std::find_if(anchor_strides_1.begin(),
-                                   anchor_strides_1.end(),
-                                   [&output_shape](const ma_anchor_stride_t& anchor_stride) {
-                                       return static_cast<int>(anchor_stride.size) == output_shape.dims[1];
-                                   });
-            if (it == anchor_strides_1.end()) {
-                return false;
-            } else {
-                anchor_strides_1.erase(it);
-            }
-        } break;
-
-        case 64: {
-            auto it = std::find_if(anchor_strides_2.begin(),
-                                   anchor_strides_2.end(),
-                                   [&output_shape](const ma_anchor_stride_t& anchor_stride) {
-                                       return static_cast<int>(anchor_stride.size) == output_shape.dims[1];
-                                   });
-            if (it == anchor_strides_2.end()) {
-                return false;
-            } else {
-                anchor_strides_2.erase(it);
-            }
-        } break;
+            case 1: {
+                auto it = std::find_if(anchor_strides_1.begin(), anchor_strides_1.end(), [&output_shape](const ma_anchor_stride_t& anchor_stride) {
+                    return static_cast<int>(anchor_stride.size) == output_shape.dims[1];
+                });
+                if (it == anchor_strides_1.end()) {
+                    return false;
+                } else {
+                    anchor_strides_1.erase(it);
+                }
+            } break;
+
+            case 64: {
+                auto it = std::find_if(anchor_strides_2.begin(), anchor_strides_2.end(), [&output_shape](const ma_anchor_stride_t& anchor_stride) {
+                    return static_cast<int>(anchor_stride.size) == output_shape.dims[1];
+                });
+                if (it == anchor_strides_2.end()) {
+                    return false;
+                } else {
+                    anchor_strides_2.erase(it);
+                }
+            } break;
 
-        default:
-            if (output_shape.dims[2] % 3 != 0) {
-                return false;
-            }
-            if (output_shape.dims[1] != static_cast<int>(sum)) {
-                return false;
-            }
+            default:
+                if (output_shape.dims[2] % 3 != 0) {
+                    return false;
+                }
+                if (output_shape.dims[1] != static_cast<int>(sum)) {
+                    return false;
+                }
         }
     }
 
@@ -159,36 +152,38 @@ bool YoloV8Pose::isValid(Engine* engine) {
     return true;
 }
 
-const char* YoloV8Pose::getTag() { return "ma::model::yolo_world"; }
+const char* YoloV8Pose::getTag() {
+    return "ma::model::yolo_world";
+}
 
 ma_err_t YoloV8Pose::postprocess() {
     uint8_t check = 0;
 
     for (size_t i = 0; i < num_outputs_; ++i) {
         switch (outputs_[i].type) {
-        case MA_TENSOR_TYPE_S8:
-            break;
+            case MA_TENSOR_TYPE_S8:
+                break;
 
-        case MA_TENSOR_TYPE_F32:
-            check |= 1 << i;
-            break;
+            case MA_TENSOR_TYPE_F32:
+                check |= 1 << i;
+                break;
 
-        default:
-            return MA_ENOTSUP;
+            default:
+                return MA_ENOTSUP;
         }
     }
 
     switch (check) {
-    case 0:
-        return postProcessI8();
+        case 0:
+            return postProcessI8();
 
 #ifdef MA_MODEL_POSTPROCESS_FP32_VARIANT
-    case 0b1111111:
-        return postProcessF32();
+        case 0b1111111:
+            return postProcessF32();
 #endif
 
-    default:
-        return MA_ENOTSUP;
+        default:
+            return MA_ENOTSUP;
     }
 
     return MA_ENOTSUP;
@@ -213,27 +208,26 @@ ma_err_t YoloV8Pose::postProcessI8() {
     const auto anchor_matrix_size = anchor_matrix_.size();
 
     for (size_t i = 0; i < anchor_matrix_size; ++i) {
-        const auto   output_scores_id           = output_scores_ids_[i];
-        const auto*  output_scores              = output_data[output_scores_id];
+        const auto output_scores_id             = output_scores_ids_[i];
+        const auto* output_scores               = output_data[output_scores_id];
         const size_t output_scores_shape_dims_2 = outputs_[output_scores_id].shape.dims[2];
-        const auto   output_scores_quant_parm   = outputs_[output_scores_id].quant_param;
+        const auto output_scores_quant_parm     = outputs_[output_scores_id].quant_param;
 
-        const auto   output_bboxes_id           = output_bboxes_ids_[i];
-        const auto*  output_bboxes              = output_data[output_bboxes_id];
+        const auto output_bboxes_id             = output_bboxes_ids_[i];
+        const auto* output_bboxes               = output_data[output_bboxes_id];
         const size_t output_bboxes_shape_dims_2 = outputs_[output_bboxes_id].shape.dims[2];
-        const auto   output_bboxes_quant_parm   = outputs_[output_bboxes_id].quant_param;
+        const auto output_bboxes_quant_parm     = outputs_[output_bboxes_id].quant_param;
 
-        const auto& anchor_array      = anchor_matrix_[i];
-        const auto  anchor_array_size = anchor_array.size();
+        const auto& anchor_array     = anchor_matrix_[i];
+        const auto anchor_array_size = anchor_array.size();
 
-        const int32_t score_threshold_quan_non_sigmoid = ma::math::quantizeValueFloor(
-          score_threshold_non_sigmoid, output_scores_quant_parm.zero_point, output_scores_quant_parm.scale);
+        const int32_t score_threshold_quan_non_sigmoid = ma::math::quantizeValueFloor(score_threshold_non_sigmoid, output_scores_quant_parm.zero_point, output_scores_quant_parm.scale);
 
         for (size_t j = 0; j < anchor_array_size; ++j) {
             const auto j_mul_output_scores_shape_dims_2 = j * output_scores_shape_dims_2;
 
-            auto    max_score_raw = score_threshold_quan_non_sigmoid;
-            int32_t target        = -1;
+            auto max_score_raw = score_threshold_quan_non_sigmoid;
+            int32_t target     = -1;
 
             for (size_t k = 0; k < output_scores_shape_dims_2; ++k) {
                 int8_t score = output_scores[j_mul_output_scores_shape_dims_2 + k];
@@ -245,10 +239,10 @@ ma_err_t YoloV8Pose::postProcessI8() {
                 target        = k;
             }
 
-            if (target < 0) continue;
+            if (target < 0)
+                continue;
 
-            const float real_score = ma::math::sigmoid(ma::math::dequantizeValue(
-              max_score_raw, output_scores_quant_parm.zero_point, output_scores_quant_parm.scale));
+            const float real_score = ma::math::sigmoid(ma::math::dequantizeValue(max_score_raw, output_scores_quant_parm.zero_point, output_scores_quant_parm.scale));
 
             // DFL
             float dist[4];
@@ -258,9 +252,7 @@ ma_err_t YoloV8Pose::postProcessI8() {
             for (size_t m = 0; m < 4; ++m) {
                 const size_t offset = pre + m * 16;
                 for (size_t n = 0; n < 16; ++n) {
-                    matrix[n] = ma::math::dequantizeValue(static_cast<int32_t>(output_bboxes[offset + n]),
-                                                          output_bboxes_quant_parm.zero_point,
-                                                          output_bboxes_quant_parm.scale);
+                    matrix[n] = ma::math::dequantizeValue(static_cast<int32_t>(output_bboxes[offset + n]), output_bboxes_quant_parm.zero_point, output_bboxes_quant_parm.scale);
                 }
 
                 ma::math::softmax(matrix, 16);
@@ -296,15 +288,13 @@ ma_err_t YoloV8Pose::postProcessI8() {
     ma::utils::nms(multi_level_bboxes, threshold_nms_, threshold_score_, false, true);
 
     if (multi_level_bboxes.empty()) {
-        results_.shrink_to_fit();
-
         return MA_OK;
     }
 
-    const auto*  output_keypoints            = output_data[output_keypoints_id_];
-    const auto   output_keypoints_dims_2     = outputs_[output_keypoints_id_].shape.dims[2];
-    const auto   output_keypoints_quant_parm = outputs_[output_keypoints_id_].quant_param;
-    const size_t keypoint_nums               = output_keypoints_dims_2 / 3;
+    const auto* output_keypoints           = output_data[output_keypoints_id_];
+    const auto output_keypoints_dims_2     = outputs_[output_keypoints_id_].shape.dims[2];
+    const auto output_keypoints_quant_parm = outputs_[output_keypoints_id_].quant_param;
+    const size_t keypoint_nums             = output_keypoints_dims_2 / 3;
 
     std::vector<ma_pt3f_t> n_keypoint(keypoint_nums);
 
@@ -314,18 +304,11 @@ ma_err_t YoloV8Pose::postProcessI8() {
         for (size_t i = 0; i < keypoint_nums; ++i) {
             const auto offset = pre + i * 3;
 
-            const float x = ma::math::dequantizeValue(static_cast<int32_t>(output_keypoints[offset]),
-                                                      output_keypoints_quant_parm.zero_point,
-                                                      output_keypoints_quant_parm.scale);
+            const float x = ma::math::dequantizeValue(static_cast<int32_t>(output_keypoints[offset]), output_keypoints_quant_parm.zero_point, output_keypoints_quant_parm.scale);
 
-            const float y = ma::math::dequantizeValue(static_cast<int32_t>(output_keypoints[offset + 1]),
-                                                      output_keypoints_quant_parm.zero_point,
-                                                      output_keypoints_quant_parm.scale);
+            const float y = ma::math::dequantizeValue(static_cast<int32_t>(output_keypoints[offset + 1]), output_keypoints_quant_parm.zero_point, output_keypoints_quant_parm.scale);
 
-            const float z =
-              ma::math::sigmoid(ma::math::dequantizeValue(static_cast<int32_t>(output_keypoints[offset + 2]),
-                                                          output_keypoints_quant_parm.zero_point,
-                                                          output_keypoints_quant_parm.scale));
+            const float z = ma::math::sigmoid(ma::math::dequantizeValue(static_cast<int32_t>(output_keypoints[offset + 2]), output_keypoints_quant_parm.zero_point, output_keypoints_quant_parm.scale));
 
             n_keypoint[i] = {x, y, z};
         }
@@ -334,11 +317,9 @@ ma_err_t YoloV8Pose::postProcessI8() {
         keypoint.box = {.x = bbox.x, .y = bbox.y, .w = bbox.w, .h = bbox.h, .score = bbox.score, .target = bbox.target};
         keypoint.pts = n_keypoint;
 
-        results_.push_back(std::move(keypoint));
+        results_.emplace_front(std::move(keypoint));
     }
 
-    results_.shrink_to_fit();
-
     return MA_OK;
 }
 
@@ -362,22 +343,22 @@ ma_err_t YoloV8Pose::postProcessF32() {
     const auto anchor_matrix_size = anchor_matrix_.size();
 
     for (size_t i = 0; i < anchor_matrix_size; ++i) {
-        const auto   output_scores_id           = output_scores_ids_[i];
-        const auto*  output_scores              = output_data[output_scores_id];
+        const auto output_scores_id             = output_scores_ids_[i];
+        const auto* output_scores               = output_data[output_scores_id];
         const size_t output_scores_shape_dims_2 = outputs_[output_scores_id].shape.dims[2];
 
-        const auto   output_bboxes_id           = output_bboxes_ids_[i];
-        const auto*  output_bboxes              = output_data[output_bboxes_id];
+        const auto output_bboxes_id             = output_bboxes_ids_[i];
+        const auto* output_bboxes               = output_data[output_bboxes_id];
         const size_t output_bboxes_shape_dims_2 = outputs_[output_bboxes_id].shape.dims[2];
 
-        const auto& anchor_array      = anchor_matrix_[i];
-        const auto  anchor_array_size = anchor_array.size();
+        const auto& anchor_array     = anchor_matrix_[i];
+        const auto anchor_array_size = anchor_array.size();
 
         for (size_t j = 0; j < anchor_array_size; ++j) {
             const auto j_mul_output_scores_shape_dims_2 = j * output_scores_shape_dims_2;
 
-            auto    max_score_raw = score_threshold_non_sigmoid;
-            int32_t target        = -1;
+            auto max_score_raw = score_threshold_non_sigmoid;
+            int32_t target     = -1;
 
             for (size_t k = 0; k < output_scores_shape_dims_2; ++k) {
                 int8_t score = output_scores[j_mul_output_scores_shape_dims_2 + k];
@@ -389,7 +370,8 @@ ma_err_t YoloV8Pose::postProcessF32() {
                 target        = k;
             }
 
-            if (target < 0) continue;
+            if (target < 0)
+                continue;
 
             const float real_score = ma::math::sigmoid(max_score_raw);
 
@@ -437,15 +419,13 @@ ma_err_t YoloV8Pose::postProcessF32() {
     ma::utils::nms(multi_level_bboxes, threshold_nms_, threshold_score_, false, true);
 
     if (multi_level_bboxes.empty()) {
-        results_.shrink_to_fit();
-
         return MA_OK;
     }
 
-    const auto*  output_keypoints            = output_data[output_keypoints_id_];
-    const auto   output_keypoints_dims_2     = outputs_[output_keypoints_id_].shape.dims[2];
-    const auto   output_keypoints_quant_parm = outputs_[output_keypoints_id_].quant_param;
-    const size_t keypoint_nums               = output_keypoints_dims_2 / 3;
+    const auto* output_keypoints           = output_data[output_keypoints_id_];
+    const auto output_keypoints_dims_2     = outputs_[output_keypoints_id_].shape.dims[2];
+    const auto output_keypoints_quant_parm = outputs_[output_keypoints_id_].quant_param;
+    const size_t keypoint_nums             = output_keypoints_dims_2 / 3;
 
     std::vector<ma_pt3f_t> n_keypoint(keypoint_nums);
 
@@ -468,11 +448,9 @@ ma_err_t YoloV8Pose::postProcessF32() {
         keypoint.box = {.x = bbox.x, .y = bbox.y, .w = bbox.w, .h = bbox.h, .score = bbox.score, .target = bbox.target};
         keypoint.pts = n_keypoint;
 
-        results_.push_back(std::move(keypoint));
+        results_.empalce_front(std::move(keypoint));
     }
 
-    results_.shrink_to_fit();
-
     return MA_OK;
 }
 #endif