foundation-model-stack · evaline-ju · Jan 21, 2025 · Jan 13, 2025 · Jan 14, 2025 · Jan 14, 2025
diff --git a/protos/caikit_data_model_nlp.proto b/protos/caikit_data_model_nlp.proto
@@ -89,8 +89,9 @@ message TokenClassificationResult {
   string word = 3;
   string entity = 4;
   string entity_group = 5;
-  double score = 6;
-  int64 token_count = 7;
+  string detector_id = 6;
+  double score = 7;
+  int64 token_count = 8;
 }
 
 message TokenClassificationResults {

@@ -132,22 +132,26 @@ pub struct ContentAnalysisResponse {
     pub detection: String,
     /// Detection type or aggregate detection label
     pub detection_type: String,
+    /// ID of Detector
+    pub detector_id: Option<String>,
     /// Score of detection
     pub score: f64,
     /// Optional, any applicable evidence for detection
     #[serde(skip_serializing_if = "Option::is_none")]
     pub evidence: Option<Vec<EvidenceObj>>,
 }
 
-impl From<ContentAnalysisResponse> for crate::models::TokenClassificationResult {
-    fn from(value: ContentAnalysisResponse) -> Self {
+impl From<(String, ContentAnalysisResponse)> for crate::models::TokenClassificationResult {
+    fn from(value: (String, ContentAnalysisResponse)) -> Self {
+        let (detector_id, response) = value;
         Self {
-            start: value.start as u32,
-            end: value.end as u32,
-            word: value.text,
-            entity: value.detection,
-            entity_group: value.detection_type,
-            score: value.score,
+            start: response.start as u32,
+            end: response.end as u32,
+            word: response.text,
+            entity: response.detection,
+            entity_group: response.detection_type,
+            detector_id,
+            score: response.score,
             token_count: None,
         }
     }

@@ -459,6 +459,9 @@ pub struct TokenClassificationResult {
     /// Aggregate label, if applicable
     pub entity_group: String,
 
+    /// id of detector (model) responsible for result(s)
+    pub detector_id: String,
+
     /// Confidence-like score of this classification prediction in [0, 1]
     pub score: f64,
 
@@ -894,6 +897,9 @@ pub struct DetectionResult {
     // The detection class
     pub detection: String,
 
+    // The id of the detector
+    pub detector_id: Option<String>,
+
     // The confidence level in the detection class
     pub score: f64,
 

@@ -427,7 +427,7 @@ async fn detection_task(
                                             .into_iter()
                                             .flat_map(|r| {
                                                 r.into_iter().filter_map(|resp| {
-                                                    let result: TokenClassificationResult = resp.into();
+                                                    let result: TokenClassificationResult = (detector_id.clone(),resp).into();
                                                     (result.score >= threshold).then_some(result)
                                                 })
                                             })

@@ -458,13 +458,15 @@ mod tests {
         text: &str,
         detection: &str,
         detection_type: &str,
+        detector_id: &str,
     ) -> TokenClassificationResult {
         TokenClassificationResult {
             start: span.0 as u32,
             end: span.1 as u32,
             word: text.to_string(),
             entity: detection.to_string(),
             entity_group: detection_type.to_string(),
+            detector_id: detector_id.to_string(),
             score: 0.99,
             token_count: None,
         }
@@ -498,11 +500,11 @@ mod tests {
             let partial_span = (chunk_token.start + 2, chunk_token.end - 2);
 
             let (detector_tx1, detector_rx1) = mpsc::channel(1);
-            let detection = get_detection_obj(whole_span, text, "has_HAP", "HAP");
+            let detection = get_detection_obj(whole_span, text, "has_HAP", "HAP", "en-hap");
             let _ = detector_tx1.send((chunk.clone(), vec![detection])).await;
 
             let (detector_tx2, detector_rx2) = mpsc::channel(1);
-            let detection = get_detection_obj(partial_span, text, "email_ID", "PII");
+            let detection = get_detection_obj(partial_span, text, "email_ID", "PII", "en-pii");
             let _ = detector_tx2.send((chunk.clone(), vec![detection])).await;
 
             // Push HAP after PII to make sure detection ordering is not coincidental

@@ -419,7 +419,7 @@ async fn detection_task(
                                             .into_iter()
                                             .flat_map(|r| {
                                                 r.into_iter().filter_map(|resp| {
-                                                    let result: TokenClassificationResult = resp.into();
+                                                    let result: TokenClassificationResult = (detector_id.clone(), resp).into();
                                                     (result.score >= threshold).then_some(result)
                                                 })
                                             })

@@ -158,6 +158,7 @@ impl AggregationActor {
                             text: r.word,
                             detection: r.entity,
                             detection_type: r.entity_group,
+                            detector_id: Some(r.detector_id),
                             score: r.score,
                             evidence: None,
                         })
@@ -206,13 +207,15 @@ mod tests {
         text: &str,
         detection: &str,
         detection_type: &str,
+        detector_id: &str,
     ) -> TokenClassificationResult {
         TokenClassificationResult {
             start: span.0 as u32,
             end: span.1 as u32,
             word: text.to_string(),
             entity: detection.to_string(),
             entity_group: detection_type.to_string(),
+            detector_id: detector_id.to_string(),
             score: 0.99,
             token_count: None,
         }
@@ -246,11 +249,11 @@ mod tests {
             let partial_span = (chunk_token.start + 2, chunk_token.end - 2);
 
             let (detector_tx1, detector_rx1) = mpsc::channel(1);
-            let detection = get_detection_obj(whole_span, text, "has_HAP", "HAP");
+            let detection = get_detection_obj(whole_span, text, "has_HAP", "HAP", "en-hap");
             let _ = detector_tx1.send((chunk.clone(), vec![detection])).await;
 
             let (detector_tx2, detector_rx2) = mpsc::channel(1);
-            let detection = get_detection_obj(partial_span, text, "email_ID", "PII");
+            let detection = get_detection_obj(partial_span, text, "email_ID", "PII", "en-pii");
             let _ = detector_tx2.send((chunk.clone(), vec![detection])).await;
 
             // Push HAP after PII to make sure detection ordering is not coincidental

@@ -695,9 +695,10 @@ pub async fn detect(
             response
                 .into_iter()
                 .filter_map(|resp| {
-                    let mut result: TokenClassificationResult = resp.into();
+                    let mut result: TokenClassificationResult = (detector_id.clone(), resp).into();
                     result.start += chunk.offset as u32;
                     result.end += chunk.offset as u32;
+                    // result.detector_id = detector_id.clone();  attach detector_id to the result
                     (result.score >= threshold).then_some(result)
                 })
                 .collect::<Vec<_>>()
@@ -756,6 +757,7 @@ pub async fn detect_content(
                 .filter_map(|mut resp| {
                     resp.start += chunk.offset;
                     resp.end += chunk.offset;
+                    resp.detector_id = Some(detector_id.clone()); // add detector_id
                     (resp.score >= threshold).then_some(resp)
                 })
                 .collect::<Vec<_>>()
@@ -803,6 +805,16 @@ pub async fn detect_for_generation(
             results
                 .into_iter()
                 .filter(|detection| detection.score > threshold)
+                .map(|detection| {
+                    //add detector_id
+                    DetectionResult {
+                        detection_type: detection.detection_type,
+                        detection: detection.detection,
+                        detector_id: Some(detector_id.clone()),
+                        score: detection.score,
+                        evidence: detection.evidence,
+                    }
+                })
                 .collect()
         })
         .map_err(|error| Error::DetectorRequestFailed {
@@ -844,6 +856,16 @@ pub async fn detect_for_chat(
             results
                 .into_iter()
                 .filter(|detection| detection.score > threshold)
+                .map(|detection| {
+                    //add detector_id
+                    DetectionResult {
+                        detection_type: detection.detection_type,
+                        detection: detection.detection,
+                        detector_id: Some(detector_id.clone()),
+                        score: detection.score,
+                        evidence: detection.evidence,
+                    }
+                })
                 .collect()
         })
         .map_err(|error| Error::DetectorRequestFailed {
@@ -899,6 +921,16 @@ pub async fn detect_for_context(
             results
                 .into_iter()
                 .filter(|detection| detection.score > threshold)
+                .map(|detection| {
+                    //add detector_id
+                    DetectionResult {
+                        detection_type: detection.detection_type,
+                        detection: detection.detection,
+                        detector_id: Some(detector_id.clone()),
+                        score: detection.score,
+                        evidence: detection.evidence,
+                    }
+                })
                 .collect()
         })
         .map_err(|error| Error::DetectorRequestFailed {
@@ -1131,6 +1163,7 @@ mod tests {
             word: second_sentence.clone(),
             entity: "has_HAP".to_string(),
             entity_group: "hap".to_string(),
+            detector_id: detector_id.to_string(),
             score: 0.9,
             token_count: None,
         }];
@@ -1151,6 +1184,7 @@ mod tests {
                 text: first_sentence.clone(),
                 detection: "has_HAP".to_string(),
                 detection_type: "hap".to_string(),
+                detector_id: Some(detector_id.to_string()),
                 score: 0.1,
                 evidence: Some(vec![]),
             }],
@@ -1160,6 +1194,7 @@ mod tests {
                 text: second_sentence.clone(),
                 detection: "has_HAP".to_string(),
                 detection_type: "hap".to_string(),
+                detector_id: Some(detector_id.to_string()),
                 score: 0.9,
                 evidence: Some(vec![]),
             }],
@@ -1300,6 +1335,7 @@ mod tests {
         let expected_response: Vec<DetectionResult> = vec![DetectionResult {
             detection_type: "relevance".to_string(),
             detection: "is_relevant".to_string(),
+            detector_id: Some(detector_id.to_string()),
             score: 0.9,
             evidence: Some(
                 [EvidenceObj {
@@ -1325,6 +1361,7 @@ mod tests {
         .then_return(Ok(vec![DetectionResult {
             detection_type: "relevance".to_string(),
             detection: "is_relevant".to_string(),
+            detector_id: Some(detector_id.to_string()),
             score: 0.9,
             evidence: Some(
                 [EvidenceObj {
@@ -1393,6 +1430,7 @@ mod tests {
         .then_return(Ok(vec![DetectionResult {
             detection_type: "relevance".to_string(),
             detection: "is_relevant".to_string(),
+            detector_id: Some(detector_id.to_string()),
             score: 0.1,
             evidence: None,
         }]));