Update text_extraction_id in all callback serializers

the-deep · Dec 27, 2023 · 63bf6a3 · 63bf6a3
1 parent ff1984e
commit 63bf6a3
Show file tree

Hide file tree

Showing 3 changed files with 24 additions and 17 deletions.
diff --git a/apps/deepl_integration/handlers.py b/apps/deepl_integration/handlers.py
@@ -360,7 +360,7 @@ def auto_trigger_request_to_extractor(cls, lead):
             "documents": [
                 {
                     "client_id": cls.get_client_id(lead),
-                    "text_extraction_id": str(lead_preview.text_extraction_id)
+                    "text_extraction_id": str(lead_preview.text_extraction_id),
                 }
             ],
             "callback_url": cls.get_callback_url()
@@ -625,18 +625,17 @@ def save_data(
         images_uri: List[str],
         word_count: int,
         page_count: int,
-        text_extraction_id: str
+        text_extraction_id: str,
     ):
         LeadPreview.objects.filter(lead=lead).delete()
         LeadPreviewImage.objects.filter(lead=lead).delete()
-        word_count, page_count = word_count, page_count
         # and create new one
         LeadPreview.objects.create(
             lead=lead,
             text_extract=RequestHelper(url=text_source_uri, ignore_error=True).get_text(sanitize=True) or '',
             word_count=word_count,
             page_count=page_count,
-            text_extraction_id=text_extraction_id
+            text_extraction_id=text_extraction_id,
         )
         # Save extracted images as LeadPreviewImage instances
         # TODO: The logic is same for unified_connector leads as well. Maybe have a single func?
@@ -668,7 +667,7 @@ def save_lead_data_using_connector_lead(
             text_extract=connector_lead.simplified_text,
             word_count=connector_lead.word_count,
             page_count=connector_lead.page_count,
-            text_extraction_id=connector_lead.text_extraction_id
+            text_extraction_id=connector_lead.text_extraction_id,
         )
         # Save extracted images as LeadPreviewImage instances
         # TODO: The logic is same for unified_connector leads as well. Maybe have a single func?

diff --git a/apps/deepl_integration/serializers.py b/apps/deepl_integration/serializers.py
@@ -73,7 +73,7 @@ class LeadExtractCallbackSerializer(DeeplServerBaseCallbackSerializer):
     text_path = serializers.CharField(required=False, allow_null=True)
     total_words_count = serializers.IntegerField(required=False, default=0, allow_null=True)
     total_pages = serializers.IntegerField(required=False, default=0, allow_null=True)
-    text_extraction_id = serializers.CharField(required=True)
+    text_extraction_id = serializers.UUIDField(required=False, allow_null=True)
     nlp_handler = LeadExtractionHandler
 
     def validate(self, data):
@@ -85,9 +85,9 @@ def validate(self, data):
             })
         if data['status'] == self.Status.SUCCESS:
             errors = {}
-            for key in ['text_path', 'total_words_count', 'total_pages']:
-                if key not in data:
-                    errors[key] = f'<{key}> is missing. Required when the extraction status is Success'
+            for key in ['text_path', 'total_words_count', 'total_pages', 'text_extraction_id']:
+                if key not in data or data[key] is None:
+                    errors[key] = f"<{key=} or {data.get('key')=}> is missing. Required when the extraction status is Success"
             if errors:
                 raise serializers.ValidationError(errors)
         return data
@@ -102,7 +102,7 @@ def create(self, data):
                 data.get('images_path', [])[:10],   # TODO: Support for more images, too much image will error.
                 data.get('total_words_count'),
                 data.get('total_pages'),
-                data.get('text_extraction_id')
+                data.get('text_extraction_id'),
             )
             # Add to deduplication index
             transaction.on_commit(lambda: index_lead_and_calculate_duplicates.delay(lead.id))
@@ -134,7 +134,7 @@ def validate(self, data):
             errors = {}
             for key in ['text_path', 'total_words_count', 'total_pages', 'text_extraction_id']:
                 if key not in data or data[key] is None:
-                    errors[key] = f'<{key}> is missing. Required when the extraction is Success'
+                    errors[key] = f"<{key=} or {data.get('key')=}> is missing. Required when the extraction status is Success"
             if errors:
                 raise serializers.ValidationError(errors)
         return data
@@ -149,7 +149,7 @@ def create(self, data):
                 data.get('images_path', [])[:10],  # TODO: Support for more images, to much image will error.
                 data['total_words_count'],
                 data['total_pages'],
-                data['text_extraction_id']
+                data['text_extraction_id'],
             )
         connector_lead.update_extraction_status(ConnectorLead.ExtractionStatus.FAILED)
         return connector_lead

diff --git a/apps/unified_connector/tests/test_mutation.py b/apps/unified_connector/tests/test_mutation.py
@@ -498,19 +498,24 @@ def _check_connector_lead_status(connector_lead, status):
             total_pages=10,
             status=DeeplServerBaseCallbackSerializer.Status.FAILED.value,
             text_extraction_id='c4c3c256-f307-4a85-a50e-5516a6f1ce8e',
-
         )
 
         response = self.client.post(url, data)
         self.assert_400(response)
         _check_connector_lead_status(connector_lead1, ConnectorLead.ExtractionStatus.PENDING)
+        connector_lead1.refresh_from_db()
+        assert connector_lead1.text_extraction_id is None
 
         data['client_id'] = UnifiedConnectorLeadHandler.get_client_id(connector_lead1)
-        data['status'] = DeeplServerBaseCallbackSerializer.Status.SUCCESS.value
+        data['status'] = DeeplServerBaseCallbackSerializer.Status.FAILED.value
         response = self.client.post(url, data)
         self.assert_200(response)
         connector_lead1.refresh_from_db()
-        _check_connector_lead_status(connector_lead1, ConnectorLead.ExtractionStatus.SUCCESS)
+        _check_connector_lead_status(connector_lead1, ConnectorLead.ExtractionStatus.FAILED)
+        assert connector_lead1.text_extraction_id is None
+        assert connector_lead1.simplified_text is None
+        assert connector_lead1.word_count is None
+        assert connector_lead1.page_count is None
 
         # ------ Extraction SUCCESS
         data = dict(
@@ -530,14 +535,17 @@ def _check_connector_lead_status(connector_lead, status):
         response = self.client.post(url, data)
         self.assert_200(response)
         _check_connector_lead_status(connector_lead2, ConnectorLead.ExtractionStatus.SUCCESS)
+        assert connector_lead1.text_extraction_id is data['text_extraction_id']
+        assert connector_lead1.simplified_text is not None
+        assert connector_lead1.word_count == 100
+        assert connector_lead1.page_count == 10
 
-        data['url'] = connector_lead2.url
         response = self.client.post(url, data)
         self.assert_200(response)
         connector_lead2.refresh_from_db()
         _check_connector_lead_status(connector_lead2, ConnectorLead.ExtractionStatus.SUCCESS)
         preview_image_qs = ConnectorLeadPreviewImage.objects.filter(connector_lead=connector_lead2)
         preview_image = preview_image_qs.first()
         self.assertEqual(connector_lead2.simplified_text, SAMPLE_SIMPLIFIED_TEXT)
-        self.assertEqual(preview_image_qs.count(), 4)
+        self.assertEqual(preview_image_qs.count(), 2)
         self.assertIsNotNone(preview_image and preview_image.image.name)