From 8855826216af94f7a4deb21c776e1322b689a649 Mon Sep 17 00:00:00 2001 From: Kunal Tiwary Date: Sun, 8 Dec 2024 07:24:44 +0000 Subject: [PATCH] added fix for draft_data_json --- backend/projects/annotation_registry.py | 85 +++++-------------------- backend/projects/views.py | 63 ++++++++++-------- 2 files changed, 54 insertions(+), 94 deletions(-) diff --git a/backend/projects/annotation_registry.py b/backend/projects/annotation_registry.py index 002896007..0e29a5b40 100644 --- a/backend/projects/annotation_registry.py +++ b/backend/projects/annotation_registry.py @@ -167,59 +167,6 @@ } -def convert_prediction_json_to_annotation_result( - prediction_json, speakers_json, audio_duration, index, tred_type, is_acoustic=False -): - """ - Convert prediction_json and transcribed_json to annotation_result - """ - - result = [] - if prediction_json == None: - return result - - for idx, val in enumerate(prediction_json): - label_dict = { - "origin": "manual", - "to_name": "audio_url", - "from_name": "labels", - "original_length": audio_duration, - } - text_dict = { - "origin": "manual", - "to_name": "audio_url", - "from_name": "transcribed_json", - "original_length": audio_duration, - } - if is_acoustic: - text_dict["from_name"] = tred_type - id = f"shoonya_{index}s{idx}s{generate_random_string(13-len(str(idx)))}" - label_dict["id"] = id - text_dict["id"] = id - label_dict["type"] = "labels" - text_dict["type"] = "textarea" - - value_labels = { - "start": val["start"], - "end": val["end"], - "labels": [ - next( - speaker - for speaker in speakers_json - if speaker["speaker_id"] == val["speaker_id"] - )["name"] - ], - } - value_text = {"start": val["start"], "end": val["end"], "text": [val["text"]]} - - label_dict["value"] = value_labels - text_dict["value"] = value_text - result.append(label_dict) - result.append(text_dict) - - return result - - def convert_conversation_json_to_annotation_result(conversation_json, idx): result = [] for i in range(len(conversation_json)): @@ -239,12 +186,15 @@ def convert_conversation_json_to_annotation_result(conversation_json, idx): def draft_data_json_to_annotation_result(draft_data_json, project_type, pk=None): + from projects.views import convert_prediction_json_to_annotation_result + registry_helper = ProjectRegistry.get_instance() input_dataset_info = registry_helper.get_input_dataset_and_fields(project_type) dataset_model = getattr(dataset_models, input_dataset_info["dataset_type"]) try: dataset_item = dataset_model.objects.get(pk=pk) except: + dataset_item = None pass result = [] idx = 0 @@ -263,21 +213,20 @@ def draft_data_json_to_annotation_result(draft_data_json, project_type, pk=None) if field == "conversation_json": ans = convert_conversation_json_to_annotation_result(value, idx) elif field == "transcribed_json" or field == "prediction_json": - assert type(value) in [list, dict], f"Something wrong is there in the type of {value}" - if isinstance(value,list): - value = { - "verbatim_transcribed_json": value - } - for tred_type, tred_value in value.items(): - sub_ans = convert_prediction_json_to_annotation_result( - tred_value, - dataset_item.speakers_json, - dataset_item.audio_duration, - idx, - tred_type = tred_type, - is_acoustic = (project_type == "AcousticNormalisedTranscriptionEditing") - ) - ans.extend(sub_ans) + assert type(value) in [ + list, + dict, + ], f"Something wrong is there in the type of {value}" + if isinstance(value, list): + value = {"verbatim_transcribed_json": value} + sub_ans = convert_prediction_json_to_annotation_result( + None, + project_type, + dataset_item, + value, + True, + ) + ans.extend(sub_ans) else: if field_type == "textarea": field_dict["value"] = {"text": [value]} diff --git a/backend/projects/views.py b/backend/projects/views.py index 1f6e97e4e..039e53570 100644 --- a/backend/projects/views.py +++ b/backend/projects/views.py @@ -854,33 +854,42 @@ def get_task_count_unassigned(pk, user): return len(proj_tasks_unassigned) -def convert_prediction_json_to_annotation_result(pk, proj_type): +def convert_prediction_json_to_annotation_result( + pk, proj_type, data_item, prediction_json, populate_draft_data=False +): result = [] if ( proj_type == "AudioTranscriptionEditing" or proj_type == "AcousticNormalisedTranscriptionEditing" ): - data_item = SpeechConversation.objects.get(pk=pk) - prediction_json = ( - json.loads(data_item.prediction_json) - if isinstance(data_item.prediction_json, str) - else data_item.prediction_json - ) - assert type(prediction_json) in [dict, list], "Seems something is wrong with the formatting" + if not data_item and not prediction_json: + data_item = SpeechConversation.objects.get(pk=pk) + prediction_json = ( + json.loads(data_item.prediction_json) + if isinstance(data_item.prediction_json, str) + else data_item.prediction_json + ) + assert type(prediction_json) in [ + dict, + list, + ], "Seems something is wrong with the formatting" # see if the prediction is a list, then it seems that only verbatim json is present - if isinstance(prediction_json,list): - prediction_json = { - "verbatim_transcribed_json": prediction_json - } - + if isinstance(prediction_json, list): + prediction_json = {"verbatim_transcribed_json": prediction_json} + speakers_json = data_item.speakers_json audio_duration = data_item.audio_duration # converting prediction_json to result (wherever it exists) for every task. if prediction_json == None: return result # for pred_type, pred_json in prediction_json.items(): - if 'acoustic_normalised_transcribed_json' in prediction_json.keys(): - for idx, (val, val_acoustic) in enumerate(zip(prediction_json['verbatim_transcribed_json'],prediction_json['acoustic_normalised_transcribed_json'])): + if "acoustic_normalised_transcribed_json" in prediction_json.keys(): + for idx, (val, val_acoustic) in enumerate( + zip( + prediction_json["verbatim_transcribed_json"], + prediction_json["acoustic_normalised_transcribed_json"], + ) + ): label_dict = { "origin": "manual", "to_name": "audio_url", @@ -900,18 +909,20 @@ def convert_prediction_json_to_annotation_result(pk, proj_type): "original_length": audio_duration, } if proj_type == "AcousticNormalisedTranscriptionEditing": - text_dict["from_name"] = 'verbatim_transcribed_json' - text_dict_acoustic["from_name"] = 'acoustic_normalised_transcribed_json' - + text_dict["from_name"] = "verbatim_transcribed_json" + text_dict_acoustic[ + "from_name" + ] = "acoustic_normalised_transcribed_json" + id = f"shoonya_{idx}s{generate_random_string(13 - len(str(idx)))}" label_dict["id"] = id text_dict["id"] = id text_dict_acoustic["id"] = id - + label_dict["type"] = "labels" text_dict["type"] = "textarea" text_dict_acoustic["type"] = "textarea" - + value_labels = { "start": val["start"], "end": val["end"], @@ -933,7 +944,7 @@ def convert_prediction_json_to_annotation_result(pk, proj_type): "end": val_acoustic["end"], "text": [val_acoustic["text"]], } - + label_dict["value"] = value_labels text_dict["value"] = value_text text_dict_acoustic["value"] = value_text_acoustic @@ -942,7 +953,7 @@ def convert_prediction_json_to_annotation_result(pk, proj_type): result.append(text_dict) result.append(text_dict_acoustic) else: - for idx, val in enumerate(prediction_json['verbatim_transcribed_json']): + for idx, val in enumerate(prediction_json["verbatim_transcribed_json"]): label_dict = { "origin": "manual", "to_name": "audio_url", @@ -956,13 +967,13 @@ def convert_prediction_json_to_annotation_result(pk, proj_type): "original_length": audio_duration, } if proj_type == "AcousticNormalisedTranscriptionEditing": - text_dict["from_name"] = 'verbatim_transcribed_json' + text_dict["from_name"] = "verbatim_transcribed_json" id = f"shoonya_{idx}s{generate_random_string(13 - len(str(idx)))}" label_dict["id"] = id text_dict["id"] = id label_dict["type"] = "labels" text_dict["type"] = "textarea" - + value_labels = { "start": val["start"], "end": val["end"], @@ -979,7 +990,7 @@ def convert_prediction_json_to_annotation_result(pk, proj_type): "end": val["end"], "text": [val["text"]], } - + label_dict["value"] = value_labels text_dict["value"] = value_text # mainly label_dict and text_dict are sent as result @@ -2439,7 +2450,7 @@ def assign_new_tasks(self, request, pk, *args, **kwargs): ]: try: result = convert_prediction_json_to_annotation_result( - task.input_data.id, project.project_type + task.input_data.id, project.project_type, None, None, False ) except Exception as e: print(