forked from clovaai/donut
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathadapt_json_labels.py
113 lines (90 loc) · 3.17 KB
/
adapt_json_labels.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import json
import re
from datasets import load_dataset
import pandas as pd
def add_ground_truth(examples):
question = examples['question']
answers = examples['answers']
ground_truths = []
# question = question.replace("\\", "") # this was just one corrupt example (index 91 of training set)
query = re.sub(' +', ' ', question)
query = query.replace('"', '\\"')
# let's create the ground truth string
ground_truth_example = '{"gt_parses": ['
for idx, answer in enumerate(answers):
answer = answer.replace('"', '\\"')
ground_truth_example += '{"question" : "' + query + '", "answer" : "' + answer + '"}'
# add comma
# print("idx: ", idx)
# print("len(answers): ", len(answers))
if idx < len(answers) - 1:
ground_truth_example += ', '
ground_truth_example += ']}'
# ground_truths.append(ground_truth_example)
# examples['ground_truth'] = ground_truths
examples['ground_truth'] = ground_truth_example
# print(examples)
return examples
def add_file_name(examples):
reult = []
# path = "{" + examples["image"] + "}"
path = examples['image']
# reult.append(path)
# examples['file_name'] = reult
examples['file_name'] = path
# print(examples['file_name'])
return examples
def main_json(FILE_INPUT, FILE_OUTPUT):
input = open(FILE_INPUT)
output = open(FILE_OUTPUT, 'w')
data = json.load(input)
for line in data:
line = add_file_name(line)
line = add_ground_truth(line)
line_out = line
del line_out["image"]
del line_out["question"]
del line_out["docId"]
del line_out["answers"]
del line_out["data_split"]
del line_out["is_table"]
# print(line_out)
json.dump(line_out, output)
output.write('\n')
input.close()
output.close()
def main_excell(FILE_INPUT, FILE_OUTPUT):
input_file = open(FILE_INPUT)
data = json.load(input_file)
# Create a list to hold the processed data
line_result = []
# Loop through the data and process each line
for line in data:
line = add_file_name(line)
line = add_ground_truth(line)
# Remove unnecessary fields
del line["image"]
del line["question"]
del line["docId"]
del line["answers"]
del line["data_split"]
del line["is_table"]
print(line)
line_result.append(line)
# Create a DataFrame from the processed data
df = pd.DataFrame(line_result)
# Save the DataFrame to an Excel file
df.to_csv(FILE_OUTPUT, sep=';', index=False)
print(df)
input_file.close()
if __name__ == "__main__":
FILE_INPUT = "./dataset/bupa_docvqa_dataset_v2/metadata.json"
FILE_OUTPUT = "./dataset/bupa_docvqa_dataset_v2/metadata_gt.jsonl"
main_json(FILE_INPUT, FILE_OUTPUT)
FILE_OUTPUT = "./dataset/bupa_docvqa_dataset_v2/metadata_gt.csv"
main_excell(FILE_INPUT, FILE_OUTPUT)
### IMPORTANTE: REEMPLAZAR A MANO "" POR "
# Check dataset created as will be downloaded in training
data = load_dataset("dataset/bupa_docvqa_dataset_v2/", split="train")
# print(data)
# print(data[10])