forked from clovaai/donut
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_custom_dataset_hf.py
61 lines (49 loc) · 2.07 KB
/
create_custom_dataset_hf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import pandas as pd
from datasets.dataset_dict import *
from datasets import *
import datasets
from PIL import Image
import json
from huggingface_hub import notebook_login
def preprocess_train(example):
image = Image.open("E:\\INETUM\\INETUM_Datasets\\DocVQA\\rrc_docvqa\\rrc_docvqa\\train\\"+example["file_name"]).convert("RGB")
new_image = {'image': image}
return new_image
def preprocess_val(example):
image = Image.open("E:\\INETUM\\INETUM_Datasets\\DocVQA\\rrc_docvqa\\rrc_docvqa\\val\\"+example["file_name"]).convert("RGB")
new_image = {'image': image}
return new_image
if __name__ == "__main__":
LABELS_FILE_TRAIN = "E:\\INETUM\\INETUM_Datasets\\DocVQA\\rrc_docvqa\\rrc_docvqa\\train\\metadata_gt.csv"
LABELS_FILE_VAL = "E:\\INETUM\\INETUM_Datasets\\DocVQA\\rrc_docvqa\\rrc_docvqa\\val\\metadata_gt.csv"
df = pd.read_csv(LABELS_FILE_TRAIN, sep=";", engine="python")
# display(df)
dataset_train = Dataset.from_pandas(df).map(preprocess_train)
# display(dataset_train)
df = pd.read_csv(LABELS_FILE_VAL, sep=";", engine="python")
# display(df)
dataset_val = Dataset.from_pandas(df).map(preprocess_val)
# display(dataset_val)
example = dataset_train[0]
json.loads(example['ground_truth'])
example = dataset_val[0]
json.loads(example['ground_truth'])
test_image = dataset_train[0]["image"]
# display(test_image)
print("\nRESULT: ", dataset_train[0]["ground_truth"])
# display
print("\nDATASET FEATURES:")
# display(dataset_train.features)
print("\nDATASET ELEMENT:")
test_image = dataset_val[0]["image"]
# display(test_image)
print("\nRESULT: ", dataset_val[0]["ground_truth"])
# display
print("\nDATASET FEATURES:")
# display(dataset_val.features)
print("\nDATASET ELEMENT:")
dataset = DatasetDict({'train': dataset_train, 'test': dataset_val})
# dataset
notebook_login()
## note that you can push your dataset to the hub very easily (and reload afterwards using load_dataset)!
dataset.push_to_hub("arvisioncode/donut-docvqa-base-12k", private=True)