From 24119ecdcb81d4831180724cfc384782606dd3c4 Mon Sep 17 00:00:00 2001 From: KevinNuNu Date: Fri, 24 Mar 2023 03:54:55 +0800 Subject: [PATCH 01/50] =?UTF-8?q?=E9=98=B6=E6=AE=B5=E6=80=A7=E6=8F=90?= =?UTF-8?q?=E4=BA=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- dataset_zoo/xfund/metafile.yml | 25 +++++++ dataset_zoo/xfund/ner.py | 74 +++++++++++++++++++ .../obtainers/naive_data_obtainer.py | 1 + tools/dataset_converters/prepare_dataset.py | 4 +- 4 files changed, 102 insertions(+), 2 deletions(-) create mode 100644 dataset_zoo/xfund/metafile.yml create mode 100644 dataset_zoo/xfund/ner.py diff --git a/dataset_zoo/xfund/metafile.yml b/dataset_zoo/xfund/metafile.yml new file mode 100644 index 000000000..993117468 --- /dev/null +++ b/dataset_zoo/xfund/metafile.yml @@ -0,0 +1,25 @@ +Name: 'XFUND' +Paper: + Title: 'FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents' + URL: https://arxiv.org/pdf/1905.13538.pdf + Venue: ICDAR + Year: '2019' + BibTeX: '@inproceedings{jaume2019, + title = {FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents}, + author = {Guillaume Jaume, Hazim Kemal Ekenel, Jean-Philippe Thiran}, + booktitle = {Accepted to ICDAR-OST}, + year = {2019}}' +Data: + Website: https://guillaumejaume.github.io/FUNSD/ + Language: + - English + Scene: + - Document + Granularity: + - Word + Tasks: + - ner + License: + Type: FUNSD License + Link: https://guillaumejaume.github.io/FUNSD/work/ + Format: .json diff --git a/dataset_zoo/xfund/ner.py b/dataset_zoo/xfund/ner.py new file mode 100644 index 000000000..c9cf10526 --- /dev/null +++ b/dataset_zoo/xfund/ner.py @@ -0,0 +1,74 @@ +data_root = 'data/xfund' +cache_path = 'data/cache' +langs = ['zh'] + +train_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + data_root=data_root, + files=[dict(url=f'https://github.com/doc-analysis/XFUND/releases/tag/v1.0/{lang}.train.zip', + save_name=f'{lang}_train.zip', + md5='a4ce16d1c1a8554a8b1e00907cff3b4b', + content=['image'], + mapping=[ + [ + f'{lang}_train/*.jpg', + f'ner_imgs/{lang}/train' + ] + ]) for lang in langs] + + [dict(url=f'https://github.com/doc-analysis/XFUND/releases/tag/v1.0/{lang}.train.json', + save_name=f'{lang}_train.json', + md5='af1afd5e935cccd3a105de6c12eb4c31', + content=['annotation'], + mapping=[ + [ + f'{lang}_train.json', + f'annotations/{lang}/ner_train.json' + ] + ]) for lang in langs] + ), + # gatherer=dict( + # type='PairGatherer', + # img_suffixes=['.png'], + # rule=[r'(\w+)\.png', r'\1.json']), + # parser=dict(type='FUNSDTextDetAnnParser'), + # packer=dict(type='TextDetPacker'), + # dumper=dict(type='JsonDumper'), +) + +test_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[dict(url=f'https://github.com/doc-analysis/XFUND/releases/tag/v1.0/{lang}.val.zip', + save_name=f'{lang}_val.zip', + md5='f84c2651e350f5b394585207a43d06e4', + content=['image'], + mapping=[ + [ + f'{lang}_val/*.jpg', + f'ner_imgs/{lang}/test' + ] + ]) for lang in langs] + + [dict(url=f'https://github.com/doc-analysis/XFUND/releases/tag/v1.0/{lang}.val.json', + save_name=f'{lang}_val.json', + md5='c243c35d1685a16435c8b281a445005c', + content=['annotation'], + mapping=[ + [ + f'{lang}_val.json', + f'annotations/{lang}/ner_test.json' + ] + ]) for lang in langs] + ), + # gatherer=dict( + # type='PairGatherer', + # img_suffixes=['.png'], + # rule=[r'(\w+)\.png', r'\1.json']), + # parser=dict(type='FUNSDTextDetAnnParser'), + # packer=dict(type='TextDetPacker'), + # dumper=dict(type='JsonDumper'), +) +delete = ['annotations', 'funsd'] +config_generator = dict(type='TextDetConfigGenerator') diff --git a/mmocr/datasets/preparers/obtainers/naive_data_obtainer.py b/mmocr/datasets/preparers/obtainers/naive_data_obtainer.py index 664ca6817..cc81737df 100644 --- a/mmocr/datasets/preparers/obtainers/naive_data_obtainer.py +++ b/mmocr/datasets/preparers/obtainers/naive_data_obtainer.py @@ -180,6 +180,7 @@ def move(self, mapping: List[Tuple[str, str]]) -> None: shutil.move(f, dst) elif osp.exists(src) and not osp.exists(dst): + mkdir_or_exist(osp.dirname(dst)) shutil.move(src, dst) def clean(self) -> None: diff --git a/tools/dataset_converters/prepare_dataset.py b/tools/dataset_converters/prepare_dataset.py index 84b8a0353..ea5c20974 100644 --- a/tools/dataset_converters/prepare_dataset.py +++ b/tools/dataset_converters/prepare_dataset.py @@ -21,9 +21,9 @@ def parse_args(): parser.add_argument( '--task', default='textdet', - choices=['textdet', 'textrecog', 'textspotting', 'kie'], + choices=['textdet', 'textrecog', 'textspotting', 'kie', 'ner'], help='Task type. Options are "textdet", "textrecog", "textspotting"' - ' and "kie".') + ' "kie" and "ner".') parser.add_argument( '--splits', default=['train', 'test', 'val'], From 3ae3f842e286a70d7544895bbdc1b1db3fcc9596 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Sat, 25 Mar 2023 15:53:19 +0800 Subject: [PATCH 02/50] =?UTF-8?q?=E9=87=8D=E6=9E=84xfund=E6=95=B0=E6=8D=AE?= =?UTF-8?q?=E9=9B=86config=E6=96=87=E4=BB=B6=E7=BB=93=E6=9E=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .codespellrc | 2 +- dataset_zoo/xfund/metafile.yml | 25 ------- dataset_zoo/xfund/ner.py | 74 ------------------- dataset_zoo/xfund/zh/metafile.yml | 41 ++++++++++ dataset_zoo/xfund/zh/sample_anno.md | 70 ++++++++++++++++++ dataset_zoo/xfund/zh/ser.py | 67 +++++++++++++++++ mmocr/datasets/preparers/parsers/__init__.py | 3 +- .../preparers/parsers/xfund_parser.py | 68 +++++++++++++++++ tools/dataset_converters/prepare_dataset.py | 4 +- 9 files changed, 251 insertions(+), 103 deletions(-) delete mode 100644 dataset_zoo/xfund/metafile.yml delete mode 100644 dataset_zoo/xfund/ner.py create mode 100644 dataset_zoo/xfund/zh/metafile.yml create mode 100644 dataset_zoo/xfund/zh/sample_anno.md create mode 100644 dataset_zoo/xfund/zh/ser.py create mode 100644 mmocr/datasets/preparers/parsers/xfund_parser.py diff --git a/.codespellrc b/.codespellrc index d9a0a76c5..72be50e00 100644 --- a/.codespellrc +++ b/.codespellrc @@ -2,4 +2,4 @@ skip = *.ipynb count = quiet-level = 3 -ignore-words-list = convertor,convertors,formating,nin,wan,datas,hist,ned +ignore-words-list = convertor,convertors,formating,nin,wan,datas,hist,ned,ser diff --git a/dataset_zoo/xfund/metafile.yml b/dataset_zoo/xfund/metafile.yml deleted file mode 100644 index 993117468..000000000 --- a/dataset_zoo/xfund/metafile.yml +++ /dev/null @@ -1,25 +0,0 @@ -Name: 'XFUND' -Paper: - Title: 'FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents' - URL: https://arxiv.org/pdf/1905.13538.pdf - Venue: ICDAR - Year: '2019' - BibTeX: '@inproceedings{jaume2019, - title = {FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents}, - author = {Guillaume Jaume, Hazim Kemal Ekenel, Jean-Philippe Thiran}, - booktitle = {Accepted to ICDAR-OST}, - year = {2019}}' -Data: - Website: https://guillaumejaume.github.io/FUNSD/ - Language: - - English - Scene: - - Document - Granularity: - - Word - Tasks: - - ner - License: - Type: FUNSD License - Link: https://guillaumejaume.github.io/FUNSD/work/ - Format: .json diff --git a/dataset_zoo/xfund/ner.py b/dataset_zoo/xfund/ner.py deleted file mode 100644 index c9cf10526..000000000 --- a/dataset_zoo/xfund/ner.py +++ /dev/null @@ -1,74 +0,0 @@ -data_root = 'data/xfund' -cache_path = 'data/cache' -langs = ['zh'] - -train_preparer = dict( - obtainer=dict( - type='NaiveDataObtainer', - cache_path=cache_path, - data_root=data_root, - files=[dict(url=f'https://github.com/doc-analysis/XFUND/releases/tag/v1.0/{lang}.train.zip', - save_name=f'{lang}_train.zip', - md5='a4ce16d1c1a8554a8b1e00907cff3b4b', - content=['image'], - mapping=[ - [ - f'{lang}_train/*.jpg', - f'ner_imgs/{lang}/train' - ] - ]) for lang in langs] + - [dict(url=f'https://github.com/doc-analysis/XFUND/releases/tag/v1.0/{lang}.train.json', - save_name=f'{lang}_train.json', - md5='af1afd5e935cccd3a105de6c12eb4c31', - content=['annotation'], - mapping=[ - [ - f'{lang}_train.json', - f'annotations/{lang}/ner_train.json' - ] - ]) for lang in langs] - ), - # gatherer=dict( - # type='PairGatherer', - # img_suffixes=['.png'], - # rule=[r'(\w+)\.png', r'\1.json']), - # parser=dict(type='FUNSDTextDetAnnParser'), - # packer=dict(type='TextDetPacker'), - # dumper=dict(type='JsonDumper'), -) - -test_preparer = dict( - obtainer=dict( - type='NaiveDataObtainer', - cache_path=cache_path, - files=[dict(url=f'https://github.com/doc-analysis/XFUND/releases/tag/v1.0/{lang}.val.zip', - save_name=f'{lang}_val.zip', - md5='f84c2651e350f5b394585207a43d06e4', - content=['image'], - mapping=[ - [ - f'{lang}_val/*.jpg', - f'ner_imgs/{lang}/test' - ] - ]) for lang in langs] + - [dict(url=f'https://github.com/doc-analysis/XFUND/releases/tag/v1.0/{lang}.val.json', - save_name=f'{lang}_val.json', - md5='c243c35d1685a16435c8b281a445005c', - content=['annotation'], - mapping=[ - [ - f'{lang}_val.json', - f'annotations/{lang}/ner_test.json' - ] - ]) for lang in langs] - ), - # gatherer=dict( - # type='PairGatherer', - # img_suffixes=['.png'], - # rule=[r'(\w+)\.png', r'\1.json']), - # parser=dict(type='FUNSDTextDetAnnParser'), - # packer=dict(type='TextDetPacker'), - # dumper=dict(type='JsonDumper'), -) -delete = ['annotations', 'funsd'] -config_generator = dict(type='TextDetConfigGenerator') diff --git a/dataset_zoo/xfund/zh/metafile.yml b/dataset_zoo/xfund/zh/metafile.yml new file mode 100644 index 000000000..86dfff885 --- /dev/null +++ b/dataset_zoo/xfund/zh/metafile.yml @@ -0,0 +1,41 @@ +Name: 'XFUND' +Paper: + Title: 'XFUND: A Benchmark Dataset for Multilingual Visually Rich Form Understanding' + URL: https://aclanthology.org/2022.findings-acl.253 + Venue: ACL + Year: '2022' + BibTeX: '@inproceedings{xu-etal-2022-xfund, + title = "{XFUND}: A Benchmark Dataset for Multilingual Visually Rich Form Understanding", + author = "Xu, Yiheng and + Lv, Tengchao and + Cui, Lei and + Wang, Guoxin and + Lu, Yijuan and + Florencio, Dinei and + Zhang, Cha and + Wei, Furu", + booktitle = "Findings of the Association for Computational Linguistics: ACL 2022", + month = may, + year = "2022", + address = "Dublin, Ireland", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2022.findings-acl.253", + doi = "10.18653/v1/2022.findings-acl.253", + pages = "3214--3224", + abstract = "Multimodal pre-training with text, layout, and image has achieved SOTA performance for visually rich document understanding tasks recently, which demonstrates the great potential for joint learning across different modalities. However, the existed research work has focused only on the English domain while neglecting the importance of multilingual generalization. In this paper, we introduce a human-annotated multilingual form understanding benchmark dataset named XFUND, which includes form understanding samples in 7 languages (Chinese, Japanese, Spanish, French, Italian, German, Portuguese). Meanwhile, we present LayoutXLM, a multimodal pre-trained model for multilingual document understanding, which aims to bridge the language barriers for visually rich document understanding. Experimental results show that the LayoutXLM model has significantly outperformed the existing SOTA cross-lingual pre-trained models on the XFUND dataset. The XFUND dataset and the pre-trained LayoutXLM model have been publicly available at https://aka.ms/layoutxlm.", +}' +Data: + Website: https://github.com/doc-analysis/XFUND + Language: + - Chinese, Japanese, Spanish, French, Italian, German, Portuguese + Scene: + - Document + Granularity: + - Word + Tasks: + - ser + - re + License: + Type: CC BY 4.0 + Link: https://creativecommons.org/licenses/by/4.0/ + Format: .json diff --git a/dataset_zoo/xfund/zh/sample_anno.md b/dataset_zoo/xfund/zh/sample_anno.md new file mode 100644 index 000000000..6f41a5e92 --- /dev/null +++ b/dataset_zoo/xfund/zh/sample_anno.md @@ -0,0 +1,70 @@ +**Semantic Entity Recognition / Relation Extraction** + +```json +{ + "lang": "zh", + "version": "0.1", + "split": "val", + "documents": [ + { + "id": "zh_val_0", + "uid": "0ac15750a098682aa02b51555f7c49ff43adc0436c325548ba8dba560cde4e7e", + "document": [ + { + "box": [ + 410, + 541, + 535, + 590 + ], + "text": "夏艳辰", + "label": "answer", + "words": [ + { + "box": [ + 413, + 541, + 447, + 587 + ], + "text": "夏" + }, + { + "box": [ + 458, + 542, + 489, + 588 + ], + "text": "艳" + }, + { + "box": [ + 497, + 544, + 531, + 590 + ], + "text": "辰" + } + ], + "linking": [ + [ + 30, + 26 + ] + ], + "id": 26 + }, + // ... + ], + "img": { + "fname": "zh_val_0.jpg", + "width": 2480, + "height": 3508 + } + }, + // ... + ] +} +``` diff --git a/dataset_zoo/xfund/zh/ser.py b/dataset_zoo/xfund/zh/ser.py new file mode 100644 index 000000000..75ee91d2b --- /dev/null +++ b/dataset_zoo/xfund/zh/ser.py @@ -0,0 +1,67 @@ +lang = 'zh' +data_root = f'data/xfund/{lang}' +cache_path = 'data/cache' + +train_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + data_root=data_root, + files=[ + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.train.zip', + save_name=f'{lang}_train.zip', + md5='a4ce16d1c1a8554a8b1e00907cff3b4b', + content=['image'], + mapping=[[f'{lang}_train/*.jpg', 'ser_imgs/train']]), + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.train.json', + save_name=f'{lang}_train.json', + md5='af1afd5e935cccd3a105de6c12eb4c31', + content=['annotation'], + mapping=[[f'{lang}_train.json', 'annotations/train.json']]) + ]), + gatherer=dict( + type='MonoGatherer', ann_name='train.json', img_dir='ser_imgs/train'), + parser=dict(type='XFUNDSERAnnParser'), + packer=dict(type='TextDetPacker'), + dumper=dict(type='JsonDumper'), +) + +# test_preparer = dict( +# obtainer=dict( +# type='NaiveDataObtainer', +# cache_path=cache_path, +# files=[ +# dict(url=f'https://github.com/doc-analysis/XFUND/releases/download/v1.0/{lang}.val.zip', +# save_name=f'{lang}_val.zip', +# md5='f84c2651e350f5b394585207a43d06e4', +# content=['image'], +# mapping=[ +# [ +# f'{lang}_val/*.jpg', +# f'ser_imgs/{lang}/test' +# ]]), +# dict(url=f'https://github.com/doc-analysis/XFUND/releases/download/v1.0/{lang}.val.json', +# save_name=f'{lang}_val.json', +# md5='c243c35d1685a16435c8b281a445005c', +# content=['annotation'], +# mapping=[ +# [ +# f'{lang}_val.json', +# f'annotations/{lang}/ser_test.json' +# ]])] +# ), +# gatherer=dict( +# type='PairGatherer', +# img_suffixes=['.png'], +# rule=[r'(\w+)\.png', r'\1.json']), +# parser=dict(type='FUNSDTextDetAnnParser'), +# packer=dict(type='TextDetPacker'), +# dumper=dict(type='JsonDumper'), +# ) + +delete = ['annotations'] + [f'{lang}_{split}' for split in ['train', 'val']] +config_generator = dict(type='TextDetConfigGenerator') diff --git a/mmocr/datasets/preparers/parsers/__init__.py b/mmocr/datasets/preparers/parsers/__init__.py index 58d6d9bd5..bef361238 100644 --- a/mmocr/datasets/preparers/parsers/__init__.py +++ b/mmocr/datasets/preparers/parsers/__init__.py @@ -11,11 +11,12 @@ from .synthtext_parser import SynthTextAnnParser from .totaltext_parser import TotaltextTextDetAnnParser from .wildreceipt_parser import WildreceiptKIEAnnParser +from .xfund_parser import XFUNDREAnnParser, XFUNDSERAnnParser __all__ = [ 'BaseParser', 'ICDARTxtTextDetAnnParser', 'ICDARTxtTextRecogAnnParser', 'TotaltextTextDetAnnParser', 'WildreceiptKIEAnnParser', 'COCOTextDetAnnParser', 'SVTTextDetAnnParser', 'FUNSDTextDetAnnParser', 'SROIETextDetAnnParser', 'NAFAnnParser', 'CTW1500AnnParser', - 'SynthTextAnnParser' + 'SynthTextAnnParser', 'XFUNDSERAnnParser', 'XFUNDREAnnParser' ] diff --git a/mmocr/datasets/preparers/parsers/xfund_parser.py b/mmocr/datasets/preparers/parsers/xfund_parser.py new file mode 100644 index 000000000..329797d02 --- /dev/null +++ b/mmocr/datasets/preparers/parsers/xfund_parser.py @@ -0,0 +1,68 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import json +import os.path as osp +from typing import List + +from mmocr.registry import DATA_PARSERS +from .base import BaseParser + + +@DATA_PARSERS.register_module() +class XFUNDSERAnnParser(BaseParser): + """XFUND Semantic Entity Recognition Annotation Parser. See + dataset_zoo/xfund/xx/sample_anno.md for annotation example. + + Args: + nproc (int): The number of processes to parse the annotation. Defaults + to 1. + """ + + def parse_files(self, img_dir: str, ann_path: str) -> List: + """Parse annotations.""" + assert isinstance(ann_path, str) + instances = list() + for img_fname, width, height, instance in self.loader(ann_path): + instances.append( + dict( + img_path=osp.join(img_dir, img_fname), + width=width, + height=height, + instances=instance)) + return instances + + def loader(self, file_path: str): + with open(file_path, 'r') as f: + data = json.load(f) + for i in range(len(data['documents'])): + img_fname = data['documents'][i]['img']['fname'] + width = data['documents'][i]['img']['width'] + height = data['documents'][i]['img']['height'] + cur_doc_texts, cur_doc_bboxes = [], [] + cur_doc_labels, cur_doc_words = [], [] + for j in range(len(data['documents'][i]['document'])): + cur_item = data['documents'][i]['document'][j] + cur_doc_texts.append(cur_item['text']) + cur_doc_bboxes.append(cur_item['box']) + cur_doc_labels.append(cur_item['label']) + cur_doc_words.append(cur_item['words']) + instance = dict( + texts=cur_doc_texts, + bboxes=cur_doc_bboxes, + labels=cur_doc_labels, + words=cur_doc_words) + yield img_fname, width, height, instance + + +@DATA_PARSERS.register_module() +class XFUNDREAnnParser(BaseParser): + """XFUND Relation Extraction Annotation Parser. See + dataset_zoo/xfund/xx/sample_anno.md for annotation example. + + Args: + nproc (int): The number of processes to parse the annotation. Defaults + to 1. + """ + + # TODO: 完成RE parser + def __init__(self, split: str, nproc: int = 1) -> None: + super().__init__(split, nproc) diff --git a/tools/dataset_converters/prepare_dataset.py b/tools/dataset_converters/prepare_dataset.py index ea5c20974..1d2e74c06 100644 --- a/tools/dataset_converters/prepare_dataset.py +++ b/tools/dataset_converters/prepare_dataset.py @@ -21,9 +21,9 @@ def parse_args(): parser.add_argument( '--task', default='textdet', - choices=['textdet', 'textrecog', 'textspotting', 'kie', 'ner'], + choices=['textdet', 'textrecog', 'textspotting', 'kie', 'ser', 're'], help='Task type. Options are "textdet", "textrecog", "textspotting"' - ' "kie" and "ner".') + ' "kie", "ser" and "re".') parser.add_argument( '--splits', default=['train', 'test', 'val'], From 35d0dd9fe846739b5bcab59abd1f6a3857e731da Mon Sep 17 00:00:00 2001 From: KevinNuNu Date: Sat, 25 Mar 2023 20:11:08 +0800 Subject: [PATCH 03/50] =?UTF-8?q?=E6=96=B0=E5=A2=9Exfund=20zh=E6=95=B0?= =?UTF-8?q?=E6=8D=AE=E9=9B=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- configs/ser/_base_/datasets/xfund_zh.py | 15 ++ dataset_zoo/xfund/zh/ser.py | 63 ++++---- .../preparers/config_generators/__init__.py | 4 +- .../config_generators/ser_config_generator.py | 98 ++++++++++++ mmocr/datasets/preparers/packers/__init__.py | 3 +- .../datasets/preparers/packers/ser_packer.py | 147 ++++++++++++++++++ .../preparers/parsers/xfund_parser.py | 59 +++---- 7 files changed, 323 insertions(+), 66 deletions(-) create mode 100644 configs/ser/_base_/datasets/xfund_zh.py create mode 100644 mmocr/datasets/preparers/config_generators/ser_config_generator.py create mode 100644 mmocr/datasets/preparers/packers/ser_packer.py diff --git a/configs/ser/_base_/datasets/xfund_zh.py b/configs/ser/_base_/datasets/xfund_zh.py new file mode 100644 index 000000000..9130ce99b --- /dev/null +++ b/configs/ser/_base_/datasets/xfund_zh.py @@ -0,0 +1,15 @@ +xfund_zh_ser_data_root = 'data/xfund/zh' + +xfund_zh_ser_train = dict( + type='SERDataset', + data_root=xfund_zh_ser_data_root, + ann_file='ser_train.json', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=None) + +xfund_zh_ser_test = dict( + type='SERDataset', + data_root=xfund_zh_ser_data_root, + ann_file='ser_test.json', + test_mode=True, + pipeline=None) diff --git a/dataset_zoo/xfund/zh/ser.py b/dataset_zoo/xfund/zh/ser.py index 75ee91d2b..ad167346c 100644 --- a/dataset_zoo/xfund/zh/ser.py +++ b/dataset_zoo/xfund/zh/ser.py @@ -6,7 +6,6 @@ obtainer=dict( type='NaiveDataObtainer', cache_path=cache_path, - data_root=data_root, files=[ dict( url='https://github.com/doc-analysis/XFUND/' @@ -26,42 +25,36 @@ gatherer=dict( type='MonoGatherer', ann_name='train.json', img_dir='ser_imgs/train'), parser=dict(type='XFUNDSERAnnParser'), - packer=dict(type='TextDetPacker'), + packer=dict(type='SERPacker'), dumper=dict(type='JsonDumper'), ) -# test_preparer = dict( -# obtainer=dict( -# type='NaiveDataObtainer', -# cache_path=cache_path, -# files=[ -# dict(url=f'https://github.com/doc-analysis/XFUND/releases/download/v1.0/{lang}.val.zip', -# save_name=f'{lang}_val.zip', -# md5='f84c2651e350f5b394585207a43d06e4', -# content=['image'], -# mapping=[ -# [ -# f'{lang}_val/*.jpg', -# f'ser_imgs/{lang}/test' -# ]]), -# dict(url=f'https://github.com/doc-analysis/XFUND/releases/download/v1.0/{lang}.val.json', -# save_name=f'{lang}_val.json', -# md5='c243c35d1685a16435c8b281a445005c', -# content=['annotation'], -# mapping=[ -# [ -# f'{lang}_val.json', -# f'annotations/{lang}/ser_test.json' -# ]])] -# ), -# gatherer=dict( -# type='PairGatherer', -# img_suffixes=['.png'], -# rule=[r'(\w+)\.png', r'\1.json']), -# parser=dict(type='FUNSDTextDetAnnParser'), -# packer=dict(type='TextDetPacker'), -# dumper=dict(type='JsonDumper'), -# ) +test_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.val.zip', + save_name=f'{lang}_val.zip', + md5='f84c2651e350f5b394585207a43d06e4', + content=['image'], + mapping=[[f'{lang}_val/*.jpg', 'ser_imgs/test']]), + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.val.json', + save_name=f'{lang}_val.json', + md5='c243c35d1685a16435c8b281a445005c', + content=['annotation'], + mapping=[[f'{lang}_val.json', 'annotations/test.json']]) + ]), + gatherer=dict( + type='MonoGatherer', ann_name='test.json', img_dir='ser_imgs/test'), + parser=dict(type='XFUNDSERAnnParser'), + packer=dict(type='SERPacker'), + dumper=dict(type='JsonDumper'), +) delete = ['annotations'] + [f'{lang}_{split}' for split in ['train', 'val']] -config_generator = dict(type='TextDetConfigGenerator') +config_generator = dict(type='SERConfigGenerator') diff --git a/mmocr/datasets/preparers/config_generators/__init__.py b/mmocr/datasets/preparers/config_generators/__init__.py index 8e884c6d9..c6f3253a4 100644 --- a/mmocr/datasets/preparers/config_generators/__init__.py +++ b/mmocr/datasets/preparers/config_generators/__init__.py @@ -3,8 +3,10 @@ from .textdet_config_generator import TextDetConfigGenerator from .textrecog_config_generator import TextRecogConfigGenerator from .textspotting_config_generator import TextSpottingConfigGenerator +from .ser_config_generator import SERConfigGenerator __all__ = [ 'BaseDatasetConfigGenerator', 'TextDetConfigGenerator', - 'TextRecogConfigGenerator', 'TextSpottingConfigGenerator' + 'TextRecogConfigGenerator', 'TextSpottingConfigGenerator', + 'SERConfigGenerator' ] diff --git a/mmocr/datasets/preparers/config_generators/ser_config_generator.py b/mmocr/datasets/preparers/config_generators/ser_config_generator.py new file mode 100644 index 000000000..c93167869 --- /dev/null +++ b/mmocr/datasets/preparers/config_generators/ser_config_generator.py @@ -0,0 +1,98 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, List, Optional + +from mmocr.registry import CFG_GENERATORS +from .base import BaseDatasetConfigGenerator + + +@CFG_GENERATORS.register_module() +class SERConfigGenerator(BaseDatasetConfigGenerator): + """Text detection config generator. + + Args: + data_root (str): The root path of the dataset. + dataset_name (str): The name of the dataset. + overwrite_cfg (bool): Whether to overwrite the dataset config file if + it already exists. If False, config generator will not generate new + config for datasets whose configs are already in base. + train_anns (List[Dict], optional): A list of train annotation files + to appear in the base configs. Defaults to + ``[dict(file='ser_train.json', dataset_postfix='')]``. + Each element is typically a dict with the following fields: + - ann_file (str): The path to the annotation file relative to + data_root. + - dataset_postfix (str, optional): Affects the postfix of the + resulting variable in the generated config. If specified, the + dataset variable will be named in the form of + ``{dataset_name}_{dataset_postfix}_{task}_{split}``. Defaults to + None. + val_anns (List[Dict], optional): A list of val annotation files + to appear in the base configs, similar to ``train_anns``. Defaults + to []. + test_anns (List[Dict], optional): A list of test annotation files + to appear in the base configs, similar to ``train_anns``. Defaults + to ``[dict(file='ser_test.json')]``. + config_path (str): Path to the configs. Defaults to 'configs/'. + """ + + def __init__( + self, + data_root: str, + dataset_name: str, + overwrite_cfg: bool = False, + train_anns: Optional[List[Dict]] = [ + dict(ann_file='ser_train.json', dataset_postfix='') + ], + val_anns: Optional[List[Dict]] = [], + test_anns: Optional[List[Dict]] = [ + dict(ann_file='ser_test.json', dataset_postfix='') + ], + config_path: str = 'configs/', + ) -> None: + if '/' in dataset_name: + dataset_name = '_'.join(dataset_name.split('/')) + super().__init__( + data_root=data_root, + task='ser', + overwrite_cfg=overwrite_cfg, + dataset_name=dataset_name, + train_anns=train_anns, + val_anns=val_anns, + test_anns=test_anns, + config_path=config_path, + ) + + def _gen_dataset_config(self) -> str: + """Generate a full dataset config based on the annotation file + dictionary. + + Args: + ann_dict (dict[str, dict(str, str)]): A nested dictionary that maps + a config variable name (such as icdar2015_textrecog_train) to + its corresponding annotation information dict. Each dict + contains following keys: + - ann_file (str): The path to the annotation file relative to + data_root. + - dataset_postfix (str, optional): Affects the postfix of the + resulting variable in the generated config. If specified, the + dataset variable will be named in the form of + ``{dataset_name}_{dataset_postfix}_{task}_{split}``. Defaults + to None. + - split (str): The split the annotation belongs to. Usually + it can be 'train', 'val' and 'test'. + + Returns: + str: The generated dataset config. + """ + cfg = '' + for key_name, ann_dict in self.anns.items(): + cfg += f'\n{key_name} = dict(\n' + cfg += ' type=\'SERDataset\',\n' + cfg += ' data_root=' + f'{self.dataset_name}_{self.task}_data_root,\n' # noqa: E501 + cfg += f' ann_file=\'{ann_dict["ann_file"]}\',\n' + if ann_dict['split'] == 'train': + cfg += ' filter_cfg=dict(filter_empty_gt=True, min_size=32),\n' # noqa: E501 + elif ann_dict['split'] in ['test', 'val']: + cfg += ' test_mode=True,\n' + cfg += ' pipeline=None)\n' + return cfg diff --git a/mmocr/datasets/preparers/packers/__init__.py b/mmocr/datasets/preparers/packers/__init__.py index 78eb55dc4..90a98e8c3 100644 --- a/mmocr/datasets/preparers/packers/__init__.py +++ b/mmocr/datasets/preparers/packers/__init__.py @@ -4,8 +4,9 @@ from .textrecog_packer import TextRecogCropPacker, TextRecogPacker from .textspotting_packer import TextSpottingPacker from .wildreceipt_packer import WildReceiptPacker +from .ser_packer import SERPacker __all__ = [ 'BasePacker', 'TextDetPacker', 'TextRecogPacker', 'TextRecogCropPacker', - 'TextSpottingPacker', 'WildReceiptPacker' + 'TextSpottingPacker', 'WildReceiptPacker', 'SERPacker' ] diff --git a/mmocr/datasets/preparers/packers/ser_packer.py b/mmocr/datasets/preparers/packers/ser_packer.py new file mode 100644 index 000000000..23971b53a --- /dev/null +++ b/mmocr/datasets/preparers/packers/ser_packer.py @@ -0,0 +1,147 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +from typing import Dict, List, Tuple + +import mmcv + +from mmocr.registry import DATA_PACKERS +from .base import BasePacker + + +@DATA_PACKERS.register_module() +class SERPacker(BasePacker): + """Semantic Entity Recognition packer. + It is used to pack the parsed annotation info to. + + .. code-block:: python + + { + "metainfo": + { + "dataset_type": "SERDataset", + "task_name": "ser", + "ser_labels": ['answer', 'header', 'other', 'question'], + "id2label": { + "0": "answer", + "1": "header", + "2": "other", + "3": "question" + }, + "label2id": { + "answer": 0, + "header": 1, + "other": 2, + "question": 3 + } + }, + "data_list": + [ + { + "img_path": "ser_imgs\\test\\zh_val_0.jpg", + "height": 3508, + "width": 2480, + "instances": + { + "texts": ["绩效目标申报表(一级项目)", "项目名称", ...], + "bboxes": [[906,195,1478,259], [357,325,467,357], ...], + "labels": ["header", "question", ...], + "words": [[{ + "box": [ + 904, + 192, + 942, + 253 + ], + "text": "绩" + }, + { + "box": [ + 953, + 192, + 987, + 253 + ], + "text": "效" + }, ...], ...] + } + } + ] + } + """ + + def pack_instance(self, sample: Tuple) -> Dict: + """Pack the parsed annotation info to an MMOCR format instance. + + Args: + sample (Tuple): A tuple of (img_file, instances). + - img_path (str): Path to the image file. + - instances (Sequence[Dict]): A list of converted annos. Each + element should be a dict with the following keys: + + - 'texts' + - 'bboxes' + - 'labels' + - 'words' (optional) + + Returns: + Dict: An MMOCR format instance. + """ + + img_path, instances = sample + + img = mmcv.imread(img_path) + h, w = img.shape[:2] + + texts_per_doc = [] + bboxes_per_doc = [] + labels_per_doc = [] + words_per_doc = [] + for instance in instances: + text = instance.get('text', None) + box = instance.get('box', None) + label = instance.get('label', None) + words = instance.get('words', None) + assert text or box or label + texts_per_doc.append(text) + bboxes_per_doc.append(box) + labels_per_doc.append(label) + words_per_doc.append(words) + packed_instances = dict( + instances=dict( + texts=texts_per_doc, + bboxes=bboxes_per_doc, + labels=labels_per_doc, + words=words_per_doc), + img_path=osp.relpath(img_path, self.data_root), + height=h, + width=w) + + return packed_instances + + def add_meta(self, sample: List) -> Dict: + """Add meta information to the sample. + + Args: + sample (List): A list of samples of the dataset. + + Returns: + Dict: A dict contains the meta information and samples. + """ + + labels = [] + for s in sample: + labels += s['instances']['labels'] + label_list = list(set(labels)) + label_list.sort() + + meta = { + 'metainfo': { + 'dataset_type': 'SERDataset', + 'task_name': 'ser', + 'ser_labels': label_list, + 'id2label': {k: v for k, v in enumerate(label_list)}, + 'label2id': {v: k for k, v in enumerate(label_list)} + }, + 'data_list': sample + } + return meta diff --git a/mmocr/datasets/preparers/parsers/xfund_parser.py b/mmocr/datasets/preparers/parsers/xfund_parser.py index 329797d02..e8a920e97 100644 --- a/mmocr/datasets/preparers/parsers/xfund_parser.py +++ b/mmocr/datasets/preparers/parsers/xfund_parser.py @@ -20,41 +20,29 @@ class XFUNDSERAnnParser(BaseParser): def parse_files(self, img_dir: str, ann_path: str) -> List: """Parse annotations.""" assert isinstance(ann_path, str) - instances = list() - for img_fname, width, height, instance in self.loader(ann_path): - instances.append( - dict( - img_path=osp.join(img_dir, img_fname), - width=width, - height=height, - instances=instance)) - return instances - + samples = list() + for img_fname, instance in self.loader(ann_path): + samples.append((osp.join(img_dir, img_fname), instance)) + return samples + def loader(self, file_path: str): - with open(file_path, 'r') as f: + with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) for i in range(len(data['documents'])): img_fname = data['documents'][i]['img']['fname'] - width = data['documents'][i]['img']['width'] - height = data['documents'][i]['img']['height'] - cur_doc_texts, cur_doc_bboxes = [], [] - cur_doc_labels, cur_doc_words = [], [] + instances = list() for j in range(len(data['documents'][i]['document'])): cur_item = data['documents'][i]['document'][j] - cur_doc_texts.append(cur_item['text']) - cur_doc_bboxes.append(cur_item['box']) - cur_doc_labels.append(cur_item['label']) - cur_doc_words.append(cur_item['words']) - instance = dict( - texts=cur_doc_texts, - bboxes=cur_doc_bboxes, - labels=cur_doc_labels, - words=cur_doc_words) - yield img_fname, width, height, instance + instance = dict(text=cur_item['text'], + box=cur_item['box'], + label=cur_item['label'], + words=cur_item['words']) + instances.append(instance) + yield img_fname, instances @DATA_PARSERS.register_module() -class XFUNDREAnnParser(BaseParser): +class XFUNDREAnnParser(XFUNDSERAnnParser): """XFUND Relation Extraction Annotation Parser. See dataset_zoo/xfund/xx/sample_anno.md for annotation example. @@ -63,6 +51,19 @@ class XFUNDREAnnParser(BaseParser): to 1. """ - # TODO: 完成RE parser - def __init__(self, split: str, nproc: int = 1) -> None: - super().__init__(split, nproc) + def loader(self, file_path: str): + with open(file_path, 'r', encoding='utf-8') as f: + data = json.load(f) + for i in range(len(data['documents'])): + img_fname = data['documents'][i]['img']['fname'] + instances = list() + for j in range(len(data['documents'][i]['document'])): + cur_item = data['documents'][i]['document'][j] + instance = dict(text=cur_item['text'], + box=cur_item['box'], + label=cur_item['label'], + words=cur_item['words'], + linking=cur_item['linking'], + id=cur_item['id']) + instances.append(instance) + yield img_fname, instances From 125cce212359daa2d1cb496dd938580f35b06a2f Mon Sep 17 00:00:00 2001 From: KevinNuNu Date: Sat, 25 Mar 2023 20:12:45 +0800 Subject: [PATCH 04/50] =?UTF-8?q?[Fix]=20=E8=A7=A3=E5=86=B3jsondumper?= =?UTF-8?q?=E7=94=9F=E6=88=90=E7=9A=84=E6=96=87=E4=BB=B6=E6=97=A0=E6=B3=95?= =?UTF-8?q?=E6=AD=A3=E7=A1=AE=E6=98=BE=E7=A4=BA=E4=B8=AD=E6=96=87=E7=9A=84?= =?UTF-8?q?=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mmocr/datasets/preparers/dumpers/json_dumper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mmocr/datasets/preparers/dumpers/json_dumper.py b/mmocr/datasets/preparers/dumpers/json_dumper.py index e1c8ab026..73d91e856 100644 --- a/mmocr/datasets/preparers/dumpers/json_dumper.py +++ b/mmocr/datasets/preparers/dumpers/json_dumper.py @@ -21,4 +21,4 @@ def dump(self, data: Dict) -> None: filename = f'{self.task}_{self.split}.json' dst_file = osp.join(self.data_root, filename) - mmengine.dump(data, dst_file) + mmengine.dump(data, dst_file, ensure_ascii=False) From f4f1dacbc20c27ed4fefd9bb16d0981f2322a1bd Mon Sep 17 00:00:00 2001 From: KevinNuNu Date: Sat, 25 Mar 2023 20:16:44 +0800 Subject: [PATCH 05/50] =?UTF-8?q?[Fix]=20=E8=A7=A3=E5=86=B3=E8=B7=AF?= =?UTF-8?q?=E5=BE=84=E6=8B=BC=E6=8E=A5=E5=BC=82=E5=B8=B8Bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mmocr/datasets/preparers/obtainers/naive_data_obtainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mmocr/datasets/preparers/obtainers/naive_data_obtainer.py b/mmocr/datasets/preparers/obtainers/naive_data_obtainer.py index 51b0d266c..38af27327 100644 --- a/mmocr/datasets/preparers/obtainers/naive_data_obtainer.py +++ b/mmocr/datasets/preparers/obtainers/naive_data_obtainer.py @@ -187,7 +187,7 @@ def move(self, mapping: List[Tuple[str, str]]) -> None: mkdir_or_exist(dst) for f in glob.glob(src): if not osp.exists( - osp.join(dst, osp.relpath(f, self.data_root))): + osp.join(dst, osp.basename(f))): shutil.move(f, dst) elif osp.exists(src) and not osp.exists(dst): From 5b203ad6a7179099df982684ee2f1f5228c51bce Mon Sep 17 00:00:00 2001 From: KevinNuNu Date: Sat, 25 Mar 2023 22:15:42 +0800 Subject: [PATCH 06/50] =?UTF-8?q?=E6=96=B0=E5=A2=9E=E5=8F=A6=E5=A4=966?= =?UTF-8?q?=E4=B8=AA=E6=95=B0=E6=8D=AE=E9=9B=86config=E6=96=87=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- configs/ser/_base_/datasets/xfund_de.py | 15 ++++++ configs/ser/_base_/datasets/xfund_es.py | 15 ++++++ configs/ser/_base_/datasets/xfund_fr.py | 15 ++++++ configs/ser/_base_/datasets/xfund_it.py | 15 ++++++ configs/ser/_base_/datasets/xfund_ja.py | 15 ++++++ configs/ser/_base_/datasets/xfund_pt.py | 15 ++++++ dataset_zoo/xfund/de/metafile.yml | 41 +++++++++++++++ dataset_zoo/xfund/de/sample_anno.md | 70 +++++++++++++++++++++++++ dataset_zoo/xfund/de/ser.py | 60 +++++++++++++++++++++ dataset_zoo/xfund/es/metafile.yml | 41 +++++++++++++++ dataset_zoo/xfund/es/sample_anno.md | 70 +++++++++++++++++++++++++ dataset_zoo/xfund/es/ser.py | 60 +++++++++++++++++++++ dataset_zoo/xfund/fr/metafile.yml | 41 +++++++++++++++ dataset_zoo/xfund/fr/sample_anno.md | 70 +++++++++++++++++++++++++ dataset_zoo/xfund/fr/ser.py | 60 +++++++++++++++++++++ dataset_zoo/xfund/it/metafile.yml | 41 +++++++++++++++ dataset_zoo/xfund/it/sample_anno.md | 70 +++++++++++++++++++++++++ dataset_zoo/xfund/it/ser.py | 60 +++++++++++++++++++++ dataset_zoo/xfund/ja/metafile.yml | 41 +++++++++++++++ dataset_zoo/xfund/ja/sample_anno.md | 70 +++++++++++++++++++++++++ dataset_zoo/xfund/ja/ser.py | 60 +++++++++++++++++++++ dataset_zoo/xfund/pt/metafile.yml | 41 +++++++++++++++ dataset_zoo/xfund/pt/sample_anno.md | 70 +++++++++++++++++++++++++ dataset_zoo/xfund/pt/ser.py | 60 +++++++++++++++++++++ 24 files changed, 1116 insertions(+) create mode 100644 configs/ser/_base_/datasets/xfund_de.py create mode 100644 configs/ser/_base_/datasets/xfund_es.py create mode 100644 configs/ser/_base_/datasets/xfund_fr.py create mode 100644 configs/ser/_base_/datasets/xfund_it.py create mode 100644 configs/ser/_base_/datasets/xfund_ja.py create mode 100644 configs/ser/_base_/datasets/xfund_pt.py create mode 100644 dataset_zoo/xfund/de/metafile.yml create mode 100644 dataset_zoo/xfund/de/sample_anno.md create mode 100644 dataset_zoo/xfund/de/ser.py create mode 100644 dataset_zoo/xfund/es/metafile.yml create mode 100644 dataset_zoo/xfund/es/sample_anno.md create mode 100644 dataset_zoo/xfund/es/ser.py create mode 100644 dataset_zoo/xfund/fr/metafile.yml create mode 100644 dataset_zoo/xfund/fr/sample_anno.md create mode 100644 dataset_zoo/xfund/fr/ser.py create mode 100644 dataset_zoo/xfund/it/metafile.yml create mode 100644 dataset_zoo/xfund/it/sample_anno.md create mode 100644 dataset_zoo/xfund/it/ser.py create mode 100644 dataset_zoo/xfund/ja/metafile.yml create mode 100644 dataset_zoo/xfund/ja/sample_anno.md create mode 100644 dataset_zoo/xfund/ja/ser.py create mode 100644 dataset_zoo/xfund/pt/metafile.yml create mode 100644 dataset_zoo/xfund/pt/sample_anno.md create mode 100644 dataset_zoo/xfund/pt/ser.py diff --git a/configs/ser/_base_/datasets/xfund_de.py b/configs/ser/_base_/datasets/xfund_de.py new file mode 100644 index 000000000..61f14ecff --- /dev/null +++ b/configs/ser/_base_/datasets/xfund_de.py @@ -0,0 +1,15 @@ +xfund_de_ser_data_root = 'data/xfund/de' + +xfund_de_ser_train = dict( + type='SERDataset', + data_root=xfund_de_ser_data_root, + ann_file='ser_train.json', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=None) + +xfund_de_ser_test = dict( + type='SERDataset', + data_root=xfund_de_ser_data_root, + ann_file='ser_test.json', + test_mode=True, + pipeline=None) diff --git a/configs/ser/_base_/datasets/xfund_es.py b/configs/ser/_base_/datasets/xfund_es.py new file mode 100644 index 000000000..25ac5b5fb --- /dev/null +++ b/configs/ser/_base_/datasets/xfund_es.py @@ -0,0 +1,15 @@ +xfund_es_ser_data_root = 'data/xfund/es' + +xfund_es_ser_train = dict( + type='SERDataset', + data_root=xfund_es_ser_data_root, + ann_file='ser_train.json', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=None) + +xfund_es_ser_test = dict( + type='SERDataset', + data_root=xfund_es_ser_data_root, + ann_file='ser_test.json', + test_mode=True, + pipeline=None) diff --git a/configs/ser/_base_/datasets/xfund_fr.py b/configs/ser/_base_/datasets/xfund_fr.py new file mode 100644 index 000000000..039b1124d --- /dev/null +++ b/configs/ser/_base_/datasets/xfund_fr.py @@ -0,0 +1,15 @@ +xfund_fr_ser_data_root = 'data/xfund/fr' + +xfund_fr_ser_train = dict( + type='SERDataset', + data_root=xfund_fr_ser_data_root, + ann_file='ser_train.json', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=None) + +xfund_fr_ser_test = dict( + type='SERDataset', + data_root=xfund_fr_ser_data_root, + ann_file='ser_test.json', + test_mode=True, + pipeline=None) diff --git a/configs/ser/_base_/datasets/xfund_it.py b/configs/ser/_base_/datasets/xfund_it.py new file mode 100644 index 000000000..eca998816 --- /dev/null +++ b/configs/ser/_base_/datasets/xfund_it.py @@ -0,0 +1,15 @@ +xfund_it_ser_data_root = 'data/xfund/it' + +xfund_it_ser_train = dict( + type='SERDataset', + data_root=xfund_it_ser_data_root, + ann_file='ser_train.json', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=None) + +xfund_it_ser_test = dict( + type='SERDataset', + data_root=xfund_it_ser_data_root, + ann_file='ser_test.json', + test_mode=True, + pipeline=None) diff --git a/configs/ser/_base_/datasets/xfund_ja.py b/configs/ser/_base_/datasets/xfund_ja.py new file mode 100644 index 000000000..43fa5a514 --- /dev/null +++ b/configs/ser/_base_/datasets/xfund_ja.py @@ -0,0 +1,15 @@ +xfund_ja_ser_data_root = 'data/xfund/ja' + +xfund_ja_ser_train = dict( + type='SERDataset', + data_root=xfund_ja_ser_data_root, + ann_file='ser_train.json', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=None) + +xfund_ja_ser_test = dict( + type='SERDataset', + data_root=xfund_ja_ser_data_root, + ann_file='ser_test.json', + test_mode=True, + pipeline=None) diff --git a/configs/ser/_base_/datasets/xfund_pt.py b/configs/ser/_base_/datasets/xfund_pt.py new file mode 100644 index 000000000..99804d8a6 --- /dev/null +++ b/configs/ser/_base_/datasets/xfund_pt.py @@ -0,0 +1,15 @@ +xfund_pt_ser_data_root = 'data/xfund/pt' + +xfund_pt_ser_train = dict( + type='SERDataset', + data_root=xfund_pt_ser_data_root, + ann_file='ser_train.json', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=None) + +xfund_pt_ser_test = dict( + type='SERDataset', + data_root=xfund_pt_ser_data_root, + ann_file='ser_test.json', + test_mode=True, + pipeline=None) diff --git a/dataset_zoo/xfund/de/metafile.yml b/dataset_zoo/xfund/de/metafile.yml new file mode 100644 index 000000000..86dfff885 --- /dev/null +++ b/dataset_zoo/xfund/de/metafile.yml @@ -0,0 +1,41 @@ +Name: 'XFUND' +Paper: + Title: 'XFUND: A Benchmark Dataset for Multilingual Visually Rich Form Understanding' + URL: https://aclanthology.org/2022.findings-acl.253 + Venue: ACL + Year: '2022' + BibTeX: '@inproceedings{xu-etal-2022-xfund, + title = "{XFUND}: A Benchmark Dataset for Multilingual Visually Rich Form Understanding", + author = "Xu, Yiheng and + Lv, Tengchao and + Cui, Lei and + Wang, Guoxin and + Lu, Yijuan and + Florencio, Dinei and + Zhang, Cha and + Wei, Furu", + booktitle = "Findings of the Association for Computational Linguistics: ACL 2022", + month = may, + year = "2022", + address = "Dublin, Ireland", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2022.findings-acl.253", + doi = "10.18653/v1/2022.findings-acl.253", + pages = "3214--3224", + abstract = "Multimodal pre-training with text, layout, and image has achieved SOTA performance for visually rich document understanding tasks recently, which demonstrates the great potential for joint learning across different modalities. However, the existed research work has focused only on the English domain while neglecting the importance of multilingual generalization. In this paper, we introduce a human-annotated multilingual form understanding benchmark dataset named XFUND, which includes form understanding samples in 7 languages (Chinese, Japanese, Spanish, French, Italian, German, Portuguese). Meanwhile, we present LayoutXLM, a multimodal pre-trained model for multilingual document understanding, which aims to bridge the language barriers for visually rich document understanding. Experimental results show that the LayoutXLM model has significantly outperformed the existing SOTA cross-lingual pre-trained models on the XFUND dataset. The XFUND dataset and the pre-trained LayoutXLM model have been publicly available at https://aka.ms/layoutxlm.", +}' +Data: + Website: https://github.com/doc-analysis/XFUND + Language: + - Chinese, Japanese, Spanish, French, Italian, German, Portuguese + Scene: + - Document + Granularity: + - Word + Tasks: + - ser + - re + License: + Type: CC BY 4.0 + Link: https://creativecommons.org/licenses/by/4.0/ + Format: .json diff --git a/dataset_zoo/xfund/de/sample_anno.md b/dataset_zoo/xfund/de/sample_anno.md new file mode 100644 index 000000000..6f41a5e92 --- /dev/null +++ b/dataset_zoo/xfund/de/sample_anno.md @@ -0,0 +1,70 @@ +**Semantic Entity Recognition / Relation Extraction** + +```json +{ + "lang": "zh", + "version": "0.1", + "split": "val", + "documents": [ + { + "id": "zh_val_0", + "uid": "0ac15750a098682aa02b51555f7c49ff43adc0436c325548ba8dba560cde4e7e", + "document": [ + { + "box": [ + 410, + 541, + 535, + 590 + ], + "text": "夏艳辰", + "label": "answer", + "words": [ + { + "box": [ + 413, + 541, + 447, + 587 + ], + "text": "夏" + }, + { + "box": [ + 458, + 542, + 489, + 588 + ], + "text": "艳" + }, + { + "box": [ + 497, + 544, + 531, + 590 + ], + "text": "辰" + } + ], + "linking": [ + [ + 30, + 26 + ] + ], + "id": 26 + }, + // ... + ], + "img": { + "fname": "zh_val_0.jpg", + "width": 2480, + "height": 3508 + } + }, + // ... + ] +} +``` diff --git a/dataset_zoo/xfund/de/ser.py b/dataset_zoo/xfund/de/ser.py new file mode 100644 index 000000000..344e85158 --- /dev/null +++ b/dataset_zoo/xfund/de/ser.py @@ -0,0 +1,60 @@ +lang = 'de' +data_root = f'data/xfund/{lang}' +cache_path = 'data/cache' + +train_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.train.zip', + save_name=f'{lang}_train.zip', + md5='8c9f949952d227290e22f736cdbe4d29', + content=['image'], + mapping=[[f'{lang}_train/*.jpg', 'ser_imgs/train']]), + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.train.json', + save_name=f'{lang}_train.json', + md5='3e4b95c7da893bf5a91018445c83ccdd', + content=['annotation'], + mapping=[[f'{lang}_train.json', 'annotations/train.json']]) + ]), + gatherer=dict( + type='MonoGatherer', ann_name='train.json', img_dir='ser_imgs/train'), + parser=dict(type='XFUNDSERAnnParser'), + packer=dict(type='SERPacker'), + dumper=dict(type='JsonDumper'), +) + +test_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.val.zip', + save_name=f'{lang}_val.zip', + md5='d13d12278d585214183c3cfb949b0e59', + content=['image'], + mapping=[[f'{lang}_val/*.jpg', 'ser_imgs/test']]), + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.val.json', + save_name=f'{lang}_val.json', + md5='8eaf742f2d19b17f5c0e72da5c7761ef', + content=['annotation'], + mapping=[[f'{lang}_val.json', 'annotations/test.json']]) + ]), + gatherer=dict( + type='MonoGatherer', ann_name='test.json', img_dir='ser_imgs/test'), + parser=dict(type='XFUNDSERAnnParser'), + packer=dict(type='SERPacker'), + dumper=dict(type='JsonDumper'), +) + +delete = ['annotations'] + [f'{lang}_{split}' for split in ['train', 'val']] +config_generator = dict(type='SERConfigGenerator') diff --git a/dataset_zoo/xfund/es/metafile.yml b/dataset_zoo/xfund/es/metafile.yml new file mode 100644 index 000000000..86dfff885 --- /dev/null +++ b/dataset_zoo/xfund/es/metafile.yml @@ -0,0 +1,41 @@ +Name: 'XFUND' +Paper: + Title: 'XFUND: A Benchmark Dataset for Multilingual Visually Rich Form Understanding' + URL: https://aclanthology.org/2022.findings-acl.253 + Venue: ACL + Year: '2022' + BibTeX: '@inproceedings{xu-etal-2022-xfund, + title = "{XFUND}: A Benchmark Dataset for Multilingual Visually Rich Form Understanding", + author = "Xu, Yiheng and + Lv, Tengchao and + Cui, Lei and + Wang, Guoxin and + Lu, Yijuan and + Florencio, Dinei and + Zhang, Cha and + Wei, Furu", + booktitle = "Findings of the Association for Computational Linguistics: ACL 2022", + month = may, + year = "2022", + address = "Dublin, Ireland", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2022.findings-acl.253", + doi = "10.18653/v1/2022.findings-acl.253", + pages = "3214--3224", + abstract = "Multimodal pre-training with text, layout, and image has achieved SOTA performance for visually rich document understanding tasks recently, which demonstrates the great potential for joint learning across different modalities. However, the existed research work has focused only on the English domain while neglecting the importance of multilingual generalization. In this paper, we introduce a human-annotated multilingual form understanding benchmark dataset named XFUND, which includes form understanding samples in 7 languages (Chinese, Japanese, Spanish, French, Italian, German, Portuguese). Meanwhile, we present LayoutXLM, a multimodal pre-trained model for multilingual document understanding, which aims to bridge the language barriers for visually rich document understanding. Experimental results show that the LayoutXLM model has significantly outperformed the existing SOTA cross-lingual pre-trained models on the XFUND dataset. The XFUND dataset and the pre-trained LayoutXLM model have been publicly available at https://aka.ms/layoutxlm.", +}' +Data: + Website: https://github.com/doc-analysis/XFUND + Language: + - Chinese, Japanese, Spanish, French, Italian, German, Portuguese + Scene: + - Document + Granularity: + - Word + Tasks: + - ser + - re + License: + Type: CC BY 4.0 + Link: https://creativecommons.org/licenses/by/4.0/ + Format: .json diff --git a/dataset_zoo/xfund/es/sample_anno.md b/dataset_zoo/xfund/es/sample_anno.md new file mode 100644 index 000000000..6f41a5e92 --- /dev/null +++ b/dataset_zoo/xfund/es/sample_anno.md @@ -0,0 +1,70 @@ +**Semantic Entity Recognition / Relation Extraction** + +```json +{ + "lang": "zh", + "version": "0.1", + "split": "val", + "documents": [ + { + "id": "zh_val_0", + "uid": "0ac15750a098682aa02b51555f7c49ff43adc0436c325548ba8dba560cde4e7e", + "document": [ + { + "box": [ + 410, + 541, + 535, + 590 + ], + "text": "夏艳辰", + "label": "answer", + "words": [ + { + "box": [ + 413, + 541, + 447, + 587 + ], + "text": "夏" + }, + { + "box": [ + 458, + 542, + 489, + 588 + ], + "text": "艳" + }, + { + "box": [ + 497, + 544, + 531, + 590 + ], + "text": "辰" + } + ], + "linking": [ + [ + 30, + 26 + ] + ], + "id": 26 + }, + // ... + ], + "img": { + "fname": "zh_val_0.jpg", + "width": 2480, + "height": 3508 + } + }, + // ... + ] +} +``` diff --git a/dataset_zoo/xfund/es/ser.py b/dataset_zoo/xfund/es/ser.py new file mode 100644 index 000000000..9ee6caec5 --- /dev/null +++ b/dataset_zoo/xfund/es/ser.py @@ -0,0 +1,60 @@ +lang = 'es' +data_root = f'data/xfund/{lang}' +cache_path = 'data/cache' + +train_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.train.zip', + save_name=f'{lang}_train.zip', + md5='0ff89032bc6cb2e7ccba062c71944d03', + content=['image'], + mapping=[[f'{lang}_train/*.jpg', 'ser_imgs/train']]), + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.train.json', + save_name=f'{lang}_train.json', + md5='b40b43f276c7deaaaa5923d035da2820', + content=['annotation'], + mapping=[[f'{lang}_train.json', 'annotations/train.json']]) + ]), + gatherer=dict( + type='MonoGatherer', ann_name='train.json', img_dir='ser_imgs/train'), + parser=dict(type='XFUNDSERAnnParser'), + packer=dict(type='SERPacker'), + dumper=dict(type='JsonDumper'), +) + +test_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.val.zip', + save_name=f'{lang}_val.zip', + md5='efad9fb11ee3036bef003b6364a79ac0', + content=['image'], + mapping=[[f'{lang}_val/*.jpg', 'ser_imgs/test']]), + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.val.json', + save_name=f'{lang}_val.json', + md5='96ffc2057049ba2826a005825b3e7f0d', + content=['annotation'], + mapping=[[f'{lang}_val.json', 'annotations/test.json']]) + ]), + gatherer=dict( + type='MonoGatherer', ann_name='test.json', img_dir='ser_imgs/test'), + parser=dict(type='XFUNDSERAnnParser'), + packer=dict(type='SERPacker'), + dumper=dict(type='JsonDumper'), +) + +delete = ['annotations'] + [f'{lang}_{split}' for split in ['train', 'val']] +config_generator = dict(type='SERConfigGenerator') diff --git a/dataset_zoo/xfund/fr/metafile.yml b/dataset_zoo/xfund/fr/metafile.yml new file mode 100644 index 000000000..86dfff885 --- /dev/null +++ b/dataset_zoo/xfund/fr/metafile.yml @@ -0,0 +1,41 @@ +Name: 'XFUND' +Paper: + Title: 'XFUND: A Benchmark Dataset for Multilingual Visually Rich Form Understanding' + URL: https://aclanthology.org/2022.findings-acl.253 + Venue: ACL + Year: '2022' + BibTeX: '@inproceedings{xu-etal-2022-xfund, + title = "{XFUND}: A Benchmark Dataset for Multilingual Visually Rich Form Understanding", + author = "Xu, Yiheng and + Lv, Tengchao and + Cui, Lei and + Wang, Guoxin and + Lu, Yijuan and + Florencio, Dinei and + Zhang, Cha and + Wei, Furu", + booktitle = "Findings of the Association for Computational Linguistics: ACL 2022", + month = may, + year = "2022", + address = "Dublin, Ireland", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2022.findings-acl.253", + doi = "10.18653/v1/2022.findings-acl.253", + pages = "3214--3224", + abstract = "Multimodal pre-training with text, layout, and image has achieved SOTA performance for visually rich document understanding tasks recently, which demonstrates the great potential for joint learning across different modalities. However, the existed research work has focused only on the English domain while neglecting the importance of multilingual generalization. In this paper, we introduce a human-annotated multilingual form understanding benchmark dataset named XFUND, which includes form understanding samples in 7 languages (Chinese, Japanese, Spanish, French, Italian, German, Portuguese). Meanwhile, we present LayoutXLM, a multimodal pre-trained model for multilingual document understanding, which aims to bridge the language barriers for visually rich document understanding. Experimental results show that the LayoutXLM model has significantly outperformed the existing SOTA cross-lingual pre-trained models on the XFUND dataset. The XFUND dataset and the pre-trained LayoutXLM model have been publicly available at https://aka.ms/layoutxlm.", +}' +Data: + Website: https://github.com/doc-analysis/XFUND + Language: + - Chinese, Japanese, Spanish, French, Italian, German, Portuguese + Scene: + - Document + Granularity: + - Word + Tasks: + - ser + - re + License: + Type: CC BY 4.0 + Link: https://creativecommons.org/licenses/by/4.0/ + Format: .json diff --git a/dataset_zoo/xfund/fr/sample_anno.md b/dataset_zoo/xfund/fr/sample_anno.md new file mode 100644 index 000000000..6f41a5e92 --- /dev/null +++ b/dataset_zoo/xfund/fr/sample_anno.md @@ -0,0 +1,70 @@ +**Semantic Entity Recognition / Relation Extraction** + +```json +{ + "lang": "zh", + "version": "0.1", + "split": "val", + "documents": [ + { + "id": "zh_val_0", + "uid": "0ac15750a098682aa02b51555f7c49ff43adc0436c325548ba8dba560cde4e7e", + "document": [ + { + "box": [ + 410, + 541, + 535, + 590 + ], + "text": "夏艳辰", + "label": "answer", + "words": [ + { + "box": [ + 413, + 541, + 447, + 587 + ], + "text": "夏" + }, + { + "box": [ + 458, + 542, + 489, + 588 + ], + "text": "艳" + }, + { + "box": [ + 497, + 544, + 531, + 590 + ], + "text": "辰" + } + ], + "linking": [ + [ + 30, + 26 + ] + ], + "id": 26 + }, + // ... + ], + "img": { + "fname": "zh_val_0.jpg", + "width": 2480, + "height": 3508 + } + }, + // ... + ] +} +``` diff --git a/dataset_zoo/xfund/fr/ser.py b/dataset_zoo/xfund/fr/ser.py new file mode 100644 index 000000000..a6e6af790 --- /dev/null +++ b/dataset_zoo/xfund/fr/ser.py @@ -0,0 +1,60 @@ +lang = 'fr' +data_root = f'data/xfund/{lang}' +cache_path = 'data/cache' + +train_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.train.zip', + save_name=f'{lang}_train.zip', + md5='d821ca50f37cc39ff1715632f4068ea1', + content=['image'], + mapping=[[f'{lang}_train/*.jpg', 'ser_imgs/train']]), + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.train.json', + save_name=f'{lang}_train.json', + md5='349e7f824225bc7cc53f0c0eb8c87d3e', + content=['annotation'], + mapping=[[f'{lang}_train.json', 'annotations/train.json']]) + ]), + gatherer=dict( + type='MonoGatherer', ann_name='train.json', img_dir='ser_imgs/train'), + parser=dict(type='XFUNDSERAnnParser'), + packer=dict(type='SERPacker'), + dumper=dict(type='JsonDumper'), +) + +test_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.val.zip', + save_name=f'{lang}_val.zip', + md5='9ccbf15816ca05e50229885b75e57e49', + content=['image'], + mapping=[[f'{lang}_val/*.jpg', 'ser_imgs/test']]), + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.val.json', + save_name=f'{lang}_val.json', + md5='15d8a52a4eb20ea029a4aa3eaa25ef8d', + content=['annotation'], + mapping=[[f'{lang}_val.json', 'annotations/test.json']]) + ]), + gatherer=dict( + type='MonoGatherer', ann_name='test.json', img_dir='ser_imgs/test'), + parser=dict(type='XFUNDSERAnnParser'), + packer=dict(type='SERPacker'), + dumper=dict(type='JsonDumper'), +) + +delete = ['annotations'] + [f'{lang}_{split}' for split in ['train', 'val']] +config_generator = dict(type='SERConfigGenerator') diff --git a/dataset_zoo/xfund/it/metafile.yml b/dataset_zoo/xfund/it/metafile.yml new file mode 100644 index 000000000..86dfff885 --- /dev/null +++ b/dataset_zoo/xfund/it/metafile.yml @@ -0,0 +1,41 @@ +Name: 'XFUND' +Paper: + Title: 'XFUND: A Benchmark Dataset for Multilingual Visually Rich Form Understanding' + URL: https://aclanthology.org/2022.findings-acl.253 + Venue: ACL + Year: '2022' + BibTeX: '@inproceedings{xu-etal-2022-xfund, + title = "{XFUND}: A Benchmark Dataset for Multilingual Visually Rich Form Understanding", + author = "Xu, Yiheng and + Lv, Tengchao and + Cui, Lei and + Wang, Guoxin and + Lu, Yijuan and + Florencio, Dinei and + Zhang, Cha and + Wei, Furu", + booktitle = "Findings of the Association for Computational Linguistics: ACL 2022", + month = may, + year = "2022", + address = "Dublin, Ireland", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2022.findings-acl.253", + doi = "10.18653/v1/2022.findings-acl.253", + pages = "3214--3224", + abstract = "Multimodal pre-training with text, layout, and image has achieved SOTA performance for visually rich document understanding tasks recently, which demonstrates the great potential for joint learning across different modalities. However, the existed research work has focused only on the English domain while neglecting the importance of multilingual generalization. In this paper, we introduce a human-annotated multilingual form understanding benchmark dataset named XFUND, which includes form understanding samples in 7 languages (Chinese, Japanese, Spanish, French, Italian, German, Portuguese). Meanwhile, we present LayoutXLM, a multimodal pre-trained model for multilingual document understanding, which aims to bridge the language barriers for visually rich document understanding. Experimental results show that the LayoutXLM model has significantly outperformed the existing SOTA cross-lingual pre-trained models on the XFUND dataset. The XFUND dataset and the pre-trained LayoutXLM model have been publicly available at https://aka.ms/layoutxlm.", +}' +Data: + Website: https://github.com/doc-analysis/XFUND + Language: + - Chinese, Japanese, Spanish, French, Italian, German, Portuguese + Scene: + - Document + Granularity: + - Word + Tasks: + - ser + - re + License: + Type: CC BY 4.0 + Link: https://creativecommons.org/licenses/by/4.0/ + Format: .json diff --git a/dataset_zoo/xfund/it/sample_anno.md b/dataset_zoo/xfund/it/sample_anno.md new file mode 100644 index 000000000..6f41a5e92 --- /dev/null +++ b/dataset_zoo/xfund/it/sample_anno.md @@ -0,0 +1,70 @@ +**Semantic Entity Recognition / Relation Extraction** + +```json +{ + "lang": "zh", + "version": "0.1", + "split": "val", + "documents": [ + { + "id": "zh_val_0", + "uid": "0ac15750a098682aa02b51555f7c49ff43adc0436c325548ba8dba560cde4e7e", + "document": [ + { + "box": [ + 410, + 541, + 535, + 590 + ], + "text": "夏艳辰", + "label": "answer", + "words": [ + { + "box": [ + 413, + 541, + 447, + 587 + ], + "text": "夏" + }, + { + "box": [ + 458, + 542, + 489, + 588 + ], + "text": "艳" + }, + { + "box": [ + 497, + 544, + 531, + 590 + ], + "text": "辰" + } + ], + "linking": [ + [ + 30, + 26 + ] + ], + "id": 26 + }, + // ... + ], + "img": { + "fname": "zh_val_0.jpg", + "width": 2480, + "height": 3508 + } + }, + // ... + ] +} +``` diff --git a/dataset_zoo/xfund/it/ser.py b/dataset_zoo/xfund/it/ser.py new file mode 100644 index 000000000..17b279b9c --- /dev/null +++ b/dataset_zoo/xfund/it/ser.py @@ -0,0 +1,60 @@ +lang = 'it' +data_root = f'data/xfund/{lang}' +cache_path = 'data/cache' + +train_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.train.zip', + save_name=f'{lang}_train.zip', + md5='c531e39f0cbc1dc74caa320ffafe5de9', + content=['image'], + mapping=[[f'{lang}_train/*.jpg', 'ser_imgs/train']]), + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.train.json', + save_name=f'{lang}_train.json', + md5='fa6afe204a6af57152627e76fe2de005', + content=['annotation'], + mapping=[[f'{lang}_train.json', 'annotations/train.json']]) + ]), + gatherer=dict( + type='MonoGatherer', ann_name='train.json', img_dir='ser_imgs/train'), + parser=dict(type='XFUNDSERAnnParser'), + packer=dict(type='SERPacker'), + dumper=dict(type='JsonDumper'), +) + +test_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.val.zip', + save_name=f'{lang}_val.zip', + md5='35446a115561d0773b7f2a0c2f32fe5c', + content=['image'], + mapping=[[f'{lang}_val/*.jpg', 'ser_imgs/test']]), + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.val.json', + save_name=f'{lang}_val.json', + md5='260d4ea447636cbca1ce1ca5fc5846d9', + content=['annotation'], + mapping=[[f'{lang}_val.json', 'annotations/test.json']]) + ]), + gatherer=dict( + type='MonoGatherer', ann_name='test.json', img_dir='ser_imgs/test'), + parser=dict(type='XFUNDSERAnnParser'), + packer=dict(type='SERPacker'), + dumper=dict(type='JsonDumper'), +) + +delete = ['annotations'] + [f'{lang}_{split}' for split in ['train', 'val']] +config_generator = dict(type='SERConfigGenerator') diff --git a/dataset_zoo/xfund/ja/metafile.yml b/dataset_zoo/xfund/ja/metafile.yml new file mode 100644 index 000000000..86dfff885 --- /dev/null +++ b/dataset_zoo/xfund/ja/metafile.yml @@ -0,0 +1,41 @@ +Name: 'XFUND' +Paper: + Title: 'XFUND: A Benchmark Dataset for Multilingual Visually Rich Form Understanding' + URL: https://aclanthology.org/2022.findings-acl.253 + Venue: ACL + Year: '2022' + BibTeX: '@inproceedings{xu-etal-2022-xfund, + title = "{XFUND}: A Benchmark Dataset for Multilingual Visually Rich Form Understanding", + author = "Xu, Yiheng and + Lv, Tengchao and + Cui, Lei and + Wang, Guoxin and + Lu, Yijuan and + Florencio, Dinei and + Zhang, Cha and + Wei, Furu", + booktitle = "Findings of the Association for Computational Linguistics: ACL 2022", + month = may, + year = "2022", + address = "Dublin, Ireland", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2022.findings-acl.253", + doi = "10.18653/v1/2022.findings-acl.253", + pages = "3214--3224", + abstract = "Multimodal pre-training with text, layout, and image has achieved SOTA performance for visually rich document understanding tasks recently, which demonstrates the great potential for joint learning across different modalities. However, the existed research work has focused only on the English domain while neglecting the importance of multilingual generalization. In this paper, we introduce a human-annotated multilingual form understanding benchmark dataset named XFUND, which includes form understanding samples in 7 languages (Chinese, Japanese, Spanish, French, Italian, German, Portuguese). Meanwhile, we present LayoutXLM, a multimodal pre-trained model for multilingual document understanding, which aims to bridge the language barriers for visually rich document understanding. Experimental results show that the LayoutXLM model has significantly outperformed the existing SOTA cross-lingual pre-trained models on the XFUND dataset. The XFUND dataset and the pre-trained LayoutXLM model have been publicly available at https://aka.ms/layoutxlm.", +}' +Data: + Website: https://github.com/doc-analysis/XFUND + Language: + - Chinese, Japanese, Spanish, French, Italian, German, Portuguese + Scene: + - Document + Granularity: + - Word + Tasks: + - ser + - re + License: + Type: CC BY 4.0 + Link: https://creativecommons.org/licenses/by/4.0/ + Format: .json diff --git a/dataset_zoo/xfund/ja/sample_anno.md b/dataset_zoo/xfund/ja/sample_anno.md new file mode 100644 index 000000000..6f41a5e92 --- /dev/null +++ b/dataset_zoo/xfund/ja/sample_anno.md @@ -0,0 +1,70 @@ +**Semantic Entity Recognition / Relation Extraction** + +```json +{ + "lang": "zh", + "version": "0.1", + "split": "val", + "documents": [ + { + "id": "zh_val_0", + "uid": "0ac15750a098682aa02b51555f7c49ff43adc0436c325548ba8dba560cde4e7e", + "document": [ + { + "box": [ + 410, + 541, + 535, + 590 + ], + "text": "夏艳辰", + "label": "answer", + "words": [ + { + "box": [ + 413, + 541, + 447, + 587 + ], + "text": "夏" + }, + { + "box": [ + 458, + 542, + 489, + 588 + ], + "text": "艳" + }, + { + "box": [ + 497, + 544, + 531, + 590 + ], + "text": "辰" + } + ], + "linking": [ + [ + 30, + 26 + ] + ], + "id": 26 + }, + // ... + ], + "img": { + "fname": "zh_val_0.jpg", + "width": 2480, + "height": 3508 + } + }, + // ... + ] +} +``` diff --git a/dataset_zoo/xfund/ja/ser.py b/dataset_zoo/xfund/ja/ser.py new file mode 100644 index 000000000..f43a41c8a --- /dev/null +++ b/dataset_zoo/xfund/ja/ser.py @@ -0,0 +1,60 @@ +lang = 'ja' +data_root = f'data/xfund/{lang}' +cache_path = 'data/cache' + +train_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.train.zip', + save_name=f'{lang}_train.zip', + md5='50c22c6774706494080a73f8eabcf45d', + content=['image'], + mapping=[[f'{lang}_train/*.jpg', 'ser_imgs/train']]), + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.train.json', + save_name=f'{lang}_train.json', + md5='46cd53deab3b8fbd69278da56d1778c4', + content=['annotation'], + mapping=[[f'{lang}_train.json', 'annotations/train.json']]) + ]), + gatherer=dict( + type='MonoGatherer', ann_name='train.json', img_dir='ser_imgs/train'), + parser=dict(type='XFUNDSERAnnParser'), + packer=dict(type='SERPacker'), + dumper=dict(type='JsonDumper'), +) + +test_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.val.zip', + save_name=f'{lang}_val.zip', + md5='93a22fea044894264bfa3c9f9c84dd37', + content=['image'], + mapping=[[f'{lang}_val/*.jpg', 'ser_imgs/test']]), + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.val.json', + save_name=f'{lang}_val.json', + md5='f576b6dc6c08fd98cf877fb04bc4c8c3', + content=['annotation'], + mapping=[[f'{lang}_val.json', 'annotations/test.json']]) + ]), + gatherer=dict( + type='MonoGatherer', ann_name='test.json', img_dir='ser_imgs/test'), + parser=dict(type='XFUNDSERAnnParser'), + packer=dict(type='SERPacker'), + dumper=dict(type='JsonDumper'), +) + +delete = ['annotations'] + [f'{lang}_{split}' for split in ['train', 'val']] +config_generator = dict(type='SERConfigGenerator') diff --git a/dataset_zoo/xfund/pt/metafile.yml b/dataset_zoo/xfund/pt/metafile.yml new file mode 100644 index 000000000..86dfff885 --- /dev/null +++ b/dataset_zoo/xfund/pt/metafile.yml @@ -0,0 +1,41 @@ +Name: 'XFUND' +Paper: + Title: 'XFUND: A Benchmark Dataset for Multilingual Visually Rich Form Understanding' + URL: https://aclanthology.org/2022.findings-acl.253 + Venue: ACL + Year: '2022' + BibTeX: '@inproceedings{xu-etal-2022-xfund, + title = "{XFUND}: A Benchmark Dataset for Multilingual Visually Rich Form Understanding", + author = "Xu, Yiheng and + Lv, Tengchao and + Cui, Lei and + Wang, Guoxin and + Lu, Yijuan and + Florencio, Dinei and + Zhang, Cha and + Wei, Furu", + booktitle = "Findings of the Association for Computational Linguistics: ACL 2022", + month = may, + year = "2022", + address = "Dublin, Ireland", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2022.findings-acl.253", + doi = "10.18653/v1/2022.findings-acl.253", + pages = "3214--3224", + abstract = "Multimodal pre-training with text, layout, and image has achieved SOTA performance for visually rich document understanding tasks recently, which demonstrates the great potential for joint learning across different modalities. However, the existed research work has focused only on the English domain while neglecting the importance of multilingual generalization. In this paper, we introduce a human-annotated multilingual form understanding benchmark dataset named XFUND, which includes form understanding samples in 7 languages (Chinese, Japanese, Spanish, French, Italian, German, Portuguese). Meanwhile, we present LayoutXLM, a multimodal pre-trained model for multilingual document understanding, which aims to bridge the language barriers for visually rich document understanding. Experimental results show that the LayoutXLM model has significantly outperformed the existing SOTA cross-lingual pre-trained models on the XFUND dataset. The XFUND dataset and the pre-trained LayoutXLM model have been publicly available at https://aka.ms/layoutxlm.", +}' +Data: + Website: https://github.com/doc-analysis/XFUND + Language: + - Chinese, Japanese, Spanish, French, Italian, German, Portuguese + Scene: + - Document + Granularity: + - Word + Tasks: + - ser + - re + License: + Type: CC BY 4.0 + Link: https://creativecommons.org/licenses/by/4.0/ + Format: .json diff --git a/dataset_zoo/xfund/pt/sample_anno.md b/dataset_zoo/xfund/pt/sample_anno.md new file mode 100644 index 000000000..6f41a5e92 --- /dev/null +++ b/dataset_zoo/xfund/pt/sample_anno.md @@ -0,0 +1,70 @@ +**Semantic Entity Recognition / Relation Extraction** + +```json +{ + "lang": "zh", + "version": "0.1", + "split": "val", + "documents": [ + { + "id": "zh_val_0", + "uid": "0ac15750a098682aa02b51555f7c49ff43adc0436c325548ba8dba560cde4e7e", + "document": [ + { + "box": [ + 410, + 541, + 535, + 590 + ], + "text": "夏艳辰", + "label": "answer", + "words": [ + { + "box": [ + 413, + 541, + 447, + 587 + ], + "text": "夏" + }, + { + "box": [ + 458, + 542, + 489, + 588 + ], + "text": "艳" + }, + { + "box": [ + 497, + 544, + 531, + 590 + ], + "text": "辰" + } + ], + "linking": [ + [ + 30, + 26 + ] + ], + "id": 26 + }, + // ... + ], + "img": { + "fname": "zh_val_0.jpg", + "width": 2480, + "height": 3508 + } + }, + // ... + ] +} +``` diff --git a/dataset_zoo/xfund/pt/ser.py b/dataset_zoo/xfund/pt/ser.py new file mode 100644 index 000000000..39c57fc92 --- /dev/null +++ b/dataset_zoo/xfund/pt/ser.py @@ -0,0 +1,60 @@ +lang = 'pt' +data_root = f'data/xfund/{lang}' +cache_path = 'data/cache' + +train_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.train.zip', + save_name=f'{lang}_train.zip', + md5='783ba0aba419235bc81cf547e7c5011b', + content=['image'], + mapping=[[f'{lang}_train/*.jpg', 'ser_imgs/train']]), + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.train.json', + save_name=f'{lang}_train.json', + md5='3fe0fb93e631fcbc391216d2d7b0510d', + content=['annotation'], + mapping=[[f'{lang}_train.json', 'annotations/train.json']]) + ]), + gatherer=dict( + type='MonoGatherer', ann_name='train.json', img_dir='ser_imgs/train'), + parser=dict(type='XFUNDSERAnnParser'), + packer=dict(type='SERPacker'), + dumper=dict(type='JsonDumper'), +) + +test_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.val.zip', + save_name=f'{lang}_val.zip', + md5='5f0189d29c5a0e6764757457f54ba14f', + content=['image'], + mapping=[[f'{lang}_val/*.jpg', 'ser_imgs/test']]), + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.val.json', + save_name=f'{lang}_val.json', + md5='82a93addffdd7ac7fd978972adf1a348', + content=['annotation'], + mapping=[[f'{lang}_val.json', 'annotations/test.json']]) + ]), + gatherer=dict( + type='MonoGatherer', ann_name='test.json', img_dir='ser_imgs/test'), + parser=dict(type='XFUNDSERAnnParser'), + packer=dict(type='SERPacker'), + dumper=dict(type='JsonDumper'), +) + +delete = ['annotations'] + [f'{lang}_{split}' for split in ['train', 'val']] +config_generator = dict(type='SERConfigGenerator') From 201692166dc09be533f33b61289309130b5031ea Mon Sep 17 00:00:00 2001 From: KevinNuNu Date: Sat, 25 Mar 2023 23:33:56 +0800 Subject: [PATCH 07/50] =?UTF-8?q?=E5=A2=9E=E5=8A=A0xfund=20RE=E4=BB=BB?= =?UTF-8?q?=E5=8A=A1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- configs/re/_base_/datasets/xfund_de.py | 15 ++ configs/re/_base_/datasets/xfund_es.py | 15 ++ configs/re/_base_/datasets/xfund_fr.py | 15 ++ configs/re/_base_/datasets/xfund_it.py | 15 ++ configs/re/_base_/datasets/xfund_ja.py | 15 ++ configs/re/_base_/datasets/xfund_pt.py | 15 ++ configs/re/_base_/datasets/xfund_zh.py | 15 ++ dataset_zoo/xfund/de/re.py | 8 + dataset_zoo/xfund/es/re.py | 8 + dataset_zoo/xfund/fr/re.py | 8 + dataset_zoo/xfund/it/re.py | 8 + dataset_zoo/xfund/ja/re.py | 8 + dataset_zoo/xfund/pt/re.py | 8 + dataset_zoo/xfund/zh/re.py | 8 + .../preparers/config_generators/__init__.py | 3 +- .../config_generators/re_config_generator.py | 98 +++++++++++ mmocr/datasets/preparers/packers/__init__.py | 3 +- mmocr/datasets/preparers/packers/re_packer.py | 159 ++++++++++++++++++ .../datasets/preparers/packers/ser_packer.py | 6 +- 19 files changed, 425 insertions(+), 5 deletions(-) create mode 100644 configs/re/_base_/datasets/xfund_de.py create mode 100644 configs/re/_base_/datasets/xfund_es.py create mode 100644 configs/re/_base_/datasets/xfund_fr.py create mode 100644 configs/re/_base_/datasets/xfund_it.py create mode 100644 configs/re/_base_/datasets/xfund_ja.py create mode 100644 configs/re/_base_/datasets/xfund_pt.py create mode 100644 configs/re/_base_/datasets/xfund_zh.py create mode 100644 dataset_zoo/xfund/de/re.py create mode 100644 dataset_zoo/xfund/es/re.py create mode 100644 dataset_zoo/xfund/fr/re.py create mode 100644 dataset_zoo/xfund/it/re.py create mode 100644 dataset_zoo/xfund/ja/re.py create mode 100644 dataset_zoo/xfund/pt/re.py create mode 100644 dataset_zoo/xfund/zh/re.py create mode 100644 mmocr/datasets/preparers/config_generators/re_config_generator.py create mode 100644 mmocr/datasets/preparers/packers/re_packer.py diff --git a/configs/re/_base_/datasets/xfund_de.py b/configs/re/_base_/datasets/xfund_de.py new file mode 100644 index 000000000..c86de998e --- /dev/null +++ b/configs/re/_base_/datasets/xfund_de.py @@ -0,0 +1,15 @@ +xfund_de_re_data_root = 'data/xfund/de' + +xfund_de_re_train = dict( + type='REDataset', + data_root=xfund_de_re_data_root, + ann_file='re_train.json', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=None) + +xfund_de_re_test = dict( + type='REDataset', + data_root=xfund_de_re_data_root, + ann_file='re_test.json', + test_mode=True, + pipeline=None) diff --git a/configs/re/_base_/datasets/xfund_es.py b/configs/re/_base_/datasets/xfund_es.py new file mode 100644 index 000000000..24d9400d2 --- /dev/null +++ b/configs/re/_base_/datasets/xfund_es.py @@ -0,0 +1,15 @@ +xfund_es_re_data_root = 'data/xfund/es' + +xfund_es_re_train = dict( + type='REDataset', + data_root=xfund_es_re_data_root, + ann_file='re_train.json', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=None) + +xfund_es_re_test = dict( + type='REDataset', + data_root=xfund_es_re_data_root, + ann_file='re_test.json', + test_mode=True, + pipeline=None) diff --git a/configs/re/_base_/datasets/xfund_fr.py b/configs/re/_base_/datasets/xfund_fr.py new file mode 100644 index 000000000..771cc8bb9 --- /dev/null +++ b/configs/re/_base_/datasets/xfund_fr.py @@ -0,0 +1,15 @@ +xfund_fr_re_data_root = 'data/xfund/fr' + +xfund_fr_re_train = dict( + type='REDataset', + data_root=xfund_fr_re_data_root, + ann_file='re_train.json', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=None) + +xfund_fr_re_test = dict( + type='REDataset', + data_root=xfund_fr_re_data_root, + ann_file='re_test.json', + test_mode=True, + pipeline=None) diff --git a/configs/re/_base_/datasets/xfund_it.py b/configs/re/_base_/datasets/xfund_it.py new file mode 100644 index 000000000..e8dcbdf39 --- /dev/null +++ b/configs/re/_base_/datasets/xfund_it.py @@ -0,0 +1,15 @@ +xfund_it_re_data_root = 'data/xfund/it' + +xfund_it_re_train = dict( + type='REDataset', + data_root=xfund_it_re_data_root, + ann_file='re_train.json', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=None) + +xfund_it_re_test = dict( + type='REDataset', + data_root=xfund_it_re_data_root, + ann_file='re_test.json', + test_mode=True, + pipeline=None) diff --git a/configs/re/_base_/datasets/xfund_ja.py b/configs/re/_base_/datasets/xfund_ja.py new file mode 100644 index 000000000..e6057c3a9 --- /dev/null +++ b/configs/re/_base_/datasets/xfund_ja.py @@ -0,0 +1,15 @@ +xfund_ja_re_data_root = 'data/xfund/ja' + +xfund_ja_re_train = dict( + type='REDataset', + data_root=xfund_ja_re_data_root, + ann_file='re_train.json', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=None) + +xfund_ja_re_test = dict( + type='REDataset', + data_root=xfund_ja_re_data_root, + ann_file='re_test.json', + test_mode=True, + pipeline=None) diff --git a/configs/re/_base_/datasets/xfund_pt.py b/configs/re/_base_/datasets/xfund_pt.py new file mode 100644 index 000000000..8bcbc59a9 --- /dev/null +++ b/configs/re/_base_/datasets/xfund_pt.py @@ -0,0 +1,15 @@ +xfund_pt_re_data_root = 'data/xfund/pt' + +xfund_pt_re_train = dict( + type='REDataset', + data_root=xfund_pt_re_data_root, + ann_file='re_train.json', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=None) + +xfund_pt_re_test = dict( + type='REDataset', + data_root=xfund_pt_re_data_root, + ann_file='re_test.json', + test_mode=True, + pipeline=None) diff --git a/configs/re/_base_/datasets/xfund_zh.py b/configs/re/_base_/datasets/xfund_zh.py new file mode 100644 index 000000000..68117cf44 --- /dev/null +++ b/configs/re/_base_/datasets/xfund_zh.py @@ -0,0 +1,15 @@ +xfund_zh_re_data_root = 'data/xfund/zh' + +xfund_zh_re_train = dict( + type='REDataset', + data_root=xfund_zh_re_data_root, + ann_file='re_train.json', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=None) + +xfund_zh_re_test = dict( + type='REDataset', + data_root=xfund_zh_re_data_root, + ann_file='re_test.json', + test_mode=True, + pipeline=None) diff --git a/dataset_zoo/xfund/de/re.py b/dataset_zoo/xfund/de/re.py new file mode 100644 index 000000000..fdd3b4765 --- /dev/null +++ b/dataset_zoo/xfund/de/re.py @@ -0,0 +1,8 @@ +_base_ = ['ser.py'] + +_base_.train_preparer.parser.type = 'XFUNDREAnnParser' +_base_.train_preparer.packer.type = 'REPacker' +_base_.test_preparer.parser.type = 'XFUNDREAnnParser' +_base_.test_preparer.packer.type = 'REPacker' + +config_generator = dict(type='REConfigGenerator') diff --git a/dataset_zoo/xfund/es/re.py b/dataset_zoo/xfund/es/re.py new file mode 100644 index 000000000..fdd3b4765 --- /dev/null +++ b/dataset_zoo/xfund/es/re.py @@ -0,0 +1,8 @@ +_base_ = ['ser.py'] + +_base_.train_preparer.parser.type = 'XFUNDREAnnParser' +_base_.train_preparer.packer.type = 'REPacker' +_base_.test_preparer.parser.type = 'XFUNDREAnnParser' +_base_.test_preparer.packer.type = 'REPacker' + +config_generator = dict(type='REConfigGenerator') diff --git a/dataset_zoo/xfund/fr/re.py b/dataset_zoo/xfund/fr/re.py new file mode 100644 index 000000000..fdd3b4765 --- /dev/null +++ b/dataset_zoo/xfund/fr/re.py @@ -0,0 +1,8 @@ +_base_ = ['ser.py'] + +_base_.train_preparer.parser.type = 'XFUNDREAnnParser' +_base_.train_preparer.packer.type = 'REPacker' +_base_.test_preparer.parser.type = 'XFUNDREAnnParser' +_base_.test_preparer.packer.type = 'REPacker' + +config_generator = dict(type='REConfigGenerator') diff --git a/dataset_zoo/xfund/it/re.py b/dataset_zoo/xfund/it/re.py new file mode 100644 index 000000000..fdd3b4765 --- /dev/null +++ b/dataset_zoo/xfund/it/re.py @@ -0,0 +1,8 @@ +_base_ = ['ser.py'] + +_base_.train_preparer.parser.type = 'XFUNDREAnnParser' +_base_.train_preparer.packer.type = 'REPacker' +_base_.test_preparer.parser.type = 'XFUNDREAnnParser' +_base_.test_preparer.packer.type = 'REPacker' + +config_generator = dict(type='REConfigGenerator') diff --git a/dataset_zoo/xfund/ja/re.py b/dataset_zoo/xfund/ja/re.py new file mode 100644 index 000000000..fdd3b4765 --- /dev/null +++ b/dataset_zoo/xfund/ja/re.py @@ -0,0 +1,8 @@ +_base_ = ['ser.py'] + +_base_.train_preparer.parser.type = 'XFUNDREAnnParser' +_base_.train_preparer.packer.type = 'REPacker' +_base_.test_preparer.parser.type = 'XFUNDREAnnParser' +_base_.test_preparer.packer.type = 'REPacker' + +config_generator = dict(type='REConfigGenerator') diff --git a/dataset_zoo/xfund/pt/re.py b/dataset_zoo/xfund/pt/re.py new file mode 100644 index 000000000..fdd3b4765 --- /dev/null +++ b/dataset_zoo/xfund/pt/re.py @@ -0,0 +1,8 @@ +_base_ = ['ser.py'] + +_base_.train_preparer.parser.type = 'XFUNDREAnnParser' +_base_.train_preparer.packer.type = 'REPacker' +_base_.test_preparer.parser.type = 'XFUNDREAnnParser' +_base_.test_preparer.packer.type = 'REPacker' + +config_generator = dict(type='REConfigGenerator') diff --git a/dataset_zoo/xfund/zh/re.py b/dataset_zoo/xfund/zh/re.py new file mode 100644 index 000000000..fdd3b4765 --- /dev/null +++ b/dataset_zoo/xfund/zh/re.py @@ -0,0 +1,8 @@ +_base_ = ['ser.py'] + +_base_.train_preparer.parser.type = 'XFUNDREAnnParser' +_base_.train_preparer.packer.type = 'REPacker' +_base_.test_preparer.parser.type = 'XFUNDREAnnParser' +_base_.test_preparer.packer.type = 'REPacker' + +config_generator = dict(type='REConfigGenerator') diff --git a/mmocr/datasets/preparers/config_generators/__init__.py b/mmocr/datasets/preparers/config_generators/__init__.py index c6f3253a4..4976b92fd 100644 --- a/mmocr/datasets/preparers/config_generators/__init__.py +++ b/mmocr/datasets/preparers/config_generators/__init__.py @@ -4,9 +4,10 @@ from .textrecog_config_generator import TextRecogConfigGenerator from .textspotting_config_generator import TextSpottingConfigGenerator from .ser_config_generator import SERConfigGenerator +from .re_config_generator import REConfigGenerator __all__ = [ 'BaseDatasetConfigGenerator', 'TextDetConfigGenerator', 'TextRecogConfigGenerator', 'TextSpottingConfigGenerator', - 'SERConfigGenerator' + 'SERConfigGenerator', 'REConfigGenerator' ] diff --git a/mmocr/datasets/preparers/config_generators/re_config_generator.py b/mmocr/datasets/preparers/config_generators/re_config_generator.py new file mode 100644 index 000000000..35f2b6589 --- /dev/null +++ b/mmocr/datasets/preparers/config_generators/re_config_generator.py @@ -0,0 +1,98 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, List, Optional + +from mmocr.registry import CFG_GENERATORS +from .base import BaseDatasetConfigGenerator + + +@CFG_GENERATORS.register_module() +class REConfigGenerator(BaseDatasetConfigGenerator): + """Text detection config generator. + + Args: + data_root (str): The root path of the dataset. + dataset_name (str): The name of the dataset. + overwrite_cfg (bool): Whether to overwrite the dataset config file if + it already exists. If False, config generator will not generate new + config for datasets whose configs are already in base. + train_anns (List[Dict], optional): A list of train annotation files + to appear in the base configs. Defaults to + ``[dict(file='re_train.json', dataset_postfix='')]``. + Each element is typically a dict with the following fields: + - ann_file (str): The path to the annotation file relative to + data_root. + - dataset_postfix (str, optional): Affects the postfix of the + resulting variable in the generated config. If specified, the + dataset variable will be named in the form of + ``{dataset_name}_{dataset_postfix}_{task}_{split}``. Defaults to + None. + val_anns (List[Dict], optional): A list of val annotation files + to appear in the base configs, similar to ``train_anns``. Defaults + to []. + test_anns (List[Dict], optional): A list of test annotation files + to appear in the base configs, similar to ``train_anns``. Defaults + to ``[dict(file='re_test.json')]``. + config_path (str): Path to the configs. Defaults to 'configs/'. + """ + + def __init__( + self, + data_root: str, + dataset_name: str, + overwrite_cfg: bool = False, + train_anns: Optional[List[Dict]] = [ + dict(ann_file='re_train.json', dataset_postfix='') + ], + val_anns: Optional[List[Dict]] = [], + test_anns: Optional[List[Dict]] = [ + dict(ann_file='re_test.json', dataset_postfix='') + ], + config_path: str = 'configs/', + ) -> None: + if '/' in dataset_name: + dataset_name = '_'.join(dataset_name.split('/')) + super().__init__( + data_root=data_root, + task='re', + overwrite_cfg=overwrite_cfg, + dataset_name=dataset_name, + train_anns=train_anns, + val_anns=val_anns, + test_anns=test_anns, + config_path=config_path, + ) + + def _gen_dataset_config(self) -> str: + """Generate a full dataset config based on the annotation file + dictionary. + + Args: + ann_dict (dict[str, dict(str, str)]): A nested dictionary that maps + a config variable name (such as icdar2015_textrecog_train) to + its corresponding annotation information dict. Each dict + contains following keys: + - ann_file (str): The path to the annotation file relative to + data_root. + - dataset_postfix (str, optional): Affects the postfix of the + resulting variable in the generated config. If specified, the + dataset variable will be named in the form of + ``{dataset_name}_{dataset_postfix}_{task}_{split}``. Defaults + to None. + - split (str): The split the annotation belongs to. Usually + it can be 'train', 'val' and 'test'. + + Returns: + str: The generated dataset config. + """ + cfg = '' + for key_name, ann_dict in self.anns.items(): + cfg += f'\n{key_name} = dict(\n' + cfg += ' type=\'REDataset\',\n' + cfg += ' data_root=' + f'{self.dataset_name}_{self.task}_data_root,\n' # noqa: E501 + cfg += f' ann_file=\'{ann_dict["ann_file"]}\',\n' + if ann_dict['split'] == 'train': + cfg += ' filter_cfg=dict(filter_empty_gt=True, min_size=32),\n' # noqa: E501 + elif ann_dict['split'] in ['test', 'val']: + cfg += ' test_mode=True,\n' + cfg += ' pipeline=None)\n' + return cfg diff --git a/mmocr/datasets/preparers/packers/__init__.py b/mmocr/datasets/preparers/packers/__init__.py index 90a98e8c3..79c137995 100644 --- a/mmocr/datasets/preparers/packers/__init__.py +++ b/mmocr/datasets/preparers/packers/__init__.py @@ -5,8 +5,9 @@ from .textspotting_packer import TextSpottingPacker from .wildreceipt_packer import WildReceiptPacker from .ser_packer import SERPacker +from .re_packer import REPacker __all__ = [ 'BasePacker', 'TextDetPacker', 'TextRecogPacker', 'TextRecogCropPacker', - 'TextSpottingPacker', 'WildReceiptPacker', 'SERPacker' + 'TextSpottingPacker', 'WildReceiptPacker', 'SERPacker', 'REPacker' ] diff --git a/mmocr/datasets/preparers/packers/re_packer.py b/mmocr/datasets/preparers/packers/re_packer.py new file mode 100644 index 000000000..57880adc2 --- /dev/null +++ b/mmocr/datasets/preparers/packers/re_packer.py @@ -0,0 +1,159 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +from typing import Dict, List, Tuple + +import mmcv + +from mmocr.registry import DATA_PACKERS +from .base import BasePacker + + +@DATA_PACKERS.register_module() +class REPacker(BasePacker): + """Relation Extraction packer. + It is used to pack the parsed annotation info to. + + .. code-block:: python + + { + "metainfo": + { + "dataset_type": "REDataset", + "task_name": "re", + "re_labels": ['answer', 'header', 'other', 'question'], + "id2label": { + "0": "answer", + "1": "header", + "2": "other", + "3": "question" + }, + "label2id": { + "answer": 0, + "header": 1, + "other": 2, + "question": 3 + } + }, + "data_list": + [ + { + "img_path": "ser_imgs\\test\\zh_val_0.jpg", + "height": 3508, + "width": 2480, + "instances": + { + "texts": ["绩效目标申报表(一级项目)", "项目名称", ...], + "bboxes": [[906,195,1478,259], [357,325,467,357], ...], + "labels": ["header", "question", ...], + "linkings": [[0, 1], [2, 3], ...], + "ids": [0, 1, ...], + "words": [[{ + "box": [ + 904, + 192, + 942, + 253 + ], + "text": "绩" + }, + { + "box": [ + 953, + 192, + 987, + 253 + ], + "text": "效" + }, ...], ...] + } + } + ] + } + """ + + def pack_instance(self, sample: Tuple) -> Dict: + """Pack the parsed annotation info to an MMOCR format instance. + + Args: + sample (Tuple): A tuple of (img_file, instances). + - img_path (str): Path to the image file. + - instances (Sequence[Dict]): A list of converted annos. Each + element should be a dict with the following keys: + + - 'text' + - 'box' + - 'label' + - 'linking' + - 'id' + - 'words' (optional) + + Returns: + Dict: An MMOCR format instance. + """ + + img_path, instances = sample + + img = mmcv.imread(img_path) + h, w = img.shape[:2] + + texts_per_doc = [] + bboxes_per_doc = [] + labels_per_doc = [] + words_per_doc = [] + linking_per_doc = [] + id_per_doc = [] + for instance in instances: + text = instance.get('text', None) + box = instance.get('box', None) + label = instance.get('label', None) + linking = instance.get('linking', None) + id = instance.get('id', None) + words = instance.get('words', None) + assert text or box or label + texts_per_doc.append(text) + bboxes_per_doc.append(box) + labels_per_doc.append(label) + words_per_doc.append(words) + linking_per_doc.append(linking) + id_per_doc.append(id) + packed_instances = dict( + instances=dict( + texts=texts_per_doc, + bboxes=bboxes_per_doc, + labels=labels_per_doc, + linkings=linking_per_doc, + ids=id_per_doc, + words=words_per_doc), + img_path=osp.relpath(img_path, self.data_root), + height=h, + width=w) + + return packed_instances + + def add_meta(self, sample: List) -> Dict: + """Add meta information to the sample. + + Args: + sample (List): A list of samples of the dataset. + + Returns: + Dict: A dict contains the meta information and samples. + """ + + labels = [] + for s in sample: + labels += s['instances']['labels'] + label_list = list(set(labels)) + label_list.sort() + + meta = { + 'metainfo': { + 'dataset_type': 'REDataset', + 'task_name': 're', + 're_labels': label_list, + 'id2label': {k: v for k, v in enumerate(label_list)}, + 'label2id': {v: k for k, v in enumerate(label_list)} + }, + 'data_list': sample + } + return meta diff --git a/mmocr/datasets/preparers/packers/ser_packer.py b/mmocr/datasets/preparers/packers/ser_packer.py index 23971b53a..3ec4c2a96 100644 --- a/mmocr/datasets/preparers/packers/ser_packer.py +++ b/mmocr/datasets/preparers/packers/ser_packer.py @@ -78,9 +78,9 @@ def pack_instance(self, sample: Tuple) -> Dict: - instances (Sequence[Dict]): A list of converted annos. Each element should be a dict with the following keys: - - 'texts' - - 'bboxes' - - 'labels' + - 'text' + - 'box' + - 'label' - 'words' (optional) Returns: From 717ac03e57c6a38b082608abda5f6694c05569be Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Sat, 25 Mar 2023 23:48:33 +0800 Subject: [PATCH 08/50] pre-commit fix --- .../preparers/config_generators/__init__.py | 4 ++-- .../obtainers/naive_data_obtainer.py | 3 +-- mmocr/datasets/preparers/packers/__init__.py | 4 ++-- mmocr/datasets/preparers/packers/re_packer.py | 13 ++++++---- .../datasets/preparers/packers/ser_packer.py | 13 ++++++---- .../preparers/parsers/xfund_parser.py | 24 ++++++++++--------- 6 files changed, 34 insertions(+), 27 deletions(-) diff --git a/mmocr/datasets/preparers/config_generators/__init__.py b/mmocr/datasets/preparers/config_generators/__init__.py index 4976b92fd..1a6221256 100644 --- a/mmocr/datasets/preparers/config_generators/__init__.py +++ b/mmocr/datasets/preparers/config_generators/__init__.py @@ -1,10 +1,10 @@ # Copyright (c) OpenMMLab. All rights reserved. from .base import BaseDatasetConfigGenerator +from .re_config_generator import REConfigGenerator +from .ser_config_generator import SERConfigGenerator from .textdet_config_generator import TextDetConfigGenerator from .textrecog_config_generator import TextRecogConfigGenerator from .textspotting_config_generator import TextSpottingConfigGenerator -from .ser_config_generator import SERConfigGenerator -from .re_config_generator import REConfigGenerator __all__ = [ 'BaseDatasetConfigGenerator', 'TextDetConfigGenerator', diff --git a/mmocr/datasets/preparers/obtainers/naive_data_obtainer.py b/mmocr/datasets/preparers/obtainers/naive_data_obtainer.py index 38af27327..c743a4859 100644 --- a/mmocr/datasets/preparers/obtainers/naive_data_obtainer.py +++ b/mmocr/datasets/preparers/obtainers/naive_data_obtainer.py @@ -186,8 +186,7 @@ def move(self, mapping: List[Tuple[str, str]]) -> None: if '*' in src: mkdir_or_exist(dst) for f in glob.glob(src): - if not osp.exists( - osp.join(dst, osp.basename(f))): + if not osp.exists(osp.join(dst, osp.basename(f))): shutil.move(f, dst) elif osp.exists(src) and not osp.exists(dst): diff --git a/mmocr/datasets/preparers/packers/__init__.py b/mmocr/datasets/preparers/packers/__init__.py index 79c137995..a271a3ce5 100644 --- a/mmocr/datasets/preparers/packers/__init__.py +++ b/mmocr/datasets/preparers/packers/__init__.py @@ -1,11 +1,11 @@ # Copyright (c) OpenMMLab. All rights reserved. from .base import BasePacker +from .re_packer import REPacker +from .ser_packer import SERPacker from .textdet_packer import TextDetPacker from .textrecog_packer import TextRecogCropPacker, TextRecogPacker from .textspotting_packer import TextSpottingPacker from .wildreceipt_packer import WildReceiptPacker -from .ser_packer import SERPacker -from .re_packer import REPacker __all__ = [ 'BasePacker', 'TextDetPacker', 'TextRecogPacker', 'TextRecogCropPacker', diff --git a/mmocr/datasets/preparers/packers/re_packer.py b/mmocr/datasets/preparers/packers/re_packer.py index 57880adc2..e627ab10c 100644 --- a/mmocr/datasets/preparers/packers/re_packer.py +++ b/mmocr/datasets/preparers/packers/re_packer.py @@ -10,8 +10,8 @@ @DATA_PACKERS.register_module() class REPacker(BasePacker): - """Relation Extraction packer. - It is used to pack the parsed annotation info to. + """Relation Extraction packer. It is used to pack the parsed annotation + info to. .. code-block:: python @@ -43,7 +43,8 @@ class REPacker(BasePacker): "instances": { "texts": ["绩效目标申报表(一级项目)", "项目名称", ...], - "bboxes": [[906,195,1478,259], [357,325,467,357], ...], + "bboxes": [[906,195,1478,259], + [357,325,467,357], ...], "labels": ["header", "question", ...], "linkings": [[0, 1], [2, 3], ...], "ids": [0, 1, ...], @@ -151,8 +152,10 @@ def add_meta(self, sample: List) -> Dict: 'dataset_type': 'REDataset', 'task_name': 're', 're_labels': label_list, - 'id2label': {k: v for k, v in enumerate(label_list)}, - 'label2id': {v: k for k, v in enumerate(label_list)} + 'id2label': {k: v + for k, v in enumerate(label_list)}, + 'label2id': {v: k + for k, v in enumerate(label_list)} }, 'data_list': sample } diff --git a/mmocr/datasets/preparers/packers/ser_packer.py b/mmocr/datasets/preparers/packers/ser_packer.py index 3ec4c2a96..2fe0805e2 100644 --- a/mmocr/datasets/preparers/packers/ser_packer.py +++ b/mmocr/datasets/preparers/packers/ser_packer.py @@ -10,8 +10,8 @@ @DATA_PACKERS.register_module() class SERPacker(BasePacker): - """Semantic Entity Recognition packer. - It is used to pack the parsed annotation info to. + """Semantic Entity Recognition packer. It is used to pack the parsed + annotation info to. .. code-block:: python @@ -43,7 +43,8 @@ class SERPacker(BasePacker): "instances": { "texts": ["绩效目标申报表(一级项目)", "项目名称", ...], - "bboxes": [[906,195,1478,259], [357,325,467,357], ...], + "bboxes": [[906,195,1478,259], + [357,325,467,357], ...], "labels": ["header", "question", ...], "words": [[{ "box": [ @@ -139,8 +140,10 @@ def add_meta(self, sample: List) -> Dict: 'dataset_type': 'SERDataset', 'task_name': 'ser', 'ser_labels': label_list, - 'id2label': {k: v for k, v in enumerate(label_list)}, - 'label2id': {v: k for k, v in enumerate(label_list)} + 'id2label': {k: v + for k, v in enumerate(label_list)}, + 'label2id': {v: k + for k, v in enumerate(label_list)} }, 'data_list': sample } diff --git a/mmocr/datasets/preparers/parsers/xfund_parser.py b/mmocr/datasets/preparers/parsers/xfund_parser.py index e8a920e97..6465418f6 100644 --- a/mmocr/datasets/preparers/parsers/xfund_parser.py +++ b/mmocr/datasets/preparers/parsers/xfund_parser.py @@ -24,7 +24,7 @@ def parse_files(self, img_dir: str, ann_path: str) -> List: for img_fname, instance in self.loader(ann_path): samples.append((osp.join(img_dir, img_fname), instance)) return samples - + def loader(self, file_path: str): with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) @@ -33,10 +33,11 @@ def loader(self, file_path: str): instances = list() for j in range(len(data['documents'][i]['document'])): cur_item = data['documents'][i]['document'][j] - instance = dict(text=cur_item['text'], - box=cur_item['box'], - label=cur_item['label'], - words=cur_item['words']) + instance = dict( + text=cur_item['text'], + box=cur_item['box'], + label=cur_item['label'], + words=cur_item['words']) instances.append(instance) yield img_fname, instances @@ -59,11 +60,12 @@ def loader(self, file_path: str): instances = list() for j in range(len(data['documents'][i]['document'])): cur_item = data['documents'][i]['document'][j] - instance = dict(text=cur_item['text'], - box=cur_item['box'], - label=cur_item['label'], - words=cur_item['words'], - linking=cur_item['linking'], - id=cur_item['id']) + instance = dict( + text=cur_item['text'], + box=cur_item['box'], + label=cur_item['label'], + words=cur_item['words'], + linking=cur_item['linking'], + id=cur_item['id']) instances.append(instance) yield img_fname, instances From 078cc83fd98f02963b28adef88b986bab26f4672 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Mon, 27 Mar 2023 13:01:22 +0800 Subject: [PATCH 09/50] =?UTF-8?q?[Fix]=20=E7=AE=80=E5=8C=96XFUND=20parser?= =?UTF-8?q?=EF=BC=8C=E4=BC=98=E5=8C=96=E6=9C=80=E7=BB=88=E6=95=B0=E6=8D=AE?= =?UTF-8?q?=E9=9B=86=E7=9A=84=E7=9B=AE=E5=BD=95=E7=BB=93=E6=9E=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- configs/re/_base_/datasets/xfund_de.py | 15 --------- configs/re/_base_/datasets/xfund_es.py | 15 --------- configs/re/_base_/datasets/xfund_fr.py | 15 --------- configs/re/_base_/datasets/xfund_it.py | 15 --------- configs/re/_base_/datasets/xfund_ja.py | 15 --------- configs/re/_base_/datasets/xfund_pt.py | 15 --------- configs/re/_base_/datasets/xfund_zh.py | 1 - .../_base_/datasets/xfund_zh_huggingface.py | 14 ++++++++ configs/ser/_base_/datasets/xfund_de.py | 15 --------- configs/ser/_base_/datasets/xfund_es.py | 15 --------- configs/ser/_base_/datasets/xfund_fr.py | 15 --------- configs/ser/_base_/datasets/xfund_it.py | 15 --------- configs/ser/_base_/datasets/xfund_ja.py | 15 --------- configs/ser/_base_/datasets/xfund_pt.py | 15 --------- configs/ser/_base_/datasets/xfund_zh.py | 1 - .../_base_/datasets/xfund_zh_huggingface.py | 14 ++++++++ dataset_zoo/xfund/de/re.py | 2 -- dataset_zoo/xfund/de/ser.py | 12 +++---- dataset_zoo/xfund/es/re.py | 2 -- dataset_zoo/xfund/es/ser.py | 12 +++---- dataset_zoo/xfund/fr/re.py | 2 -- dataset_zoo/xfund/fr/ser.py | 12 +++---- dataset_zoo/xfund/it/re.py | 2 -- dataset_zoo/xfund/it/ser.py | 12 +++---- dataset_zoo/xfund/ja/re.py | 2 -- dataset_zoo/xfund/ja/ser.py | 12 +++---- dataset_zoo/xfund/pt/re.py | 2 -- dataset_zoo/xfund/pt/ser.py | 12 +++---- dataset_zoo/xfund/zh/re.py | 2 -- dataset_zoo/xfund/zh/ser.py | 12 +++---- mmocr/datasets/preparers/packers/re_packer.py | 8 ++--- .../datasets/preparers/packers/ser_packer.py | 4 +-- mmocr/datasets/preparers/parsers/__init__.py | 4 +-- .../preparers/parsers/xfund_parser.py | 33 ++----------------- 34 files changed, 81 insertions(+), 276 deletions(-) delete mode 100644 configs/re/_base_/datasets/xfund_de.py delete mode 100644 configs/re/_base_/datasets/xfund_es.py delete mode 100644 configs/re/_base_/datasets/xfund_fr.py delete mode 100644 configs/re/_base_/datasets/xfund_it.py delete mode 100644 configs/re/_base_/datasets/xfund_ja.py delete mode 100644 configs/re/_base_/datasets/xfund_pt.py create mode 100644 configs/re/_base_/datasets/xfund_zh_huggingface.py delete mode 100644 configs/ser/_base_/datasets/xfund_de.py delete mode 100644 configs/ser/_base_/datasets/xfund_es.py delete mode 100644 configs/ser/_base_/datasets/xfund_fr.py delete mode 100644 configs/ser/_base_/datasets/xfund_it.py delete mode 100644 configs/ser/_base_/datasets/xfund_ja.py delete mode 100644 configs/ser/_base_/datasets/xfund_pt.py create mode 100644 configs/ser/_base_/datasets/xfund_zh_huggingface.py diff --git a/configs/re/_base_/datasets/xfund_de.py b/configs/re/_base_/datasets/xfund_de.py deleted file mode 100644 index c86de998e..000000000 --- a/configs/re/_base_/datasets/xfund_de.py +++ /dev/null @@ -1,15 +0,0 @@ -xfund_de_re_data_root = 'data/xfund/de' - -xfund_de_re_train = dict( - type='REDataset', - data_root=xfund_de_re_data_root, - ann_file='re_train.json', - filter_cfg=dict(filter_empty_gt=True, min_size=32), - pipeline=None) - -xfund_de_re_test = dict( - type='REDataset', - data_root=xfund_de_re_data_root, - ann_file='re_test.json', - test_mode=True, - pipeline=None) diff --git a/configs/re/_base_/datasets/xfund_es.py b/configs/re/_base_/datasets/xfund_es.py deleted file mode 100644 index 24d9400d2..000000000 --- a/configs/re/_base_/datasets/xfund_es.py +++ /dev/null @@ -1,15 +0,0 @@ -xfund_es_re_data_root = 'data/xfund/es' - -xfund_es_re_train = dict( - type='REDataset', - data_root=xfund_es_re_data_root, - ann_file='re_train.json', - filter_cfg=dict(filter_empty_gt=True, min_size=32), - pipeline=None) - -xfund_es_re_test = dict( - type='REDataset', - data_root=xfund_es_re_data_root, - ann_file='re_test.json', - test_mode=True, - pipeline=None) diff --git a/configs/re/_base_/datasets/xfund_fr.py b/configs/re/_base_/datasets/xfund_fr.py deleted file mode 100644 index 771cc8bb9..000000000 --- a/configs/re/_base_/datasets/xfund_fr.py +++ /dev/null @@ -1,15 +0,0 @@ -xfund_fr_re_data_root = 'data/xfund/fr' - -xfund_fr_re_train = dict( - type='REDataset', - data_root=xfund_fr_re_data_root, - ann_file='re_train.json', - filter_cfg=dict(filter_empty_gt=True, min_size=32), - pipeline=None) - -xfund_fr_re_test = dict( - type='REDataset', - data_root=xfund_fr_re_data_root, - ann_file='re_test.json', - test_mode=True, - pipeline=None) diff --git a/configs/re/_base_/datasets/xfund_it.py b/configs/re/_base_/datasets/xfund_it.py deleted file mode 100644 index e8dcbdf39..000000000 --- a/configs/re/_base_/datasets/xfund_it.py +++ /dev/null @@ -1,15 +0,0 @@ -xfund_it_re_data_root = 'data/xfund/it' - -xfund_it_re_train = dict( - type='REDataset', - data_root=xfund_it_re_data_root, - ann_file='re_train.json', - filter_cfg=dict(filter_empty_gt=True, min_size=32), - pipeline=None) - -xfund_it_re_test = dict( - type='REDataset', - data_root=xfund_it_re_data_root, - ann_file='re_test.json', - test_mode=True, - pipeline=None) diff --git a/configs/re/_base_/datasets/xfund_ja.py b/configs/re/_base_/datasets/xfund_ja.py deleted file mode 100644 index e6057c3a9..000000000 --- a/configs/re/_base_/datasets/xfund_ja.py +++ /dev/null @@ -1,15 +0,0 @@ -xfund_ja_re_data_root = 'data/xfund/ja' - -xfund_ja_re_train = dict( - type='REDataset', - data_root=xfund_ja_re_data_root, - ann_file='re_train.json', - filter_cfg=dict(filter_empty_gt=True, min_size=32), - pipeline=None) - -xfund_ja_re_test = dict( - type='REDataset', - data_root=xfund_ja_re_data_root, - ann_file='re_test.json', - test_mode=True, - pipeline=None) diff --git a/configs/re/_base_/datasets/xfund_pt.py b/configs/re/_base_/datasets/xfund_pt.py deleted file mode 100644 index 8bcbc59a9..000000000 --- a/configs/re/_base_/datasets/xfund_pt.py +++ /dev/null @@ -1,15 +0,0 @@ -xfund_pt_re_data_root = 'data/xfund/pt' - -xfund_pt_re_train = dict( - type='REDataset', - data_root=xfund_pt_re_data_root, - ann_file='re_train.json', - filter_cfg=dict(filter_empty_gt=True, min_size=32), - pipeline=None) - -xfund_pt_re_test = dict( - type='REDataset', - data_root=xfund_pt_re_data_root, - ann_file='re_test.json', - test_mode=True, - pipeline=None) diff --git a/configs/re/_base_/datasets/xfund_zh.py b/configs/re/_base_/datasets/xfund_zh.py index 68117cf44..5ea9c9d33 100644 --- a/configs/re/_base_/datasets/xfund_zh.py +++ b/configs/re/_base_/datasets/xfund_zh.py @@ -4,7 +4,6 @@ type='REDataset', data_root=xfund_zh_re_data_root, ann_file='re_train.json', - filter_cfg=dict(filter_empty_gt=True, min_size=32), pipeline=None) xfund_zh_re_test = dict( diff --git a/configs/re/_base_/datasets/xfund_zh_huggingface.py b/configs/re/_base_/datasets/xfund_zh_huggingface.py new file mode 100644 index 000000000..887c28ad6 --- /dev/null +++ b/configs/re/_base_/datasets/xfund_zh_huggingface.py @@ -0,0 +1,14 @@ +xfund_zh_huggingface_re_data_root = 'data/xfund/zh' + +xfund_zh_huggingface_re_train = dict( + type='REHuggingfaceDataset', + data_root=xfund_zh_huggingface_re_data_root, + ann_file='re_train.huggingface', + pipeline=None) + +xfund_zh_huggingface_re_test = dict( + type='REHuggingfaceDataset', + data_root=xfund_zh_huggingface_re_data_root, + ann_file='re_test.huggingface', + test_mode=True, + pipeline=None) diff --git a/configs/ser/_base_/datasets/xfund_de.py b/configs/ser/_base_/datasets/xfund_de.py deleted file mode 100644 index 61f14ecff..000000000 --- a/configs/ser/_base_/datasets/xfund_de.py +++ /dev/null @@ -1,15 +0,0 @@ -xfund_de_ser_data_root = 'data/xfund/de' - -xfund_de_ser_train = dict( - type='SERDataset', - data_root=xfund_de_ser_data_root, - ann_file='ser_train.json', - filter_cfg=dict(filter_empty_gt=True, min_size=32), - pipeline=None) - -xfund_de_ser_test = dict( - type='SERDataset', - data_root=xfund_de_ser_data_root, - ann_file='ser_test.json', - test_mode=True, - pipeline=None) diff --git a/configs/ser/_base_/datasets/xfund_es.py b/configs/ser/_base_/datasets/xfund_es.py deleted file mode 100644 index 25ac5b5fb..000000000 --- a/configs/ser/_base_/datasets/xfund_es.py +++ /dev/null @@ -1,15 +0,0 @@ -xfund_es_ser_data_root = 'data/xfund/es' - -xfund_es_ser_train = dict( - type='SERDataset', - data_root=xfund_es_ser_data_root, - ann_file='ser_train.json', - filter_cfg=dict(filter_empty_gt=True, min_size=32), - pipeline=None) - -xfund_es_ser_test = dict( - type='SERDataset', - data_root=xfund_es_ser_data_root, - ann_file='ser_test.json', - test_mode=True, - pipeline=None) diff --git a/configs/ser/_base_/datasets/xfund_fr.py b/configs/ser/_base_/datasets/xfund_fr.py deleted file mode 100644 index 039b1124d..000000000 --- a/configs/ser/_base_/datasets/xfund_fr.py +++ /dev/null @@ -1,15 +0,0 @@ -xfund_fr_ser_data_root = 'data/xfund/fr' - -xfund_fr_ser_train = dict( - type='SERDataset', - data_root=xfund_fr_ser_data_root, - ann_file='ser_train.json', - filter_cfg=dict(filter_empty_gt=True, min_size=32), - pipeline=None) - -xfund_fr_ser_test = dict( - type='SERDataset', - data_root=xfund_fr_ser_data_root, - ann_file='ser_test.json', - test_mode=True, - pipeline=None) diff --git a/configs/ser/_base_/datasets/xfund_it.py b/configs/ser/_base_/datasets/xfund_it.py deleted file mode 100644 index eca998816..000000000 --- a/configs/ser/_base_/datasets/xfund_it.py +++ /dev/null @@ -1,15 +0,0 @@ -xfund_it_ser_data_root = 'data/xfund/it' - -xfund_it_ser_train = dict( - type='SERDataset', - data_root=xfund_it_ser_data_root, - ann_file='ser_train.json', - filter_cfg=dict(filter_empty_gt=True, min_size=32), - pipeline=None) - -xfund_it_ser_test = dict( - type='SERDataset', - data_root=xfund_it_ser_data_root, - ann_file='ser_test.json', - test_mode=True, - pipeline=None) diff --git a/configs/ser/_base_/datasets/xfund_ja.py b/configs/ser/_base_/datasets/xfund_ja.py deleted file mode 100644 index 43fa5a514..000000000 --- a/configs/ser/_base_/datasets/xfund_ja.py +++ /dev/null @@ -1,15 +0,0 @@ -xfund_ja_ser_data_root = 'data/xfund/ja' - -xfund_ja_ser_train = dict( - type='SERDataset', - data_root=xfund_ja_ser_data_root, - ann_file='ser_train.json', - filter_cfg=dict(filter_empty_gt=True, min_size=32), - pipeline=None) - -xfund_ja_ser_test = dict( - type='SERDataset', - data_root=xfund_ja_ser_data_root, - ann_file='ser_test.json', - test_mode=True, - pipeline=None) diff --git a/configs/ser/_base_/datasets/xfund_pt.py b/configs/ser/_base_/datasets/xfund_pt.py deleted file mode 100644 index 99804d8a6..000000000 --- a/configs/ser/_base_/datasets/xfund_pt.py +++ /dev/null @@ -1,15 +0,0 @@ -xfund_pt_ser_data_root = 'data/xfund/pt' - -xfund_pt_ser_train = dict( - type='SERDataset', - data_root=xfund_pt_ser_data_root, - ann_file='ser_train.json', - filter_cfg=dict(filter_empty_gt=True, min_size=32), - pipeline=None) - -xfund_pt_ser_test = dict( - type='SERDataset', - data_root=xfund_pt_ser_data_root, - ann_file='ser_test.json', - test_mode=True, - pipeline=None) diff --git a/configs/ser/_base_/datasets/xfund_zh.py b/configs/ser/_base_/datasets/xfund_zh.py index 9130ce99b..4ee522efd 100644 --- a/configs/ser/_base_/datasets/xfund_zh.py +++ b/configs/ser/_base_/datasets/xfund_zh.py @@ -4,7 +4,6 @@ type='SERDataset', data_root=xfund_zh_ser_data_root, ann_file='ser_train.json', - filter_cfg=dict(filter_empty_gt=True, min_size=32), pipeline=None) xfund_zh_ser_test = dict( diff --git a/configs/ser/_base_/datasets/xfund_zh_huggingface.py b/configs/ser/_base_/datasets/xfund_zh_huggingface.py new file mode 100644 index 000000000..b9f067bda --- /dev/null +++ b/configs/ser/_base_/datasets/xfund_zh_huggingface.py @@ -0,0 +1,14 @@ +xfund_zh_huggingface_ser_data_root = 'data/xfund/zh' + +xfund_zh_huggingface_ser_train = dict( + type='SERHuggingfaceDataset', + data_root=xfund_zh_huggingface_ser_data_root, + ann_file='ser_train.huggingface', + pipeline=None) + +xfund_zh_huggingface_ser_test = dict( + type='SERHuggingfaceDataset', + data_root=xfund_zh_huggingface_ser_data_root, + ann_file='ser_test.huggingface', + test_mode=True, + pipeline=None) diff --git a/dataset_zoo/xfund/de/re.py b/dataset_zoo/xfund/de/re.py index fdd3b4765..b3e0666de 100644 --- a/dataset_zoo/xfund/de/re.py +++ b/dataset_zoo/xfund/de/re.py @@ -1,8 +1,6 @@ _base_ = ['ser.py'] -_base_.train_preparer.parser.type = 'XFUNDREAnnParser' _base_.train_preparer.packer.type = 'REPacker' -_base_.test_preparer.parser.type = 'XFUNDREAnnParser' _base_.test_preparer.packer.type = 'REPacker' config_generator = dict(type='REConfigGenerator') diff --git a/dataset_zoo/xfund/de/ser.py b/dataset_zoo/xfund/de/ser.py index 344e85158..60c6963c0 100644 --- a/dataset_zoo/xfund/de/ser.py +++ b/dataset_zoo/xfund/de/ser.py @@ -13,7 +13,7 @@ save_name=f'{lang}_train.zip', md5='8c9f949952d227290e22f736cdbe4d29', content=['image'], - mapping=[[f'{lang}_train/*.jpg', 'ser_imgs/train']]), + mapping=[[f'{lang}_train/*.jpg', 'imgs/train']]), dict( url='https://github.com/doc-analysis/XFUND/' f'releases/download/v1.0/{lang}.train.json', @@ -23,8 +23,8 @@ mapping=[[f'{lang}_train.json', 'annotations/train.json']]) ]), gatherer=dict( - type='MonoGatherer', ann_name='train.json', img_dir='ser_imgs/train'), - parser=dict(type='XFUNDSERAnnParser'), + type='MonoGatherer', ann_name='train.json', img_dir='imgs/train'), + parser=dict(type='XFUNDAnnParser'), packer=dict(type='SERPacker'), dumper=dict(type='JsonDumper'), ) @@ -40,7 +40,7 @@ save_name=f'{lang}_val.zip', md5='d13d12278d585214183c3cfb949b0e59', content=['image'], - mapping=[[f'{lang}_val/*.jpg', 'ser_imgs/test']]), + mapping=[[f'{lang}_val/*.jpg', 'imgs/test']]), dict( url='https://github.com/doc-analysis/XFUND/' f'releases/download/v1.0/{lang}.val.json', @@ -50,8 +50,8 @@ mapping=[[f'{lang}_val.json', 'annotations/test.json']]) ]), gatherer=dict( - type='MonoGatherer', ann_name='test.json', img_dir='ser_imgs/test'), - parser=dict(type='XFUNDSERAnnParser'), + type='MonoGatherer', ann_name='test.json', img_dir='imgs/test'), + parser=dict(type='XFUNDAnnParser'), packer=dict(type='SERPacker'), dumper=dict(type='JsonDumper'), ) diff --git a/dataset_zoo/xfund/es/re.py b/dataset_zoo/xfund/es/re.py index fdd3b4765..b3e0666de 100644 --- a/dataset_zoo/xfund/es/re.py +++ b/dataset_zoo/xfund/es/re.py @@ -1,8 +1,6 @@ _base_ = ['ser.py'] -_base_.train_preparer.parser.type = 'XFUNDREAnnParser' _base_.train_preparer.packer.type = 'REPacker' -_base_.test_preparer.parser.type = 'XFUNDREAnnParser' _base_.test_preparer.packer.type = 'REPacker' config_generator = dict(type='REConfigGenerator') diff --git a/dataset_zoo/xfund/es/ser.py b/dataset_zoo/xfund/es/ser.py index 9ee6caec5..2cc4dbcc6 100644 --- a/dataset_zoo/xfund/es/ser.py +++ b/dataset_zoo/xfund/es/ser.py @@ -13,7 +13,7 @@ save_name=f'{lang}_train.zip', md5='0ff89032bc6cb2e7ccba062c71944d03', content=['image'], - mapping=[[f'{lang}_train/*.jpg', 'ser_imgs/train']]), + mapping=[[f'{lang}_train/*.jpg', 'imgs/train']]), dict( url='https://github.com/doc-analysis/XFUND/' f'releases/download/v1.0/{lang}.train.json', @@ -23,8 +23,8 @@ mapping=[[f'{lang}_train.json', 'annotations/train.json']]) ]), gatherer=dict( - type='MonoGatherer', ann_name='train.json', img_dir='ser_imgs/train'), - parser=dict(type='XFUNDSERAnnParser'), + type='MonoGatherer', ann_name='train.json', img_dir='imgs/train'), + parser=dict(type='XFUNDAnnParser'), packer=dict(type='SERPacker'), dumper=dict(type='JsonDumper'), ) @@ -40,7 +40,7 @@ save_name=f'{lang}_val.zip', md5='efad9fb11ee3036bef003b6364a79ac0', content=['image'], - mapping=[[f'{lang}_val/*.jpg', 'ser_imgs/test']]), + mapping=[[f'{lang}_val/*.jpg', 'imgs/test']]), dict( url='https://github.com/doc-analysis/XFUND/' f'releases/download/v1.0/{lang}.val.json', @@ -50,8 +50,8 @@ mapping=[[f'{lang}_val.json', 'annotations/test.json']]) ]), gatherer=dict( - type='MonoGatherer', ann_name='test.json', img_dir='ser_imgs/test'), - parser=dict(type='XFUNDSERAnnParser'), + type='MonoGatherer', ann_name='test.json', img_dir='imgs/test'), + parser=dict(type='XFUNDAnnParser'), packer=dict(type='SERPacker'), dumper=dict(type='JsonDumper'), ) diff --git a/dataset_zoo/xfund/fr/re.py b/dataset_zoo/xfund/fr/re.py index fdd3b4765..b3e0666de 100644 --- a/dataset_zoo/xfund/fr/re.py +++ b/dataset_zoo/xfund/fr/re.py @@ -1,8 +1,6 @@ _base_ = ['ser.py'] -_base_.train_preparer.parser.type = 'XFUNDREAnnParser' _base_.train_preparer.packer.type = 'REPacker' -_base_.test_preparer.parser.type = 'XFUNDREAnnParser' _base_.test_preparer.packer.type = 'REPacker' config_generator = dict(type='REConfigGenerator') diff --git a/dataset_zoo/xfund/fr/ser.py b/dataset_zoo/xfund/fr/ser.py index a6e6af790..ff9f5ea1f 100644 --- a/dataset_zoo/xfund/fr/ser.py +++ b/dataset_zoo/xfund/fr/ser.py @@ -13,7 +13,7 @@ save_name=f'{lang}_train.zip', md5='d821ca50f37cc39ff1715632f4068ea1', content=['image'], - mapping=[[f'{lang}_train/*.jpg', 'ser_imgs/train']]), + mapping=[[f'{lang}_train/*.jpg', 'imgs/train']]), dict( url='https://github.com/doc-analysis/XFUND/' f'releases/download/v1.0/{lang}.train.json', @@ -23,8 +23,8 @@ mapping=[[f'{lang}_train.json', 'annotations/train.json']]) ]), gatherer=dict( - type='MonoGatherer', ann_name='train.json', img_dir='ser_imgs/train'), - parser=dict(type='XFUNDSERAnnParser'), + type='MonoGatherer', ann_name='train.json', img_dir='imgs/train'), + parser=dict(type='XFUNDAnnParser'), packer=dict(type='SERPacker'), dumper=dict(type='JsonDumper'), ) @@ -40,7 +40,7 @@ save_name=f'{lang}_val.zip', md5='9ccbf15816ca05e50229885b75e57e49', content=['image'], - mapping=[[f'{lang}_val/*.jpg', 'ser_imgs/test']]), + mapping=[[f'{lang}_val/*.jpg', 'imgs/test']]), dict( url='https://github.com/doc-analysis/XFUND/' f'releases/download/v1.0/{lang}.val.json', @@ -50,8 +50,8 @@ mapping=[[f'{lang}_val.json', 'annotations/test.json']]) ]), gatherer=dict( - type='MonoGatherer', ann_name='test.json', img_dir='ser_imgs/test'), - parser=dict(type='XFUNDSERAnnParser'), + type='MonoGatherer', ann_name='test.json', img_dir='imgs/test'), + parser=dict(type='XFUNDAnnParser'), packer=dict(type='SERPacker'), dumper=dict(type='JsonDumper'), ) diff --git a/dataset_zoo/xfund/it/re.py b/dataset_zoo/xfund/it/re.py index fdd3b4765..b3e0666de 100644 --- a/dataset_zoo/xfund/it/re.py +++ b/dataset_zoo/xfund/it/re.py @@ -1,8 +1,6 @@ _base_ = ['ser.py'] -_base_.train_preparer.parser.type = 'XFUNDREAnnParser' _base_.train_preparer.packer.type = 'REPacker' -_base_.test_preparer.parser.type = 'XFUNDREAnnParser' _base_.test_preparer.packer.type = 'REPacker' config_generator = dict(type='REConfigGenerator') diff --git a/dataset_zoo/xfund/it/ser.py b/dataset_zoo/xfund/it/ser.py index 17b279b9c..92c298ff1 100644 --- a/dataset_zoo/xfund/it/ser.py +++ b/dataset_zoo/xfund/it/ser.py @@ -13,7 +13,7 @@ save_name=f'{lang}_train.zip', md5='c531e39f0cbc1dc74caa320ffafe5de9', content=['image'], - mapping=[[f'{lang}_train/*.jpg', 'ser_imgs/train']]), + mapping=[[f'{lang}_train/*.jpg', 'imgs/train']]), dict( url='https://github.com/doc-analysis/XFUND/' f'releases/download/v1.0/{lang}.train.json', @@ -23,8 +23,8 @@ mapping=[[f'{lang}_train.json', 'annotations/train.json']]) ]), gatherer=dict( - type='MonoGatherer', ann_name='train.json', img_dir='ser_imgs/train'), - parser=dict(type='XFUNDSERAnnParser'), + type='MonoGatherer', ann_name='train.json', img_dir='imgs/train'), + parser=dict(type='XFUNDAnnParser'), packer=dict(type='SERPacker'), dumper=dict(type='JsonDumper'), ) @@ -40,7 +40,7 @@ save_name=f'{lang}_val.zip', md5='35446a115561d0773b7f2a0c2f32fe5c', content=['image'], - mapping=[[f'{lang}_val/*.jpg', 'ser_imgs/test']]), + mapping=[[f'{lang}_val/*.jpg', 'imgs/test']]), dict( url='https://github.com/doc-analysis/XFUND/' f'releases/download/v1.0/{lang}.val.json', @@ -50,8 +50,8 @@ mapping=[[f'{lang}_val.json', 'annotations/test.json']]) ]), gatherer=dict( - type='MonoGatherer', ann_name='test.json', img_dir='ser_imgs/test'), - parser=dict(type='XFUNDSERAnnParser'), + type='MonoGatherer', ann_name='test.json', img_dir='imgs/test'), + parser=dict(type='XFUNDAnnParser'), packer=dict(type='SERPacker'), dumper=dict(type='JsonDumper'), ) diff --git a/dataset_zoo/xfund/ja/re.py b/dataset_zoo/xfund/ja/re.py index fdd3b4765..b3e0666de 100644 --- a/dataset_zoo/xfund/ja/re.py +++ b/dataset_zoo/xfund/ja/re.py @@ -1,8 +1,6 @@ _base_ = ['ser.py'] -_base_.train_preparer.parser.type = 'XFUNDREAnnParser' _base_.train_preparer.packer.type = 'REPacker' -_base_.test_preparer.parser.type = 'XFUNDREAnnParser' _base_.test_preparer.packer.type = 'REPacker' config_generator = dict(type='REConfigGenerator') diff --git a/dataset_zoo/xfund/ja/ser.py b/dataset_zoo/xfund/ja/ser.py index f43a41c8a..e536151ea 100644 --- a/dataset_zoo/xfund/ja/ser.py +++ b/dataset_zoo/xfund/ja/ser.py @@ -13,7 +13,7 @@ save_name=f'{lang}_train.zip', md5='50c22c6774706494080a73f8eabcf45d', content=['image'], - mapping=[[f'{lang}_train/*.jpg', 'ser_imgs/train']]), + mapping=[[f'{lang}_train/*.jpg', 'imgs/train']]), dict( url='https://github.com/doc-analysis/XFUND/' f'releases/download/v1.0/{lang}.train.json', @@ -23,8 +23,8 @@ mapping=[[f'{lang}_train.json', 'annotations/train.json']]) ]), gatherer=dict( - type='MonoGatherer', ann_name='train.json', img_dir='ser_imgs/train'), - parser=dict(type='XFUNDSERAnnParser'), + type='MonoGatherer', ann_name='train.json', img_dir='imgs/train'), + parser=dict(type='XFUNDAnnParser'), packer=dict(type='SERPacker'), dumper=dict(type='JsonDumper'), ) @@ -40,7 +40,7 @@ save_name=f'{lang}_val.zip', md5='93a22fea044894264bfa3c9f9c84dd37', content=['image'], - mapping=[[f'{lang}_val/*.jpg', 'ser_imgs/test']]), + mapping=[[f'{lang}_val/*.jpg', 'imgs/test']]), dict( url='https://github.com/doc-analysis/XFUND/' f'releases/download/v1.0/{lang}.val.json', @@ -50,8 +50,8 @@ mapping=[[f'{lang}_val.json', 'annotations/test.json']]) ]), gatherer=dict( - type='MonoGatherer', ann_name='test.json', img_dir='ser_imgs/test'), - parser=dict(type='XFUNDSERAnnParser'), + type='MonoGatherer', ann_name='test.json', img_dir='imgs/test'), + parser=dict(type='XFUNDAnnParser'), packer=dict(type='SERPacker'), dumper=dict(type='JsonDumper'), ) diff --git a/dataset_zoo/xfund/pt/re.py b/dataset_zoo/xfund/pt/re.py index fdd3b4765..b3e0666de 100644 --- a/dataset_zoo/xfund/pt/re.py +++ b/dataset_zoo/xfund/pt/re.py @@ -1,8 +1,6 @@ _base_ = ['ser.py'] -_base_.train_preparer.parser.type = 'XFUNDREAnnParser' _base_.train_preparer.packer.type = 'REPacker' -_base_.test_preparer.parser.type = 'XFUNDREAnnParser' _base_.test_preparer.packer.type = 'REPacker' config_generator = dict(type='REConfigGenerator') diff --git a/dataset_zoo/xfund/pt/ser.py b/dataset_zoo/xfund/pt/ser.py index 39c57fc92..079b39448 100644 --- a/dataset_zoo/xfund/pt/ser.py +++ b/dataset_zoo/xfund/pt/ser.py @@ -13,7 +13,7 @@ save_name=f'{lang}_train.zip', md5='783ba0aba419235bc81cf547e7c5011b', content=['image'], - mapping=[[f'{lang}_train/*.jpg', 'ser_imgs/train']]), + mapping=[[f'{lang}_train/*.jpg', 'imgs/train']]), dict( url='https://github.com/doc-analysis/XFUND/' f'releases/download/v1.0/{lang}.train.json', @@ -23,8 +23,8 @@ mapping=[[f'{lang}_train.json', 'annotations/train.json']]) ]), gatherer=dict( - type='MonoGatherer', ann_name='train.json', img_dir='ser_imgs/train'), - parser=dict(type='XFUNDSERAnnParser'), + type='MonoGatherer', ann_name='train.json', img_dir='imgs/train'), + parser=dict(type='XFUNDAnnParser'), packer=dict(type='SERPacker'), dumper=dict(type='JsonDumper'), ) @@ -40,7 +40,7 @@ save_name=f'{lang}_val.zip', md5='5f0189d29c5a0e6764757457f54ba14f', content=['image'], - mapping=[[f'{lang}_val/*.jpg', 'ser_imgs/test']]), + mapping=[[f'{lang}_val/*.jpg', 'imgs/test']]), dict( url='https://github.com/doc-analysis/XFUND/' f'releases/download/v1.0/{lang}.val.json', @@ -50,8 +50,8 @@ mapping=[[f'{lang}_val.json', 'annotations/test.json']]) ]), gatherer=dict( - type='MonoGatherer', ann_name='test.json', img_dir='ser_imgs/test'), - parser=dict(type='XFUNDSERAnnParser'), + type='MonoGatherer', ann_name='test.json', img_dir='imgs/test'), + parser=dict(type='XFUNDAnnParser'), packer=dict(type='SERPacker'), dumper=dict(type='JsonDumper'), ) diff --git a/dataset_zoo/xfund/zh/re.py b/dataset_zoo/xfund/zh/re.py index fdd3b4765..b3e0666de 100644 --- a/dataset_zoo/xfund/zh/re.py +++ b/dataset_zoo/xfund/zh/re.py @@ -1,8 +1,6 @@ _base_ = ['ser.py'] -_base_.train_preparer.parser.type = 'XFUNDREAnnParser' _base_.train_preparer.packer.type = 'REPacker' -_base_.test_preparer.parser.type = 'XFUNDREAnnParser' _base_.test_preparer.packer.type = 'REPacker' config_generator = dict(type='REConfigGenerator') diff --git a/dataset_zoo/xfund/zh/ser.py b/dataset_zoo/xfund/zh/ser.py index ad167346c..ec8efb1a3 100644 --- a/dataset_zoo/xfund/zh/ser.py +++ b/dataset_zoo/xfund/zh/ser.py @@ -13,7 +13,7 @@ save_name=f'{lang}_train.zip', md5='a4ce16d1c1a8554a8b1e00907cff3b4b', content=['image'], - mapping=[[f'{lang}_train/*.jpg', 'ser_imgs/train']]), + mapping=[[f'{lang}_train/*.jpg', 'imgs/train']]), dict( url='https://github.com/doc-analysis/XFUND/' f'releases/download/v1.0/{lang}.train.json', @@ -23,8 +23,8 @@ mapping=[[f'{lang}_train.json', 'annotations/train.json']]) ]), gatherer=dict( - type='MonoGatherer', ann_name='train.json', img_dir='ser_imgs/train'), - parser=dict(type='XFUNDSERAnnParser'), + type='MonoGatherer', ann_name='train.json', img_dir='imgs/train'), + parser=dict(type='XFUNDAnnParser'), packer=dict(type='SERPacker'), dumper=dict(type='JsonDumper'), ) @@ -40,7 +40,7 @@ save_name=f'{lang}_val.zip', md5='f84c2651e350f5b394585207a43d06e4', content=['image'], - mapping=[[f'{lang}_val/*.jpg', 'ser_imgs/test']]), + mapping=[[f'{lang}_val/*.jpg', 'imgs/test']]), dict( url='https://github.com/doc-analysis/XFUND/' f'releases/download/v1.0/{lang}.val.json', @@ -50,8 +50,8 @@ mapping=[[f'{lang}_val.json', 'annotations/test.json']]) ]), gatherer=dict( - type='MonoGatherer', ann_name='test.json', img_dir='ser_imgs/test'), - parser=dict(type='XFUNDSERAnnParser'), + type='MonoGatherer', ann_name='test.json', img_dir='imgs/test'), + parser=dict(type='XFUNDAnnParser'), packer=dict(type='SERPacker'), dumper=dict(type='JsonDumper'), ) diff --git a/mmocr/datasets/preparers/packers/re_packer.py b/mmocr/datasets/preparers/packers/re_packer.py index e627ab10c..ded44325b 100644 --- a/mmocr/datasets/preparers/packers/re_packer.py +++ b/mmocr/datasets/preparers/packers/re_packer.py @@ -20,7 +20,7 @@ class REPacker(BasePacker): { "dataset_type": "REDataset", "task_name": "re", - "re_labels": ['answer', 'header', 'other', 'question'], + "labels": ['answer', 'header', 'other', 'question'], "id2label": { "0": "answer", "1": "header", @@ -108,7 +108,7 @@ def pack_instance(self, sample: Tuple) -> Dict: box = instance.get('box', None) label = instance.get('label', None) linking = instance.get('linking', None) - id = instance.get('id', None) + ins_id = instance.get('id', None) words = instance.get('words', None) assert text or box or label texts_per_doc.append(text) @@ -116,7 +116,7 @@ def pack_instance(self, sample: Tuple) -> Dict: labels_per_doc.append(label) words_per_doc.append(words) linking_per_doc.append(linking) - id_per_doc.append(id) + id_per_doc.append(ins_id) packed_instances = dict( instances=dict( texts=texts_per_doc, @@ -151,7 +151,7 @@ def add_meta(self, sample: List) -> Dict: 'metainfo': { 'dataset_type': 'REDataset', 'task_name': 're', - 're_labels': label_list, + 'labels': label_list, 'id2label': {k: v for k, v in enumerate(label_list)}, 'label2id': {v: k diff --git a/mmocr/datasets/preparers/packers/ser_packer.py b/mmocr/datasets/preparers/packers/ser_packer.py index 2fe0805e2..201d3fc1f 100644 --- a/mmocr/datasets/preparers/packers/ser_packer.py +++ b/mmocr/datasets/preparers/packers/ser_packer.py @@ -20,7 +20,7 @@ class SERPacker(BasePacker): { "dataset_type": "SERDataset", "task_name": "ser", - "ser_labels": ['answer', 'header', 'other', 'question'], + "labels": ['answer', 'header', 'other', 'question'], "id2label": { "0": "answer", "1": "header", @@ -139,7 +139,7 @@ def add_meta(self, sample: List) -> Dict: 'metainfo': { 'dataset_type': 'SERDataset', 'task_name': 'ser', - 'ser_labels': label_list, + 'labels': label_list, 'id2label': {k: v for k, v in enumerate(label_list)}, 'label2id': {v: k diff --git a/mmocr/datasets/preparers/parsers/__init__.py b/mmocr/datasets/preparers/parsers/__init__.py index bef361238..2241dc21a 100644 --- a/mmocr/datasets/preparers/parsers/__init__.py +++ b/mmocr/datasets/preparers/parsers/__init__.py @@ -11,12 +11,12 @@ from .synthtext_parser import SynthTextAnnParser from .totaltext_parser import TotaltextTextDetAnnParser from .wildreceipt_parser import WildreceiptKIEAnnParser -from .xfund_parser import XFUNDREAnnParser, XFUNDSERAnnParser +from .xfund_parser import XFUNDAnnParser __all__ = [ 'BaseParser', 'ICDARTxtTextDetAnnParser', 'ICDARTxtTextRecogAnnParser', 'TotaltextTextDetAnnParser', 'WildreceiptKIEAnnParser', 'COCOTextDetAnnParser', 'SVTTextDetAnnParser', 'FUNSDTextDetAnnParser', 'SROIETextDetAnnParser', 'NAFAnnParser', 'CTW1500AnnParser', - 'SynthTextAnnParser', 'XFUNDSERAnnParser', 'XFUNDREAnnParser' + 'SynthTextAnnParser', 'XFUNDAnnParser' ] diff --git a/mmocr/datasets/preparers/parsers/xfund_parser.py b/mmocr/datasets/preparers/parsers/xfund_parser.py index 6465418f6..e776b0fee 100644 --- a/mmocr/datasets/preparers/parsers/xfund_parser.py +++ b/mmocr/datasets/preparers/parsers/xfund_parser.py @@ -8,9 +8,9 @@ @DATA_PARSERS.register_module() -class XFUNDSERAnnParser(BaseParser): - """XFUND Semantic Entity Recognition Annotation Parser. See - dataset_zoo/xfund/xx/sample_anno.md for annotation example. +class XFUNDAnnParser(BaseParser): + """XFUND Semantic Entity Recognition and Relation Extraction Annotation + Parser. See dataset_zoo/xfund/xx/sample_anno.md for annotation example. Args: nproc (int): The number of processes to parse the annotation. Defaults @@ -25,33 +25,6 @@ def parse_files(self, img_dir: str, ann_path: str) -> List: samples.append((osp.join(img_dir, img_fname), instance)) return samples - def loader(self, file_path: str): - with open(file_path, 'r', encoding='utf-8') as f: - data = json.load(f) - for i in range(len(data['documents'])): - img_fname = data['documents'][i]['img']['fname'] - instances = list() - for j in range(len(data['documents'][i]['document'])): - cur_item = data['documents'][i]['document'][j] - instance = dict( - text=cur_item['text'], - box=cur_item['box'], - label=cur_item['label'], - words=cur_item['words']) - instances.append(instance) - yield img_fname, instances - - -@DATA_PARSERS.register_module() -class XFUNDREAnnParser(XFUNDSERAnnParser): - """XFUND Relation Extraction Annotation Parser. See - dataset_zoo/xfund/xx/sample_anno.md for annotation example. - - Args: - nproc (int): The number of processes to parse the annotation. Defaults - to 1. - """ - def loader(self, file_path: str): with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) From d3e16ad28d02146ff89dff689a2f8c939070ebd7 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Tue, 28 Mar 2023 22:20:05 +0800 Subject: [PATCH 10/50] =?UTF-8?q?[Fix]=20=E5=9B=9E=E9=80=80=E5=88=A0?= =?UTF-8?q?=E9=99=A4huggingface=20dataset=E5=BD=A2=E5=BC=8F=EF=BC=8C?= =?UTF-8?q?=E6=B2=A1=E6=84=8F=E4=B9=89=E3=80=82=E4=BF=AE=E6=94=B9ser/re=20?= =?UTF-8?q?packer=E7=9A=84metainfo=E4=BF=A1=E6=81=AF=EF=BC=8C=E9=98=B6?= =?UTF-8?q?=E6=AE=B5=E6=80=A7=E6=B7=BB=E5=8A=A0SERDataset?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../_base_/datasets/xfund_zh_huggingface.py | 14 -- .../_base_/datasets/xfund_zh_huggingface.py | 14 -- mmocr/datasets/__init__.py | 3 +- .../config_generators/re_config_generator.py | 4 +- .../config_generators/ser_config_generator.py | 4 +- mmocr/datasets/preparers/packers/re_packer.py | 42 +++-- .../datasets/preparers/packers/ser_packer.py | 42 +++-- mmocr/datasets/ser_dataset.py | 170 ++++++++++++++++++ projects/LayoutLMv3/README.md | 0 .../LayoutLMv3/configs/layoutlmv3_xfund_zh.py | 12 ++ .../LayoutLMv3/scripts/prepare_dataset.sh | 17 ++ projects/LayoutLMv3/test.py | 14 ++ 12 files changed, 275 insertions(+), 61 deletions(-) delete mode 100644 configs/re/_base_/datasets/xfund_zh_huggingface.py delete mode 100644 configs/ser/_base_/datasets/xfund_zh_huggingface.py create mode 100644 mmocr/datasets/ser_dataset.py create mode 100644 projects/LayoutLMv3/README.md create mode 100644 projects/LayoutLMv3/configs/layoutlmv3_xfund_zh.py create mode 100644 projects/LayoutLMv3/scripts/prepare_dataset.sh create mode 100644 projects/LayoutLMv3/test.py diff --git a/configs/re/_base_/datasets/xfund_zh_huggingface.py b/configs/re/_base_/datasets/xfund_zh_huggingface.py deleted file mode 100644 index 887c28ad6..000000000 --- a/configs/re/_base_/datasets/xfund_zh_huggingface.py +++ /dev/null @@ -1,14 +0,0 @@ -xfund_zh_huggingface_re_data_root = 'data/xfund/zh' - -xfund_zh_huggingface_re_train = dict( - type='REHuggingfaceDataset', - data_root=xfund_zh_huggingface_re_data_root, - ann_file='re_train.huggingface', - pipeline=None) - -xfund_zh_huggingface_re_test = dict( - type='REHuggingfaceDataset', - data_root=xfund_zh_huggingface_re_data_root, - ann_file='re_test.huggingface', - test_mode=True, - pipeline=None) diff --git a/configs/ser/_base_/datasets/xfund_zh_huggingface.py b/configs/ser/_base_/datasets/xfund_zh_huggingface.py deleted file mode 100644 index b9f067bda..000000000 --- a/configs/ser/_base_/datasets/xfund_zh_huggingface.py +++ /dev/null @@ -1,14 +0,0 @@ -xfund_zh_huggingface_ser_data_root = 'data/xfund/zh' - -xfund_zh_huggingface_ser_train = dict( - type='SERHuggingfaceDataset', - data_root=xfund_zh_huggingface_ser_data_root, - ann_file='ser_train.huggingface', - pipeline=None) - -xfund_zh_huggingface_ser_test = dict( - type='SERHuggingfaceDataset', - data_root=xfund_zh_huggingface_ser_data_root, - ann_file='ser_test.huggingface', - test_mode=True, - pipeline=None) diff --git a/mmocr/datasets/__init__.py b/mmocr/datasets/__init__.py index 54a9ea7f0..f6d802325 100644 --- a/mmocr/datasets/__init__.py +++ b/mmocr/datasets/__init__.py @@ -5,10 +5,11 @@ from .recog_lmdb_dataset import RecogLMDBDataset from .recog_text_dataset import RecogTextDataset from .samplers import * # NOQA +from .ser_dataset import SERDataset from .transforms import * # NOQA from .wildreceipt_dataset import WildReceiptDataset __all__ = [ 'IcdarDataset', 'OCRDataset', 'RecogLMDBDataset', 'RecogTextDataset', - 'WildReceiptDataset', 'ConcatDataset' + 'WildReceiptDataset', 'ConcatDataset', 'SERDataset' ] diff --git a/mmocr/datasets/preparers/config_generators/re_config_generator.py b/mmocr/datasets/preparers/config_generators/re_config_generator.py index 35f2b6589..3d5d4c5e2 100644 --- a/mmocr/datasets/preparers/config_generators/re_config_generator.py +++ b/mmocr/datasets/preparers/config_generators/re_config_generator.py @@ -90,9 +90,7 @@ def _gen_dataset_config(self) -> str: cfg += ' type=\'REDataset\',\n' cfg += ' data_root=' + f'{self.dataset_name}_{self.task}_data_root,\n' # noqa: E501 cfg += f' ann_file=\'{ann_dict["ann_file"]}\',\n' - if ann_dict['split'] == 'train': - cfg += ' filter_cfg=dict(filter_empty_gt=True, min_size=32),\n' # noqa: E501 - elif ann_dict['split'] in ['test', 'val']: + if ann_dict['split'] in ['test', 'val']: cfg += ' test_mode=True,\n' cfg += ' pipeline=None)\n' return cfg diff --git a/mmocr/datasets/preparers/config_generators/ser_config_generator.py b/mmocr/datasets/preparers/config_generators/ser_config_generator.py index c93167869..c3cb7f53f 100644 --- a/mmocr/datasets/preparers/config_generators/ser_config_generator.py +++ b/mmocr/datasets/preparers/config_generators/ser_config_generator.py @@ -90,9 +90,7 @@ def _gen_dataset_config(self) -> str: cfg += ' type=\'SERDataset\',\n' cfg += ' data_root=' + f'{self.dataset_name}_{self.task}_data_root,\n' # noqa: E501 cfg += f' ann_file=\'{ann_dict["ann_file"]}\',\n' - if ann_dict['split'] == 'train': - cfg += ' filter_cfg=dict(filter_empty_gt=True, min_size=32),\n' # noqa: E501 - elif ann_dict['split'] in ['test', 'val']: + if ann_dict['split'] in ['test', 'val']: cfg += ' test_mode=True,\n' cfg += ' pipeline=None)\n' return cfg diff --git a/mmocr/datasets/preparers/packers/re_packer.py b/mmocr/datasets/preparers/packers/re_packer.py index ded44325b..5f3c12fe7 100644 --- a/mmocr/datasets/preparers/packers/re_packer.py +++ b/mmocr/datasets/preparers/packers/re_packer.py @@ -22,16 +22,22 @@ class REPacker(BasePacker): "task_name": "re", "labels": ['answer', 'header', 'other', 'question'], "id2label": { - "0": "answer", - "1": "header", - "2": "other", - "3": "question" + "0": "O", + "1": "B-ANSWER", + "2": "I-ANSWER", + "3": "B-HEADER", + "4": "I-HEADER", + "5": "B-QUESTION", + "6": "I-QUESTION" }, "label2id": { - "answer": 0, - "header": 1, - "other": 2, - "question": 3 + "O": 0, + "B-ANSWER": 1, + "I-ANSWER": 2, + "B-HEADER": 3, + "I-HEADER": 4, + "B-QUESTION": 5, + "I-QUESTION": 6 } }, "data_list": @@ -141,21 +147,31 @@ def add_meta(self, sample: List) -> Dict: Dict: A dict contains the meta information and samples. """ + def get_BIO_label_list(labels): + bio_label_list = [] + for label in labels: + if label == 'other': + bio_label_list.insert(0, 'O') + else: + bio_label_list.append(f'B-{label.upper()}') + bio_label_list.append(f'I-{label.upper()}') + return bio_label_list + labels = [] for s in sample: labels += s['instances']['labels'] - label_list = list(set(labels)) - label_list.sort() + org_label_list = list(set(labels)) + bio_label_list = get_BIO_label_list(org_label_list) meta = { 'metainfo': { 'dataset_type': 'REDataset', 'task_name': 're', - 'labels': label_list, + 'labels': org_label_list, 'id2label': {k: v - for k, v in enumerate(label_list)}, + for k, v in enumerate(bio_label_list)}, 'label2id': {v: k - for k, v in enumerate(label_list)} + for k, v in enumerate(bio_label_list)} }, 'data_list': sample } diff --git a/mmocr/datasets/preparers/packers/ser_packer.py b/mmocr/datasets/preparers/packers/ser_packer.py index 201d3fc1f..1b1d8528f 100644 --- a/mmocr/datasets/preparers/packers/ser_packer.py +++ b/mmocr/datasets/preparers/packers/ser_packer.py @@ -22,16 +22,22 @@ class SERPacker(BasePacker): "task_name": "ser", "labels": ['answer', 'header', 'other', 'question'], "id2label": { - "0": "answer", - "1": "header", - "2": "other", - "3": "question" + "0": "O", + "1": "B-ANSWER", + "2": "I-ANSWER", + "3": "B-HEADER", + "4": "I-HEADER", + "5": "B-QUESTION", + "6": "I-QUESTION" }, "label2id": { - "answer": 0, - "header": 1, - "other": 2, - "question": 3 + "O": 0, + "B-ANSWER": 1, + "I-ANSWER": 2, + "B-HEADER": 3, + "I-HEADER": 4, + "B-QUESTION": 5, + "I-QUESTION": 6 } }, "data_list": @@ -129,21 +135,31 @@ def add_meta(self, sample: List) -> Dict: Dict: A dict contains the meta information and samples. """ + def get_BIO_label_list(labels): + bio_label_list = [] + for label in labels: + if label == 'other': + bio_label_list.insert(0, 'O') + else: + bio_label_list.append(f'B-{label.upper()}') + bio_label_list.append(f'I-{label.upper()}') + return bio_label_list + labels = [] for s in sample: labels += s['instances']['labels'] - label_list = list(set(labels)) - label_list.sort() + org_label_list = list(set(labels)) + bio_label_list = get_BIO_label_list(org_label_list) meta = { 'metainfo': { 'dataset_type': 'SERDataset', 'task_name': 'ser', - 'labels': label_list, + 'labels': org_label_list, 'id2label': {k: v - for k, v in enumerate(label_list)}, + for k, v in enumerate(bio_label_list)}, 'label2id': {v: k - for k, v in enumerate(label_list)} + for k, v in enumerate(bio_label_list)} }, 'data_list': sample } diff --git a/mmocr/datasets/ser_dataset.py b/mmocr/datasets/ser_dataset.py new file mode 100644 index 000000000..4144a958f --- /dev/null +++ b/mmocr/datasets/ser_dataset.py @@ -0,0 +1,170 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +from typing import Callable, List, Optional, Sequence, Union + +from mmengine.dataset import BaseDataset +from transformers import AutoTokenizer + +from mmocr.registry import DATASETS + + +@DATASETS.register_module() +class SERDataset(BaseDataset): + + def __init__(self, + ann_file: str = '', + tokenizer: str = '', + metainfo: Optional[dict] = None, + data_root: Optional[str] = '', + data_prefix: dict = dict(img_path=''), + filter_cfg: Optional[dict] = None, + indices: Optional[Union[int, Sequence[int]]] = None, + serialize_data: bool = True, + pipeline: List[Union[dict, Callable]] = [], + test_mode: bool = False, + lazy_init: bool = False, + max_refetch: int = 1000) -> None: + + if isinstance(tokenizer, str): + tokenizer = AutoTokenizer.from_pretrained(tokenizer, use_fast=True) + self.tokenizer = tokenizer + + super().__init__( + ann_file=ann_file, + metainfo=metainfo, + data_root=data_root, + data_prefix=data_prefix, + filter_cfg=filter_cfg, + indices=indices, + serialize_data=serialize_data, + pipeline=pipeline, + test_mode=test_mode, + lazy_init=lazy_init, + max_refetch=max_refetch) + + def load_data_list(self) -> List[dict]: + data_list = super().load_data_list() + + # split text to several slices because of over-length + input_ids, bboxes, labels = [], [], [] + segment_ids, position_ids = [], [] + image_path = [] + for i in range(len(data_list)): + start = 0 + cur_iter = 0 + while start < len(data_list[i]['input_ids']): + end = min(start + 510, len(data_list[i]['input_ids'])) + + input_ids.append([self.tokenizer.cls_token_id] + + data_list[i]['input_ids'][start:end] + + [self.tokenizer.sep_token_id]) + bboxes.append([[0, 0, 0, 0]] + + data_list[i]['bboxes'][start:end] + + [[1000, 1000, 1000, 1000]]) + labels.append([-100] + data_list[i]['labels'][start:end] + + [-100]) + + cur_segment_ids = self.get_segment_ids(bboxes[-1]) + cur_position_ids = self.get_position_ids(cur_segment_ids) + segment_ids.append(cur_segment_ids) + position_ids.append(cur_position_ids) + image_path.append( + os.path.join(self.data_root, data_list[i]['img_path'])) + + start = end + cur_iter += 1 + + assert len(input_ids) == len(bboxes) == len(labels) == len( + segment_ids) == len(position_ids) + assert len(segment_ids) == len(image_path) + + return data_list + + def parse_data_info(self, raw_data_info: dict) -> Union[dict, List[dict]]: + instances = raw_data_info['instances'] + img_path = raw_data_info['img_path'] + width = raw_data_info['width'] + height = raw_data_info['height'] + + texts = instances.get('texts', None) + bboxes = instances.get('bboxes', None) + labels = instances.get('labels', None) + assert texts or bboxes or labels + # norm box + bboxes_norm = [self.box_norm(box, width, height) for box in bboxes] + # get label2id + label2id = self.metainfo['label2id'] + + cur_doc_input_ids, cur_doc_bboxes, cur_doc_labels = [], [], [] + for j in range(len(texts)): + cur_input_ids = self.tokenizer( + texts[j], + truncation=False, + add_special_tokens=False, + return_attention_mask=False)['input_ids'] + if len(cur_input_ids) == 0: + continue + + cur_label = labels[j].upper() + if cur_label == 'OTHER': + cur_labels = ['O'] * len(cur_input_ids) + for k in range(len(cur_labels)): + cur_labels[k] = label2id[cur_labels[k]] + else: + cur_labels = [cur_label] * len(cur_input_ids) + cur_labels[0] = label2id['B-' + cur_labels[0]] + for k in range(1, len(cur_labels)): + cur_labels[k] = label2id['I-' + cur_labels[k]] + assert len(cur_input_ids) == len( + [bboxes_norm[j]] * len(cur_input_ids)) == len(cur_labels) + cur_doc_input_ids += cur_input_ids + cur_doc_bboxes += [bboxes_norm[j]] * len(cur_input_ids) + cur_doc_labels += cur_labels + assert len(cur_doc_input_ids) == len(cur_doc_bboxes) == len( + cur_doc_labels) + assert len(cur_doc_input_ids) > 0 + + data_info = {} + data_info['img_path'] = img_path + data_info['input_ids'] = cur_doc_input_ids + data_info['bboxes'] = cur_doc_bboxes + data_info['labels'] = cur_doc_labels + return data_info + + def box_norm(self, box, width, height): + + def clip(min_num, num, max_num): + return min(max(num, min_num), max_num) + + x0, y0, x1, y1 = box + x0 = clip(0, int((x0 / width) * 1000), 1000) + y0 = clip(0, int((y0 / height) * 1000), 1000) + x1 = clip(0, int((x1 / width) * 1000), 1000) + y1 = clip(0, int((y1 / height) * 1000), 1000) + assert x1 >= x0 + assert y1 >= y0 + return [x0, y0, x1, y1] + + def get_segment_ids(self, bboxs): + segment_ids = [] + for i in range(len(bboxs)): + if i == 0: + segment_ids.append(0) + else: + if bboxs[i - 1] == bboxs[i]: + segment_ids.append(segment_ids[-1]) + else: + segment_ids.append(segment_ids[-1] + 1) + return segment_ids + + def get_position_ids(self, segment_ids): + position_ids = [] + for i in range(len(segment_ids)): + if i == 0: + position_ids.append(2) + else: + if segment_ids[i] == segment_ids[i - 1]: + position_ids.append(position_ids[-1] + 1) + else: + position_ids.append(2) + return position_ids diff --git a/projects/LayoutLMv3/README.md b/projects/LayoutLMv3/README.md new file mode 100644 index 000000000..e69de29bb diff --git a/projects/LayoutLMv3/configs/layoutlmv3_xfund_zh.py b/projects/LayoutLMv3/configs/layoutlmv3_xfund_zh.py new file mode 100644 index 000000000..7a74627c7 --- /dev/null +++ b/projects/LayoutLMv3/configs/layoutlmv3_xfund_zh.py @@ -0,0 +1,12 @@ +_base_ = [ + '/Users/wangnu/Documents/GitHub/mmocr/' + 'configs/ser/_base_/datasets/xfund_zh.py' +] + +train_dataset = _base_.xfund_zh_ser_train +train_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=train_dataset) diff --git a/projects/LayoutLMv3/scripts/prepare_dataset.sh b/projects/LayoutLMv3/scripts/prepare_dataset.sh new file mode 100644 index 000000000..e385e2abe --- /dev/null +++ b/projects/LayoutLMv3/scripts/prepare_dataset.sh @@ -0,0 +1,17 @@ +PROJ_ROOT=$(pwd) +DATASET_ZOO_PATH=${PROJ_ROOT}/dataset_zoo +NPROC=8 +TASKS=('ser' 're') +# DATASET_NAME=('xfund/de' 'xfund/es' 'xfund/fr' 'xfund/jt' 'xfund/ja' 'xfund/pt' 'xfund/zh') +DATASET_NAME=('xfund/zh') + +for TASK in ${TASKS[@]} +do + python tools/dataset_converters/prepare_dataset.py \ + ${DATASET_NAME[@]} \ + --nproc ${NPROC} \ + --task ${TASK} \ + --splits train test \ + --overwrite-cfg \ + --dataset-zoo-path ${DATASET_ZOO_PATH} +done diff --git a/projects/LayoutLMv3/test.py b/projects/LayoutLMv3/test.py new file mode 100644 index 000000000..14170c39f --- /dev/null +++ b/projects/LayoutLMv3/test.py @@ -0,0 +1,14 @@ +from mmengine.config import Config + +from mmocr.registry import DATASETS + +if __name__ == '__main__': + cfg_path = '/Users/wangnu/Documents/GitHub/mmocr/projects/' \ + 'LayoutLMv3/configs/layoutlmv3_xfund_zh.py' + cfg = Config.fromfile(cfg_path) + + dataset_cfg = cfg.train_dataset + dataset_cfg['tokenizer'] = \ + '/Users/wangnu/Documents/GitHub/mmocr/data/layoutlmv3-base-chinese' + ds = DATASETS.build(dataset_cfg) + print(ds[0]) From 1d0c5e31d2edb28455715c2a0066d77443608277 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Wed, 29 Mar 2023 01:16:29 +0800 Subject: [PATCH 11/50] =?UTF-8?q?=E9=98=B6=E6=AE=B5=E6=80=A7=E5=AE=8C?= =?UTF-8?q?=E6=88=90SERDataset=E6=95=B0=E6=8D=AE=E9=9B=86=E5=8A=A0?= =?UTF-8?q?=E8=BD=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mmocr/datasets/ser_dataset.py | 55 ++++++++++++++++++++--------------- projects/LayoutLMv3/test.py | 11 ++++++- 2 files changed, 41 insertions(+), 25 deletions(-) diff --git a/mmocr/datasets/ser_dataset.py b/mmocr/datasets/ser_dataset.py index 4144a958f..0bf1bbbfb 100644 --- a/mmocr/datasets/ser_dataset.py +++ b/mmocr/datasets/ser_dataset.py @@ -46,39 +46,46 @@ def load_data_list(self) -> List[dict]: data_list = super().load_data_list() # split text to several slices because of over-length - input_ids, bboxes, labels = [], [], [] - segment_ids, position_ids = [], [] - image_path = [] + split_text_data_list = [] for i in range(len(data_list)): start = 0 cur_iter = 0 while start < len(data_list[i]['input_ids']): end = min(start + 510, len(data_list[i]['input_ids'])) - - input_ids.append([self.tokenizer.cls_token_id] + - data_list[i]['input_ids'][start:end] + - [self.tokenizer.sep_token_id]) - bboxes.append([[0, 0, 0, 0]] + - data_list[i]['bboxes'][start:end] + - [[1000, 1000, 1000, 1000]]) - labels.append([-100] + data_list[i]['labels'][start:end] + - [-100]) - - cur_segment_ids = self.get_segment_ids(bboxes[-1]) - cur_position_ids = self.get_position_ids(cur_segment_ids) - segment_ids.append(cur_segment_ids) - position_ids.append(cur_position_ids) - image_path.append( - os.path.join(self.data_root, data_list[i]['img_path'])) + # get input_ids + input_ids = [self.tokenizer.cls_token_id] + \ + data_list[i]['input_ids'][start:end] + \ + [self.tokenizer.sep_token_id] + # get bboxes + bboxes = [[0, 0, 0, 0]] + \ + data_list[i]['bboxes'][start:end] + \ + [[1000, 1000, 1000, 1000]] + # get labels + labels = [-100] + data_list[i]['labels'][start:end] + [-100] + # get segment_ids + segment_ids = self.get_segment_ids(bboxes) + # get position_ids + position_ids = self.get_position_ids(segment_ids) + # get img_path + img_path = os.path.join(self.data_root, + data_list[i]['img_path']) + # get attention_mask + attention_mask = [1] * len(input_ids) + + data_info = {} + data_info['input_ids'] = input_ids + data_info['bboxes'] = bboxes + data_info['labels'] = labels + data_info['segment_ids'] = segment_ids + data_info['position_ids'] = position_ids + data_info['img_path'] = img_path + data_info['attention_mask '] = attention_mask + split_text_data_list.append(data_info) start = end cur_iter += 1 - assert len(input_ids) == len(bboxes) == len(labels) == len( - segment_ids) == len(position_ids) - assert len(segment_ids) == len(image_path) - - return data_list + return split_text_data_list def parse_data_info(self, raw_data_info: dict) -> Union[dict, List[dict]]: instances = raw_data_info['instances'] diff --git a/projects/LayoutLMv3/test.py b/projects/LayoutLMv3/test.py index 14170c39f..96df72b1a 100644 --- a/projects/LayoutLMv3/test.py +++ b/projects/LayoutLMv3/test.py @@ -1,4 +1,5 @@ from mmengine.config import Config +from mmengine.registry import init_default_scope from mmocr.registry import DATASETS @@ -6,9 +7,17 @@ cfg_path = '/Users/wangnu/Documents/GitHub/mmocr/projects/' \ 'LayoutLMv3/configs/layoutlmv3_xfund_zh.py' cfg = Config.fromfile(cfg_path) + init_default_scope(cfg.get('default_scope', 'mmocr')) dataset_cfg = cfg.train_dataset dataset_cfg['tokenizer'] = \ '/Users/wangnu/Documents/GitHub/mmocr/data/layoutlmv3-base-chinese' + + train_pipeline = [ + dict(type='LoadImageFromFile', color_type='color'), + dict(type='Resize', scale=(224, 224)) + ] + dataset_cfg['pipeline'] = train_pipeline ds = DATASETS.build(dataset_cfg) - print(ds[0]) + data = ds[0] + print('hi') From deb96cc124fc87da559a579575b1037194b083c6 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Thu, 30 Mar 2023 13:14:15 +0800 Subject: [PATCH 12/50] =?UTF-8?q?=E4=BC=98=E5=8C=96ser/re=20packer?= =?UTF-8?q?=EF=BC=8C=E6=A0=B9=E6=8D=AEwords=E5=85=B3=E9=94=AE=E5=AD=97?= =?UTF-8?q?=E6=98=AF=E5=90=A6=E5=AD=98=E5=9C=A8=E8=A7=89=E5=BE=97=E6=98=AF?= =?UTF-8?q?=E5=90=A6=E5=8A=A0=E5=85=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mmocr/datasets/preparers/packers/re_packer.py | 80 ++++++------------- .../datasets/preparers/packers/ser_packer.py | 38 +++++---- 2 files changed, 46 insertions(+), 72 deletions(-) diff --git a/mmocr/datasets/preparers/packers/re_packer.py b/mmocr/datasets/preparers/packers/re_packer.py index 5f3c12fe7..54edce73d 100644 --- a/mmocr/datasets/preparers/packers/re_packer.py +++ b/mmocr/datasets/preparers/packers/re_packer.py @@ -1,15 +1,16 @@ # Copyright (c) OpenMMLab. All rights reserved. import os.path as osp -from typing import Dict, List, Tuple +import warnings +from typing import Dict, Tuple import mmcv from mmocr.registry import DATA_PACKERS -from .base import BasePacker +from .ser_packer import SERPacker @DATA_PACKERS.register_module() -class REPacker(BasePacker): +class REPacker(SERPacker): """Relation Extraction packer. It is used to pack the parsed annotation info to. @@ -18,8 +19,6 @@ class REPacker(BasePacker): { "metainfo": { - "dataset_type": "REDataset", - "task_name": "re", "labels": ['answer', 'header', 'other', 'question'], "id2label": { "0": "O", @@ -49,8 +48,8 @@ class REPacker(BasePacker): "instances": { "texts": ["绩效目标申报表(一级项目)", "项目名称", ...], - "bboxes": [[906,195,1478,259], - [357,325,467,357], ...], + "boxes": [[906,195,1478,259], + [357,325,467,357], ...], "labels": ["header", "question", ...], "linkings": [[0, 1], [2, 3], ...], "ids": [0, 1, ...], @@ -104,75 +103,44 @@ def pack_instance(self, sample: Tuple) -> Dict: h, w = img.shape[:2] texts_per_doc = [] - bboxes_per_doc = [] + boxes_per_doc = [] labels_per_doc = [] - words_per_doc = [] linking_per_doc = [] id_per_doc = [] + has_words = all(['words' in ins for ins in instances]) + if has_words: + words_per_doc = [] + else: + warnings.warn( + 'Not all instance has `words` key,' + 'so final MMOCR format SER instance will not have `words` key') + for instance in instances: text = instance.get('text', None) box = instance.get('box', None) label = instance.get('label', None) linking = instance.get('linking', None) ins_id = instance.get('id', None) - words = instance.get('words', None) - assert text or box or label + assert text or box or label or linking or ins_id texts_per_doc.append(text) - bboxes_per_doc.append(box) + boxes_per_doc.append(box) labels_per_doc.append(label) - words_per_doc.append(words) linking_per_doc.append(linking) id_per_doc.append(ins_id) + if has_words: + words = instance.get('words', None) + words_per_doc.append(words) packed_instances = dict( instances=dict( texts=texts_per_doc, - bboxes=bboxes_per_doc, + boxes=boxes_per_doc, labels=labels_per_doc, linkings=linking_per_doc, - ids=id_per_doc, - words=words_per_doc), + ids=id_per_doc), img_path=osp.relpath(img_path, self.data_root), height=h, width=w) + if has_words: + packed_instances['instances'].update({'words': words_per_doc}) return packed_instances - - def add_meta(self, sample: List) -> Dict: - """Add meta information to the sample. - - Args: - sample (List): A list of samples of the dataset. - - Returns: - Dict: A dict contains the meta information and samples. - """ - - def get_BIO_label_list(labels): - bio_label_list = [] - for label in labels: - if label == 'other': - bio_label_list.insert(0, 'O') - else: - bio_label_list.append(f'B-{label.upper()}') - bio_label_list.append(f'I-{label.upper()}') - return bio_label_list - - labels = [] - for s in sample: - labels += s['instances']['labels'] - org_label_list = list(set(labels)) - bio_label_list = get_BIO_label_list(org_label_list) - - meta = { - 'metainfo': { - 'dataset_type': 'REDataset', - 'task_name': 're', - 'labels': org_label_list, - 'id2label': {k: v - for k, v in enumerate(bio_label_list)}, - 'label2id': {v: k - for k, v in enumerate(bio_label_list)} - }, - 'data_list': sample - } - return meta diff --git a/mmocr/datasets/preparers/packers/ser_packer.py b/mmocr/datasets/preparers/packers/ser_packer.py index 1b1d8528f..3db633bfa 100644 --- a/mmocr/datasets/preparers/packers/ser_packer.py +++ b/mmocr/datasets/preparers/packers/ser_packer.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import os.path as osp +import warnings from typing import Dict, List, Tuple import mmcv @@ -18,8 +19,6 @@ class SERPacker(BasePacker): { "metainfo": { - "dataset_type": "SERDataset", - "task_name": "ser", "labels": ['answer', 'header', 'other', 'question'], "id2label": { "0": "O", @@ -49,8 +48,8 @@ class SERPacker(BasePacker): "instances": { "texts": ["绩效目标申报表(一级项目)", "项目名称", ...], - "bboxes": [[906,195,1478,259], - [357,325,467,357], ...], + "boxes": [[906,195,1478,259], + [357,325,467,357], ...], "labels": ["header", "question", ...], "words": [[{ "box": [ @@ -100,28 +99,37 @@ def pack_instance(self, sample: Tuple) -> Dict: h, w = img.shape[:2] texts_per_doc = [] - bboxes_per_doc = [] + boxes_per_doc = [] labels_per_doc = [] - words_per_doc = [] + has_words = all(['words' in ins for ins in instances]) + if has_words: + words_per_doc = [] + else: + warnings.warn( + 'Not all instance has `words` key,' + 'so final MMOCR format SER instance will not have `words` key') + for instance in instances: text = instance.get('text', None) box = instance.get('box', None) label = instance.get('label', None) - words = instance.get('words', None) assert text or box or label texts_per_doc.append(text) - bboxes_per_doc.append(box) + boxes_per_doc.append(box) labels_per_doc.append(label) - words_per_doc.append(words) + if has_words: + words = instance.get('words', None) + words_per_doc.append(words) packed_instances = dict( instances=dict( texts=texts_per_doc, - bboxes=bboxes_per_doc, - labels=labels_per_doc, - words=words_per_doc), + boxes=boxes_per_doc, + labels=labels_per_doc), img_path=osp.relpath(img_path, self.data_root), height=h, width=w) + if has_words: + packed_instances['instances'].update({'words': words_per_doc}) return packed_instances @@ -135,7 +143,7 @@ def add_meta(self, sample: List) -> Dict: Dict: A dict contains the meta information and samples. """ - def get_BIO_label_list(labels): + def get_bio_label_list(labels): bio_label_list = [] for label in labels: if label == 'other': @@ -149,12 +157,10 @@ def get_BIO_label_list(labels): for s in sample: labels += s['instances']['labels'] org_label_list = list(set(labels)) - bio_label_list = get_BIO_label_list(org_label_list) + bio_label_list = get_bio_label_list(org_label_list) meta = { 'metainfo': { - 'dataset_type': 'SERDataset', - 'task_name': 'ser', 'labels': org_label_list, 'id2label': {k: v for k, v in enumerate(bio_label_list)}, From 443e979ed303fa9f965f780c9f518da74734c909 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Thu, 30 Mar 2023 13:30:36 +0800 Subject: [PATCH 13/50] =?UTF-8?q?=E4=BC=98=E5=8C=96xfund=E6=95=B0=E6=8D=AE?= =?UTF-8?q?=E9=9B=86=E7=9A=84config=5Fgenerator=E5=91=BD=E5=90=8D=EF=BC=8C?= =?UTF-8?q?=E4=BD=BFconfig=5Fgenerator=E7=9B=AE=E5=BD=95=E7=BB=93=E6=9E=84?= =?UTF-8?q?=E6=9B=B4=E6=B8=85=E6=99=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- configs/re/_base_/datasets/xfund_zh.py | 4 +- configs/ser/_base_/datasets/xfund_zh.py | 4 +- dataset_zoo/xfund/de/re.py | 2 +- dataset_zoo/xfund/de/ser.py | 2 +- dataset_zoo/xfund/es/re.py | 2 +- dataset_zoo/xfund/es/ser.py | 2 +- dataset_zoo/xfund/fr/re.py | 2 +- dataset_zoo/xfund/fr/ser.py | 2 +- dataset_zoo/xfund/it/re.py | 2 +- dataset_zoo/xfund/it/ser.py | 2 +- dataset_zoo/xfund/ja/re.py | 2 +- dataset_zoo/xfund/ja/ser.py | 2 +- dataset_zoo/xfund/pt/re.py | 2 +- dataset_zoo/xfund/pt/ser.py | 2 +- dataset_zoo/xfund/zh/re.py | 2 +- dataset_zoo/xfund/zh/ser.py | 2 +- .../preparers/config_generators/__init__.py | 6 +- .../config_generators/re_config_generator.py | 96 --------- .../config_generators/ser_config_generator.py | 96 --------- .../xfund_config_generator.py | 187 ++++++++++++++++++ 20 files changed, 208 insertions(+), 213 deletions(-) delete mode 100644 mmocr/datasets/preparers/config_generators/re_config_generator.py delete mode 100644 mmocr/datasets/preparers/config_generators/ser_config_generator.py create mode 100644 mmocr/datasets/preparers/config_generators/xfund_config_generator.py diff --git a/configs/re/_base_/datasets/xfund_zh.py b/configs/re/_base_/datasets/xfund_zh.py index 5ea9c9d33..4a44301dd 100644 --- a/configs/re/_base_/datasets/xfund_zh.py +++ b/configs/re/_base_/datasets/xfund_zh.py @@ -1,13 +1,13 @@ xfund_zh_re_data_root = 'data/xfund/zh' xfund_zh_re_train = dict( - type='REDataset', + type='XFUNDREDataset', data_root=xfund_zh_re_data_root, ann_file='re_train.json', pipeline=None) xfund_zh_re_test = dict( - type='REDataset', + type='XFUNDREDataset', data_root=xfund_zh_re_data_root, ann_file='re_test.json', test_mode=True, diff --git a/configs/ser/_base_/datasets/xfund_zh.py b/configs/ser/_base_/datasets/xfund_zh.py index 4ee522efd..40bbce4de 100644 --- a/configs/ser/_base_/datasets/xfund_zh.py +++ b/configs/ser/_base_/datasets/xfund_zh.py @@ -1,13 +1,13 @@ xfund_zh_ser_data_root = 'data/xfund/zh' xfund_zh_ser_train = dict( - type='SERDataset', + type='XFUNDSERDataset', data_root=xfund_zh_ser_data_root, ann_file='ser_train.json', pipeline=None) xfund_zh_ser_test = dict( - type='SERDataset', + type='XFUNDSERDataset', data_root=xfund_zh_ser_data_root, ann_file='ser_test.json', test_mode=True, diff --git a/dataset_zoo/xfund/de/re.py b/dataset_zoo/xfund/de/re.py index b3e0666de..e0419d026 100644 --- a/dataset_zoo/xfund/de/re.py +++ b/dataset_zoo/xfund/de/re.py @@ -3,4 +3,4 @@ _base_.train_preparer.packer.type = 'REPacker' _base_.test_preparer.packer.type = 'REPacker' -config_generator = dict(type='REConfigGenerator') +config_generator = dict(type='XFUNDREConfigGenerator') diff --git a/dataset_zoo/xfund/de/ser.py b/dataset_zoo/xfund/de/ser.py index 60c6963c0..5e9769eb0 100644 --- a/dataset_zoo/xfund/de/ser.py +++ b/dataset_zoo/xfund/de/ser.py @@ -57,4 +57,4 @@ ) delete = ['annotations'] + [f'{lang}_{split}' for split in ['train', 'val']] -config_generator = dict(type='SERConfigGenerator') +config_generator = dict(type='XFUNDSERConfigGenerator') diff --git a/dataset_zoo/xfund/es/re.py b/dataset_zoo/xfund/es/re.py index b3e0666de..e0419d026 100644 --- a/dataset_zoo/xfund/es/re.py +++ b/dataset_zoo/xfund/es/re.py @@ -3,4 +3,4 @@ _base_.train_preparer.packer.type = 'REPacker' _base_.test_preparer.packer.type = 'REPacker' -config_generator = dict(type='REConfigGenerator') +config_generator = dict(type='XFUNDREConfigGenerator') diff --git a/dataset_zoo/xfund/es/ser.py b/dataset_zoo/xfund/es/ser.py index 2cc4dbcc6..da8900980 100644 --- a/dataset_zoo/xfund/es/ser.py +++ b/dataset_zoo/xfund/es/ser.py @@ -57,4 +57,4 @@ ) delete = ['annotations'] + [f'{lang}_{split}' for split in ['train', 'val']] -config_generator = dict(type='SERConfigGenerator') +config_generator = dict(type='XFUNDSERConfigGenerator') diff --git a/dataset_zoo/xfund/fr/re.py b/dataset_zoo/xfund/fr/re.py index b3e0666de..e0419d026 100644 --- a/dataset_zoo/xfund/fr/re.py +++ b/dataset_zoo/xfund/fr/re.py @@ -3,4 +3,4 @@ _base_.train_preparer.packer.type = 'REPacker' _base_.test_preparer.packer.type = 'REPacker' -config_generator = dict(type='REConfigGenerator') +config_generator = dict(type='XFUNDREConfigGenerator') diff --git a/dataset_zoo/xfund/fr/ser.py b/dataset_zoo/xfund/fr/ser.py index ff9f5ea1f..aad6b7cf3 100644 --- a/dataset_zoo/xfund/fr/ser.py +++ b/dataset_zoo/xfund/fr/ser.py @@ -57,4 +57,4 @@ ) delete = ['annotations'] + [f'{lang}_{split}' for split in ['train', 'val']] -config_generator = dict(type='SERConfigGenerator') +config_generator = dict(type='XFUNDSERConfigGenerator') diff --git a/dataset_zoo/xfund/it/re.py b/dataset_zoo/xfund/it/re.py index b3e0666de..e0419d026 100644 --- a/dataset_zoo/xfund/it/re.py +++ b/dataset_zoo/xfund/it/re.py @@ -3,4 +3,4 @@ _base_.train_preparer.packer.type = 'REPacker' _base_.test_preparer.packer.type = 'REPacker' -config_generator = dict(type='REConfigGenerator') +config_generator = dict(type='XFUNDREConfigGenerator') diff --git a/dataset_zoo/xfund/it/ser.py b/dataset_zoo/xfund/it/ser.py index 92c298ff1..fc9fc8b70 100644 --- a/dataset_zoo/xfund/it/ser.py +++ b/dataset_zoo/xfund/it/ser.py @@ -57,4 +57,4 @@ ) delete = ['annotations'] + [f'{lang}_{split}' for split in ['train', 'val']] -config_generator = dict(type='SERConfigGenerator') +config_generator = dict(type='XFUNDSERConfigGenerator') diff --git a/dataset_zoo/xfund/ja/re.py b/dataset_zoo/xfund/ja/re.py index b3e0666de..e0419d026 100644 --- a/dataset_zoo/xfund/ja/re.py +++ b/dataset_zoo/xfund/ja/re.py @@ -3,4 +3,4 @@ _base_.train_preparer.packer.type = 'REPacker' _base_.test_preparer.packer.type = 'REPacker' -config_generator = dict(type='REConfigGenerator') +config_generator = dict(type='XFUNDREConfigGenerator') diff --git a/dataset_zoo/xfund/ja/ser.py b/dataset_zoo/xfund/ja/ser.py index e536151ea..856b4f96d 100644 --- a/dataset_zoo/xfund/ja/ser.py +++ b/dataset_zoo/xfund/ja/ser.py @@ -57,4 +57,4 @@ ) delete = ['annotations'] + [f'{lang}_{split}' for split in ['train', 'val']] -config_generator = dict(type='SERConfigGenerator') +config_generator = dict(type='XFUNDSERConfigGenerator') diff --git a/dataset_zoo/xfund/pt/re.py b/dataset_zoo/xfund/pt/re.py index b3e0666de..e0419d026 100644 --- a/dataset_zoo/xfund/pt/re.py +++ b/dataset_zoo/xfund/pt/re.py @@ -3,4 +3,4 @@ _base_.train_preparer.packer.type = 'REPacker' _base_.test_preparer.packer.type = 'REPacker' -config_generator = dict(type='REConfigGenerator') +config_generator = dict(type='XFUNDREConfigGenerator') diff --git a/dataset_zoo/xfund/pt/ser.py b/dataset_zoo/xfund/pt/ser.py index 079b39448..ff147ba4c 100644 --- a/dataset_zoo/xfund/pt/ser.py +++ b/dataset_zoo/xfund/pt/ser.py @@ -57,4 +57,4 @@ ) delete = ['annotations'] + [f'{lang}_{split}' for split in ['train', 'val']] -config_generator = dict(type='SERConfigGenerator') +config_generator = dict(type='XFUNDSERConfigGenerator') diff --git a/dataset_zoo/xfund/zh/re.py b/dataset_zoo/xfund/zh/re.py index b3e0666de..e0419d026 100644 --- a/dataset_zoo/xfund/zh/re.py +++ b/dataset_zoo/xfund/zh/re.py @@ -3,4 +3,4 @@ _base_.train_preparer.packer.type = 'REPacker' _base_.test_preparer.packer.type = 'REPacker' -config_generator = dict(type='REConfigGenerator') +config_generator = dict(type='XFUNDREConfigGenerator') diff --git a/dataset_zoo/xfund/zh/ser.py b/dataset_zoo/xfund/zh/ser.py index ec8efb1a3..20a3d1150 100644 --- a/dataset_zoo/xfund/zh/ser.py +++ b/dataset_zoo/xfund/zh/ser.py @@ -57,4 +57,4 @@ ) delete = ['annotations'] + [f'{lang}_{split}' for split in ['train', 'val']] -config_generator = dict(type='SERConfigGenerator') +config_generator = dict(type='XFUNDSERConfigGenerator') diff --git a/mmocr/datasets/preparers/config_generators/__init__.py b/mmocr/datasets/preparers/config_generators/__init__.py index 1a6221256..69e3b5157 100644 --- a/mmocr/datasets/preparers/config_generators/__init__.py +++ b/mmocr/datasets/preparers/config_generators/__init__.py @@ -1,13 +1,13 @@ # Copyright (c) OpenMMLab. All rights reserved. from .base import BaseDatasetConfigGenerator -from .re_config_generator import REConfigGenerator -from .ser_config_generator import SERConfigGenerator from .textdet_config_generator import TextDetConfigGenerator from .textrecog_config_generator import TextRecogConfigGenerator from .textspotting_config_generator import TextSpottingConfigGenerator +from .xfund_config_generator import (XFUNDREConfigGenerator, + XFUNDSERConfigGenerator) __all__ = [ 'BaseDatasetConfigGenerator', 'TextDetConfigGenerator', 'TextRecogConfigGenerator', 'TextSpottingConfigGenerator', - 'SERConfigGenerator', 'REConfigGenerator' + 'XFUNDSERConfigGenerator', 'XFUNDREConfigGenerator' ] diff --git a/mmocr/datasets/preparers/config_generators/re_config_generator.py b/mmocr/datasets/preparers/config_generators/re_config_generator.py deleted file mode 100644 index 3d5d4c5e2..000000000 --- a/mmocr/datasets/preparers/config_generators/re_config_generator.py +++ /dev/null @@ -1,96 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -from typing import Dict, List, Optional - -from mmocr.registry import CFG_GENERATORS -from .base import BaseDatasetConfigGenerator - - -@CFG_GENERATORS.register_module() -class REConfigGenerator(BaseDatasetConfigGenerator): - """Text detection config generator. - - Args: - data_root (str): The root path of the dataset. - dataset_name (str): The name of the dataset. - overwrite_cfg (bool): Whether to overwrite the dataset config file if - it already exists. If False, config generator will not generate new - config for datasets whose configs are already in base. - train_anns (List[Dict], optional): A list of train annotation files - to appear in the base configs. Defaults to - ``[dict(file='re_train.json', dataset_postfix='')]``. - Each element is typically a dict with the following fields: - - ann_file (str): The path to the annotation file relative to - data_root. - - dataset_postfix (str, optional): Affects the postfix of the - resulting variable in the generated config. If specified, the - dataset variable will be named in the form of - ``{dataset_name}_{dataset_postfix}_{task}_{split}``. Defaults to - None. - val_anns (List[Dict], optional): A list of val annotation files - to appear in the base configs, similar to ``train_anns``. Defaults - to []. - test_anns (List[Dict], optional): A list of test annotation files - to appear in the base configs, similar to ``train_anns``. Defaults - to ``[dict(file='re_test.json')]``. - config_path (str): Path to the configs. Defaults to 'configs/'. - """ - - def __init__( - self, - data_root: str, - dataset_name: str, - overwrite_cfg: bool = False, - train_anns: Optional[List[Dict]] = [ - dict(ann_file='re_train.json', dataset_postfix='') - ], - val_anns: Optional[List[Dict]] = [], - test_anns: Optional[List[Dict]] = [ - dict(ann_file='re_test.json', dataset_postfix='') - ], - config_path: str = 'configs/', - ) -> None: - if '/' in dataset_name: - dataset_name = '_'.join(dataset_name.split('/')) - super().__init__( - data_root=data_root, - task='re', - overwrite_cfg=overwrite_cfg, - dataset_name=dataset_name, - train_anns=train_anns, - val_anns=val_anns, - test_anns=test_anns, - config_path=config_path, - ) - - def _gen_dataset_config(self) -> str: - """Generate a full dataset config based on the annotation file - dictionary. - - Args: - ann_dict (dict[str, dict(str, str)]): A nested dictionary that maps - a config variable name (such as icdar2015_textrecog_train) to - its corresponding annotation information dict. Each dict - contains following keys: - - ann_file (str): The path to the annotation file relative to - data_root. - - dataset_postfix (str, optional): Affects the postfix of the - resulting variable in the generated config. If specified, the - dataset variable will be named in the form of - ``{dataset_name}_{dataset_postfix}_{task}_{split}``. Defaults - to None. - - split (str): The split the annotation belongs to. Usually - it can be 'train', 'val' and 'test'. - - Returns: - str: The generated dataset config. - """ - cfg = '' - for key_name, ann_dict in self.anns.items(): - cfg += f'\n{key_name} = dict(\n' - cfg += ' type=\'REDataset\',\n' - cfg += ' data_root=' + f'{self.dataset_name}_{self.task}_data_root,\n' # noqa: E501 - cfg += f' ann_file=\'{ann_dict["ann_file"]}\',\n' - if ann_dict['split'] in ['test', 'val']: - cfg += ' test_mode=True,\n' - cfg += ' pipeline=None)\n' - return cfg diff --git a/mmocr/datasets/preparers/config_generators/ser_config_generator.py b/mmocr/datasets/preparers/config_generators/ser_config_generator.py deleted file mode 100644 index c3cb7f53f..000000000 --- a/mmocr/datasets/preparers/config_generators/ser_config_generator.py +++ /dev/null @@ -1,96 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -from typing import Dict, List, Optional - -from mmocr.registry import CFG_GENERATORS -from .base import BaseDatasetConfigGenerator - - -@CFG_GENERATORS.register_module() -class SERConfigGenerator(BaseDatasetConfigGenerator): - """Text detection config generator. - - Args: - data_root (str): The root path of the dataset. - dataset_name (str): The name of the dataset. - overwrite_cfg (bool): Whether to overwrite the dataset config file if - it already exists. If False, config generator will not generate new - config for datasets whose configs are already in base. - train_anns (List[Dict], optional): A list of train annotation files - to appear in the base configs. Defaults to - ``[dict(file='ser_train.json', dataset_postfix='')]``. - Each element is typically a dict with the following fields: - - ann_file (str): The path to the annotation file relative to - data_root. - - dataset_postfix (str, optional): Affects the postfix of the - resulting variable in the generated config. If specified, the - dataset variable will be named in the form of - ``{dataset_name}_{dataset_postfix}_{task}_{split}``. Defaults to - None. - val_anns (List[Dict], optional): A list of val annotation files - to appear in the base configs, similar to ``train_anns``. Defaults - to []. - test_anns (List[Dict], optional): A list of test annotation files - to appear in the base configs, similar to ``train_anns``. Defaults - to ``[dict(file='ser_test.json')]``. - config_path (str): Path to the configs. Defaults to 'configs/'. - """ - - def __init__( - self, - data_root: str, - dataset_name: str, - overwrite_cfg: bool = False, - train_anns: Optional[List[Dict]] = [ - dict(ann_file='ser_train.json', dataset_postfix='') - ], - val_anns: Optional[List[Dict]] = [], - test_anns: Optional[List[Dict]] = [ - dict(ann_file='ser_test.json', dataset_postfix='') - ], - config_path: str = 'configs/', - ) -> None: - if '/' in dataset_name: - dataset_name = '_'.join(dataset_name.split('/')) - super().__init__( - data_root=data_root, - task='ser', - overwrite_cfg=overwrite_cfg, - dataset_name=dataset_name, - train_anns=train_anns, - val_anns=val_anns, - test_anns=test_anns, - config_path=config_path, - ) - - def _gen_dataset_config(self) -> str: - """Generate a full dataset config based on the annotation file - dictionary. - - Args: - ann_dict (dict[str, dict(str, str)]): A nested dictionary that maps - a config variable name (such as icdar2015_textrecog_train) to - its corresponding annotation information dict. Each dict - contains following keys: - - ann_file (str): The path to the annotation file relative to - data_root. - - dataset_postfix (str, optional): Affects the postfix of the - resulting variable in the generated config. If specified, the - dataset variable will be named in the form of - ``{dataset_name}_{dataset_postfix}_{task}_{split}``. Defaults - to None. - - split (str): The split the annotation belongs to. Usually - it can be 'train', 'val' and 'test'. - - Returns: - str: The generated dataset config. - """ - cfg = '' - for key_name, ann_dict in self.anns.items(): - cfg += f'\n{key_name} = dict(\n' - cfg += ' type=\'SERDataset\',\n' - cfg += ' data_root=' + f'{self.dataset_name}_{self.task}_data_root,\n' # noqa: E501 - cfg += f' ann_file=\'{ann_dict["ann_file"]}\',\n' - if ann_dict['split'] in ['test', 'val']: - cfg += ' test_mode=True,\n' - cfg += ' pipeline=None)\n' - return cfg diff --git a/mmocr/datasets/preparers/config_generators/xfund_config_generator.py b/mmocr/datasets/preparers/config_generators/xfund_config_generator.py new file mode 100644 index 000000000..ca80375bc --- /dev/null +++ b/mmocr/datasets/preparers/config_generators/xfund_config_generator.py @@ -0,0 +1,187 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, List, Optional + +from mmocr.registry import CFG_GENERATORS +from .base import BaseDatasetConfigGenerator + + +@CFG_GENERATORS.register_module() +class XFUNDSERConfigGenerator(BaseDatasetConfigGenerator): + """XFUND dataset Semantic Entity Recognition task config generator. + + Args: + data_root (str): The root path of the dataset. + dataset_name (str): The name of the dataset. + overwrite_cfg (bool): Whether to overwrite the dataset config file if + it already exists. If False, config generator will not generate new + config for datasets whose configs are already in base. + train_anns (List[Dict], optional): A list of train annotation files + to appear in the base configs. Defaults to + ``[dict(file='ser_train.json', dataset_postfix='')]``. + Each element is typically a dict with the following fields: + - ann_file (str): The path to the annotation file relative to + data_root. + - dataset_postfix (str, optional): Affects the postfix of the + resulting variable in the generated config. If specified, the + dataset variable will be named in the form of + ``{dataset_name}_{dataset_postfix}_{task}_{split}``. Defaults to + None. + val_anns (List[Dict], optional): A list of val annotation files + to appear in the base configs, similar to ``train_anns``. Defaults + to []. + test_anns (List[Dict], optional): A list of test annotation files + to appear in the base configs, similar to ``train_anns``. Defaults + to ``[dict(file='ser_test.json')]``. + config_path (str): Path to the configs. Defaults to 'configs/'. + """ + + def __init__( + self, + data_root: str, + dataset_name: str, + overwrite_cfg: bool = False, + train_anns: Optional[List[Dict]] = [ + dict(ann_file='ser_train.json', dataset_postfix='') + ], + val_anns: Optional[List[Dict]] = [], + test_anns: Optional[List[Dict]] = [ + dict(ann_file='ser_test.json', dataset_postfix='') + ], + config_path: str = 'configs/', + ) -> None: + if '/' in dataset_name: + dataset_name = '_'.join(dataset_name.split('/')) + super().__init__( + data_root=data_root, + task='ser', + overwrite_cfg=overwrite_cfg, + dataset_name=dataset_name, + train_anns=train_anns, + val_anns=val_anns, + test_anns=test_anns, + config_path=config_path, + ) + + def _gen_dataset_config(self) -> str: + """Generate a full dataset config based on the annotation file + dictionary. + + Args: + ann_dict (dict[str, dict(str, str)]): A nested dictionary that maps + a config variable name (such as icdar2015_textrecog_train) to + its corresponding annotation information dict. Each dict + contains following keys: + - ann_file (str): The path to the annotation file relative to + data_root. + - dataset_postfix (str, optional): Affects the postfix of the + resulting variable in the generated config. If specified, the + dataset variable will be named in the form of + ``{dataset_name}_{dataset_postfix}_{task}_{split}``. Defaults + to None. + - split (str): The split the annotation belongs to. Usually + it can be 'train', 'val' and 'test'. + + Returns: + str: The generated dataset config. + """ + cfg = '' + for key_name, ann_dict in self.anns.items(): + cfg += f'\n{key_name} = dict(\n' + cfg += ' type=\'XFUNDSERDataset\',\n' + cfg += ' data_root=' + f'{self.dataset_name}_{self.task}_data_root,\n' # noqa: E501 + cfg += f' ann_file=\'{ann_dict["ann_file"]}\',\n' + if ann_dict['split'] in ['test', 'val']: + cfg += ' test_mode=True,\n' + cfg += ' pipeline=None)\n' + return cfg + + +@CFG_GENERATORS.register_module() +class XFUNDREConfigGenerator(BaseDatasetConfigGenerator): + """XFUND dataset Relation Extraction task config generator. + + Args: + data_root (str): The root path of the dataset. + dataset_name (str): The name of the dataset. + overwrite_cfg (bool): Whether to overwrite the dataset config file if + it already exists. If False, config generator will not generate new + config for datasets whose configs are already in base. + train_anns (List[Dict], optional): A list of train annotation files + to appear in the base configs. Defaults to + ``[dict(file='re_train.json', dataset_postfix='')]``. + Each element is typically a dict with the following fields: + - ann_file (str): The path to the annotation file relative to + data_root. + - dataset_postfix (str, optional): Affects the postfix of the + resulting variable in the generated config. If specified, the + dataset variable will be named in the form of + ``{dataset_name}_{dataset_postfix}_{task}_{split}``. Defaults to + None. + val_anns (List[Dict], optional): A list of val annotation files + to appear in the base configs, similar to ``train_anns``. Defaults + to []. + test_anns (List[Dict], optional): A list of test annotation files + to appear in the base configs, similar to ``train_anns``. Defaults + to ``[dict(file='re_test.json')]``. + config_path (str): Path to the configs. Defaults to 'configs/'. + """ + + def __init__( + self, + data_root: str, + dataset_name: str, + overwrite_cfg: bool = False, + train_anns: Optional[List[Dict]] = [ + dict(ann_file='re_train.json', dataset_postfix='') + ], + val_anns: Optional[List[Dict]] = [], + test_anns: Optional[List[Dict]] = [ + dict(ann_file='re_test.json', dataset_postfix='') + ], + config_path: str = 'configs/', + ) -> None: + if '/' in dataset_name: + dataset_name = '_'.join(dataset_name.split('/')) + super().__init__( + data_root=data_root, + task='re', + overwrite_cfg=overwrite_cfg, + dataset_name=dataset_name, + train_anns=train_anns, + val_anns=val_anns, + test_anns=test_anns, + config_path=config_path, + ) + + def _gen_dataset_config(self) -> str: + """Generate a full dataset config based on the annotation file + dictionary. + + Args: + ann_dict (dict[str, dict(str, str)]): A nested dictionary that maps + a config variable name (such as icdar2015_textrecog_train) to + its corresponding annotation information dict. Each dict + contains following keys: + - ann_file (str): The path to the annotation file relative to + data_root. + - dataset_postfix (str, optional): Affects the postfix of the + resulting variable in the generated config. If specified, the + dataset variable will be named in the form of + ``{dataset_name}_{dataset_postfix}_{task}_{split}``. Defaults + to None. + - split (str): The split the annotation belongs to. Usually + it can be 'train', 'val' and 'test'. + + Returns: + str: The generated dataset config. + """ + cfg = '' + for key_name, ann_dict in self.anns.items(): + cfg += f'\n{key_name} = dict(\n' + cfg += ' type=\'XFUNDREDataset\',\n' + cfg += ' data_root=' + f'{self.dataset_name}_{self.task}_data_root,\n' # noqa: E501 + cfg += f' ann_file=\'{ann_dict["ann_file"]}\',\n' + if ann_dict['split'] in ['test', 'val']: + cfg += ' test_mode=True,\n' + cfg += ' pipeline=None)\n' + return cfg From a88a129e209575284f5c0f795df20dc723534a21 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Fri, 31 Mar 2023 00:13:49 +0800 Subject: [PATCH 14/50] =?UTF-8?q?=E4=BF=AE=E6=94=B9SERDataset=E4=B8=BAXFUN?= =?UTF-8?q?DSERDataset?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mmocr/datasets/__init__.py | 4 +- .../{ser_dataset.py => xfund_dataset.py} | 126 +++++++++++++----- 2 files changed, 98 insertions(+), 32 deletions(-) rename mmocr/datasets/{ser_dataset.py => xfund_dataset.py} (52%) diff --git a/mmocr/datasets/__init__.py b/mmocr/datasets/__init__.py index f6d802325..ab00e0328 100644 --- a/mmocr/datasets/__init__.py +++ b/mmocr/datasets/__init__.py @@ -5,11 +5,11 @@ from .recog_lmdb_dataset import RecogLMDBDataset from .recog_text_dataset import RecogTextDataset from .samplers import * # NOQA -from .ser_dataset import SERDataset from .transforms import * # NOQA from .wildreceipt_dataset import WildReceiptDataset +from .xfund_dataset import XFUNDSERDataset __all__ = [ 'IcdarDataset', 'OCRDataset', 'RecogLMDBDataset', 'RecogTextDataset', - 'WildReceiptDataset', 'ConcatDataset', 'SERDataset' + 'WildReceiptDataset', 'ConcatDataset', 'XFUNDSERDataset' ] diff --git a/mmocr/datasets/ser_dataset.py b/mmocr/datasets/xfund_dataset.py similarity index 52% rename from mmocr/datasets/ser_dataset.py rename to mmocr/datasets/xfund_dataset.py index 0bf1bbbfb..af050b275 100644 --- a/mmocr/datasets/ser_dataset.py +++ b/mmocr/datasets/xfund_dataset.py @@ -9,7 +9,42 @@ @DATASETS.register_module() -class SERDataset(BaseDataset): +class XFUNDSERDataset(BaseDataset): + """XFUND Dataset for Semantic Entity Recognition task. part of code is + modified from https://github.com/microsoft/unilm/blob/master/layoutlmv3/lay + outlmft/data/xfund.py. + + Args: + ann_file (str): Annotation file path. Defaults to ''. + tokenizer (str): The pre-trained tokenizer you want to use. + Defaults to ''. + metainfo (dict, optional): Meta information for dataset, such as class + information. Defaults to None. + data_root (str): The root directory for ``data_prefix`` and + ``ann_file``. Defaults to ''. + data_prefix (dict): Prefix for training data. Defaults to + ``dict(img_path='')``. + filter_cfg (dict, optional): Config for filter data. Defaults to None. + indices (int or Sequence[int], optional): Support using first few + data in annotation file to facilitate training/testing on a smaller + dataset. Defaults to None which means using all ``data_infos``. + serialize_data (bool, optional): Whether to hold memory using + serialized objects, when enabled, data loader workers can use + shared RAM from master process instead of making a copy. Defaults + to True. + pipeline (list, optional): Processing pipeline. Defaults to []. + test_mode (bool, optional): ``test_mode=True`` means in test phase. + Defaults to False. + lazy_init (bool, optional): Whether to load annotation during + instantiation. In some cases, such as visualization, only the meta + information of the dataset is needed, which is not necessary to + load annotation file. ``RecogLMDBDataset`` can skip load + annotations to save time by set ``lazy_init=False``. + Defaults to False. + max_refetch (int, optional): If ``RecogLMDBdataset.prepare_data`` get a + None img. The maximum extra number of cycles to get a valid + image. Defaults to 1000. + """ def __init__(self, ann_file: str = '', @@ -25,9 +60,9 @@ def __init__(self, lazy_init: bool = False, max_refetch: int = 1000) -> None: - if isinstance(tokenizer, str): - tokenizer = AutoTokenizer.from_pretrained(tokenizer, use_fast=True) - self.tokenizer = tokenizer + assert tokenizer != '', 'tokenizer must be specified.' + self.tokenizer = AutoTokenizer.from_pretrained( + tokenizer, use_fast=True) super().__init__( ann_file=ann_file, @@ -56,14 +91,14 @@ def load_data_list(self) -> List[dict]: input_ids = [self.tokenizer.cls_token_id] + \ data_list[i]['input_ids'][start:end] + \ [self.tokenizer.sep_token_id] - # get bboxes - bboxes = [[0, 0, 0, 0]] + \ - data_list[i]['bboxes'][start:end] + \ + # get boxes + boxes = [[0, 0, 0, 0]] + \ + data_list[i]['boxes'][start:end] + \ [[1000, 1000, 1000, 1000]] # get labels labels = [-100] + data_list[i]['labels'][start:end] + [-100] # get segment_ids - segment_ids = self.get_segment_ids(bboxes) + segment_ids = self.get_segment_ids(boxes) # get position_ids position_ids = self.get_position_ids(segment_ids) # get img_path @@ -74,12 +109,12 @@ def load_data_list(self) -> List[dict]: data_info = {} data_info['input_ids'] = input_ids - data_info['bboxes'] = bboxes + data_info['boxes'] = boxes data_info['labels'] = labels data_info['segment_ids'] = segment_ids data_info['position_ids'] = position_ids data_info['img_path'] = img_path - data_info['attention_mask '] = attention_mask + data_info['attention_mask'] = attention_mask split_text_data_list.append(data_info) start = end @@ -87,22 +122,52 @@ def load_data_list(self) -> List[dict]: return split_text_data_list - def parse_data_info(self, raw_data_info: dict) -> Union[dict, List[dict]]: + def parse_data_info(self, raw_data_info: dict) -> dict: + """Parse raw data information, tokenize texts and normalize boxes. + + raw_data_info + { + "img_path": "imgs\\test\\zh_val_0.jpg", + "height": 3508, + "width": 2480, + "instances": + { + "texts": ["汇丰晋信", "受理时间:", ...], + "boxes": [[104, 114, 530, 175], + [126, 267, 266, 305], ...], + "labels": ["other", "question", ...], + "words": [[...], [...], ...] + } + } + will be modified to data_info + { + "img_path": "imgs\\test\\zh_val_0.jpg", + "input_ids": [6, 47360, 49222, 124321, 5070, 6, ...], + "boxes": [[41, 32, 213, 49], + [41, 32, 213, 49], + [41, 32, 213, 49], + [41, 32, 213, 49], + [41, 32, 213, 49], + [50, 76, 107, 86], ...], + "labels": [0, 0, 0, 0, 0, 1, ...] + } + The length of `texts`、`boxes` and `labels` will increase. + The `words` annotations are not used here. + """ instances = raw_data_info['instances'] - img_path = raw_data_info['img_path'] + texts = instances['texts'] + boxes = instances['boxes'] + labels = instances['labels'] + + # norm boxes width = raw_data_info['width'] height = raw_data_info['height'] + norm_boxes = [self.box_norm(box, width, height) for box in boxes] - texts = instances.get('texts', None) - bboxes = instances.get('bboxes', None) - labels = instances.get('labels', None) - assert texts or bboxes or labels - # norm box - bboxes_norm = [self.box_norm(box, width, height) for box in bboxes] # get label2id label2id = self.metainfo['label2id'] - - cur_doc_input_ids, cur_doc_bboxes, cur_doc_labels = [], [], [] + # tokenize texts + cur_doc_input_ids, cur_doc_boxes, cur_doc_labels = [], [], [] for j in range(len(texts)): cur_input_ids = self.tokenizer( texts[j], @@ -111,7 +176,7 @@ def parse_data_info(self, raw_data_info: dict) -> Union[dict, List[dict]]: return_attention_mask=False)['input_ids'] if len(cur_input_ids) == 0: continue - + # generate bio label cur_label = labels[j].upper() if cur_label == 'OTHER': cur_labels = ['O'] * len(cur_input_ids) @@ -122,20 +187,21 @@ def parse_data_info(self, raw_data_info: dict) -> Union[dict, List[dict]]: cur_labels[0] = label2id['B-' + cur_labels[0]] for k in range(1, len(cur_labels)): cur_labels[k] = label2id['I-' + cur_labels[k]] - assert len(cur_input_ids) == len( - [bboxes_norm[j]] * len(cur_input_ids)) == len(cur_labels) + assert len(cur_input_ids) == len(cur_labels) + cur_doc_input_ids += cur_input_ids - cur_doc_bboxes += [bboxes_norm[j]] * len(cur_input_ids) + cur_doc_boxes += [norm_boxes[j]] * len(cur_input_ids) cur_doc_labels += cur_labels - assert len(cur_doc_input_ids) == len(cur_doc_bboxes) == len( + assert len(cur_doc_input_ids) == len(cur_doc_boxes) == len( cur_doc_labels) assert len(cur_doc_input_ids) > 0 data_info = {} - data_info['img_path'] = img_path + data_info['img_path'] = raw_data_info['img_path'] data_info['input_ids'] = cur_doc_input_ids - data_info['bboxes'] = cur_doc_bboxes + data_info['boxes'] = cur_doc_boxes data_info['labels'] = cur_doc_labels + return data_info def box_norm(self, box, width, height): @@ -152,13 +218,13 @@ def clip(min_num, num, max_num): assert y1 >= y0 return [x0, y0, x1, y1] - def get_segment_ids(self, bboxs): + def get_segment_ids(self, boxes): segment_ids = [] - for i in range(len(bboxs)): + for i in range(len(boxes)): if i == 0: segment_ids.append(0) else: - if bboxs[i - 1] == bboxs[i]: + if boxes[i - 1] == boxes[i]: segment_ids.append(segment_ids[-1]) else: segment_ids.append(segment_ids[-1] + 1) From 25f084ab8f588efe636a02e615ffd62e85602b2b Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Fri, 31 Mar 2023 00:15:00 +0800 Subject: [PATCH 15/50] ser/re packer docstring fix --- mmocr/datasets/preparers/packers/re_packer.py | 2 +- mmocr/datasets/preparers/packers/ser_packer.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mmocr/datasets/preparers/packers/re_packer.py b/mmocr/datasets/preparers/packers/re_packer.py index 54edce73d..6647764e2 100644 --- a/mmocr/datasets/preparers/packers/re_packer.py +++ b/mmocr/datasets/preparers/packers/re_packer.py @@ -42,7 +42,7 @@ class REPacker(SERPacker): "data_list": [ { - "img_path": "ser_imgs\\test\\zh_val_0.jpg", + "img_path": "imgs\\test\\zh_val_0.jpg", "height": 3508, "width": 2480, "instances": diff --git a/mmocr/datasets/preparers/packers/ser_packer.py b/mmocr/datasets/preparers/packers/ser_packer.py index 3db633bfa..64aa997ba 100644 --- a/mmocr/datasets/preparers/packers/ser_packer.py +++ b/mmocr/datasets/preparers/packers/ser_packer.py @@ -42,7 +42,7 @@ class SERPacker(BasePacker): "data_list": [ { - "img_path": "ser_imgs\\test\\zh_val_0.jpg", + "img_path": "imgs\\test\\zh_val_0.jpg", "height": 3508, "width": 2480, "instances": From f8f2614ad127a928adf883cf36ebad6f1d4f2b90 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Fri, 31 Mar 2023 00:17:07 +0800 Subject: [PATCH 16/50] add SERDataSample structure and PackSERInputs transforms --- mmocr/datasets/transforms/__init__.py | 6 +- mmocr/datasets/transforms/formatting.py | 102 +++++++++++++++++++++++- mmocr/structures/__init__.py | 3 +- mmocr/structures/ser_data_sample.py | 61 ++++++++++++++ mmocr/utils/__init__.py | 7 +- mmocr/utils/typing_utils.py | 4 +- 6 files changed, 175 insertions(+), 8 deletions(-) create mode 100644 mmocr/structures/ser_data_sample.py diff --git a/mmocr/datasets/transforms/__init__.py b/mmocr/datasets/transforms/__init__.py index 61a15ec96..194305ad6 100644 --- a/mmocr/datasets/transforms/__init__.py +++ b/mmocr/datasets/transforms/__init__.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. from .adapters import MMDet2MMOCR, MMOCR2MMDet -from .formatting import PackKIEInputs, PackTextDetInputs, PackTextRecogInputs +from .formatting import (PackKIEInputs, PackSERInputs, PackTextDetInputs, + PackTextRecogInputs) from .loading import (InferencerLoader, LoadImageFromFile, LoadImageFromNDArray, LoadKIEAnnotations, LoadOCRAnnotations) @@ -23,5 +24,6 @@ 'PackKIEInputs', 'LoadKIEAnnotations', 'FixInvalidPolygon', 'MMDet2MMOCR', 'MMOCR2MMDet', 'LoadImageFromFile', 'LoadImageFromNDArray', 'CropHeight', 'InferencerLoader', 'RemoveIgnored', 'ConditionApply', 'CropHeight', - 'TextRecogGeneralAug', 'ImageContentJitter', 'ReversePixels' + 'TextRecogGeneralAug', 'ImageContentJitter', 'ReversePixels', + 'PackSERInputs' ] diff --git a/mmocr/datasets/transforms/formatting.py b/mmocr/datasets/transforms/formatting.py index b9b71437a..f18b257d5 100644 --- a/mmocr/datasets/transforms/formatting.py +++ b/mmocr/datasets/transforms/formatting.py @@ -6,7 +6,7 @@ from mmengine.structures import InstanceData, LabelData from mmocr.registry import TRANSFORMS -from mmocr.structures import (KIEDataSample, TextDetDataSample, +from mmocr.structures import (KIEDataSample, SERDataSample, TextDetDataSample, TextRecogDataSample) @@ -328,3 +328,103 @@ def __repr__(self) -> str: repr_str = self.__class__.__name__ repr_str += f'(meta_keys={self.meta_keys})' return repr_str + + +@TRANSFORMS.register_module() +class PackSERInputs(BaseTransform): + """Pack the inputs data for Semantic Entity Recognition. + + The type of outputs is `dict`: + + - inputs: image converted to tensor, whose shape is (C, H, W). + - data_samples: Two components of ``SERDataSample`` will be updated: + + - gt_instances (InstanceData): Depending on annotations, a subset of the + following keys will be updated: + + - bboxes (torch.Tensor((N, 4), dtype=torch.float32)): The groundtruth + of bounding boxes in the form of [x1, y1, x2, y2]. Renamed from + 'gt_bboxes'. + - labels (torch.LongTensor(N)): The labels of instances. + Renamed from 'gt_bboxes_labels'. + - texts (list[str]): The groundtruth texts. Renamed from 'gt_texts'. + + - metainfo (dict): 'metainfo' is always populated. The contents of the + 'metainfo' depends on ``meta_keys``. By default it includes: + + - "img_path": Path to the image file. + - "img_shape": Shape of the image input to the network as a tuple + (h, w). Note that the image may be zero-padded afterward on the + bottom/right if the batch tensor is larger than this shape. + - "scale_factor": A tuple indicating the ratio of width and height + of the preprocessed image to the original one. + - "ori_shape": Shape of the preprocessed image as a tuple + (h, w). + + Args: + meta_keys (Sequence[str], optional): Meta keys to be converted to + the metainfo of ``SERDataSample``. Defaults to ``('img_path', + 'ori_shape', 'img_shape', 'scale_factor')``. + """ + ser_sample_keys = [ + 'input_ids', 'boxes', 'labels', 'position_ids', 'segment_ids', + 'attention_mask' + ] + + def __init__(self, meta_keys=()): + self.meta_keys = meta_keys + + def transform(self, results: dict) -> dict: + """Method to pack the input data. + + Args: + results (dict): Result dict from the data pipeline. + + Returns: + dict: + + - 'inputs' (obj:`torch.Tensor`): Data for model forwarding. + - 'data_samples' (obj:`DetDataSample`): The annotation info of the + sample. + """ + + packed_results = dict() + if 'img' in results: + img = results['img'] + if len(img.shape) < 3: + img = np.expand_dims(img, -1) + # A simple trick to speedup formatting by 3-5 times when + # OMP_NUM_THREADS != 1 + # Refer to https://github.com/open-mmlab/mmdetection/pull/9533 + # for more details + if img.flags.c_contiguous: + img = to_tensor(img) + img = img.permute(2, 0, 1).contiguous() + else: + img = np.ascontiguousarray(img.transpose(2, 0, 1)) + img = to_tensor(img) + packed_results['inputs'] = img + else: + packed_results['inputs'] = torch.FloatTensor().reshape(0, 0, 0) + + data_sample = SERDataSample() + instance_data = InstanceData() + + for key in self.ser_sample_keys: + if key not in results: + continue + instance_data[key] = to_tensor(results[key]) + data_sample.gt_instances = instance_data + + img_meta = {} + for key in self.meta_keys: + img_meta[key] = results[key] + data_sample.set_metainfo(img_meta) + packed_results['data_samples'] = data_sample + + return packed_results + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(meta_keys={self.meta_keys})' + return repr_str diff --git a/mmocr/structures/__init__.py b/mmocr/structures/__init__.py index 2b71ac262..2d8b78857 100644 --- a/mmocr/structures/__init__.py +++ b/mmocr/structures/__init__.py @@ -1,10 +1,11 @@ # Copyright (c) OpenMMLab. All rights reserved. from .kie_data_sample import KIEDataSample +from .ser_data_sample import SERDataSample from .textdet_data_sample import TextDetDataSample from .textrecog_data_sample import TextRecogDataSample from .textspotting_data_sample import TextSpottingDataSample __all__ = [ 'TextDetDataSample', 'TextRecogDataSample', 'KIEDataSample', - 'TextSpottingDataSample' + 'TextSpottingDataSample', 'SERDataSample' ] diff --git a/mmocr/structures/ser_data_sample.py b/mmocr/structures/ser_data_sample.py new file mode 100644 index 000000000..10c91a17a --- /dev/null +++ b/mmocr/structures/ser_data_sample.py @@ -0,0 +1,61 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .kie_data_sample import KIEDataSample + + +class SERDataSample(KIEDataSample): + """A data structure interface of MMOCR. They are used as interfaces between + different components. + + The attributes in ``SERDataSample`` are divided into two parts: + + - ``gt_instances``(InstanceData): Ground truth of instance annotations. + - ``pred_instances``(InstanceData): Instances of model predictions. + + Examples: + >>> import torch + >>> import numpy as np + >>> from mmengine.structures import InstanceData + >>> from mmocr.data import SERDataSample + >>> # gt_instances + >>> data_sample = SERDataSample() + >>> img_meta = dict(img_shape=(800, 1196, 3), + ... pad_shape=(800, 1216, 3)) + >>> gt_instances = InstanceData(metainfo=img_meta) + >>> gt_instances.bboxes = torch.rand((5, 4)) + >>> gt_instances.labels = torch.rand((5,)) + >>> data_sample.gt_instances = gt_instances + >>> assert 'img_shape' in data_sample.gt_instances.metainfo_keys() + >>> len(data_sample.gt_instances) + 5 + >>> print(data_sample) + + ) at 0x7f21fb1b9880> + >>> # pred_instances + >>> pred_instances = InstanceData(metainfo=img_meta) + >>> pred_instances.bboxes = torch.rand((5, 4)) + >>> pred_instances.scores = torch.rand((5,)) + >>> data_sample = SERDataSample(pred_instances=pred_instances) + >>> assert 'pred_instances' in data_sample + >>> data_sample = SERDataSample() + >>> gt_instances_data = dict( + ... bboxes=torch.rand(2, 4), + ... labels=torch.rand(2)) + >>> gt_instances = InstanceData(**gt_instances_data) + >>> data_sample.gt_instances = gt_instances + >>> assert 'gt_instances' in data_sample + """ diff --git a/mmocr/utils/__init__.py b/mmocr/utils/__init__.py index 3e4fb6fb2..afdfee26a 100644 --- a/mmocr/utils/__init__.py +++ b/mmocr/utils/__init__.py @@ -28,8 +28,9 @@ LabelList, MultiConfig, OptConfigType, OptDetSampleList, OptInitConfigType, OptInstanceList, OptKIESampleList, OptLabelList, - OptMultiConfig, OptRecSampleList, OptTensor, - RangeType, RecForwardResults, RecSampleList) + OptMultiConfig, OptRecSampleList, OptSERSampleList, + OptTensor, RangeType, RecForwardResults, + RecSampleList, SERSampleList) __all__ = [ 'collect_env', 'is_3dlist', 'is_type_list', 'is_none_or_type', 'equal_len', @@ -50,5 +51,5 @@ 'is_archive', 'check_integrity', 'list_files', 'get_md5', 'InstanceList', 'LabelList', 'OptInstanceList', 'OptLabelList', 'RangeType', 'remove_pipeline_elements', 'bezier2poly', 'poly2bezier', - 'track_parallel_progress_multi_args' + 'track_parallel_progress_multi_args', 'SERSampleList', 'OptSERSampleList' ] diff --git a/mmocr/utils/typing_utils.py b/mmocr/utils/typing_utils.py index 592fb36e7..45cbc649b 100644 --- a/mmocr/utils/typing_utils.py +++ b/mmocr/utils/typing_utils.py @@ -9,7 +9,7 @@ from mmengine.structures import InstanceData, LabelData from mmocr import digit_version -from mmocr.structures import (KIEDataSample, TextDetDataSample, +from mmocr.structures import (KIEDataSample, SERDataSample, TextDetDataSample, TextRecogDataSample, TextSpottingDataSample) # Config @@ -29,9 +29,11 @@ RecSampleList = List[TextRecogDataSample] DetSampleList = List[TextDetDataSample] KIESampleList = List[KIEDataSample] +SERSampleList = List[SERDataSample] OptRecSampleList = Optional[RecSampleList] OptDetSampleList = Optional[DetSampleList] OptKIESampleList = Optional[KIESampleList] +OptSERSampleList = Optional[SERSampleList] OptE2ESampleList = Optional[E2ESampleList] OptTensor = Optional[torch.Tensor] From c8a7b68639763883891e22bd61fcc0079db78d70 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Fri, 31 Mar 2023 00:55:41 +0800 Subject: [PATCH 17/50] =?UTF-8?q?=E5=88=9D=E6=AD=A5=E6=9E=84=E5=BB=BASER?= =?UTF-8?q?=E9=83=A8=E5=88=86model=E6=96=87=E4=BB=B6=E7=BB=93=E6=9E=84?= =?UTF-8?q?=EF=BC=8CLayoutLMv3DataPreprocessor=E5=8F=82=E6=95=B0=E5=B7=B2?= =?UTF-8?q?=E4=B8=8EHuggingFace=E7=9A=84LayoutLMv3ImageProcessor=E5=AE=9E?= =?UTF-8?q?=E7=8E=B0=E6=95=B0=E5=80=BC=E5=AF=B9=E9=BD=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mmocr/models/__init__.py | 1 + mmocr/models/ser/__init__.py | 5 ++ mmocr/models/ser/backbones/layoutlmv3.py | 10 +++ mmocr/models/ser/classifier/__init__.py | 4 ++ .../models/ser/classifier/token_classifier.py | 71 +++++++++++++++++++ .../models/ser/data_preprocessors/__init__.py | 4 ++ .../data_preprocessors/data_preprocessor.py | 58 +++++++++++++++ mmocr/models/ser/heads/layoutlmv3_head.py | 1 + projects/LayoutLMv3/test.py | 47 ++++++++++-- 9 files changed, 195 insertions(+), 6 deletions(-) create mode 100644 mmocr/models/ser/__init__.py create mode 100644 mmocr/models/ser/backbones/layoutlmv3.py create mode 100644 mmocr/models/ser/classifier/__init__.py create mode 100644 mmocr/models/ser/classifier/token_classifier.py create mode 100644 mmocr/models/ser/data_preprocessors/__init__.py create mode 100644 mmocr/models/ser/data_preprocessors/data_preprocessor.py create mode 100644 mmocr/models/ser/heads/layoutlmv3_head.py diff --git a/mmocr/models/__init__.py b/mmocr/models/__init__.py index abea668b3..9f57d5007 100644 --- a/mmocr/models/__init__.py +++ b/mmocr/models/__init__.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. from .common import * # NOQA from .kie import * # NOQA +from .ser import * # NOQA from .textdet import * # NOQA from .textrecog import * # NOQA diff --git a/mmocr/models/ser/__init__.py b/mmocr/models/ser/__init__.py new file mode 100644 index 000000000..40ddef429 --- /dev/null +++ b/mmocr/models/ser/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .backbones import * # NOQA +from .classifier import * # NOQA +from .data_preprocessors import * # NOQA +from .heads import * # NOQA diff --git a/mmocr/models/ser/backbones/layoutlmv3.py b/mmocr/models/ser/backbones/layoutlmv3.py new file mode 100644 index 000000000..016c81fe0 --- /dev/null +++ b/mmocr/models/ser/backbones/layoutlmv3.py @@ -0,0 +1,10 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmocr.registry import MODELS + +MODELS.register_module() + + +class LayoutLMv3: + + def __init__(self) -> None: + pass diff --git a/mmocr/models/ser/classifier/__init__.py b/mmocr/models/ser/classifier/__init__.py new file mode 100644 index 000000000..1f164a874 --- /dev/null +++ b/mmocr/models/ser/classifier/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .token_classifier import LayoutLMv3TokenClassifier + +__all__ = ['LayoutLMv3TokenClassifier'] diff --git a/mmocr/models/ser/classifier/token_classifier.py b/mmocr/models/ser/classifier/token_classifier.py new file mode 100644 index 000000000..03100df59 --- /dev/null +++ b/mmocr/models/ser/classifier/token_classifier.py @@ -0,0 +1,71 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, Optional, Tuple, Union + +import torch +from mmengine.model import BaseModel + +from mmocr.registry import MODELS +from mmocr.utils.typing_utils import OptSERSampleList, SERSampleList + +ForwardResults = Union[Dict[str, torch.Tensor], SERSampleList, + Tuple[torch.Tensor], torch.Tensor] + + +@MODELS.register_module() +class LayoutLMv3TokenClassifier(BaseModel): + + def __init__(self, + backbone: Dict, + cls_head: Dict, + data_preprocessor: Optional[Dict] = None, + init_cfg: Optional[Dict] = None): + super().__init__( + data_preprocessor=data_preprocessor, init_cfg=init_cfg) + assert cls_head is not None, 'cls_head cannot be None!' + # self.backbone = MODELS.build(backbone) + # self.cls_head = MODELS.build(cls_head) + + def forward(self, + inputs: torch.Tensor, + data_samples: OptSERSampleList = None, + mode: str = 'tensor') -> ForwardResults: + """The unified entry for a forward process in both training and test. + + The method should accept three modes: "tensor", "predict" and "loss": + + - "tensor": Forward the whole network and return tensor or tuple of + tensor without any post-processing, same as a common nn.Module. + - "predict": Forward and return the predictions, which are fully + processed to a list of :obj:`SERDataSample`. + - "loss": Forward and return a dict of losses according to the given + inputs and data samples. + + Note that this method doesn't handle either back propagation or + parameter update, which are supposed to be done in :meth:`train_step`. + + Args: + inputs (torch.Tensor): The input tensor with shape + (N, C, ...) in general. + data_samples (list[:obj:`SERDataSample`], optional): A batch of + data samples that contain annotations and predictions. + Defaults to None. + mode (str): Return what kind of value. Defaults to 'tensor'. + + Returns: + The return type depends on ``mode``. + + - If ``mode="tensor"``, return a tensor or a tuple of tensor. + - If ``mode="predict"``, return a list of :obj:`SERDataSample`. + - If ``mode="loss"``, return a dict of tensor. + """ + data = self.data_preprocessor(inputs, True) + print(data) + # if mode == 'loss': + # return self.loss(inputs, data_samples) + # elif mode == 'predict': + # return self.predict(inputs, data_samples) + # elif mode == 'tensor': + # return self._forward(inputs, data_samples) + # else: + # raise RuntimeError(f'Invalid mode "{mode}". ' + # 'Only supports loss, predict and tensor mode') diff --git a/mmocr/models/ser/data_preprocessors/__init__.py b/mmocr/models/ser/data_preprocessors/__init__.py new file mode 100644 index 000000000..1594b51fd --- /dev/null +++ b/mmocr/models/ser/data_preprocessors/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .data_preprocessor import LayoutLMv3DataPreprocessor + +__all__ = ['LayoutLMv3DataPreprocessor'] diff --git a/mmocr/models/ser/data_preprocessors/data_preprocessor.py b/mmocr/models/ser/data_preprocessors/data_preprocessor.py new file mode 100644 index 000000000..92c8cbd6d --- /dev/null +++ b/mmocr/models/ser/data_preprocessors/data_preprocessor.py @@ -0,0 +1,58 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmocr.models.textdet.data_preprocessors import TextDetDataPreprocessor +from mmocr.registry import MODELS + + +@MODELS.register_module() +class LayoutLMv3DataPreprocessor(TextDetDataPreprocessor): + """Image pre-processor for LayoutLMv3. + + If you want to get the same processing result as + LayoutLMv3ImageProcessor in HuggingFace, you need to set + mean/std to [127.5, 127.5, 127.5], bgr_to_rgb = True, + and set pipeline Resize backend to `pillow`. + + Like: + + train_pipeline = [ + dict(type='LoadImageFromFile', color_type='color'), + dict(type='Resize', + scale=(224, 224), + backend='pillow'), # backend=pillow 数值与huggingface对齐 + ... + ] + model_cfg = dict( + ... + data_preprocessor=dict( + type='LayoutLMv3DataPreprocessor', + mean=[127.5, 127.5, 127.5], + std=[127.5, 127.5, 127.5], + bgr_to_rgb=True), + ... + ) + + It provides the data pre-processing as follows + + - Collate and move data to the target device. + - Pad inputs to the maximum size of current batch with defined + ``pad_value``. The padding size can be divisible by a defined + ``pad_size_divisor`` + - Stack inputs to batch_inputs. + - Convert inputs from bgr to rgb if the shape of input is (3, H, W). + - Normalize image with defined std and mean. + - Do batch augmentations during training. + + Args: + mean (Sequence[Number], optional): The pixel mean of R, G, B channels. + Defaults to None. + std (Sequence[Number], optional): The pixel standard deviation of + R, G, B channels. Defaults to None. + pad_size_divisor (int): The size of padded image should be + divisible by ``pad_size_divisor``. Defaults to 1. + pad_value (Number): The padded pixel value. Defaults to 0. + bgr_to_rgb (bool): whether to convert image from BGR to RGB. + Defaults to False. + rgb_to_bgr (bool): whether to convert image from RGB to RGB. + Defaults to False. + batch_augments (list[dict], optional): Batch-level augmentations + """ diff --git a/mmocr/models/ser/heads/layoutlmv3_head.py b/mmocr/models/ser/heads/layoutlmv3_head.py new file mode 100644 index 000000000..ef101fec6 --- /dev/null +++ b/mmocr/models/ser/heads/layoutlmv3_head.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/projects/LayoutLMv3/test.py b/projects/LayoutLMv3/test.py index 96df72b1a..cbb757da1 100644 --- a/projects/LayoutLMv3/test.py +++ b/projects/LayoutLMv3/test.py @@ -1,7 +1,11 @@ +from functools import partial + from mmengine.config import Config +from mmengine.dataset.utils import COLLATE_FUNCTIONS from mmengine.registry import init_default_scope +from torch.utils.data import DataLoader -from mmocr.registry import DATASETS +from mmocr.registry import DATASETS, MODELS if __name__ == '__main__': cfg_path = '/Users/wangnu/Documents/GitHub/mmocr/projects/' \ @@ -12,12 +16,43 @@ dataset_cfg = cfg.train_dataset dataset_cfg['tokenizer'] = \ '/Users/wangnu/Documents/GitHub/mmocr/data/layoutlmv3-base-chinese' - train_pipeline = [ dict(type='LoadImageFromFile', color_type='color'), - dict(type='Resize', scale=(224, 224)) + dict(type='Resize', scale=(224, 224), + backend='pillow'), # backend=pillow 数值与huggingface对齐 + dict( + type='PackSERInputs', + meta_keys=('img_path', 'ori_shape', 'img_shape', 'scale_factor')) ] dataset_cfg['pipeline'] = train_pipeline - ds = DATASETS.build(dataset_cfg) - data = ds[0] - print('hi') + train_dataloader_cfg = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dataset_cfg) + + model_cfg = dict( + type='LayoutLMv3TokenClassifier', + backbone=dict(), + cls_head=dict(), + data_preprocessor=dict( + type='LayoutLMv3DataPreprocessor', + mean=[127.5, 127.5, 127.5], + std=[127.5, 127.5, 127.5], + bgr_to_rgb=True)) + + train_dataset = DATASETS.build(dataset_cfg) + collate_fn_cfg = dict(type='pseudo_collate') + collate_fn_type = collate_fn_cfg.pop('type') + collate_fn = COLLATE_FUNCTIONS.get(collate_fn_type) + collate_fn = partial(collate_fn, **collate_fn_cfg) + train_dataloader = DataLoader(dataset=train_dataset, collate_fn=collate_fn) + + model = MODELS.build(model_cfg) + + for idx, data_batch in enumerate(train_dataloader): + result = model.forward(data_batch) + break + + print('Done') From e22e466c01002ac60ed122bbc1c00794c7bd2bf0 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Tue, 11 Apr 2023 17:38:47 +0800 Subject: [PATCH 18/50] =?UTF-8?q?packer=20metainfo=E5=88=A0=E9=99=A4id2lab?= =?UTF-8?q?el=E4=BF=A1=E6=81=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mmocr/datasets/preparers/packers/re_packer.py | 13 ++-------- .../datasets/preparers/packers/ser_packer.py | 25 ++++++------------- 2 files changed, 9 insertions(+), 29 deletions(-) diff --git a/mmocr/datasets/preparers/packers/re_packer.py b/mmocr/datasets/preparers/packers/re_packer.py index 6647764e2..91018cb72 100644 --- a/mmocr/datasets/preparers/packers/re_packer.py +++ b/mmocr/datasets/preparers/packers/re_packer.py @@ -19,17 +19,8 @@ class REPacker(SERPacker): { "metainfo": { - "labels": ['answer', 'header', 'other', 'question'], - "id2label": { - "0": "O", - "1": "B-ANSWER", - "2": "I-ANSWER", - "3": "B-HEADER", - "4": "I-HEADER", - "5": "B-QUESTION", - "6": "I-QUESTION" - }, - "label2id": { + "orig_labels": ['answer', 'header', 'other', 'question'], + "biolabel2id": { "O": 0, "B-ANSWER": 1, "I-ANSWER": 2, diff --git a/mmocr/datasets/preparers/packers/ser_packer.py b/mmocr/datasets/preparers/packers/ser_packer.py index 64aa997ba..2e9e0684d 100644 --- a/mmocr/datasets/preparers/packers/ser_packer.py +++ b/mmocr/datasets/preparers/packers/ser_packer.py @@ -19,17 +19,8 @@ class SERPacker(BasePacker): { "metainfo": { - "labels": ['answer', 'header', 'other', 'question'], - "id2label": { - "0": "O", - "1": "B-ANSWER", - "2": "I-ANSWER", - "3": "B-HEADER", - "4": "I-HEADER", - "5": "B-QUESTION", - "6": "I-QUESTION" - }, - "label2id": { + "orig_labels": ['answer', 'header', 'other', 'question'], + "biolabel2id": { "O": 0, "B-ANSWER": 1, "I-ANSWER": 2, @@ -156,16 +147,14 @@ def get_bio_label_list(labels): labels = [] for s in sample: labels += s['instances']['labels'] - org_label_list = list(set(labels)) - bio_label_list = get_bio_label_list(org_label_list) + orig_label_list = list(set(labels)) + bio_label_list = get_bio_label_list(orig_label_list) meta = { 'metainfo': { - 'labels': org_label_list, - 'id2label': {k: v - for k, v in enumerate(bio_label_list)}, - 'label2id': {v: k - for k, v in enumerate(bio_label_list)} + 'orig_labels': orig_label_list, + 'biolabel2id': {v: k + for k, v in enumerate(bio_label_list)} }, 'data_list': sample } From ceb66dc70dd85fb9184137e5ed5a330c52210d82 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Tue, 11 Apr 2023 17:50:28 +0800 Subject: [PATCH 19/50] =?UTF-8?q?=E4=BC=98=E5=8C=96xfund=5Fdataset?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mmocr/datasets/xfund_dataset.py | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/mmocr/datasets/xfund_dataset.py b/mmocr/datasets/xfund_dataset.py index af050b275..e0866a54b 100644 --- a/mmocr/datasets/xfund_dataset.py +++ b/mmocr/datasets/xfund_dataset.py @@ -3,9 +3,9 @@ from typing import Callable, List, Optional, Sequence, Union from mmengine.dataset import BaseDataset -from transformers import AutoTokenizer from mmocr.registry import DATASETS +from transformers import AutoTokenizer @DATASETS.register_module() @@ -47,8 +47,8 @@ class XFUNDSERDataset(BaseDataset): """ def __init__(self, - ann_file: str = '', - tokenizer: str = '', + ann_file: str, + tokenizer: dict, metainfo: Optional[dict] = None, data_root: Optional[str] = '', data_prefix: dict = dict(img_path=''), @@ -60,9 +60,13 @@ def __init__(self, lazy_init: bool = False, max_refetch: int = 1000) -> None: - assert tokenizer != '', 'tokenizer must be specified.' - self.tokenizer = AutoTokenizer.from_pretrained( - tokenizer, use_fast=True) + if isinstance(tokenizer, dict) and \ + tokenizer.get('pretrained_model_name_or_path', None): + self.tokenizer = AutoTokenizer.from_pretrained(**tokenizer) + else: + raise TypeError( + 'tokenizer cfg should be a `dict` and a key ' + '`pretrained_model_name_or_path` must be specified') super().__init__( ann_file=ann_file, @@ -115,6 +119,11 @@ def load_data_list(self) -> List[dict]: data_info['position_ids'] = position_ids data_info['img_path'] = img_path data_info['attention_mask'] = attention_mask + # record biolabel2id and id2biolabel + biolabel2id = self.metainfo['biolabel2id'] + data_info['biolabel2id'] = biolabel2id + id2biolabel = {v: k for k, v in biolabel2id.items()} + data_info['id2biolabel'] = id2biolabel split_text_data_list.append(data_info) start = end @@ -164,8 +173,8 @@ def parse_data_info(self, raw_data_info: dict) -> dict: height = raw_data_info['height'] norm_boxes = [self.box_norm(box, width, height) for box in boxes] - # get label2id - label2id = self.metainfo['label2id'] + # get biolabel2id + biolabel2id = self.metainfo['biolabel2id'] # tokenize texts cur_doc_input_ids, cur_doc_boxes, cur_doc_labels = [], [], [] for j in range(len(texts)): @@ -181,12 +190,12 @@ def parse_data_info(self, raw_data_info: dict) -> dict: if cur_label == 'OTHER': cur_labels = ['O'] * len(cur_input_ids) for k in range(len(cur_labels)): - cur_labels[k] = label2id[cur_labels[k]] + cur_labels[k] = biolabel2id[cur_labels[k]] else: cur_labels = [cur_label] * len(cur_input_ids) - cur_labels[0] = label2id['B-' + cur_labels[0]] + cur_labels[0] = biolabel2id['B-' + cur_labels[0]] for k in range(1, len(cur_labels)): - cur_labels[k] = label2id['I-' + cur_labels[k]] + cur_labels[k] = biolabel2id['I-' + cur_labels[k]] assert len(cur_input_ids) == len(cur_labels) cur_doc_input_ids += cur_input_ids From 2eb79c3f568483e52de7a12b3b61ead3d29de3dd Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Tue, 11 Apr 2023 17:59:03 +0800 Subject: [PATCH 20/50] =?UTF-8?q?=E6=98=8E=E7=A1=AE=E6=B7=BB=E5=8A=A0?= =?UTF-8?q?=E7=9A=84metainfo=E7=B1=BB=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mmocr/datasets/transforms/formatting.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mmocr/datasets/transforms/formatting.py b/mmocr/datasets/transforms/formatting.py index f18b257d5..ebd4c523b 100644 --- a/mmocr/datasets/transforms/formatting.py +++ b/mmocr/datasets/transforms/formatting.py @@ -416,10 +416,10 @@ def transform(self, results: dict) -> dict: instance_data[key] = to_tensor(results[key]) data_sample.gt_instances = instance_data - img_meta = {} + img_and_text_meta = {} for key in self.meta_keys: - img_meta[key] = results[key] - data_sample.set_metainfo(img_meta) + img_and_text_meta[key] = results[key] + data_sample.set_metainfo(img_and_text_meta) packed_results['data_samples'] = data_sample return packed_results From a6bbe1220330c7e9a7dbc039b5e093a4af7074d2 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Mon, 17 Apr 2023 16:24:08 +0800 Subject: [PATCH 21/50] =?UTF-8?q?=E7=AE=80=E5=8C=96=E7=89=88layoutlmv3?= =?UTF-8?q?=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mmocr/models/ser/__init__.py | 7 +- mmocr/models/ser/backbones/layoutlmv3.py | 10 -- mmocr/models/ser/classifier/__init__.py | 4 - .../models/ser/classifier/token_classifier.py | 71 --------- .../models/ser/data_preprocessors/__init__.py | 4 - .../data_preprocessors/data_preprocessor.py | 58 ------- mmocr/models/ser/heads/layoutlmv3_head.py | 1 - mmocr/models/ser/hf_layoutlmv3_wrapper.py | 143 ++++++++++++++++++ .../LayoutLMv3/configs/layoutlmv3_xfund_zh.py | 12 -- 9 files changed, 146 insertions(+), 164 deletions(-) delete mode 100644 mmocr/models/ser/backbones/layoutlmv3.py delete mode 100644 mmocr/models/ser/classifier/__init__.py delete mode 100644 mmocr/models/ser/classifier/token_classifier.py delete mode 100644 mmocr/models/ser/data_preprocessors/__init__.py delete mode 100644 mmocr/models/ser/data_preprocessors/data_preprocessor.py delete mode 100644 mmocr/models/ser/heads/layoutlmv3_head.py create mode 100644 mmocr/models/ser/hf_layoutlmv3_wrapper.py delete mode 100644 projects/LayoutLMv3/configs/layoutlmv3_xfund_zh.py diff --git a/mmocr/models/ser/__init__.py b/mmocr/models/ser/__init__.py index 40ddef429..2799c9dab 100644 --- a/mmocr/models/ser/__init__.py +++ b/mmocr/models/ser/__init__.py @@ -1,5 +1,4 @@ # Copyright (c) OpenMMLab. All rights reserved. -from .backbones import * # NOQA -from .classifier import * # NOQA -from .data_preprocessors import * # NOQA -from .heads import * # NOQA +from .hf_layoutlmv3_wrapper import HFLayoutLMv3ForTokenClassificationWrapper + +__all__ = ['HFLayoutLMv3ForTokenClassificationWrapper'] diff --git a/mmocr/models/ser/backbones/layoutlmv3.py b/mmocr/models/ser/backbones/layoutlmv3.py deleted file mode 100644 index 016c81fe0..000000000 --- a/mmocr/models/ser/backbones/layoutlmv3.py +++ /dev/null @@ -1,10 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -from mmocr.registry import MODELS - -MODELS.register_module() - - -class LayoutLMv3: - - def __init__(self) -> None: - pass diff --git a/mmocr/models/ser/classifier/__init__.py b/mmocr/models/ser/classifier/__init__.py deleted file mode 100644 index 1f164a874..000000000 --- a/mmocr/models/ser/classifier/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -from .token_classifier import LayoutLMv3TokenClassifier - -__all__ = ['LayoutLMv3TokenClassifier'] diff --git a/mmocr/models/ser/classifier/token_classifier.py b/mmocr/models/ser/classifier/token_classifier.py deleted file mode 100644 index 03100df59..000000000 --- a/mmocr/models/ser/classifier/token_classifier.py +++ /dev/null @@ -1,71 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -from typing import Dict, Optional, Tuple, Union - -import torch -from mmengine.model import BaseModel - -from mmocr.registry import MODELS -from mmocr.utils.typing_utils import OptSERSampleList, SERSampleList - -ForwardResults = Union[Dict[str, torch.Tensor], SERSampleList, - Tuple[torch.Tensor], torch.Tensor] - - -@MODELS.register_module() -class LayoutLMv3TokenClassifier(BaseModel): - - def __init__(self, - backbone: Dict, - cls_head: Dict, - data_preprocessor: Optional[Dict] = None, - init_cfg: Optional[Dict] = None): - super().__init__( - data_preprocessor=data_preprocessor, init_cfg=init_cfg) - assert cls_head is not None, 'cls_head cannot be None!' - # self.backbone = MODELS.build(backbone) - # self.cls_head = MODELS.build(cls_head) - - def forward(self, - inputs: torch.Tensor, - data_samples: OptSERSampleList = None, - mode: str = 'tensor') -> ForwardResults: - """The unified entry for a forward process in both training and test. - - The method should accept three modes: "tensor", "predict" and "loss": - - - "tensor": Forward the whole network and return tensor or tuple of - tensor without any post-processing, same as a common nn.Module. - - "predict": Forward and return the predictions, which are fully - processed to a list of :obj:`SERDataSample`. - - "loss": Forward and return a dict of losses according to the given - inputs and data samples. - - Note that this method doesn't handle either back propagation or - parameter update, which are supposed to be done in :meth:`train_step`. - - Args: - inputs (torch.Tensor): The input tensor with shape - (N, C, ...) in general. - data_samples (list[:obj:`SERDataSample`], optional): A batch of - data samples that contain annotations and predictions. - Defaults to None. - mode (str): Return what kind of value. Defaults to 'tensor'. - - Returns: - The return type depends on ``mode``. - - - If ``mode="tensor"``, return a tensor or a tuple of tensor. - - If ``mode="predict"``, return a list of :obj:`SERDataSample`. - - If ``mode="loss"``, return a dict of tensor. - """ - data = self.data_preprocessor(inputs, True) - print(data) - # if mode == 'loss': - # return self.loss(inputs, data_samples) - # elif mode == 'predict': - # return self.predict(inputs, data_samples) - # elif mode == 'tensor': - # return self._forward(inputs, data_samples) - # else: - # raise RuntimeError(f'Invalid mode "{mode}". ' - # 'Only supports loss, predict and tensor mode') diff --git a/mmocr/models/ser/data_preprocessors/__init__.py b/mmocr/models/ser/data_preprocessors/__init__.py deleted file mode 100644 index 1594b51fd..000000000 --- a/mmocr/models/ser/data_preprocessors/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -from .data_preprocessor import LayoutLMv3DataPreprocessor - -__all__ = ['LayoutLMv3DataPreprocessor'] diff --git a/mmocr/models/ser/data_preprocessors/data_preprocessor.py b/mmocr/models/ser/data_preprocessors/data_preprocessor.py deleted file mode 100644 index 92c8cbd6d..000000000 --- a/mmocr/models/ser/data_preprocessors/data_preprocessor.py +++ /dev/null @@ -1,58 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -from mmocr.models.textdet.data_preprocessors import TextDetDataPreprocessor -from mmocr.registry import MODELS - - -@MODELS.register_module() -class LayoutLMv3DataPreprocessor(TextDetDataPreprocessor): - """Image pre-processor for LayoutLMv3. - - If you want to get the same processing result as - LayoutLMv3ImageProcessor in HuggingFace, you need to set - mean/std to [127.5, 127.5, 127.5], bgr_to_rgb = True, - and set pipeline Resize backend to `pillow`. - - Like: - - train_pipeline = [ - dict(type='LoadImageFromFile', color_type='color'), - dict(type='Resize', - scale=(224, 224), - backend='pillow'), # backend=pillow 数值与huggingface对齐 - ... - ] - model_cfg = dict( - ... - data_preprocessor=dict( - type='LayoutLMv3DataPreprocessor', - mean=[127.5, 127.5, 127.5], - std=[127.5, 127.5, 127.5], - bgr_to_rgb=True), - ... - ) - - It provides the data pre-processing as follows - - - Collate and move data to the target device. - - Pad inputs to the maximum size of current batch with defined - ``pad_value``. The padding size can be divisible by a defined - ``pad_size_divisor`` - - Stack inputs to batch_inputs. - - Convert inputs from bgr to rgb if the shape of input is (3, H, W). - - Normalize image with defined std and mean. - - Do batch augmentations during training. - - Args: - mean (Sequence[Number], optional): The pixel mean of R, G, B channels. - Defaults to None. - std (Sequence[Number], optional): The pixel standard deviation of - R, G, B channels. Defaults to None. - pad_size_divisor (int): The size of padded image should be - divisible by ``pad_size_divisor``. Defaults to 1. - pad_value (Number): The padded pixel value. Defaults to 0. - bgr_to_rgb (bool): whether to convert image from BGR to RGB. - Defaults to False. - rgb_to_bgr (bool): whether to convert image from RGB to RGB. - Defaults to False. - batch_augments (list[dict], optional): Batch-level augmentations - """ diff --git a/mmocr/models/ser/heads/layoutlmv3_head.py b/mmocr/models/ser/heads/layoutlmv3_head.py deleted file mode 100644 index ef101fec6..000000000 --- a/mmocr/models/ser/heads/layoutlmv3_head.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. diff --git a/mmocr/models/ser/hf_layoutlmv3_wrapper.py b/mmocr/models/ser/hf_layoutlmv3_wrapper.py new file mode 100644 index 000000000..32b1b4ef0 --- /dev/null +++ b/mmocr/models/ser/hf_layoutlmv3_wrapper.py @@ -0,0 +1,143 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, Optional, Tuple, Union + +import torch +from mmengine.model import BaseModel + +from mmocr.registry import MODELS +from mmocr.utils.typing_utils import OptSERSampleList, SERSampleList +from transformers import AutoModelForTokenClassification +from transformers.modeling_outputs import TokenClassifierOutput + +ForwardResults = Union[Dict[str, torch.Tensor], SERSampleList, + Tuple[torch.Tensor], torch.Tensor] + + +@MODELS.register_module() +class HFLayoutLMv3ForTokenClassificationWrapper(BaseModel): + + def __init__(self, + classifier: dict = dict(pretrained_model_name_or_path=None), + data_preprocessor: Optional[Dict] = None, + init_cfg: Optional[Dict] = None): + super().__init__( + data_preprocessor=data_preprocessor, init_cfg=init_cfg) + if isinstance(classifier, dict) and \ + classifier.get('pretrained_model_name_or_path', None): + self.model = AutoModelForTokenClassification.from_pretrained( + **classifier) + else: + raise TypeError( + 'classifier cfg should be a `dict` and a key ' + '`pretrained_model_name_or_path` must be specified') + + def forward(self, + inputs: torch.Tensor, + data_samples: OptSERSampleList = None, + mode: str = 'tensor') -> ForwardResults: + """The unified entry for a forward process in both training and test. + + The method should accept three modes: "tensor", "predict" and "loss": + + - "tensor": Forward the whole network and return tensor or tuple of + tensor without any post-processing, same as a common nn.Module. + - "predict": Forward and return the predictions, which are fully + processed to a list of :obj:`SERDataSample`. + - "loss": Forward and return a dict of losses according to the given + inputs and data samples. + + Note that this method doesn't handle either back propagation or + parameter update, which are supposed to be done in :meth:`train_step`. + + Args: + inputs (torch.Tensor): The input tensor with shape + (N, C, ...) in general. + data_samples (list[:obj:`SERDataSample`], optional): A batch of + data samples that contain annotations and predictions. + Defaults to None. + mode (str): Return what kind of value. Defaults to 'tensor'. + + Returns: + The return type depends on ``mode``. + + - If ``mode="tensor"``, return a tensor or a tuple of tensor. + - If ``mode="predict"``, return a list of :obj:`SERDataSample`. + - If ``mode="loss"``, return a dict of tensor. + """ + # copying inputs data to the target device + inputs = self.data_preprocessor(inputs) + + if mode == 'loss': + return self.loss(inputs, data_samples) + elif mode == 'predict': + return self.predict(inputs, data_samples) + elif mode == 'tensor': + return self._forward(inputs, data_samples) + else: + raise RuntimeError(f'Invalid mode "{mode}". ' + 'Only supports loss, predict and tensor mode') + + def loss(self, inputs: torch.Tensor, data_samples: SERSampleList) -> Dict: + """Calculate losses from a batch of inputs and data samples. + + Args: + inputs (torch.Tensor): Input images of shape (N, C, H, W). + Typically these should be mean centered and std scaled. + data_samples (list[SERDataSample]): A list of N + datasamples, containing meta information and gold annotations + for each of the images. + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + outputs: TokenClassifierOutput = self.model(**inputs) + return outputs + + def predict(self, inputs: torch.Tensor, + data_samples: SERSampleList) -> SERSampleList: + """Predict results from a batch of inputs and data samples with post- + processing. + + Args: + inputs (torch.Tensor): Images of shape (N, C, H, W). + data_samples (list[SERDataSample]): A list of N + datasamples, containing meta information and gold annotations + for each of the images. + + Returns: + list[SERDataSample]: A list of N datasamples of prediction + results. Each DetDataSample usually contain + 'pred_instances'. And the ``pred_instances`` usually + contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + - polygons (list[np.ndarray]): The length is num_instances. + Each element represents the polygon of the + instance, in (xn, yn) order. + """ + x = self.extract_feat(inputs) + return self.det_head.predict(x, data_samples) + + def _forward(self, + inputs: torch.Tensor, + data_samples: OptSERSampleList = None, + **kwargs) -> torch.Tensor: + """Network forward process. Usually includes backbone, neck and head + forward without any post-processing. + + Args: + inputs (Tensor): Inputs with shape (N, C, H, W). + data_samples (list[SERDataSample]): A list of N + datasamples, containing meta information and gold annotations + for each of the images. + + Returns: + Tensor or tuple[Tensor]: A tuple of features from ``det_head`` + forward. + """ + x = self.extract_feat(inputs) + return self.det_head(x, data_samples) diff --git a/projects/LayoutLMv3/configs/layoutlmv3_xfund_zh.py b/projects/LayoutLMv3/configs/layoutlmv3_xfund_zh.py deleted file mode 100644 index 7a74627c7..000000000 --- a/projects/LayoutLMv3/configs/layoutlmv3_xfund_zh.py +++ /dev/null @@ -1,12 +0,0 @@ -_base_ = [ - '/Users/wangnu/Documents/GitHub/mmocr/' - 'configs/ser/_base_/datasets/xfund_zh.py' -] - -train_dataset = _base_.xfund_zh_ser_train -train_dataloader = dict( - batch_size=1, - num_workers=8, - persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=True), - dataset=train_dataset) From 7951200267f8319ff48fe41a8eb7d3f0e30d9947 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Mon, 17 Apr 2023 16:38:13 +0800 Subject: [PATCH 22/50] =?UTF-8?q?=E4=BC=98=E5=8C=96layoutlmv3=E9=A2=84?= =?UTF-8?q?=E5=A4=84=E7=90=86=E4=BB=A3=E7=A0=81=EF=BC=8C=E6=95=B4=E5=90=88?= =?UTF-8?q?=E5=88=B0datasets/transforms=E9=87=8C=EF=BC=8C=E6=9B=B4?= =?UTF-8?q?=E6=98=8E=E7=A1=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mmocr/datasets/transforms/__init__.py | 4 +- mmocr/datasets/transforms/formatting.py | 41 ++-- .../transforms/layoutlmv3_transforms.py | 224 ++++++++++++++++++ mmocr/datasets/xfund_dataset.py | 19 +- 4 files changed, 260 insertions(+), 28 deletions(-) create mode 100644 mmocr/datasets/transforms/layoutlmv3_transforms.py diff --git a/mmocr/datasets/transforms/__init__.py b/mmocr/datasets/transforms/__init__.py index 194305ad6..c4c258be9 100644 --- a/mmocr/datasets/transforms/__init__.py +++ b/mmocr/datasets/transforms/__init__.py @@ -2,6 +2,8 @@ from .adapters import MMDet2MMOCR, MMOCR2MMDet from .formatting import (PackKIEInputs, PackSERInputs, PackTextDetInputs, PackTextRecogInputs) +from .layoutlmv3_transforms import (ProcessImageForLayoutLMv3, + ProcessTokenForLayoutLMv3) from .loading import (InferencerLoader, LoadImageFromFile, LoadImageFromNDArray, LoadKIEAnnotations, LoadOCRAnnotations) @@ -25,5 +27,5 @@ 'MMOCR2MMDet', 'LoadImageFromFile', 'LoadImageFromNDArray', 'CropHeight', 'InferencerLoader', 'RemoveIgnored', 'ConditionApply', 'CropHeight', 'TextRecogGeneralAug', 'ImageContentJitter', 'ReversePixels', - 'PackSERInputs' + 'PackSERInputs', 'ProcessImageForLayoutLMv3', 'ProcessTokenForLayoutLMv3' ] diff --git a/mmocr/datasets/transforms/formatting.py b/mmocr/datasets/transforms/formatting.py index ebd4c523b..41c75f138 100644 --- a/mmocr/datasets/transforms/formatting.py +++ b/mmocr/datasets/transforms/formatting.py @@ -332,7 +332,7 @@ def __repr__(self) -> str: @TRANSFORMS.register_module() class PackSERInputs(BaseTransform): - """Pack the inputs data for Semantic Entity Recognition. + """Pack the inputs data for LayoutLMv3ForTokenClassification model. The type of outputs is `dict`: @@ -358,24 +358,25 @@ class PackSERInputs(BaseTransform): bottom/right if the batch tensor is larger than this shape. - "scale_factor": A tuple indicating the ratio of width and height of the preprocessed image to the original one. - - "ori_shape": Shape of the preprocessed image as a tuple - (h, w). + - "ori_shape": Shape of the preprocessed image as a tuple (h, w). + - "id2biolabel": Label id convert to biolabel map dict. Args: meta_keys (Sequence[str], optional): Meta keys to be converted to the metainfo of ``SERDataSample``. Defaults to ``('img_path', - 'ori_shape', 'img_shape', 'scale_factor')``. + 'ori_shape', 'img_shape', 'scale_factor', 'id2biolabel')``. """ - ser_sample_keys = [ - 'input_ids', 'boxes', 'labels', 'position_ids', 'segment_ids', - 'attention_mask' + # HF LayoutLMv3ForTokenClassification model input params. + ser_keys = [ + 'input_ids', 'bbox', 'attention_mask', 'position_ids', 'pixel_values', + 'labels' ] def __init__(self, meta_keys=()): self.meta_keys = meta_keys def transform(self, results: dict) -> dict: - """Method to pack the input data. + """Method to pack SER input data. Args: results (dict): Result dict from the data pipeline. @@ -389,8 +390,8 @@ def transform(self, results: dict) -> dict: """ packed_results = dict() - if 'img' in results: - img = results['img'] + if 'pixel_values' in results: + img = results['pixel_values'] if len(img.shape) < 3: img = np.expand_dims(img, -1) # A simple trick to speedup formatting by 3-5 times when @@ -403,23 +404,23 @@ def transform(self, results: dict) -> dict: else: img = np.ascontiguousarray(img.transpose(2, 0, 1)) img = to_tensor(img) - packed_results['inputs'] = img - else: - packed_results['inputs'] = torch.FloatTensor().reshape(0, 0, 0) + results['pixel_values'] = img data_sample = SERDataSample() - instance_data = InstanceData() + # instance_data = InstanceData() - for key in self.ser_sample_keys: + inputs = {} + for key in self.ser_keys: if key not in results: continue - instance_data[key] = to_tensor(results[key]) - data_sample.gt_instances = instance_data + inputs[key] = to_tensor(results[key]) + packed_results['inputs'] = inputs + # data_sample.gt_instances = instance_data - img_and_text_meta = {} + meta = {} for key in self.meta_keys: - img_and_text_meta[key] = results[key] - data_sample.set_metainfo(img_and_text_meta) + meta[key] = results[key] + data_sample.set_metainfo(meta) packed_results['data_samples'] = data_sample return packed_results diff --git a/mmocr/datasets/transforms/layoutlmv3_transforms.py b/mmocr/datasets/transforms/layoutlmv3_transforms.py new file mode 100644 index 000000000..f3cb61ebd --- /dev/null +++ b/mmocr/datasets/transforms/layoutlmv3_transforms.py @@ -0,0 +1,224 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, Optional, Union + +from mmcv.transforms.base import BaseTransform + +from mmocr.registry import TRANSFORMS +from transformers import AutoImageProcessor, LayoutLMv3ImageProcessor +from transformers.file_utils import PaddingStrategy +from transformers.image_processing_utils import BatchFeature +from transformers.image_utils import ChannelDimension +from transformers.tokenization_utils_base import BatchEncoding + + +@TRANSFORMS.register_module() +class ProcessImageForLayoutLMv3(BaseTransform): + """A transform to process image for LayoutLMv3, which will use HuggingFace + `AutoImageProcessor` + + Required Keys: + + - img + - img_shape + + Modified Keys: + + - img_shape + + Added Keys: + + - scale_factor + - pixel_values + + Args: + image_processor (dict): The image_processor cfg, which the key + `pretrained_model_name_or_path` must be specified. + """ + + image_processor_class = (LayoutLMv3ImageProcessor) + + def __init__(self, + image_processor: dict = dict( + pretrained_model_name_or_path=None), + label_pad_token_id: int = -100) -> None: + super().__init__() + if isinstance(image_processor, dict) and \ + image_processor.get('pretrained_model_name_or_path', None): + self.image_processor = AutoImageProcessor.from_pretrained( + **image_processor) + else: + raise TypeError( + 'image_processor cfg should be a `dict` and a key ' + '`pretrained_model_name_or_path` must be specified') + + if not isinstance(self.image_processor, self.image_processor_class): + raise ValueError( + f'Received a {type(self.image_processor)} for argument ' + f'image_processor, but a {self.image_processor_class} ' + 'was expected.') + + # TODO: support apply_ocr + if self.image_processor.apply_ocr: + raise ValueError( + 'Now only support initialized the image processor ' + 'with apply_ocr set to False.') + + self.label_pad_token_id = label_pad_token_id + + def _resize_rescale_norm(self, results: dict) -> None: + """apply the image_processor to process img.""" + img = results['img'] + h, w = results['img_shape'] + + features: BatchFeature = self.image_processor( + images=img, return_tensors='np', data_format=ChannelDimension.LAST) + + # output default dims NHWC and here N=1 + pixel_values = features['pixel_values'][0] + new_h, new_w = pixel_values.shape[:2] + w_scale = new_w / w + h_scale = new_h / h + results['pixel_values'] = pixel_values + results['img_shape'] = (new_h, new_w) + results['scale_factor'] = (w_scale, h_scale) + + def transform(self, results: dict) -> Dict: + self._resize_rescale_norm(results) + return results + + +@TRANSFORMS.register_module() +class ProcessTokenForLayoutLMv3(BaseTransform): + """A transform to process token, which will dynamically pad the inputs + received, as well as the labels. + + Part of code is modified from `https://github.com/microsoft/unilm/blob + /master/layoutlmv3/layoutlmft/data/data_collator.py` and `https:// + github.com/huggingface/transformers/blob/main/src/transformers/models/ + layoutlmv3/processing_layoutlmv3.py`. + + Required Keys: + + - tokenizer + - input_ids + - attention_mask + - labels + - bbox + - position_ids + - segment_ids(optional) + + Modified Keys: + + - input_ids + - attention_mask + - labels + - bbox + - position_ids + - segment_ids(optional) + + Args: + padding (:obj:`bool`, :obj:`str` or :class: + `~transformers.file_utils.PaddingStrategy`, + `optional`, defaults to :obj:`True`): + Select a strategy to pad the returned sequences + (according to the model's padding side and padding index) + among: + * :obj:`True` or :obj:`'longest'`: Pad to the longest + sequence in the batch (or no padding if only a + single sequence if provided). + * :obj:`'max_length'`: Pad to a maximum length specified + with the argument :obj:`max_length` or to the maximum + acceptable input length for the model if that argument + is not provided. + * :obj:`False` or :obj:`'do_not_pad'` (default): No + padding (i.e., can output a batch with sequences + of different lengths). + max_length (:obj:`int`, `optional`): + Maximum length of the returned list and optionally + padding length (see above). + pad_to_multiple_of (:obj:`int`, `optional`): + If set will pad the sequence to a multiple of the + provided value. This is especially useful to enable the + use of Tensor Cores on NVIDIA hardware with compute + capability >= 7.5 (Volta). + label_pad_token_id (:obj:`int`, `optional`, defaults to -100): + The id to use when padding the labels (-100 will be + automatically ignore by PyTorch loss functions). + """ + + padded_input_names = ['input_ids', 'attention_mask'] + + def __init__(self, + padding: Union[bool, str, PaddingStrategy] = True, + max_length: Optional[int] = None, + pad_to_multiple_of: Optional[int] = None, + label_pad_token_id: int = -100) -> None: + super().__init__() + self.padding = padding + self.max_length = max_length + self.pad_to_multiple_of = pad_to_multiple_of + self.label_pad_token_id = label_pad_token_id + + def _pad(self, results: dict) -> None: + # get tokenizer + tokenizer = results['tokenizer'] + + # There will be a warning advice: + # You're using a XLMRobertaTokenizerFast tokenizer. + # Please note that with a fast tokenizer, using the + # `__call__` method is faster than using a method to + # encode the text followed by a call to the `pad` + # method to get a padded encoding. + # But `__call__` method only supports input string text, + # which has already been encoded before this step. + features = { + k: v + for k, v in results.items() if k in self.padded_input_names + } + batch: BatchEncoding = tokenizer.pad( + encoded_inputs=features, + padding=self.padding, + max_length=self.max_length, + pad_to_multiple_of=self.pad_to_multiple_of) + # update `input_ids` and `attention_mask` + results.update(batch) + + has_bbox_input = 'bbox' in results + has_position_input = 'position_ids' in results + has_segment_input = 'segment_ids' in results + sequence_length = len(results['input_ids']) + if tokenizer.padding_side == 'right': + results[ + 'labels'] = results['labels'] + [self.label_pad_token_id] * ( + sequence_length - len(results['labels'])) + if has_bbox_input: + results['bbox'] = results['bbox'] + [[0, 0, 0, 0]] * ( + sequence_length - len(results['bbox'])) + if has_position_input: + results['position_ids'] = results['position_ids'] + [ + tokenizer.pad_token_id + ] * ( + sequence_length - len(results['position_ids'])) + if has_segment_input: + results['segment_ids'] = results['segment_ids'] + [ + results['segment_ids'][-1] + 1 + ] * ( + sequence_length - len(results['segment_ids'])) + else: + results['labels'] = [self.label_pad_token_id] * ( + sequence_length - len(results['labels'])) + results['labels'] + if has_bbox_input: + results['bbox'] = [[0, 0, 0, 0]] * ( + sequence_length - len(results['bbox'])) + results['bbox'] + if has_position_input: + results['position_ids'] = [tokenizer.pad_token_id] * ( + sequence_length - + len(results['position_ids'])) + results['position_ids'] + if has_segment_input: + results['segment_ids'] = [results['segment_ids'][-1] + 1] * ( + sequence_length - + len(results['segment_ids'])) + results['segment_ids'] + + def transform(self, results: dict) -> Dict: + self._pad(results) + return results diff --git a/mmocr/datasets/xfund_dataset.py b/mmocr/datasets/xfund_dataset.py index e0866a54b..728eda4bc 100644 --- a/mmocr/datasets/xfund_dataset.py +++ b/mmocr/datasets/xfund_dataset.py @@ -47,8 +47,8 @@ class XFUNDSERDataset(BaseDataset): """ def __init__(self, - ann_file: str, - tokenizer: dict, + ann_file: str = '', + tokenizer: dict = dict(pretrained_model_name_or_path=None), metainfo: Optional[dict] = None, data_root: Optional[str] = '', data_prefix: dict = dict(img_path=''), @@ -113,17 +113,22 @@ def load_data_list(self) -> List[dict]: data_info = {} data_info['input_ids'] = input_ids - data_info['boxes'] = boxes + # set key=bbox in order to be consistent with + # HuggingFace LayoutLMv3Model input params + data_info['bbox'] = boxes data_info['labels'] = labels data_info['segment_ids'] = segment_ids data_info['position_ids'] = position_ids data_info['img_path'] = img_path data_info['attention_mask'] = attention_mask - # record biolabel2id and id2biolabel - biolabel2id = self.metainfo['biolabel2id'] - data_info['biolabel2id'] = biolabel2id - id2biolabel = {v: k for k, v in biolabel2id.items()} + # record id2biolabel + id2biolabel = { + v: k + for k, v in self.metainfo['biolabel2id'].items() + } data_info['id2biolabel'] = id2biolabel + # record tokenizer + data_info['tokenizer'] = self.tokenizer split_text_data_list.append(data_info) start = end From 3ddf7808aac00cde4c47b540767fd342f4b0e661 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Mon, 17 Apr 2023 16:42:22 +0800 Subject: [PATCH 23/50] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E6=B5=8B=E8=AF=95?= =?UTF-8?q?=E8=84=9A=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../configs/_base_/default_runtime.py | 41 ++++++++++ .../configs/ser/layoutlmv3_xfund_zh.py | 61 +++++++++++++++ projects/LayoutLMv3/scripts/run_ser.sh | 6 ++ projects/LayoutLMv3/test.py | 74 ++++++++++++------- 4 files changed, 157 insertions(+), 25 deletions(-) create mode 100644 projects/LayoutLMv3/configs/_base_/default_runtime.py create mode 100644 projects/LayoutLMv3/configs/ser/layoutlmv3_xfund_zh.py create mode 100644 projects/LayoutLMv3/scripts/run_ser.sh diff --git a/projects/LayoutLMv3/configs/_base_/default_runtime.py b/projects/LayoutLMv3/configs/_base_/default_runtime.py new file mode 100644 index 000000000..81480273b --- /dev/null +++ b/projects/LayoutLMv3/configs/_base_/default_runtime.py @@ -0,0 +1,41 @@ +default_scope = 'mmocr' +env_cfg = dict( + cudnn_benchmark=False, + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + dist_cfg=dict(backend='nccl'), +) +randomness = dict(seed=None) + +default_hooks = dict( + timer=dict(type='IterTimerHook'), + logger=dict(type='LoggerHook', interval=5), + param_scheduler=dict(type='ParamSchedulerHook'), + checkpoint=dict(type='CheckpointHook', interval=20), + sampler_seed=dict(type='DistSamplerSeedHook'), + sync_buffer=dict(type='SyncBuffersHook'), + visualization=dict( + type='VisualizationHook', + interval=1, + enable=False, + show=False, + draw_gt=False, + draw_pred=False), +) + +# Logging +log_level = 'INFO' +log_processor = dict(type='LogProcessor', window_size=10, by_epoch=True) + +load_from = None +resume = False + +# Evaluation +val_evaluator = dict(type='HmeanIOUMetric') +test_evaluator = val_evaluator + +# Visualization +vis_backends = [dict(type='LocalVisBackend')] +visualizer = dict( + type='TextDetLocalVisualizer', + name='visualizer', + vis_backends=vis_backends) diff --git a/projects/LayoutLMv3/configs/ser/layoutlmv3_xfund_zh.py b/projects/LayoutLMv3/configs/ser/layoutlmv3_xfund_zh.py new file mode 100644 index 000000000..63d334280 --- /dev/null +++ b/projects/LayoutLMv3/configs/ser/layoutlmv3_xfund_zh.py @@ -0,0 +1,61 @@ +_base_ = [ + '/Users/wangnu/Documents/GitHub/mmocr/' + 'configs/ser/_base_/datasets/xfund_zh.py', '../_base_/default_runtime.py' +] + +# specify a pretrained model +pretrained_model = '/Users/wangnu/Documents' +'/GitHub/mmocr/data/layoutlmv3-base-chinese' +# set classes +classes = ('answer', 'header', 'question', 'other') + +# optimizer +max_epochs = 10 +optim_wrapper = dict(type='OptimWrapper', optimizer=dict(type='Adam', lr=1e-3)) +param_scheduler = [ + dict(type='PolyLR', power=0.9, end=max_epochs), +] +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1) + +train_dataset = _base_.xfund_zh_ser_train +# specify a tokenizer for the dataset +train_dataset['tokenizer'] = dict( + pretrained_model_name_or_path=pretrained_model, use_fast=True) +train_pipeline = [ + dict(type='LoadImageFromFile', color_type='color'), + dict( + type='ProcessImageForLayoutLMv3', + image_processor=dict( + pretrained_model_name_or_path=pretrained_model, + size=(224, 224), + apply_ocr=False)), + dict( + type='ProcessTokenForLayoutLMv3', padding='max_length', + max_length=512), + dict( + type='PackSERInputs', + meta_keys=('img_path', 'ori_shape', 'img_shape', 'scale_factor', + 'id2biolabel')) +] +train_dataset.pipeline = train_pipeline +# set collate_fn='default_collate' for the dataloader +collate_fn = dict(type='default_collate') +train_dataloader = dict( + batch_size=1, + num_workers=1, + pin_memory=True, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + collate_fn=collate_fn, + dataset=train_dataset) + +model = dict( + type='HFLayoutLMv3ForTokenClassificationWrapper', + classifier=dict( + pretrained_model_name_or_path=pretrained_model, + num_labels=len(classes) * 2 - 1), + data_preprocessor=None) + +val_evaluator = None +test_evaluator = None diff --git a/projects/LayoutLMv3/scripts/run_ser.sh b/projects/LayoutLMv3/scripts/run_ser.sh new file mode 100644 index 000000000..05770d4a1 --- /dev/null +++ b/projects/LayoutLMv3/scripts/run_ser.sh @@ -0,0 +1,6 @@ +config='/Users/wangnu/Documents/GitHub/mmocr/projects/LayoutLMv3/configs/ser/layoutlmv3_xfund_zh.py' + +export TOKENIZERS_PARALLELISM=false + +python tools/train.py \ +${config} \ diff --git a/projects/LayoutLMv3/test.py b/projects/LayoutLMv3/test.py index cbb757da1..c6ca4ae44 100644 --- a/projects/LayoutLMv3/test.py +++ b/projects/LayoutLMv3/test.py @@ -2,57 +2,81 @@ from mmengine.config import Config from mmengine.dataset.utils import COLLATE_FUNCTIONS -from mmengine.registry import init_default_scope +from mmengine.registry import DATA_SAMPLERS, init_default_scope from torch.utils.data import DataLoader from mmocr.registry import DATASETS, MODELS if __name__ == '__main__': cfg_path = '/Users/wangnu/Documents/GitHub/mmocr/projects/' \ - 'LayoutLMv3/configs/layoutlmv3_xfund_zh.py' + 'LayoutLMv3/configs/ser/layoutlmv3_xfund_zh.py' cfg = Config.fromfile(cfg_path) init_default_scope(cfg.get('default_scope', 'mmocr')) + pretrained_model = '/Users/wangnu/Documents/GitHub' + '/mmocr/data/layoutlmv3-base-chinese' + dataset_cfg = cfg.train_dataset - dataset_cfg['tokenizer'] = \ - '/Users/wangnu/Documents/GitHub/mmocr/data/layoutlmv3-base-chinese' + dataset_cfg['tokenizer'] = dict( + pretrained_model_name_or_path=pretrained_model, use_fast=True) train_pipeline = [ dict(type='LoadImageFromFile', color_type='color'), - dict(type='Resize', scale=(224, 224), - backend='pillow'), # backend=pillow 数值与huggingface对齐 + dict( + type='ProcessImageForLayoutLMv3', + image_processor=dict( + pretrained_model_name_or_path=pretrained_model, + size=(224, 224), + apply_ocr=False)), + dict( + type='ProcessTokenForLayoutLMv3', + padding='max_length', + max_length=512), dict( type='PackSERInputs', - meta_keys=('img_path', 'ori_shape', 'img_shape', 'scale_factor')) + meta_keys=('img_path', 'ori_shape', 'img_shape', 'scale_factor', + 'id2biolabel')) ] dataset_cfg['pipeline'] = train_pipeline - train_dataloader_cfg = dict( - batch_size=1, - num_workers=8, - persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=True), - dataset=dataset_cfg) + train_dataset = DATASETS.build(dataset_cfg) model_cfg = dict( - type='LayoutLMv3TokenClassifier', - backbone=dict(), - cls_head=dict(), - data_preprocessor=dict( - type='LayoutLMv3DataPreprocessor', - mean=[127.5, 127.5, 127.5], - std=[127.5, 127.5, 127.5], - bgr_to_rgb=True)) + type='HFLayoutLMv3ForTokenClassificationWrapper', + classifier=dict( + pretrained_model_name_or_path=pretrained_model, num_labels=7), + data_preprocessor=None) - train_dataset = DATASETS.build(dataset_cfg) - collate_fn_cfg = dict(type='pseudo_collate') + collate_fn_cfg = dict(type='default_collate') collate_fn_type = collate_fn_cfg.pop('type') collate_fn = COLLATE_FUNCTIONS.get(collate_fn_type) collate_fn = partial(collate_fn, **collate_fn_cfg) - train_dataloader = DataLoader(dataset=train_dataset, collate_fn=collate_fn) + + sampler_cfg = dict( + type='DefaultSampler', dataset=train_dataset, shuffle=True) + sampler = DATA_SAMPLERS.build(sampler_cfg) + + from mmengine.dataset.utils import worker_init_fn as default_worker_init_fn + init_fn = partial( + default_worker_init_fn, + num_workers=2, + rank=0, + seed=301967075, + disable_subprocess_warning=False) + + train_dataloader = DataLoader( + batch_size=1, + dataset=train_dataset, + pin_memory=True, + persistent_workers=True, + sampler=sampler, + collate_fn=collate_fn, + num_workers=2, + worker_init_fn=init_fn) model = MODELS.build(model_cfg) for idx, data_batch in enumerate(train_dataloader): - result = model.forward(data_batch) + print(idx) + result = model.forward(**data_batch, mode='loss') break print('Done') From 84be2640d9148e9cb5d9e5770c75227929fc545b Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Tue, 18 Apr 2023 14:36:45 +0800 Subject: [PATCH 24/50] =?UTF-8?q?=E9=87=8D=E6=9E=84xfund=E6=95=B0=E6=8D=AE?= =?UTF-8?q?=E9=9B=86mmocr=E6=A0=BC=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- configs/re/_base_/datasets/xfund_zh.py | 4 +- configs/ser/_base_/datasets/xfund_zh.py | 4 +- .../xfund_config_generator.py | 110 ++++++------------ mmocr/datasets/preparers/packers/re_packer.py | 16 +-- .../datasets/preparers/packers/ser_packer.py | 42 +------ 5 files changed, 44 insertions(+), 132 deletions(-) diff --git a/configs/re/_base_/datasets/xfund_zh.py b/configs/re/_base_/datasets/xfund_zh.py index 4a44301dd..06fb11c09 100644 --- a/configs/re/_base_/datasets/xfund_zh.py +++ b/configs/re/_base_/datasets/xfund_zh.py @@ -1,13 +1,13 @@ xfund_zh_re_data_root = 'data/xfund/zh' xfund_zh_re_train = dict( - type='XFUNDREDataset', + type='XFUNDDataset', data_root=xfund_zh_re_data_root, ann_file='re_train.json', pipeline=None) xfund_zh_re_test = dict( - type='XFUNDREDataset', + type='XFUNDDataset', data_root=xfund_zh_re_data_root, ann_file='re_test.json', test_mode=True, diff --git a/configs/ser/_base_/datasets/xfund_zh.py b/configs/ser/_base_/datasets/xfund_zh.py index 40bbce4de..e790a7bf6 100644 --- a/configs/ser/_base_/datasets/xfund_zh.py +++ b/configs/ser/_base_/datasets/xfund_zh.py @@ -1,13 +1,13 @@ xfund_zh_ser_data_root = 'data/xfund/zh' xfund_zh_ser_train = dict( - type='XFUNDSERDataset', + type='XFUNDDataset', data_root=xfund_zh_ser_data_root, ann_file='ser_train.json', pipeline=None) xfund_zh_ser_test = dict( - type='XFUNDSERDataset', + type='XFUNDDataset', data_root=xfund_zh_ser_data_root, ann_file='ser_test.json', test_mode=True, diff --git a/mmocr/datasets/preparers/config_generators/xfund_config_generator.py b/mmocr/datasets/preparers/config_generators/xfund_config_generator.py index ca80375bc..0bc243505 100644 --- a/mmocr/datasets/preparers/config_generators/xfund_config_generator.py +++ b/mmocr/datasets/preparers/config_generators/xfund_config_generator.py @@ -35,22 +35,22 @@ class XFUNDSERConfigGenerator(BaseDatasetConfigGenerator): config_path (str): Path to the configs. Defaults to 'configs/'. """ - def __init__( - self, - data_root: str, - dataset_name: str, - overwrite_cfg: bool = False, - train_anns: Optional[List[Dict]] = [ - dict(ann_file='ser_train.json', dataset_postfix='') - ], - val_anns: Optional[List[Dict]] = [], - test_anns: Optional[List[Dict]] = [ - dict(ann_file='ser_test.json', dataset_postfix='') - ], - config_path: str = 'configs/', - ) -> None: + def __init__(self, + data_root: str, + dataset_name: str, + overwrite_cfg: bool = False, + train_anns: Optional[List[Dict]] = [ + dict(ann_file='ser_train.json', dataset_postfix='') + ], + val_anns: Optional[List[Dict]] = [], + test_anns: Optional[List[Dict]] = [ + dict(ann_file='ser_test.json', dataset_postfix='') + ], + config_path: str = 'configs/') -> None: + if '/' in dataset_name: dataset_name = '_'.join(dataset_name.split('/')) + super().__init__( data_root=data_root, task='ser', @@ -87,7 +87,7 @@ def _gen_dataset_config(self) -> str: cfg = '' for key_name, ann_dict in self.anns.items(): cfg += f'\n{key_name} = dict(\n' - cfg += ' type=\'XFUNDSERDataset\',\n' + cfg += ' type=\'XFUNDDataset\',\n' cfg += ' data_root=' + f'{self.dataset_name}_{self.task}_data_root,\n' # noqa: E501 cfg += f' ann_file=\'{ann_dict["ann_file"]}\',\n' if ann_dict['split'] in ['test', 'val']: @@ -100,48 +100,28 @@ def _gen_dataset_config(self) -> str: class XFUNDREConfigGenerator(BaseDatasetConfigGenerator): """XFUND dataset Relation Extraction task config generator. - Args: - data_root (str): The root path of the dataset. - dataset_name (str): The name of the dataset. - overwrite_cfg (bool): Whether to overwrite the dataset config file if - it already exists. If False, config generator will not generate new - config for datasets whose configs are already in base. - train_anns (List[Dict], optional): A list of train annotation files - to appear in the base configs. Defaults to - ``[dict(file='re_train.json', dataset_postfix='')]``. - Each element is typically a dict with the following fields: - - ann_file (str): The path to the annotation file relative to - data_root. - - dataset_postfix (str, optional): Affects the postfix of the - resulting variable in the generated config. If specified, the - dataset variable will be named in the form of - ``{dataset_name}_{dataset_postfix}_{task}_{split}``. Defaults to - None. - val_anns (List[Dict], optional): A list of val annotation files - to appear in the base configs, similar to ``train_anns``. Defaults - to []. - test_anns (List[Dict], optional): A list of test annotation files - to appear in the base configs, similar to ``train_anns``. Defaults - to ``[dict(file='re_test.json')]``. - config_path (str): Path to the configs. Defaults to 'configs/'. + The main difference with `XFUNDSERConfigGenerator` is: + - train_anns/val_anns/test_anns default file name: + f'{ser or re}_{train or test}.json' + - the value of self.task: 'ser' or 're' """ - def __init__( - self, - data_root: str, - dataset_name: str, - overwrite_cfg: bool = False, - train_anns: Optional[List[Dict]] = [ - dict(ann_file='re_train.json', dataset_postfix='') - ], - val_anns: Optional[List[Dict]] = [], - test_anns: Optional[List[Dict]] = [ - dict(ann_file='re_test.json', dataset_postfix='') - ], - config_path: str = 'configs/', - ) -> None: + def __init__(self, + data_root: str, + dataset_name: str, + overwrite_cfg: bool = False, + train_anns: Optional[List[Dict]] = [ + dict(ann_file='re_train.json', dataset_postfix='') + ], + val_anns: Optional[List[Dict]] = [], + test_anns: Optional[List[Dict]] = [ + dict(ann_file='re_test.json', dataset_postfix='') + ], + config_path: str = 'configs/') -> None: + if '/' in dataset_name: dataset_name = '_'.join(dataset_name.split('/')) + super().__init__( data_root=data_root, task='re', @@ -154,31 +134,11 @@ def __init__( ) def _gen_dataset_config(self) -> str: - """Generate a full dataset config based on the annotation file - dictionary. - - Args: - ann_dict (dict[str, dict(str, str)]): A nested dictionary that maps - a config variable name (such as icdar2015_textrecog_train) to - its corresponding annotation information dict. Each dict - contains following keys: - - ann_file (str): The path to the annotation file relative to - data_root. - - dataset_postfix (str, optional): Affects the postfix of the - resulting variable in the generated config. If specified, the - dataset variable will be named in the form of - ``{dataset_name}_{dataset_postfix}_{task}_{split}``. Defaults - to None. - - split (str): The split the annotation belongs to. Usually - it can be 'train', 'val' and 'test'. - - Returns: - str: The generated dataset config. - """ + """Same as `XFUNDSERConfigGenerator._gen_dataset_config()`""" cfg = '' for key_name, ann_dict in self.anns.items(): cfg += f'\n{key_name} = dict(\n' - cfg += ' type=\'XFUNDREDataset\',\n' + cfg += ' type=\'XFUNDDataset\',\n' cfg += ' data_root=' + f'{self.dataset_name}_{self.task}_data_root,\n' # noqa: E501 cfg += f' ann_file=\'{ann_dict["ann_file"]}\',\n' if ann_dict['split'] in ['test', 'val']: diff --git a/mmocr/datasets/preparers/packers/re_packer.py b/mmocr/datasets/preparers/packers/re_packer.py index 91018cb72..62dca972c 100644 --- a/mmocr/datasets/preparers/packers/re_packer.py +++ b/mmocr/datasets/preparers/packers/re_packer.py @@ -12,24 +12,12 @@ @DATA_PACKERS.register_module() class REPacker(SERPacker): """Relation Extraction packer. It is used to pack the parsed annotation - info to. + info to MMOCR format. .. code-block:: python { - "metainfo": - { - "orig_labels": ['answer', 'header', 'other', 'question'], - "biolabel2id": { - "O": 0, - "B-ANSWER": 1, - "I-ANSWER": 2, - "B-HEADER": 3, - "I-HEADER": 4, - "B-QUESTION": 5, - "I-QUESTION": 6 - } - }, + "metainfo": {}, "data_list": [ { diff --git a/mmocr/datasets/preparers/packers/ser_packer.py b/mmocr/datasets/preparers/packers/ser_packer.py index 2e9e0684d..798cfc4a2 100644 --- a/mmocr/datasets/preparers/packers/ser_packer.py +++ b/mmocr/datasets/preparers/packers/ser_packer.py @@ -12,24 +12,12 @@ @DATA_PACKERS.register_module() class SERPacker(BasePacker): """Semantic Entity Recognition packer. It is used to pack the parsed - annotation info to. + annotation info to MMOCR format. .. code-block:: python { - "metainfo": - { - "orig_labels": ['answer', 'header', 'other', 'question'], - "biolabel2id": { - "O": 0, - "B-ANSWER": 1, - "I-ANSWER": 2, - "B-HEADER": 3, - "I-HEADER": 4, - "B-QUESTION": 5, - "I-QUESTION": 6 - } - }, + "metainfo": {}, "data_list": [ { @@ -133,29 +121,5 @@ def add_meta(self, sample: List) -> Dict: Returns: Dict: A dict contains the meta information and samples. """ - - def get_bio_label_list(labels): - bio_label_list = [] - for label in labels: - if label == 'other': - bio_label_list.insert(0, 'O') - else: - bio_label_list.append(f'B-{label.upper()}') - bio_label_list.append(f'I-{label.upper()}') - return bio_label_list - - labels = [] - for s in sample: - labels += s['instances']['labels'] - orig_label_list = list(set(labels)) - bio_label_list = get_bio_label_list(orig_label_list) - - meta = { - 'metainfo': { - 'orig_labels': orig_label_list, - 'biolabel2id': {v: k - for k, v in enumerate(bio_label_list)} - }, - 'data_list': sample - } + meta = {'metainfo': {}, 'data_list': sample} return meta From 2767fcce6782747fb2c39a04c4264da5f5cfca56 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Wed, 19 Apr 2023 15:37:07 +0800 Subject: [PATCH 25/50] =?UTF-8?q?=E7=AE=80=E5=8C=96XFUNDDataset=EF=BC=8C?= =?UTF-8?q?=E4=B8=8D=E5=86=8D=E6=8C=89ser/re=E4=BB=BB=E5=8A=A1=E5=8C=BA?= =?UTF-8?q?=E5=88=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mmocr/datasets/__init__.py | 4 +- mmocr/datasets/xfund_dataset.py | 294 +++++--------------------------- 2 files changed, 49 insertions(+), 249 deletions(-) diff --git a/mmocr/datasets/__init__.py b/mmocr/datasets/__init__.py index ab00e0328..f7e746149 100644 --- a/mmocr/datasets/__init__.py +++ b/mmocr/datasets/__init__.py @@ -7,9 +7,9 @@ from .samplers import * # NOQA from .transforms import * # NOQA from .wildreceipt_dataset import WildReceiptDataset -from .xfund_dataset import XFUNDSERDataset +from .xfund_dataset import XFUNDDataset __all__ = [ 'IcdarDataset', 'OCRDataset', 'RecogLMDBDataset', 'RecogTextDataset', - 'WildReceiptDataset', 'ConcatDataset', 'XFUNDSERDataset' + 'WildReceiptDataset', 'ConcatDataset', 'XFUNDDataset' ] diff --git a/mmocr/datasets/xfund_dataset.py b/mmocr/datasets/xfund_dataset.py index 728eda4bc..5a7fda46b 100644 --- a/mmocr/datasets/xfund_dataset.py +++ b/mmocr/datasets/xfund_dataset.py @@ -1,257 +1,57 @@ # Copyright (c) OpenMMLab. All rights reserved. -import os -from typing import Callable, List, Optional, Sequence, Union - from mmengine.dataset import BaseDataset from mmocr.registry import DATASETS -from transformers import AutoTokenizer @DATASETS.register_module() -class XFUNDSERDataset(BaseDataset): - """XFUND Dataset for Semantic Entity Recognition task. part of code is - modified from https://github.com/microsoft/unilm/blob/master/layoutlmv3/lay - outlmft/data/xfund.py. +class XFUNDDataset(BaseDataset): + """XFUND Dataset for Semantic Entity Recognition and Relation Extraction + task. + + The annotation format is shown as follows. + + .. code-block:: none + + { + "metainfo":{}, + "data_list": + [ + { + "img_path": "data/xfund/zh/imgs/train/zh_train_0.jpg", + "height": 3508, + "width": 2480, + "instances": + { + "texts": ["绩效目标申报表(一级项目)", "项目名称", ...], + "boxes": [[906,195,1478,259], + [357,325,467,357], ...], + "labels": ["header", "question", ...], + "linkings": [[0, 1], [2, 3], ...], (RE task will have) + "ids": [0, 1, ...], (RE task will have) + "words": [[{ + "box": [ + 904, + 192, + 942, + 253 + ], + "text": "绩" + }, + { + "box": [ + 953, + 192, + 987, + 253 + ], + "text": "效" + }, ...], ...] + } + }, + ] + } Args: - ann_file (str): Annotation file path. Defaults to ''. - tokenizer (str): The pre-trained tokenizer you want to use. - Defaults to ''. - metainfo (dict, optional): Meta information for dataset, such as class - information. Defaults to None. - data_root (str): The root directory for ``data_prefix`` and - ``ann_file``. Defaults to ''. - data_prefix (dict): Prefix for training data. Defaults to - ``dict(img_path='')``. - filter_cfg (dict, optional): Config for filter data. Defaults to None. - indices (int or Sequence[int], optional): Support using first few - data in annotation file to facilitate training/testing on a smaller - dataset. Defaults to None which means using all ``data_infos``. - serialize_data (bool, optional): Whether to hold memory using - serialized objects, when enabled, data loader workers can use - shared RAM from master process instead of making a copy. Defaults - to True. - pipeline (list, optional): Processing pipeline. Defaults to []. - test_mode (bool, optional): ``test_mode=True`` means in test phase. - Defaults to False. - lazy_init (bool, optional): Whether to load annotation during - instantiation. In some cases, such as visualization, only the meta - information of the dataset is needed, which is not necessary to - load annotation file. ``RecogLMDBDataset`` can skip load - annotations to save time by set ``lazy_init=False``. - Defaults to False. - max_refetch (int, optional): If ``RecogLMDBdataset.prepare_data`` get a - None img. The maximum extra number of cycles to get a valid - image. Defaults to 1000. + The same as OCRDataset """ - - def __init__(self, - ann_file: str = '', - tokenizer: dict = dict(pretrained_model_name_or_path=None), - metainfo: Optional[dict] = None, - data_root: Optional[str] = '', - data_prefix: dict = dict(img_path=''), - filter_cfg: Optional[dict] = None, - indices: Optional[Union[int, Sequence[int]]] = None, - serialize_data: bool = True, - pipeline: List[Union[dict, Callable]] = [], - test_mode: bool = False, - lazy_init: bool = False, - max_refetch: int = 1000) -> None: - - if isinstance(tokenizer, dict) and \ - tokenizer.get('pretrained_model_name_or_path', None): - self.tokenizer = AutoTokenizer.from_pretrained(**tokenizer) - else: - raise TypeError( - 'tokenizer cfg should be a `dict` and a key ' - '`pretrained_model_name_or_path` must be specified') - - super().__init__( - ann_file=ann_file, - metainfo=metainfo, - data_root=data_root, - data_prefix=data_prefix, - filter_cfg=filter_cfg, - indices=indices, - serialize_data=serialize_data, - pipeline=pipeline, - test_mode=test_mode, - lazy_init=lazy_init, - max_refetch=max_refetch) - - def load_data_list(self) -> List[dict]: - data_list = super().load_data_list() - - # split text to several slices because of over-length - split_text_data_list = [] - for i in range(len(data_list)): - start = 0 - cur_iter = 0 - while start < len(data_list[i]['input_ids']): - end = min(start + 510, len(data_list[i]['input_ids'])) - # get input_ids - input_ids = [self.tokenizer.cls_token_id] + \ - data_list[i]['input_ids'][start:end] + \ - [self.tokenizer.sep_token_id] - # get boxes - boxes = [[0, 0, 0, 0]] + \ - data_list[i]['boxes'][start:end] + \ - [[1000, 1000, 1000, 1000]] - # get labels - labels = [-100] + data_list[i]['labels'][start:end] + [-100] - # get segment_ids - segment_ids = self.get_segment_ids(boxes) - # get position_ids - position_ids = self.get_position_ids(segment_ids) - # get img_path - img_path = os.path.join(self.data_root, - data_list[i]['img_path']) - # get attention_mask - attention_mask = [1] * len(input_ids) - - data_info = {} - data_info['input_ids'] = input_ids - # set key=bbox in order to be consistent with - # HuggingFace LayoutLMv3Model input params - data_info['bbox'] = boxes - data_info['labels'] = labels - data_info['segment_ids'] = segment_ids - data_info['position_ids'] = position_ids - data_info['img_path'] = img_path - data_info['attention_mask'] = attention_mask - # record id2biolabel - id2biolabel = { - v: k - for k, v in self.metainfo['biolabel2id'].items() - } - data_info['id2biolabel'] = id2biolabel - # record tokenizer - data_info['tokenizer'] = self.tokenizer - split_text_data_list.append(data_info) - - start = end - cur_iter += 1 - - return split_text_data_list - - def parse_data_info(self, raw_data_info: dict) -> dict: - """Parse raw data information, tokenize texts and normalize boxes. - - raw_data_info - { - "img_path": "imgs\\test\\zh_val_0.jpg", - "height": 3508, - "width": 2480, - "instances": - { - "texts": ["汇丰晋信", "受理时间:", ...], - "boxes": [[104, 114, 530, 175], - [126, 267, 266, 305], ...], - "labels": ["other", "question", ...], - "words": [[...], [...], ...] - } - } - will be modified to data_info - { - "img_path": "imgs\\test\\zh_val_0.jpg", - "input_ids": [6, 47360, 49222, 124321, 5070, 6, ...], - "boxes": [[41, 32, 213, 49], - [41, 32, 213, 49], - [41, 32, 213, 49], - [41, 32, 213, 49], - [41, 32, 213, 49], - [50, 76, 107, 86], ...], - "labels": [0, 0, 0, 0, 0, 1, ...] - } - The length of `texts`、`boxes` and `labels` will increase. - The `words` annotations are not used here. - """ - instances = raw_data_info['instances'] - texts = instances['texts'] - boxes = instances['boxes'] - labels = instances['labels'] - - # norm boxes - width = raw_data_info['width'] - height = raw_data_info['height'] - norm_boxes = [self.box_norm(box, width, height) for box in boxes] - - # get biolabel2id - biolabel2id = self.metainfo['biolabel2id'] - # tokenize texts - cur_doc_input_ids, cur_doc_boxes, cur_doc_labels = [], [], [] - for j in range(len(texts)): - cur_input_ids = self.tokenizer( - texts[j], - truncation=False, - add_special_tokens=False, - return_attention_mask=False)['input_ids'] - if len(cur_input_ids) == 0: - continue - # generate bio label - cur_label = labels[j].upper() - if cur_label == 'OTHER': - cur_labels = ['O'] * len(cur_input_ids) - for k in range(len(cur_labels)): - cur_labels[k] = biolabel2id[cur_labels[k]] - else: - cur_labels = [cur_label] * len(cur_input_ids) - cur_labels[0] = biolabel2id['B-' + cur_labels[0]] - for k in range(1, len(cur_labels)): - cur_labels[k] = biolabel2id['I-' + cur_labels[k]] - assert len(cur_input_ids) == len(cur_labels) - - cur_doc_input_ids += cur_input_ids - cur_doc_boxes += [norm_boxes[j]] * len(cur_input_ids) - cur_doc_labels += cur_labels - assert len(cur_doc_input_ids) == len(cur_doc_boxes) == len( - cur_doc_labels) - assert len(cur_doc_input_ids) > 0 - - data_info = {} - data_info['img_path'] = raw_data_info['img_path'] - data_info['input_ids'] = cur_doc_input_ids - data_info['boxes'] = cur_doc_boxes - data_info['labels'] = cur_doc_labels - - return data_info - - def box_norm(self, box, width, height): - - def clip(min_num, num, max_num): - return min(max(num, min_num), max_num) - - x0, y0, x1, y1 = box - x0 = clip(0, int((x0 / width) * 1000), 1000) - y0 = clip(0, int((y0 / height) * 1000), 1000) - x1 = clip(0, int((x1 / width) * 1000), 1000) - y1 = clip(0, int((y1 / height) * 1000), 1000) - assert x1 >= x0 - assert y1 >= y0 - return [x0, y0, x1, y1] - - def get_segment_ids(self, boxes): - segment_ids = [] - for i in range(len(boxes)): - if i == 0: - segment_ids.append(0) - else: - if boxes[i - 1] == boxes[i]: - segment_ids.append(segment_ids[-1]) - else: - segment_ids.append(segment_ids[-1] + 1) - return segment_ids - - def get_position_ids(self, segment_ids): - position_ids = [] - for i in range(len(segment_ids)): - if i == 0: - position_ids.append(2) - else: - if segment_ids[i] == segment_ids[i - 1]: - position_ids.append(position_ids[-1] + 1) - else: - position_ids.append(2) - return position_ids From 4b4b3438c74e0ba07dc49a8053776056eb10ee58 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Wed, 19 Apr 2023 15:41:40 +0800 Subject: [PATCH 26/50] =?UTF-8?q?=E5=B0=86=E5=8E=9F=E6=9C=AC=E5=9C=A8XFUND?= =?UTF-8?q?Dataset=E5=86=85=E5=81=9A=E7=9A=84=E9=A2=84=E5=A4=84=E7=90=86?= =?UTF-8?q?=E5=85=A8=E9=83=A8=E7=A7=BB=E5=88=B0pipeline=E4=B8=AD=EF=BC=8C?= =?UTF-8?q?=E9=87=8D=E6=9E=84=E9=A2=84=E5=A4=84=E7=90=86=E4=BB=A3=E7=A0=81?= =?UTF-8?q?=E4=B8=BALoadProcessorFromPretrainedModel=20/=20ProcessImageFor?= =?UTF-8?q?LayoutLMv3=20/=20ProcessTokenForLayoutLMv3=20/=20ConvertBIOLabe?= =?UTF-8?q?lForSER=20=E5=9B=9B=E4=B8=AA=E6=B5=81=E7=A8=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mmocr/datasets/transforms/__init__.py | 7 +- mmocr/datasets/transforms/formatting.py | 12 +- .../transforms/layoutlmv3_transforms.py | 340 ++++++++++-------- 3 files changed, 208 insertions(+), 151 deletions(-) diff --git a/mmocr/datasets/transforms/__init__.py b/mmocr/datasets/transforms/__init__.py index c4c258be9..e72031dfc 100644 --- a/mmocr/datasets/transforms/__init__.py +++ b/mmocr/datasets/transforms/__init__.py @@ -2,7 +2,9 @@ from .adapters import MMDet2MMOCR, MMOCR2MMDet from .formatting import (PackKIEInputs, PackSERInputs, PackTextDetInputs, PackTextRecogInputs) -from .layoutlmv3_transforms import (ProcessImageForLayoutLMv3, +from .layoutlmv3_transforms import (ConvertBIOLabelForSER, + LoadProcessorFromPretrainedModel, + ProcessImageForLayoutLMv3, ProcessTokenForLayoutLMv3) from .loading import (InferencerLoader, LoadImageFromFile, LoadImageFromNDArray, LoadKIEAnnotations, @@ -27,5 +29,6 @@ 'MMOCR2MMDet', 'LoadImageFromFile', 'LoadImageFromNDArray', 'CropHeight', 'InferencerLoader', 'RemoveIgnored', 'ConditionApply', 'CropHeight', 'TextRecogGeneralAug', 'ImageContentJitter', 'ReversePixels', - 'PackSERInputs', 'ProcessImageForLayoutLMv3', 'ProcessTokenForLayoutLMv3' + 'PackSERInputs', 'ProcessImageForLayoutLMv3', 'ProcessTokenForLayoutLMv3', + 'LoadProcessorFromPretrainedModel', 'ConvertBIOLabelForSER' ] diff --git a/mmocr/datasets/transforms/formatting.py b/mmocr/datasets/transforms/formatting.py index 41c75f138..845b2c17f 100644 --- a/mmocr/datasets/transforms/formatting.py +++ b/mmocr/datasets/transforms/formatting.py @@ -368,8 +368,7 @@ class PackSERInputs(BaseTransform): """ # HF LayoutLMv3ForTokenClassification model input params. ser_keys = [ - 'input_ids', 'bbox', 'attention_mask', 'position_ids', 'pixel_values', - 'labels' + 'input_ids', 'bbox', 'attention_mask', 'pixel_values', 'labels' ] def __init__(self, meta_keys=()): @@ -407,15 +406,18 @@ def transform(self, results: dict) -> dict: results['pixel_values'] = img data_sample = SERDataSample() - # instance_data = InstanceData() + gt_label = LabelData() inputs = {} for key in self.ser_keys: if key not in results: continue - inputs[key] = to_tensor(results[key]) + value = to_tensor(results[key]) + if key == 'labels': + gt_label.item = value + inputs[key] = value packed_results['inputs'] = inputs - # data_sample.gt_instances = instance_data + data_sample.gt_label = gt_label meta = {} for key in self.meta_keys: diff --git a/mmocr/datasets/transforms/layoutlmv3_transforms.py b/mmocr/datasets/transforms/layoutlmv3_transforms.py index f3cb61ebd..cfa299fc6 100644 --- a/mmocr/datasets/transforms/layoutlmv3_transforms.py +++ b/mmocr/datasets/transforms/layoutlmv3_transforms.py @@ -4,22 +4,70 @@ from mmcv.transforms.base import BaseTransform from mmocr.registry import TRANSFORMS -from transformers import AutoImageProcessor, LayoutLMv3ImageProcessor +from transformers import LayoutLMv3ImageProcessor, LayoutXLMTokenizerFast from transformers.file_utils import PaddingStrategy from transformers.image_processing_utils import BatchFeature from transformers.image_utils import ChannelDimension -from transformers.tokenization_utils_base import BatchEncoding +from transformers.tokenization_utils_base import (BatchEncoding, + TruncationStrategy) + + +@TRANSFORMS.register_module() +class LoadProcessorFromPretrainedModel(BaseTransform): + """A transform to load image_processor/text_tokenizer from pretrained + model, which will use HuggingFace `LayoutLMv3ImageProcessor` and + `LayoutXLMTokenizerFast` + + Added Keys: + + - image_processor + - tokeinzer + + Args: + pretrained_model_name_or_path (str): The name or path of huggingface + pretrained model, which must be specified. + image_processor (dict): The specific parameters for image_processor. + tokenizer (dict): The specific parameters for tokenizer. + """ + + def __init__( + self, + pretrained_model_name_or_path: str, + image_processor: dict = dict(), + tokenizer: dict = dict() + ) -> None: + super().__init__() + assert pretrained_model_name_or_path != '' + self.image_processor = LayoutLMv3ImageProcessor.from_pretrained( + pretrained_model_name_or_path=pretrained_model_name_or_path, + **image_processor) + # TODO: support apply_ocr + if self.image_processor.apply_ocr: + raise ValueError( + 'Now only support initialized the image processor ' + 'with apply_ocr set to False.') + + # https://huggingface.co/microsoft/layoutlmv3-base-chinese/discussions/3 + # use LayoutXLMTokenizerFast instead of LayoutLMv3TokenizerFast + self.tokenizer = LayoutXLMTokenizerFast.from_pretrained( + pretrained_model_name_or_path=pretrained_model_name_or_path, + **tokenizer) + + def transform(self, results: dict) -> Dict: + results['image_processor'] = self.image_processor + results['tokenizer'] = self.tokenizer + return results @TRANSFORMS.register_module() class ProcessImageForLayoutLMv3(BaseTransform): - """A transform to process image for LayoutLMv3, which will use HuggingFace - `AutoImageProcessor` + """A transform to process image for LayoutLMv3. Required Keys: - img - img_shape + - image_processor Modified Keys: @@ -29,48 +77,18 @@ class ProcessImageForLayoutLMv3(BaseTransform): - scale_factor - pixel_values - - Args: - image_processor (dict): The image_processor cfg, which the key - `pretrained_model_name_or_path` must be specified. """ - image_processor_class = (LayoutLMv3ImageProcessor) - - def __init__(self, - image_processor: dict = dict( - pretrained_model_name_or_path=None), - label_pad_token_id: int = -100) -> None: + def __init__(self) -> None: super().__init__() - if isinstance(image_processor, dict) and \ - image_processor.get('pretrained_model_name_or_path', None): - self.image_processor = AutoImageProcessor.from_pretrained( - **image_processor) - else: - raise TypeError( - 'image_processor cfg should be a `dict` and a key ' - '`pretrained_model_name_or_path` must be specified') - - if not isinstance(self.image_processor, self.image_processor_class): - raise ValueError( - f'Received a {type(self.image_processor)} for argument ' - f'image_processor, but a {self.image_processor_class} ' - 'was expected.') - - # TODO: support apply_ocr - if self.image_processor.apply_ocr: - raise ValueError( - 'Now only support initialized the image processor ' - 'with apply_ocr set to False.') - - self.label_pad_token_id = label_pad_token_id def _resize_rescale_norm(self, results: dict) -> None: - """apply the image_processor to process img.""" + """apply the image_processor to img.""" img = results['img'] h, w = results['img_shape'] - features: BatchFeature = self.image_processor( + image_processor = results['image_processor'] + features: BatchFeature = image_processor( images=img, return_tensors='np', data_format=ChannelDimension.LAST) # output default dims NHWC and here N=1 @@ -89,136 +107,170 @@ def transform(self, results: dict) -> Dict: @TRANSFORMS.register_module() class ProcessTokenForLayoutLMv3(BaseTransform): - """A transform to process token, which will dynamically pad the inputs - received, as well as the labels. - - Part of code is modified from `https://github.com/microsoft/unilm/blob - /master/layoutlmv3/layoutlmft/data/data_collator.py` and `https:// - github.com/huggingface/transformers/blob/main/src/transformers/models/ - layoutlmv3/processing_layoutlmv3.py`. + """A transform to process texts for LayoutLMv3, Required Keys: - tokenizer - - input_ids - - attention_mask - - labels - - bbox - - position_ids - - segment_ids(optional) + - width + - height + - instances + - texts + - boxes - Modified Keys: + Added Keys: - input_ids - attention_mask - - labels - bbox - - position_ids - - segment_ids(optional) + - word_ids Args: - padding (:obj:`bool`, :obj:`str` or :class: - `~transformers.file_utils.PaddingStrategy`, - `optional`, defaults to :obj:`True`): - Select a strategy to pad the returned sequences - (according to the model's padding side and padding index) - among: - * :obj:`True` or :obj:`'longest'`: Pad to the longest - sequence in the batch (or no padding if only a - single sequence if provided). - * :obj:`'max_length'`: Pad to a maximum length specified - with the argument :obj:`max_length` or to the maximum - acceptable input length for the model if that argument - is not provided. - * :obj:`False` or :obj:`'do_not_pad'` (default): No - padding (i.e., can output a batch with sequences - of different lengths). - max_length (:obj:`int`, `optional`): - Maximum length of the returned list and optionally - padding length (see above). - pad_to_multiple_of (:obj:`int`, `optional`): - If set will pad the sequence to a multiple of the - provided value. This is especially useful to enable the - use of Tensor Cores on NVIDIA hardware with compute - capability >= 7.5 (Volta). - label_pad_token_id (:obj:`int`, `optional`, defaults to -100): - The id to use when padding the labels (-100 will be - automatically ignore by PyTorch loss functions). + Refer to the parameters of the corresponding tokenizer """ - padded_input_names = ['input_ids', 'attention_mask'] - def __init__(self, - padding: Union[bool, str, PaddingStrategy] = True, + padding: Union[bool, str, PaddingStrategy] = False, max_length: Optional[int] = None, - pad_to_multiple_of: Optional[int] = None, - label_pad_token_id: int = -100) -> None: + truncation: Union[bool, str, TruncationStrategy] = None, + pad_to_multiple_of: Optional[int] = None) -> None: super().__init__() self.padding = padding self.max_length = max_length + self.truncation = truncation self.pad_to_multiple_of = pad_to_multiple_of - self.label_pad_token_id = label_pad_token_id - def _pad(self, results: dict) -> None: - # get tokenizer + def _tokenize(self, results: dict) -> None: tokenizer = results['tokenizer'] - # There will be a warning advice: - # You're using a XLMRobertaTokenizerFast tokenizer. - # Please note that with a fast tokenizer, using the - # `__call__` method is faster than using a method to - # encode the text followed by a call to the `pad` - # method to get a padded encoding. - # But `__call__` method only supports input string text, - # which has already been encoded before this step. - features = { - k: v - for k, v in results.items() if k in self.padded_input_names - } - batch: BatchEncoding = tokenizer.pad( - encoded_inputs=features, + instances = results['instances'] + texts = instances['texts'] + boxes = instances['boxes'] + + tokenized_inputs: BatchEncoding = tokenizer( + text=texts, + boxes=boxes, padding=self.padding, max_length=self.max_length, - pad_to_multiple_of=self.pad_to_multiple_of) - # update `input_ids` and `attention_mask` - results.update(batch) - - has_bbox_input = 'bbox' in results - has_position_input = 'position_ids' in results - has_segment_input = 'segment_ids' in results - sequence_length = len(results['input_ids']) - if tokenizer.padding_side == 'right': - results[ - 'labels'] = results['labels'] + [self.label_pad_token_id] * ( - sequence_length - len(results['labels'])) - if has_bbox_input: - results['bbox'] = results['bbox'] + [[0, 0, 0, 0]] * ( - sequence_length - len(results['bbox'])) - if has_position_input: - results['position_ids'] = results['position_ids'] + [ - tokenizer.pad_token_id - ] * ( - sequence_length - len(results['position_ids'])) - if has_segment_input: - results['segment_ids'] = results['segment_ids'] + [ - results['segment_ids'][-1] + 1 - ] * ( - sequence_length - len(results['segment_ids'])) - else: - results['labels'] = [self.label_pad_token_id] * ( - sequence_length - len(results['labels'])) + results['labels'] - if has_bbox_input: - results['bbox'] = [[0, 0, 0, 0]] * ( - sequence_length - len(results['bbox'])) + results['bbox'] - if has_position_input: - results['position_ids'] = [tokenizer.pad_token_id] * ( - sequence_length - - len(results['position_ids'])) + results['position_ids'] - if has_segment_input: - results['segment_ids'] = [results['segment_ids'][-1] + 1] * ( - sequence_length - - len(results['segment_ids'])) + results['segment_ids'] + truncation=self.truncation, + pad_to_multiple_of=self.pad_to_multiple_of, + add_special_tokens=True, + return_tensors='np', + return_attention_mask=True, + return_offsets_mapping=True) + + # By default, the pipeline processes one sample + # at a time, so set batch_index = 0. + batch_index = 0 + # record input_ids/attention_mask/bbox + for k in ['input_ids', 'attention_mask', 'bbox']: + results[k] = tokenized_inputs[k][batch_index] + # record word_ids + results['word_ids'] = tokenized_inputs.encodings[batch_index].word_ids + + def _norm_boxes(self, results: dict) -> None: + + def box_norm(box, width, height): + + def clip(min_num, num, max_num): + return min(max(num, min_num), max_num) + + x0, y0, x1, y1 = box + x0 = clip(0, int((x0 / width) * 1000), 1000) + y0 = clip(0, int((y0 / height) * 1000), 1000) + x1 = clip(0, int((x1 / width) * 1000), 1000) + y1 = clip(0, int((y1 / height) * 1000), 1000) + assert x1 >= x0 + assert y1 >= y0 + return [x0, y0, x1, y1] + + instances = results['instances'] + boxes = instances['boxes'] + + # norm boxes + width = results['width'] + height = results['height'] + norm_boxes = [box_norm(box, width, height) for box in boxes] + + results['instances']['boxes'] = norm_boxes + + def transform(self, results: dict) -> Dict: + self._norm_boxes(results) + self._tokenize(results) + return results + + +@TRANSFORMS.register_module() +class ConvertBIOLabelForSER(BaseTransform): + """A transform to convert BIO format labels for SER task, + + Required Keys: + + - tokenizer + - word_ids + - instances + - labels + + Added Keys: + + - labels + + Args: + classes (Union[tuple, list]): dataset classes + only_label_first_subword (bool): Whether or not to only label + the first subword, in case word labels are provided. + """ + + def __init__(self, + classes: Union[tuple, list], + only_label_first_subword: bool = False) -> None: + super().__init__() + self.biolabel2id = self._generate_biolabel2id_map(classes) + self.only_label_first_subword = only_label_first_subword + + def _generate_biolabel2id_map(self, classes: Union[tuple, list]) -> Dict: + bio_label_list = [] + classes = sorted([c.upper() for c in classes]) + for c in classes: + if c == 'OTHER': + bio_label_list.insert(0, c) + else: + bio_label_list.append(f'B-{c}') + bio_label_list.append(f'I-{c}') + biolabel2id_map = { + bio_label: idx + for idx, bio_label in enumerate(bio_label_list) + } + return biolabel2id_map + + def _convert(self, results: dict) -> None: + tokenizer = results['tokenizer'] + + instances = results['instances'] + labels = [label.upper() for label in instances['labels']] + word_ids = results['word_ids'] + + biolabel_ids = [] + pre_word_id = None + for cur_word_id in word_ids: + if cur_word_id is not None: + if cur_word_id != pre_word_id: + biolabel_name = f'B-{labels[cur_word_id]}' \ + if labels[cur_word_id] != 'OTHER' else 'OTHER' + elif self.only_label_first_subword: + biolabel_name = 'OTHER' + else: + biolabel_name = f'I-{labels[cur_word_id]}' \ + if labels[cur_word_id] != 'OTHER' else 'OTHER' + # convert biolabel to id + biolabel_ids.append(self.biolabel2id[biolabel_name]) + else: + biolabel_ids.append(tokenizer.pad_token_label) + pre_word_id = cur_word_id + + # record biolabel_ids + results['labels'] = biolabel_ids def transform(self, results: dict) -> Dict: - self._pad(results) + self._convert(results) return results From 8399f949cb021564256b28f8e53832cd106b0a8c Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Wed, 19 Apr 2023 16:06:29 +0800 Subject: [PATCH 27/50] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=E6=B5=8B=E8=AF=95=E8=84=9A=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mmocr/models/ser/__init__.py | 3 +- mmocr/models/ser/hf_layoutlmv3_wrapper.py | 24 ++++++--- mmocr/models/ser/ser_postprocessor.py | 26 +++++++++ .../configs/_base_/datasets/xfund_zh.py | 14 +++++ .../configs/ser/layoutlmv3_xfund_zh.py | 43 ++++++++------- projects/LayoutLMv3/scripts/run_ser.sh | 1 + projects/LayoutLMv3/test.py | 54 +++++++++---------- 7 files changed, 104 insertions(+), 61 deletions(-) create mode 100644 mmocr/models/ser/ser_postprocessor.py create mode 100644 projects/LayoutLMv3/configs/_base_/datasets/xfund_zh.py diff --git a/mmocr/models/ser/__init__.py b/mmocr/models/ser/__init__.py index 2799c9dab..4188c5950 100644 --- a/mmocr/models/ser/__init__.py +++ b/mmocr/models/ser/__init__.py @@ -1,4 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. from .hf_layoutlmv3_wrapper import HFLayoutLMv3ForTokenClassificationWrapper +from .ser_postprocessor import SERPostprocessor -__all__ = ['HFLayoutLMv3ForTokenClassificationWrapper'] +__all__ = ['HFLayoutLMv3ForTokenClassificationWrapper', 'SERPostprocessor'] diff --git a/mmocr/models/ser/hf_layoutlmv3_wrapper.py b/mmocr/models/ser/hf_layoutlmv3_wrapper.py index 32b1b4ef0..0dc31f11b 100644 --- a/mmocr/models/ser/hf_layoutlmv3_wrapper.py +++ b/mmocr/models/ser/hf_layoutlmv3_wrapper.py @@ -17,20 +17,27 @@ class HFLayoutLMv3ForTokenClassificationWrapper(BaseModel): def __init__(self, - classifier: dict = dict(pretrained_model_name_or_path=None), + layoutlmv3_token_classifier: dict = dict( + pretrained_model_name_or_path=None), data_preprocessor: Optional[Dict] = None, + postprocessor: Optional[Dict] = None, init_cfg: Optional[Dict] = None): super().__init__( data_preprocessor=data_preprocessor, init_cfg=init_cfg) - if isinstance(classifier, dict) and \ - classifier.get('pretrained_model_name_or_path', None): + if isinstance(layoutlmv3_token_classifier, dict) and \ + layoutlmv3_token_classifier.get( + 'pretrained_model_name_or_path', None): self.model = AutoModelForTokenClassification.from_pretrained( - **classifier) + **layoutlmv3_token_classifier) else: raise TypeError( - 'classifier cfg should be a `dict` and a key ' + 'layoutlmv3_token_classifier cfg should be a `dict` and a key ' '`pretrained_model_name_or_path` must be specified') + if postprocessor is not None: + assert isinstance(postprocessor, dict) + self.postprocessor = MODELS.build(postprocessor) + def forward(self, inputs: torch.Tensor, data_samples: OptSERSampleList = None, @@ -90,7 +97,7 @@ def loss(self, inputs: torch.Tensor, data_samples: SERSampleList) -> Dict: dict[str, Tensor]: A dictionary of loss components. """ outputs: TokenClassifierOutput = self.model(**inputs) - return outputs + return outputs['loss'] def predict(self, inputs: torch.Tensor, data_samples: SERSampleList) -> SERSampleList: @@ -119,8 +126,9 @@ def predict(self, inputs: torch.Tensor, Each element represents the polygon of the instance, in (xn, yn) order. """ - x = self.extract_feat(inputs) - return self.det_head.predict(x, data_samples) + outputs: TokenClassifierOutput = self.model(**inputs) + logits = outputs['logits'] + return self.postprocessor(logits, data_samples) def _forward(self, inputs: torch.Tensor, diff --git a/mmocr/models/ser/ser_postprocessor.py b/mmocr/models/ser/ser_postprocessor.py new file mode 100644 index 000000000..be812dfbc --- /dev/null +++ b/mmocr/models/ser/ser_postprocessor.py @@ -0,0 +1,26 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Sequence + +import torch +import torch.nn as nn +from mmengine.structures import LabelData + +from mmocr.registry import MODELS +from mmocr.structures import SERDataSample + + +@MODELS.register_module() +class SERPostprocessor(nn.Module): + """PostProcessor for SER.""" + + def __call__(self, outputs: torch.Tensor, + data_samples: Sequence[SERDataSample] + ) -> Sequence[SERDataSample]: + outputs = outputs.cpu().detach() + max_value, max_idx = torch.max(outputs, -1) + for batch_idx in range(outputs.size(0)): + pred_text = LabelData() + pred_text.score = max_value[batch_idx] + pred_text.item = max_idx[batch_idx] + data_samples[batch_idx].pred_text = pred_text + return data_samples diff --git a/projects/LayoutLMv3/configs/_base_/datasets/xfund_zh.py b/projects/LayoutLMv3/configs/_base_/datasets/xfund_zh.py new file mode 100644 index 000000000..e790a7bf6 --- /dev/null +++ b/projects/LayoutLMv3/configs/_base_/datasets/xfund_zh.py @@ -0,0 +1,14 @@ +xfund_zh_ser_data_root = 'data/xfund/zh' + +xfund_zh_ser_train = dict( + type='XFUNDDataset', + data_root=xfund_zh_ser_data_root, + ann_file='ser_train.json', + pipeline=None) + +xfund_zh_ser_test = dict( + type='XFUNDDataset', + data_root=xfund_zh_ser_data_root, + ann_file='ser_test.json', + test_mode=True, + pipeline=None) diff --git a/projects/LayoutLMv3/configs/ser/layoutlmv3_xfund_zh.py b/projects/LayoutLMv3/configs/ser/layoutlmv3_xfund_zh.py index 63d334280..6b5fbf737 100644 --- a/projects/LayoutLMv3/configs/ser/layoutlmv3_xfund_zh.py +++ b/projects/LayoutLMv3/configs/ser/layoutlmv3_xfund_zh.py @@ -1,11 +1,8 @@ -_base_ = [ - '/Users/wangnu/Documents/GitHub/mmocr/' - 'configs/ser/_base_/datasets/xfund_zh.py', '../_base_/default_runtime.py' -] +_base_ = ['../_base_/datasets/xfund_zh.py', '../_base_/default_runtime.py'] # specify a pretrained model -pretrained_model = '/Users/wangnu/Documents' -'/GitHub/mmocr/data/layoutlmv3-base-chinese' +pretrained_model = '/Users/wangnu/Documents/GitHub' \ + '/mmocr/data/layoutlmv3-base-chinese' # set classes classes = ('answer', 'header', 'question', 'other') @@ -19,31 +16,33 @@ type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1) train_dataset = _base_.xfund_zh_ser_train -# specify a tokenizer for the dataset -train_dataset['tokenizer'] = dict( - pretrained_model_name_or_path=pretrained_model, use_fast=True) train_pipeline = [ dict(type='LoadImageFromFile', color_type='color'), dict( - type='ProcessImageForLayoutLMv3', - image_processor=dict( - pretrained_model_name_or_path=pretrained_model, - size=(224, 224), - apply_ocr=False)), + type='LoadProcessorFromPretrainedModel', + pretrained_model_name_or_path=pretrained_model, + image_processor=dict(size=(224, 224), apply_ocr=False), + tokenizer=dict()), + dict(type='ProcessImageForLayoutLMv3'), + dict( + type='ProcessTokenForLayoutLMv3', + padding='max_length', + max_length=512, + truncation=True), dict( - type='ProcessTokenForLayoutLMv3', padding='max_length', - max_length=512), + type='ConvertBIOLabelForSER', + classes=classes, + only_label_first_subword=True), dict( type='PackSERInputs', - meta_keys=('img_path', 'ori_shape', 'img_shape', 'scale_factor', - 'id2biolabel')) + meta_keys=('img_path', 'ori_shape', 'img_shape', 'scale_factor')) ] train_dataset.pipeline = train_pipeline # set collate_fn='default_collate' for the dataloader collate_fn = dict(type='default_collate') train_dataloader = dict( - batch_size=1, - num_workers=1, + batch_size=2, + num_workers=8, pin_memory=True, persistent_workers=True, sampler=dict(type='DefaultSampler', shuffle=True), @@ -52,10 +51,10 @@ model = dict( type='HFLayoutLMv3ForTokenClassificationWrapper', - classifier=dict( + layoutlmv3_token_classifier=dict( pretrained_model_name_or_path=pretrained_model, num_labels=len(classes) * 2 - 1), - data_preprocessor=None) + postprocessor=dict(type='SERPostprocessor')) val_evaluator = None test_evaluator = None diff --git a/projects/LayoutLMv3/scripts/run_ser.sh b/projects/LayoutLMv3/scripts/run_ser.sh index 05770d4a1..aa03883c2 100644 --- a/projects/LayoutLMv3/scripts/run_ser.sh +++ b/projects/LayoutLMv3/scripts/run_ser.sh @@ -1,6 +1,7 @@ config='/Users/wangnu/Documents/GitHub/mmocr/projects/LayoutLMv3/configs/ser/layoutlmv3_xfund_zh.py' export TOKENIZERS_PARALLELISM=false +export OMP_NUM_THREADS=1 python tools/train.py \ ${config} \ diff --git a/projects/LayoutLMv3/test.py b/projects/LayoutLMv3/test.py index c6ca4ae44..f5c31adb9 100644 --- a/projects/LayoutLMv3/test.py +++ b/projects/LayoutLMv3/test.py @@ -8,42 +8,45 @@ from mmocr.registry import DATASETS, MODELS if __name__ == '__main__': - cfg_path = '/Users/wangnu/Documents/GitHub/mmocr/projects/' \ - 'LayoutLMv3/configs/ser/layoutlmv3_xfund_zh.py' + cfg_path = '/Users/wangnu/Documents/GitHub/mmocr' \ + '/configs/ser/_base_/datasets/xfund_zh.py' cfg = Config.fromfile(cfg_path) init_default_scope(cfg.get('default_scope', 'mmocr')) - pretrained_model = '/Users/wangnu/Documents/GitHub' - '/mmocr/data/layoutlmv3-base-chinese' + pretrained_model = '/Users/wangnu/Documents/GitHub/' \ + 'mmocr/data/layoutlmv3-base-chinese' + classes = ('answer', 'header', 'question', 'other') - dataset_cfg = cfg.train_dataset - dataset_cfg['tokenizer'] = dict( - pretrained_model_name_or_path=pretrained_model, use_fast=True) + dataset_cfg = cfg.xfund_zh_ser_train train_pipeline = [ dict(type='LoadImageFromFile', color_type='color'), dict( - type='ProcessImageForLayoutLMv3', - image_processor=dict( - pretrained_model_name_or_path=pretrained_model, - size=(224, 224), - apply_ocr=False)), + type='LoadProcessorFromPretrainedModel', + pretrained_model_name_or_path=pretrained_model, + image_processor=dict(size=(224, 224), apply_ocr=False), + tokenizer=dict()), + dict(type='ProcessImageForLayoutLMv3'), dict( type='ProcessTokenForLayoutLMv3', padding='max_length', - max_length=512), + max_length=512, + truncation=True), + dict( + type='ConvertBIOLabelForSER', + classes=classes, + only_label_first_subword=True), dict( type='PackSERInputs', - meta_keys=('img_path', 'ori_shape', 'img_shape', 'scale_factor', - 'id2biolabel')) + meta_keys=('img_path', 'ori_shape', 'img_shape', 'scale_factor')) ] dataset_cfg['pipeline'] = train_pipeline train_dataset = DATASETS.build(dataset_cfg) model_cfg = dict( type='HFLayoutLMv3ForTokenClassificationWrapper', - classifier=dict( + layoutlmv3_token_classifier=dict( pretrained_model_name_or_path=pretrained_model, num_labels=7), - data_preprocessor=None) + postprocessor=dict(type='SERPostprocessor')) collate_fn_cfg = dict(type='default_collate') collate_fn_type = collate_fn_cfg.pop('type') @@ -54,29 +57,20 @@ type='DefaultSampler', dataset=train_dataset, shuffle=True) sampler = DATA_SAMPLERS.build(sampler_cfg) - from mmengine.dataset.utils import worker_init_fn as default_worker_init_fn - init_fn = partial( - default_worker_init_fn, - num_workers=2, - rank=0, - seed=301967075, - disable_subprocess_warning=False) - train_dataloader = DataLoader( - batch_size=1, + batch_size=2, dataset=train_dataset, pin_memory=True, persistent_workers=True, sampler=sampler, collate_fn=collate_fn, - num_workers=2, - worker_init_fn=init_fn) + num_workers=8) model = MODELS.build(model_cfg) for idx, data_batch in enumerate(train_dataloader): - print(idx) - result = model.forward(**data_batch, mode='loss') + # result = model.forward(**data_batch, mode='loss') + result = model.forward(**data_batch, mode='predict') break print('Done') From bda67425a4f12e06fee2c327dea9b511107e4ba7 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Wed, 19 Apr 2023 16:36:08 +0800 Subject: [PATCH 28/50] =?UTF-8?q?=E8=B7=91=E9=80=9Atrain.py=E8=AE=AD?= =?UTF-8?q?=E7=BB=83=E6=B5=81=E7=A8=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mmocr/models/ser/hf_layoutlmv3_wrapper.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/mmocr/models/ser/hf_layoutlmv3_wrapper.py b/mmocr/models/ser/hf_layoutlmv3_wrapper.py index 0dc31f11b..efc8bcbef 100644 --- a/mmocr/models/ser/hf_layoutlmv3_wrapper.py +++ b/mmocr/models/ser/hf_layoutlmv3_wrapper.py @@ -71,9 +71,6 @@ def forward(self, - If ``mode="predict"``, return a list of :obj:`SERDataSample`. - If ``mode="loss"``, return a dict of tensor. """ - # copying inputs data to the target device - inputs = self.data_preprocessor(inputs) - if mode == 'loss': return self.loss(inputs, data_samples) elif mode == 'predict': @@ -97,7 +94,7 @@ def loss(self, inputs: torch.Tensor, data_samples: SERSampleList) -> Dict: dict[str, Tensor]: A dictionary of loss components. """ outputs: TokenClassifierOutput = self.model(**inputs) - return outputs['loss'] + return {'ce_loss': outputs['loss']} def predict(self, inputs: torch.Tensor, data_samples: SERSampleList) -> SERSampleList: From 44c68b1a03a456cebe6f8572bb8c3f50dfa27d21 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Wed, 19 Apr 2023 17:06:14 +0800 Subject: [PATCH 29/50] =?UTF-8?q?=E4=BF=AE=E6=94=B9SERDataSample=E5=BD=A2?= =?UTF-8?q?=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mmocr/structures/ser_data_sample.py | 91 ++++++++++++++++++----------- 1 file changed, 56 insertions(+), 35 deletions(-) diff --git a/mmocr/structures/ser_data_sample.py b/mmocr/structures/ser_data_sample.py index 10c91a17a..7f2318e7b 100644 --- a/mmocr/structures/ser_data_sample.py +++ b/mmocr/structures/ser_data_sample.py @@ -1,61 +1,82 @@ # Copyright (c) OpenMMLab. All rights reserved. -from .kie_data_sample import KIEDataSample +from mmengine.structures import BaseDataElement, LabelData -class SERDataSample(KIEDataSample): - """A data structure interface of MMOCR. They are used as interfaces between - different components. +class SERDataSample(BaseDataElement): + """A data structure interface of MMOCR for Semantic Entity Recognition. + They are used as interfaces between different components. The attributes in ``SERDataSample`` are divided into two parts: - - ``gt_instances``(InstanceData): Ground truth of instance annotations. - - ``pred_instances``(InstanceData): Instances of model predictions. + - ``gt_label``(LabelData): Ground truth label. + - ``pred_label``(LabelData): predictions label. Examples: >>> import torch >>> import numpy as np - >>> from mmengine.structures import InstanceData + >>> from mmengine.structures import LabelData >>> from mmocr.data import SERDataSample - >>> # gt_instances + >>> # gt_label >>> data_sample = SERDataSample() >>> img_meta = dict(img_shape=(800, 1196, 3), ... pad_shape=(800, 1216, 3)) - >>> gt_instances = InstanceData(metainfo=img_meta) - >>> gt_instances.bboxes = torch.rand((5, 4)) - >>> gt_instances.labels = torch.rand((5,)) - >>> data_sample.gt_instances = gt_instances - >>> assert 'img_shape' in data_sample.gt_instances.metainfo_keys() - >>> len(data_sample.gt_instances) - 5 + >>> gt_label = LabelData(metainfo=img_meta) + >>> gt_label.item = 'mmocr' + >>> data_sample.gt_label = gt_label + >>> assert 'img_shape' in data_sample.gt_label.metainfo_keys() >>> print(data_sample) ) at 0x7f21fb1b9880> - >>> # pred_instances - >>> pred_instances = InstanceData(metainfo=img_meta) - >>> pred_instances.bboxes = torch.rand((5, 4)) - >>> pred_instances.scores = torch.rand((5,)) - >>> data_sample = SERDataSample(pred_instances=pred_instances) - >>> assert 'pred_instances' in data_sample + >>> # pred_label + >>> pred_label = LabelData(metainfo=img_meta) + >>> pred_label.item = 'mmocr' + >>> data_sample = SERDataSample(pred_label=pred_label) + >>> assert 'pred_label' in data_sample >>> data_sample = SERDataSample() - >>> gt_instances_data = dict( - ... bboxes=torch.rand(2, 4), - ... labels=torch.rand(2)) - >>> gt_instances = InstanceData(**gt_instances_data) - >>> data_sample.gt_instances = gt_instances - >>> assert 'gt_instances' in data_sample + >>> gt_label_data = dict(item='mmocr') + >>> gt_label = LabelData(**gt_label_data) + >>> data_sample.gt_label = gt_label + >>> assert 'gt_label' in data_sample + >>> assert 'item' in data_sample.gt_label """ + + @property + def gt_label(self) -> LabelData: + """LabelData: ground truth label. + """ + return self._gt_label + + @gt_label.setter + def gt_label(self, value: LabelData) -> None: + """gt_label setter.""" + self.set_field(value, '_gt_label', dtype=LabelData) + + @gt_label.deleter + def gt_label(self) -> None: + """gt_label deleter.""" + del self._gt_label + + @property + def pred_label(self) -> LabelData: + """LabelData: prediction label. + """ + return self._pred_label + + @pred_label.setter + def pred_label(self, value: LabelData) -> None: + """pred_label setter.""" + self.set_field(value, '_pred_label', dtype=LabelData) + + @pred_label.deleter + def pred_label(self) -> None: + """pred_label deleter.""" + del self._pred_label From 023b0cf7cb048439dc00da2901bb6f30d92f7cb7 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Wed, 19 Apr 2023 18:00:43 +0800 Subject: [PATCH 30/50] =?UTF-8?q?=E4=BF=AE=E6=94=B9SERPostprocessor?= =?UTF-8?q?=E4=B8=80=E4=B8=AA=E5=91=BD=E5=90=8D=E9=94=99=E8=AF=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mmocr/models/ser/ser_postprocessor.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mmocr/models/ser/ser_postprocessor.py b/mmocr/models/ser/ser_postprocessor.py index be812dfbc..2fdc4158a 100644 --- a/mmocr/models/ser/ser_postprocessor.py +++ b/mmocr/models/ser/ser_postprocessor.py @@ -19,8 +19,8 @@ def __call__(self, outputs: torch.Tensor, outputs = outputs.cpu().detach() max_value, max_idx = torch.max(outputs, -1) for batch_idx in range(outputs.size(0)): - pred_text = LabelData() - pred_text.score = max_value[batch_idx] - pred_text.item = max_idx[batch_idx] - data_samples[batch_idx].pred_text = pred_text + pred_label = LabelData() + pred_label.score = max_value[batch_idx] + pred_label.item = max_idx[batch_idx] + data_samples[batch_idx].pred_label = pred_label return data_samples From a05a2e1f691ec5cfd5ffb92f7184a166f2842607 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Sat, 29 Apr 2023 00:35:25 +0800 Subject: [PATCH 31/50] =?UTF-8?q?=E6=95=B4=E7=90=86config=E7=9B=AE?= =?UTF-8?q?=E5=BD=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../configs/_base_/default_runtime.py | 12 +- .../_base_/schedules/schedule_adamw_1k.py | 11 ++ .../ser/layoutlmv3_1k_xfund_zh_1gpu_bs8.py | 138 ++++++++++++++++++ .../configs/ser/layoutlmv3_xfund_zh.py | 60 -------- 4 files changed, 154 insertions(+), 67 deletions(-) create mode 100644 projects/LayoutLMv3/configs/_base_/schedules/schedule_adamw_1k.py create mode 100644 projects/LayoutLMv3/configs/ser/layoutlmv3_1k_xfund_zh_1gpu_bs8.py delete mode 100644 projects/LayoutLMv3/configs/ser/layoutlmv3_xfund_zh.py diff --git a/projects/LayoutLMv3/configs/_base_/default_runtime.py b/projects/LayoutLMv3/configs/_base_/default_runtime.py index 81480273b..d080a0015 100644 --- a/projects/LayoutLMv3/configs/_base_/default_runtime.py +++ b/projects/LayoutLMv3/configs/_base_/default_runtime.py @@ -8,9 +8,9 @@ default_hooks = dict( timer=dict(type='IterTimerHook'), - logger=dict(type='LoggerHook', interval=5), + logger=dict(type='LoggerHook', interval=5, log_metric_by_epoch=False), param_scheduler=dict(type='ParamSchedulerHook'), - checkpoint=dict(type='CheckpointHook', interval=20), + checkpoint=dict(type='CheckpointHook', interval=20, by_epoch=False), sampler_seed=dict(type='DistSamplerSeedHook'), sync_buffer=dict(type='SyncBuffersHook'), visualization=dict( @@ -24,18 +24,16 @@ # Logging log_level = 'INFO' -log_processor = dict(type='LogProcessor', window_size=10, by_epoch=True) +log_processor = dict(type='LogProcessor', window_size=10, by_epoch=False) load_from = None resume = False # Evaluation -val_evaluator = dict(type='HmeanIOUMetric') +val_evaluator = dict(type='SeqevalMetric') test_evaluator = val_evaluator # Visualization vis_backends = [dict(type='LocalVisBackend')] visualizer = dict( - type='TextDetLocalVisualizer', - name='visualizer', - vis_backends=vis_backends) + type='SERLocalVisualizer', name='visualizer', vis_backends=vis_backends) diff --git a/projects/LayoutLMv3/configs/_base_/schedules/schedule_adamw_1k.py b/projects/LayoutLMv3/configs/_base_/schedules/schedule_adamw_1k.py new file mode 100644 index 000000000..33bc695c4 --- /dev/null +++ b/projects/LayoutLMv3/configs/_base_/schedules/schedule_adamw_1k.py @@ -0,0 +1,11 @@ +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=7e-5, weight_decay=0.01)) +train_cfg = dict(type='IterBasedTrainLoop', max_iters=1000, val_interval=100) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') +# learning policy +param_scheduler = [ + dict(type='OneCycleLR', eta_max=7e-5, by_epoch=False, total_steps=1000), +] diff --git a/projects/LayoutLMv3/configs/ser/layoutlmv3_1k_xfund_zh_1gpu_bs8.py b/projects/LayoutLMv3/configs/ser/layoutlmv3_1k_xfund_zh_1gpu_bs8.py new file mode 100644 index 000000000..3f883bda3 --- /dev/null +++ b/projects/LayoutLMv3/configs/ser/layoutlmv3_1k_xfund_zh_1gpu_bs8.py @@ -0,0 +1,138 @@ +_base_ = [ + '../_base_/datasets/xfund_zh.py', '../_base_/default_runtime.py', + '../_base_/schedules/schedule_adamw_1k.py' +] + +# ================== Frequently modified parameters ================== +hf_pretrained_model = '/Users/wangnu/Documents/GitHub' \ + '/mmocr/data/layoutlmv3-base-chinese' +dataset_name = 'xfund_zh' +class_name = ('answer', 'header', 'question', 'other') +max_iters = 1000 +val_interval = 100 +lr = 7e-5 +train_batch_size_per_gpu = 8 +train_num_workers = 8 +test_batch_size_per_gpu = 1 # can't batch inference now +test_num_workers = 8 +only_label_first_subword = True # select label process strategy +# ==================================================================== +# =========================== schedule =============================== +train_cfg = dict( + type='IterBasedTrainLoop', max_iters=max_iters, val_interval=val_interval) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=lr, weight_decay=0.01)) +param_scheduler = [ + dict( + type='OneCycleLR', + eta_max=lr, + by_epoch=False, + total_steps=max_iters, + three_phase=True, + final_div_factor=4), +] +# ==================================================================== +# =========================== Dataset ================================ +train_dataset = _base_.xfund_zh_ser_train +test_dataset = _base_.xfund_zh_ser_test +train_pipeline = [ + dict(type='LoadImageFromFile', color_type='color'), + dict( + type='LoadProcessorFromPretrainedModel', + pretrained_model_name_or_path=hf_pretrained_model, + image_processor=dict(size=(224, 224), apply_ocr=False)), + dict(type='ProcessImageForLayoutLMv3'), + dict( + type='ProcessTokenForLayoutLMv3', + padding='max_length', + max_length=512, + truncation=True), + dict( + type='ConvertBIOLabelForSER', + classes=class_name, + only_label_first_subword=only_label_first_subword), + dict( + type='PackSERInputs', + meta_keys=('img_path', 'ori_shape', 'img_shape', 'scale_factor')) +] +test_pipeline = [ + dict(type='LoadImageFromFile', color_type='color'), + dict( + type='LoadProcessorFromPretrainedModel', + pretrained_model_name_or_path=hf_pretrained_model, + image_processor=dict(size=(224, 224), apply_ocr=False)), + dict(type='ProcessImageForLayoutLMv3'), + dict( + type='ProcessTokenForLayoutLMv3', + padding='max_length', + max_length=512, + truncation=True), + dict( + type='ConvertBIOLabelForSER', + classes=class_name, + only_label_first_subword=only_label_first_subword), + dict( + type='PackSERInputs', + meta_keys=('img_path', 'ori_shape', 'img_shape', 'scale_factor', + 'truncation_word_ids', 'instances')) +] +train_dataset.pipeline = train_pipeline +test_dataset.pipeline = test_pipeline +# ==================================================================== +# ========================= Dataloader =============================== +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + pin_memory=True, + persistent_workers=True, + sampler=dict(type='InfiniteSampler', shuffle=True), + collate_fn=dict(type='long_text_data_collate', training=True), + dataset=train_dataset) +val_dataloader = dict( + batch_size=test_batch_size_per_gpu, + num_workers=test_num_workers, + pin_memory=True, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + collate_fn=dict(type='long_text_data_collate', training=False), + dataset=test_dataset) +test_dataloader = val_dataloader +# ==================================================================== +# ============================ Model ================================= +model = dict( + type='HFLayoutLMv3ForTokenClassificationWrapper', + layoutlmv3_token_classifier=dict( + pretrained_model_name_or_path=hf_pretrained_model, + num_labels=len(class_name) * 2 - 1), + loss_processor=dict(type='ComputeLossAfterLabelSmooth'), + postprocessor=dict(type='SERPostprocessor', classes=class_name)) +# ==================================================================== +# ========================= Evaluation =============================== +val_evaluator = dict(type='SeqevalMetric', prefix=dataset_name) +test_evaluator = val_evaluator +# ==================================================================== +# ======================= Visualization ============================== +vis_backends = [dict(type='TensorboardVisBackend')] +visualizer = dict( + type='SERLocalVisualizer', name='visualizer', vis_backends=vis_backends) +# ==================================================================== +# ============================= Hook ================================= +default_hooks = dict( + logger=dict(type='LoggerHook', interval=10), + checkpoint=dict( + type='CheckpointHook', + interval=500, + save_best=f'{dataset_name}/f1', + rule='greater'), + visualization=dict( + type='VisualizationHook', + interval=10, + enable=True, + show=False, + draw_gt=True, + draw_pred=True), +) +# ==================================================================== diff --git a/projects/LayoutLMv3/configs/ser/layoutlmv3_xfund_zh.py b/projects/LayoutLMv3/configs/ser/layoutlmv3_xfund_zh.py deleted file mode 100644 index 6b5fbf737..000000000 --- a/projects/LayoutLMv3/configs/ser/layoutlmv3_xfund_zh.py +++ /dev/null @@ -1,60 +0,0 @@ -_base_ = ['../_base_/datasets/xfund_zh.py', '../_base_/default_runtime.py'] - -# specify a pretrained model -pretrained_model = '/Users/wangnu/Documents/GitHub' \ - '/mmocr/data/layoutlmv3-base-chinese' -# set classes -classes = ('answer', 'header', 'question', 'other') - -# optimizer -max_epochs = 10 -optim_wrapper = dict(type='OptimWrapper', optimizer=dict(type='Adam', lr=1e-3)) -param_scheduler = [ - dict(type='PolyLR', power=0.9, end=max_epochs), -] -train_cfg = dict( - type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1) - -train_dataset = _base_.xfund_zh_ser_train -train_pipeline = [ - dict(type='LoadImageFromFile', color_type='color'), - dict( - type='LoadProcessorFromPretrainedModel', - pretrained_model_name_or_path=pretrained_model, - image_processor=dict(size=(224, 224), apply_ocr=False), - tokenizer=dict()), - dict(type='ProcessImageForLayoutLMv3'), - dict( - type='ProcessTokenForLayoutLMv3', - padding='max_length', - max_length=512, - truncation=True), - dict( - type='ConvertBIOLabelForSER', - classes=classes, - only_label_first_subword=True), - dict( - type='PackSERInputs', - meta_keys=('img_path', 'ori_shape', 'img_shape', 'scale_factor')) -] -train_dataset.pipeline = train_pipeline -# set collate_fn='default_collate' for the dataloader -collate_fn = dict(type='default_collate') -train_dataloader = dict( - batch_size=2, - num_workers=8, - pin_memory=True, - persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=True), - collate_fn=collate_fn, - dataset=train_dataset) - -model = dict( - type='HFLayoutLMv3ForTokenClassificationWrapper', - layoutlmv3_token_classifier=dict( - pretrained_model_name_or_path=pretrained_model, - num_labels=len(classes) * 2 - 1), - postprocessor=dict(type='SERPostprocessor')) - -val_evaluator = None -test_evaluator = None From 366477318bcf3743b04e109033eb6b94b7677d61 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Sat, 29 Apr 2023 01:23:22 +0800 Subject: [PATCH 32/50] =?UTF-8?q?=E6=B7=BB=E5=8A=A0SER=E4=BB=BB=E5=8A=A1?= =?UTF-8?q?=E7=9A=84=E8=AF=84=E4=BC=B0=E6=A8=A1=E5=9D=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- projects/LayoutLMv3/evaluation/__init__.py | 1 + .../LayoutLMv3/evaluation/metrics/__init__.py | 3 ++ .../evaluation/metrics/seqeval_metric.py | 41 +++++++++++++++++++ 3 files changed, 45 insertions(+) create mode 100644 projects/LayoutLMv3/evaluation/__init__.py create mode 100644 projects/LayoutLMv3/evaluation/metrics/__init__.py create mode 100644 projects/LayoutLMv3/evaluation/metrics/seqeval_metric.py diff --git a/projects/LayoutLMv3/evaluation/__init__.py b/projects/LayoutLMv3/evaluation/__init__.py new file mode 100644 index 000000000..e9f2df5e3 --- /dev/null +++ b/projects/LayoutLMv3/evaluation/__init__.py @@ -0,0 +1 @@ +from .metrics import * # NOQA diff --git a/projects/LayoutLMv3/evaluation/metrics/__init__.py b/projects/LayoutLMv3/evaluation/metrics/__init__.py new file mode 100644 index 000000000..d3d029f46 --- /dev/null +++ b/projects/LayoutLMv3/evaluation/metrics/__init__.py @@ -0,0 +1,3 @@ +from .seqeval_metric import SeqevalMetric + +__all__ = ['SeqevalMetric'] diff --git a/projects/LayoutLMv3/evaluation/metrics/seqeval_metric.py b/projects/LayoutLMv3/evaluation/metrics/seqeval_metric.py new file mode 100644 index 000000000..7ee774266 --- /dev/null +++ b/projects/LayoutLMv3/evaluation/metrics/seqeval_metric.py @@ -0,0 +1,41 @@ +from typing import Any, Optional, Sequence + +from mmengine.evaluator import BaseMetric +from seqeval.metrics import (accuracy_score, f1_score, precision_score, + recall_score) + +from mmocr.registry import METRICS + + +@METRICS.register_module() +class SeqevalMetric(BaseMetric): + + default_prefix: Optional[str] = 'ser' + + def __init__(self, + collect_device: str = 'cpu', + prefix: Optional[str] = None) -> None: + super().__init__(collect_device, prefix) + + def process(self, data_batch: Any, data_samples: Sequence[dict]) -> None: + for data_sample in data_samples: + pred_labels = data_sample.get('pred_label').get('item') + gt_labels = data_sample.get('gt_label').get('item') + + result = dict(pred_labels=pred_labels, gt_labels=gt_labels) + self.results.append(result) + + def compute_metrics(self, results: list) -> dict: + preds = [] + gts = [] + for result in results: + preds.append(result['pred_labels']) + gts.append(result['gt_labels']) + + result = { + 'precision': precision_score(gts, preds), + 'recall': recall_score(gts, preds), + 'f1': f1_score(gts, preds), + 'accuracy': accuracy_score(gts, preds) + } + return result From 6c1f5bea2f182d17e94bfdd12c590249e04dde80 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Sat, 29 Apr 2023 17:22:14 +0800 Subject: [PATCH 33/50] =?UTF-8?q?=E4=BC=98=E5=8C=96PackSERInputs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../datasets/transforms/formatting.py | 126 ++++++++++++++++++ 1 file changed, 126 insertions(+) create mode 100644 projects/LayoutLMv3/datasets/transforms/formatting.py diff --git a/projects/LayoutLMv3/datasets/transforms/formatting.py b/projects/LayoutLMv3/datasets/transforms/formatting.py new file mode 100644 index 000000000..0ceab6bb8 --- /dev/null +++ b/projects/LayoutLMv3/datasets/transforms/formatting.py @@ -0,0 +1,126 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import torch +from mmcv.transforms import to_tensor +from mmcv.transforms.base import BaseTransform +from mmengine.structures import LabelData + +from mmocr.registry import TRANSFORMS +from mmocr.structures import SERDataSample + + +@TRANSFORMS.register_module() +class PackSERInputs(BaseTransform): + """Pack the inputs data for LayoutLMv3ForTokenClassification model. + + The type of outputs is `dict`: + + - inputs: Data for model forwarding. Five components will be included: + + - input_ids, whose shape is (truncation_number, 512). + - bbox, whose shape is (truncation_number, 512, 4). + - attention_mask, whose shape is (truncation_number, 512). + - pixel_values, whose shape is (truncation_number, 3, 224, 224). + - labels, whose shape is (truncation_number, 512). + + - data_samples: Two components of ``SERDataSample`` will be updated: + + - gt_instances (InstanceData): Depending on annotations, a subset of the + following keys will be updated: + + - bboxes (torch.Tensor((N, 4), dtype=torch.float32)): The groundtruth + of bounding boxes in the form of [x1, y1, x2, y2]. Renamed from + 'gt_bboxes'. + - labels (torch.LongTensor(N)): The labels of instances. + Renamed from 'gt_bboxes_labels'. + - texts (list[str]): The groundtruth texts. Renamed from 'gt_texts'. + + - metainfo (dict): 'metainfo' is always populated. The contents of the + 'metainfo' depends on ``meta_keys``. By default it includes: + + - "img_path": Path to the image file. + - "img_shape": Shape of the image input to the network as a tuple + (h, w). Note that the image may be zero-padded afterward on the + bottom/right if the batch tensor is larger than this shape. + - "scale_factor": A tuple indicating the ratio of width and height + of the preprocessed image to the original one. + - "ori_shape": Shape of the preprocessed image as a tuple (h, w). + + Args: + meta_keys (Sequence[str], optional): Meta keys to be converted to + the metainfo of ``SERDataSample``. Defaults to ``('img_path', + 'ori_shape', 'img_shape', 'scale_factor')``. + """ + # HF LayoutLMv3ForTokenClassification model input params. + ser_keys = [ + 'input_ids', 'bbox', 'attention_mask', 'pixel_values', 'labels' + ] + + def __init__(self, meta_keys=()): + self.meta_keys = meta_keys + + def transform(self, results: dict) -> dict: + """Method to pack SER input data. + + Args: + results (dict): Result dict from the data pipeline. + + Returns: + dict: + + - 'inputs' (obj:`dict`): Data for model forwarding. + - 'data_samples' (obj:`SERDataSample`): The annotation info of the + sample. + """ + + packed_results = dict() + truncation_number = results['truncation_number'] + + if 'pixel_values' in results: + img = results['pixel_values'] + if len(img.shape) < 3: + img = np.expand_dims(img, -1) + # A simple trick to speedup formatting by 3-5 times when + # OMP_NUM_THREADS != 1 + # Refer to https://github.com/open-mmlab/mmdetection/pull/9533 + # for more details + if img.flags.c_contiguous: + img = to_tensor(img) + img = img.permute(2, 0, 1).contiguous() + else: + img = np.ascontiguousarray(img.transpose(2, 0, 1)) + img = to_tensor(img) + results['pixel_values'] = torch.cat( + [img.unsqueeze(0)] * truncation_number, dim=0) + + # pack `inputs` + inputs = {} + for key in self.ser_keys: + if key not in results: + continue + value = to_tensor(results[key]) + inputs[key] = value + packed_results['inputs'] = inputs + + # pack `data_samples` + data_samples = [] + for truncation_idx in range(truncation_number): + data_sample = SERDataSample() + gt_label = LabelData() + assert 'labels' in results, 'key `labels` not in results.' + value = to_tensor(results['labels'][truncation_idx]) + gt_label.item = value + data_sample.gt_label = gt_label + meta = {} + for key in self.meta_keys: + meta[key] = results[key] + data_sample.set_metainfo(meta) + data_samples.append(data_sample) + packed_results['data_samples'] = data_samples + + return packed_results + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(meta_keys={self.meta_keys})' + return repr_str From d21a1816053b9714869ecef7eeeed3940d89c9e9 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Mon, 1 May 2023 23:02:57 +0800 Subject: [PATCH 34/50] =?UTF-8?q?=E5=B0=86=E6=95=B0=E6=8D=AE=E5=A4=84?= =?UTF-8?q?=E7=90=86=E9=83=A8=E5=88=86=E4=BB=A3=E7=A0=81=E7=A7=BB=E5=8A=A8?= =?UTF-8?q?=E5=88=B0project=E4=B8=AD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mmocr/datasets/__init__.py | 3 +- mmocr/datasets/transforms/__init__.py | 51 +++-- mmocr/datasets/transforms/formatting.py | 105 +-------- mmocr/models/__init__.py | 1 - mmocr/models/ser/ser_postprocessor.py | 26 --- projects/LayoutLMv3/README.md | 143 ++++++++++++ projects/LayoutLMv3/__init__.py | 4 + .../ser/layoutlmv3_1k_xfund_zh_1gpu_bs8.py | 4 +- projects/LayoutLMv3/datasets/__init__.py | 5 + .../datasets/transforms/__init__.py | 10 + .../transforms/layoutlmv3_transforms.py | 127 +++++------ projects/LayoutLMv3/datasets/utils.py | 63 ++++++ .../LayoutLMv3}/datasets/xfund_dataset.py | 0 .../LayoutLMv3/models}/__init__.py | 6 +- .../models}/hf_layoutlmv3_wrapper.py | 18 +- projects/LayoutLMv3/models/loss_processor.py | 18 ++ .../LayoutLMv3/models/ser_postprocessor.py | 84 +++++++ projects/LayoutLMv3/scripts/run_ser.sh | 5 +- projects/LayoutLMv3/test.py | 76 ------- projects/LayoutLMv3/tools/train.py | 116 ++++++++++ projects/LayoutLMv3/utils/bio_label_utils.py | 14 ++ projects/LayoutLMv3/visualization/__init__.py | 3 + .../visualization/ser_visualizer.py | 211 ++++++++++++++++++ 23 files changed, 792 insertions(+), 301 deletions(-) delete mode 100644 mmocr/models/ser/ser_postprocessor.py create mode 100644 projects/LayoutLMv3/__init__.py create mode 100644 projects/LayoutLMv3/datasets/__init__.py create mode 100644 projects/LayoutLMv3/datasets/transforms/__init__.py rename {mmocr => projects/LayoutLMv3}/datasets/transforms/layoutlmv3_transforms.py (71%) create mode 100644 projects/LayoutLMv3/datasets/utils.py rename {mmocr => projects/LayoutLMv3}/datasets/xfund_dataset.py (100%) rename {mmocr/models/ser => projects/LayoutLMv3/models}/__init__.py (50%) rename {mmocr/models/ser => projects/LayoutLMv3/models}/hf_layoutlmv3_wrapper.py (91%) create mode 100644 projects/LayoutLMv3/models/loss_processor.py create mode 100644 projects/LayoutLMv3/models/ser_postprocessor.py delete mode 100644 projects/LayoutLMv3/test.py create mode 100755 projects/LayoutLMv3/tools/train.py create mode 100644 projects/LayoutLMv3/utils/bio_label_utils.py create mode 100644 projects/LayoutLMv3/visualization/__init__.py create mode 100644 projects/LayoutLMv3/visualization/ser_visualizer.py diff --git a/mmocr/datasets/__init__.py b/mmocr/datasets/__init__.py index f7e746149..54a9ea7f0 100644 --- a/mmocr/datasets/__init__.py +++ b/mmocr/datasets/__init__.py @@ -7,9 +7,8 @@ from .samplers import * # NOQA from .transforms import * # NOQA from .wildreceipt_dataset import WildReceiptDataset -from .xfund_dataset import XFUNDDataset __all__ = [ 'IcdarDataset', 'OCRDataset', 'RecogLMDBDataset', 'RecogTextDataset', - 'WildReceiptDataset', 'ConcatDataset', 'XFUNDDataset' + 'WildReceiptDataset', 'ConcatDataset' ] diff --git a/mmocr/datasets/transforms/__init__.py b/mmocr/datasets/transforms/__init__.py index e72031dfc..e4131184b 100644 --- a/mmocr/datasets/transforms/__init__.py +++ b/mmocr/datasets/transforms/__init__.py @@ -1,11 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. from .adapters import MMDet2MMOCR, MMOCR2MMDet -from .formatting import (PackKIEInputs, PackSERInputs, PackTextDetInputs, - PackTextRecogInputs) -from .layoutlmv3_transforms import (ConvertBIOLabelForSER, - LoadProcessorFromPretrainedModel, - ProcessImageForLayoutLMv3, - ProcessTokenForLayoutLMv3) +from .formatting import PackKIEInputs, PackTextDetInputs, PackTextRecogInputs from .loading import (InferencerLoader, LoadImageFromFile, LoadImageFromNDArray, LoadKIEAnnotations, LoadOCRAnnotations) @@ -20,15 +15,37 @@ from .wrappers import ConditionApply, ImgAugWrapper, TorchVisionWrapper __all__ = [ - 'LoadOCRAnnotations', 'RandomRotate', 'ImgAugWrapper', 'SourceImagePad', - 'TextDetRandomCropFlip', 'PyramidRescale', 'TorchVisionWrapper', 'Resize', - 'RandomCrop', 'TextDetRandomCrop', 'RandomCrop', 'PackTextDetInputs', - 'PackTextRecogInputs', 'RescaleToHeight', 'PadToWidth', - 'ShortScaleAspectJitter', 'RandomFlip', 'BoundedScaleAspectJitter', - 'PackKIEInputs', 'LoadKIEAnnotations', 'FixInvalidPolygon', 'MMDet2MMOCR', - 'MMOCR2MMDet', 'LoadImageFromFile', 'LoadImageFromNDArray', 'CropHeight', - 'InferencerLoader', 'RemoveIgnored', 'ConditionApply', 'CropHeight', - 'TextRecogGeneralAug', 'ImageContentJitter', 'ReversePixels', - 'PackSERInputs', 'ProcessImageForLayoutLMv3', 'ProcessTokenForLayoutLMv3', - 'LoadProcessorFromPretrainedModel', 'ConvertBIOLabelForSER' + 'LoadOCRAnnotations', + 'RandomRotate', + 'ImgAugWrapper', + 'SourceImagePad', + 'TextDetRandomCropFlip', + 'PyramidRescale', + 'TorchVisionWrapper', + 'Resize', + 'RandomCrop', + 'TextDetRandomCrop', + 'RandomCrop', + 'PackTextDetInputs', + 'PackTextRecogInputs', + 'RescaleToHeight', + 'PadToWidth', + 'ShortScaleAspectJitter', + 'RandomFlip', + 'BoundedScaleAspectJitter', + 'PackKIEInputs', + 'LoadKIEAnnotations', + 'FixInvalidPolygon', + 'MMDet2MMOCR', + 'MMOCR2MMDet', + 'LoadImageFromFile', + 'LoadImageFromNDArray', + 'CropHeight', + 'InferencerLoader', + 'RemoveIgnored', + 'ConditionApply', + 'CropHeight', + 'TextRecogGeneralAug', + 'ImageContentJitter', + 'ReversePixels', ] diff --git a/mmocr/datasets/transforms/formatting.py b/mmocr/datasets/transforms/formatting.py index 845b2c17f..b9b71437a 100644 --- a/mmocr/datasets/transforms/formatting.py +++ b/mmocr/datasets/transforms/formatting.py @@ -6,7 +6,7 @@ from mmengine.structures import InstanceData, LabelData from mmocr.registry import TRANSFORMS -from mmocr.structures import (KIEDataSample, SERDataSample, TextDetDataSample, +from mmocr.structures import (KIEDataSample, TextDetDataSample, TextRecogDataSample) @@ -328,106 +328,3 @@ def __repr__(self) -> str: repr_str = self.__class__.__name__ repr_str += f'(meta_keys={self.meta_keys})' return repr_str - - -@TRANSFORMS.register_module() -class PackSERInputs(BaseTransform): - """Pack the inputs data for LayoutLMv3ForTokenClassification model. - - The type of outputs is `dict`: - - - inputs: image converted to tensor, whose shape is (C, H, W). - - data_samples: Two components of ``SERDataSample`` will be updated: - - - gt_instances (InstanceData): Depending on annotations, a subset of the - following keys will be updated: - - - bboxes (torch.Tensor((N, 4), dtype=torch.float32)): The groundtruth - of bounding boxes in the form of [x1, y1, x2, y2]. Renamed from - 'gt_bboxes'. - - labels (torch.LongTensor(N)): The labels of instances. - Renamed from 'gt_bboxes_labels'. - - texts (list[str]): The groundtruth texts. Renamed from 'gt_texts'. - - - metainfo (dict): 'metainfo' is always populated. The contents of the - 'metainfo' depends on ``meta_keys``. By default it includes: - - - "img_path": Path to the image file. - - "img_shape": Shape of the image input to the network as a tuple - (h, w). Note that the image may be zero-padded afterward on the - bottom/right if the batch tensor is larger than this shape. - - "scale_factor": A tuple indicating the ratio of width and height - of the preprocessed image to the original one. - - "ori_shape": Shape of the preprocessed image as a tuple (h, w). - - "id2biolabel": Label id convert to biolabel map dict. - - Args: - meta_keys (Sequence[str], optional): Meta keys to be converted to - the metainfo of ``SERDataSample``. Defaults to ``('img_path', - 'ori_shape', 'img_shape', 'scale_factor', 'id2biolabel')``. - """ - # HF LayoutLMv3ForTokenClassification model input params. - ser_keys = [ - 'input_ids', 'bbox', 'attention_mask', 'pixel_values', 'labels' - ] - - def __init__(self, meta_keys=()): - self.meta_keys = meta_keys - - def transform(self, results: dict) -> dict: - """Method to pack SER input data. - - Args: - results (dict): Result dict from the data pipeline. - - Returns: - dict: - - - 'inputs' (obj:`torch.Tensor`): Data for model forwarding. - - 'data_samples' (obj:`DetDataSample`): The annotation info of the - sample. - """ - - packed_results = dict() - if 'pixel_values' in results: - img = results['pixel_values'] - if len(img.shape) < 3: - img = np.expand_dims(img, -1) - # A simple trick to speedup formatting by 3-5 times when - # OMP_NUM_THREADS != 1 - # Refer to https://github.com/open-mmlab/mmdetection/pull/9533 - # for more details - if img.flags.c_contiguous: - img = to_tensor(img) - img = img.permute(2, 0, 1).contiguous() - else: - img = np.ascontiguousarray(img.transpose(2, 0, 1)) - img = to_tensor(img) - results['pixel_values'] = img - - data_sample = SERDataSample() - gt_label = LabelData() - - inputs = {} - for key in self.ser_keys: - if key not in results: - continue - value = to_tensor(results[key]) - if key == 'labels': - gt_label.item = value - inputs[key] = value - packed_results['inputs'] = inputs - data_sample.gt_label = gt_label - - meta = {} - for key in self.meta_keys: - meta[key] = results[key] - data_sample.set_metainfo(meta) - packed_results['data_samples'] = data_sample - - return packed_results - - def __repr__(self) -> str: - repr_str = self.__class__.__name__ - repr_str += f'(meta_keys={self.meta_keys})' - return repr_str diff --git a/mmocr/models/__init__.py b/mmocr/models/__init__.py index 9f57d5007..abea668b3 100644 --- a/mmocr/models/__init__.py +++ b/mmocr/models/__init__.py @@ -1,6 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. from .common import * # NOQA from .kie import * # NOQA -from .ser import * # NOQA from .textdet import * # NOQA from .textrecog import * # NOQA diff --git a/mmocr/models/ser/ser_postprocessor.py b/mmocr/models/ser/ser_postprocessor.py deleted file mode 100644 index 2fdc4158a..000000000 --- a/mmocr/models/ser/ser_postprocessor.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -from typing import Sequence - -import torch -import torch.nn as nn -from mmengine.structures import LabelData - -from mmocr.registry import MODELS -from mmocr.structures import SERDataSample - - -@MODELS.register_module() -class SERPostprocessor(nn.Module): - """PostProcessor for SER.""" - - def __call__(self, outputs: torch.Tensor, - data_samples: Sequence[SERDataSample] - ) -> Sequence[SERDataSample]: - outputs = outputs.cpu().detach() - max_value, max_idx = torch.max(outputs, -1) - for batch_idx in range(outputs.size(0)): - pred_label = LabelData() - pred_label.score = max_value[batch_idx] - pred_label.item = max_idx[batch_idx] - data_samples[batch_idx].pred_label = pred_label - return data_samples diff --git a/projects/LayoutLMv3/README.md b/projects/LayoutLMv3/README.md index e69de29bb..c2a22bfb2 100644 --- a/projects/LayoutLMv3/README.md +++ b/projects/LayoutLMv3/README.md @@ -0,0 +1,143 @@ +# Dummy ResNet Wrapper + +> This is a README template for community `projects/`. + +> All the fields in this README are **mandatory** for others to understand what you have achieved in this implementation. If you still feel unclear about the requirements, please read our [contribution guide](https://mmocr.readthedocs.io/en/dev-1.x/notes/contribution_guide.html), [projects FAQ](../faq.md), or approach us in [Discussions](https://github.com/open-mmlab/mmocr/discussions). + +## Description + +> Share any information you would like others to know. For example: +> +> Author: @xxx. +> +> This is an implementation of \[XXX\]. + +This project implements a dummy ResNet wrapper, which literally does nothing new but prints "hello world" during initialization. + +## Usage + +> For a typical model, this section should contain the commands for training and testing. You are also suggested to dump your environment specification to env.yml by `conda env export > env.yml`. + +### Prerequisites + +- Python 3.7 +- PyTorch 1.6 or higher +- [MIM](https://github.com/open-mmlab/mim) +- [MMOCR](https://github.com/open-mmlab/mmocr) + +All the commands below rely on the correct configuration of `PYTHONPATH`, which should point to the project's directory so that Python can locate the module files. In `example_project/` root directory, run the following line to add the current directory to `PYTHONPATH`: + +```shell +# Linux +export PYTHONPATH=`pwd`:$PYTHONPATH +# Windows PowerShell +$env:PYTHONPATH=Get-Location +``` + +### Training commands + +In MMOCR's root directory, run the following command to train the model: + +```bash +mim train mmocr configs/dbnet_dummy-resnet_fpnc_1200e_icdar2015.py --work-dir work_dirs/dummy_mae/ +``` + +To train on multiple GPUs, e.g. 8 GPUs, run the following command: + +```bash +mim train mmocr configs/dbnet_dummy-resnet_fpnc_1200e_icdar2015.py --work-dir work_dirs/dummy_mae/ --launcher pytorch --gpus 8 +``` + +### Testing commands + +In MMOCR's root directory, run the following command to test the model: + +```bash +mim test mmocr configs/dbnet_dummy-resnet_fpnc_1200e_icdar2015.py --work-dir work_dirs/dummy_mae/ --checkpoint ${CHECKPOINT_PATH} +``` + +## Results + +> List the results as usually done in other model's README. [Example](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textdet/dbnet/README.md#results-and-models) +> +> You should claim whether this is based on the pre-trained weights, which are converted from the official release; or it's a reproduced result obtained from retraining the model in this project. + +| Method | Backbone | Pretrained Model | Training set | Test set | #epoch | Test size | Precision | Recall | Hmean | Download | +| :---------------------------------------------------------------: | :---------: | :--------------: | :-------------: | :------------: | :----: | :-------: | :-------: | :----: | :----: | :----------------------: | +| [DBNet_dummy](configs/dbnet_dummy-resnet_fpnc_1200e_icdar2015.py) | DummyResNet | - | ICDAR2015 Train | ICDAR2015 Test | 1200 | 736 | 0.8853 | 0.7583 | 0.8169 | [model](<>) \| [log](<>) | + +## Citation + +> You may remove this section if not applicable. + +```bibtex +@software{MMOCR_Contributors_OpenMMLab_Text_Detection_2020, +author = {{MMOCR Contributors}}, +license = {Apache-2.0}, +month = {8}, +title = {{OpenMMLab Text Detection, Recognition and Understanding Toolbox}}, +url = {https://github.com/open-mmlab/mmocr}, +version = {0.3.0}, +year = {2020} +} +``` + +## Checklist + +Here is a checklist illustrating a usual development workflow of a successful project, and also serves as an overview of this project's progress. + +> The PIC (person in charge) or contributors of this project should check all the items that they believe have been finished, which will further be verified by codebase maintainers via a PR. +> +> OpenMMLab's maintainer will review the code to ensure the project's quality. Reaching the first milestone means that this project suffices the minimum requirement of being merged into 'projects/'. But this project is only eligible to become a part of the core package upon attaining the last milestone. +> +> Note that keeping this section up-to-date is crucial not only for this project's developers but the entire community, since there might be some other contributors joining this project and deciding their starting point from this list. It also helps maintainers accurately estimate time and effort on further code polishing, if needed. +> +> A project does not necessarily have to be finished in a single PR, but it's essential for the project to at least reach the first milestone in its very first PR. + +- [ ] Milestone 1: PR-ready, and acceptable to be one of the `projects/`. + + - [ ] Finish the code + + > The code's design shall follow existing interfaces and convention. For example, each model component should be registered into `mmocr.registry.MODELS` and configurable via a config file. + + - [ ] Basic docstrings & proper citation + + > Each major object should contain a docstring, describing its functionality and arguments. If you have adapted the code from other open-source projects, don't forget to cite the source project in docstring and make sure your behavior is not against its license. Typically, we do not accept any code snippet under GPL license. [A Short Guide to Open Source Licenses](https://medium.com/nationwide-technology/a-short-guide-to-open-source-licenses-cf5b1c329edd) + + - [ ] Test-time correctness + + > If you are reproducing the result from a paper, make sure your model's inference-time performance matches that in the original paper. The weights usually could be obtained by simply renaming the keys in the official pre-trained weights. This test could be skipped though, if you are able to prove the training-time correctness and check the second milestone. + + - [ ] A full README + + > As this template does. + +- [ ] Milestone 2: Indicates a successful model implementation. + + - [ ] Training-time correctness + + > If you are reproducing the result from a paper, checking this item means that you should have trained your model from scratch based on the original paper's specification and verified that the final result matches the report within a minor error range. + +- [ ] Milestone 3: Good to be a part of our core package! + + - [ ] Type hints and docstrings + + > Ideally *all* the methods should have [type hints](https://www.pythontutorial.net/python-basics/python-type-hints/) and [docstrings](https://google.github.io/styleguide/pyguide.html#381-docstrings). [Example](https://github.com/open-mmlab/mmocr/blob/76637a290507f151215d299707c57cea5120976e/mmocr/utils/polygon_utils.py#L80-L96) + + - [ ] Unit tests + + > Unit tests for each module are required. [Example](https://github.com/open-mmlab/mmocr/blob/76637a290507f151215d299707c57cea5120976e/tests/test_utils/test_polygon_utils.py#L97-L106) + + - [ ] Code polishing + + > Refactor your code according to reviewer's comment. + + - [ ] Metafile.yml + + > It will be parsed by MIM and Inferencer. [Example](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textdet/dbnet/metafile.yml) + +- [ ] Move your modules into the core package following the codebase's file hierarchy structure. + + > In particular, you may have to refactor this README into a standard one. [Example](/configs/textdet/dbnet/README.md) + +- [ ] Refactor your modules into the core package following the codebase's file hierarchy structure. diff --git a/projects/LayoutLMv3/__init__.py b/projects/LayoutLMv3/__init__.py new file mode 100644 index 000000000..8c347c22d --- /dev/null +++ b/projects/LayoutLMv3/__init__.py @@ -0,0 +1,4 @@ +from .datasets import * # NOQA +from .evaluation import * # NOQA +from .models import * # NOQA +from .visualization import * # NOQA diff --git a/projects/LayoutLMv3/configs/ser/layoutlmv3_1k_xfund_zh_1gpu_bs8.py b/projects/LayoutLMv3/configs/ser/layoutlmv3_1k_xfund_zh_1gpu_bs8.py index 3f883bda3..a9fc3f896 100644 --- a/projects/LayoutLMv3/configs/ser/layoutlmv3_1k_xfund_zh_1gpu_bs8.py +++ b/projects/LayoutLMv3/configs/ser/layoutlmv3_1k_xfund_zh_1gpu_bs8.py @@ -11,7 +11,7 @@ max_iters = 1000 val_interval = 100 lr = 7e-5 -train_batch_size_per_gpu = 8 +train_batch_size_per_gpu = 2 train_num_workers = 8 test_batch_size_per_gpu = 1 # can't batch inference now test_num_workers = 8 @@ -115,7 +115,7 @@ test_evaluator = val_evaluator # ==================================================================== # ======================= Visualization ============================== -vis_backends = [dict(type='TensorboardVisBackend')] +vis_backends = [dict(type='LocalVisBackend')] visualizer = dict( type='SERLocalVisualizer', name='visualizer', vis_backends=vis_backends) # ==================================================================== diff --git a/projects/LayoutLMv3/datasets/__init__.py b/projects/LayoutLMv3/datasets/__init__.py new file mode 100644 index 000000000..691c8198a --- /dev/null +++ b/projects/LayoutLMv3/datasets/__init__.py @@ -0,0 +1,5 @@ +from .transforms import * # NOQA +from .utils import long_text_data_collate +from .xfund_dataset import XFUNDDataset + +__all__ = ['XFUNDDataset', 'long_text_data_collate'] diff --git a/projects/LayoutLMv3/datasets/transforms/__init__.py b/projects/LayoutLMv3/datasets/transforms/__init__.py new file mode 100644 index 000000000..c837267e0 --- /dev/null +++ b/projects/LayoutLMv3/datasets/transforms/__init__.py @@ -0,0 +1,10 @@ +from .formatting import PackSERInputs +from .layoutlmv3_transforms import (ConvertBIOLabelForSER, + LoadProcessorFromPretrainedModel, + ProcessImageForLayoutLMv3, + ProcessTokenForLayoutLMv3) + +__all__ = [ + 'LoadProcessorFromPretrainedModel', 'ProcessImageForLayoutLMv3', + 'ProcessTokenForLayoutLMv3', 'ConvertBIOLabelForSER', 'PackSERInputs' +] diff --git a/mmocr/datasets/transforms/layoutlmv3_transforms.py b/projects/LayoutLMv3/datasets/transforms/layoutlmv3_transforms.py similarity index 71% rename from mmocr/datasets/transforms/layoutlmv3_transforms.py rename to projects/LayoutLMv3/datasets/transforms/layoutlmv3_transforms.py index cfa299fc6..2bed95708 100644 --- a/mmocr/datasets/transforms/layoutlmv3_transforms.py +++ b/projects/LayoutLMv3/datasets/transforms/layoutlmv3_transforms.py @@ -1,9 +1,11 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Dict, Optional, Union +from typing import Dict, List, Optional, Union from mmcv.transforms.base import BaseTransform from mmocr.registry import TRANSFORMS +from projects.LayoutLMv3.utils.bio_label_utils import \ + find_other_label_name_of_biolabel from transformers import LayoutLMv3ImageProcessor, LayoutXLMTokenizerFast from transformers.file_utils import PaddingStrategy from transformers.image_processing_utils import BatchFeature @@ -123,7 +125,8 @@ class ProcessTokenForLayoutLMv3(BaseTransform): - input_ids - attention_mask - bbox - - word_ids + - truncation_number + - truncation_word_ids Args: Refer to the parameters of the corresponding tokenizer @@ -140,6 +143,20 @@ def __init__(self, self.truncation = truncation self.pad_to_multiple_of = pad_to_multiple_of + def box_norm(self, box, width, height) -> List: + + def clip(min_num, num, max_num): + return min(max(num, min_num), max_num) + + x0, y0, x1, y1 = box + x0 = clip(0, int((x0 / width) * 1000), 1000) + y0 = clip(0, int((y0 / height) * 1000), 1000) + x1 = clip(0, int((x1 / width) * 1000), 1000) + y1 = clip(0, int((y1 / height) * 1000), 1000) + assert x1 >= x0 + assert y1 >= y0 + return [x0, y0, x1, y1] + def _tokenize(self, results: dict) -> None: tokenizer = results['tokenizer'] @@ -147,9 +164,14 @@ def _tokenize(self, results: dict) -> None: texts = instances['texts'] boxes = instances['boxes'] + # norm boxes + width = results['width'] + height = results['height'] + norm_boxes = [self.box_norm(box, width, height) for box in boxes] + tokenized_inputs: BatchEncoding = tokenizer( text=texts, - boxes=boxes, + boxes=norm_boxes, padding=self.padding, max_length=self.max_length, truncation=self.truncation, @@ -157,45 +179,20 @@ def _tokenize(self, results: dict) -> None: add_special_tokens=True, return_tensors='np', return_attention_mask=True, - return_offsets_mapping=True) + return_overflowing_tokens=True) - # By default, the pipeline processes one sample - # at a time, so set batch_index = 0. - batch_index = 0 + truncation_number = tokenized_inputs['input_ids'].shape[0] + results['truncation_number'] = truncation_number # record input_ids/attention_mask/bbox for k in ['input_ids', 'attention_mask', 'bbox']: - results[k] = tokenized_inputs[k][batch_index] - # record word_ids - results['word_ids'] = tokenized_inputs.encodings[batch_index].word_ids - - def _norm_boxes(self, results: dict) -> None: - - def box_norm(box, width, height): - - def clip(min_num, num, max_num): - return min(max(num, min_num), max_num) - - x0, y0, x1, y1 = box - x0 = clip(0, int((x0 / width) * 1000), 1000) - y0 = clip(0, int((y0 / height) * 1000), 1000) - x1 = clip(0, int((x1 / width) * 1000), 1000) - y1 = clip(0, int((y1 / height) * 1000), 1000) - assert x1 >= x0 - assert y1 >= y0 - return [x0, y0, x1, y1] - - instances = results['instances'] - boxes = instances['boxes'] - - # norm boxes - width = results['width'] - height = results['height'] - norm_boxes = [box_norm(box, width, height) for box in boxes] - - results['instances']['boxes'] = norm_boxes + results[k] = tokenized_inputs[k] + # record truncation_word_ids + results['truncation_word_ids'] = [ + tokenized_inputs.encodings[batch_index].word_ids + for batch_index in range(truncation_number) + ] def transform(self, results: dict) -> Dict: - self._norm_boxes(results) self._tokenize(results) return results @@ -207,7 +204,7 @@ class ConvertBIOLabelForSER(BaseTransform): Required Keys: - tokenizer - - word_ids + - truncation_word_ids - instances - labels @@ -225,15 +222,15 @@ def __init__(self, classes: Union[tuple, list], only_label_first_subword: bool = False) -> None: super().__init__() + self.other_label_name = find_other_label_name_of_biolabel(classes) self.biolabel2id = self._generate_biolabel2id_map(classes) self.only_label_first_subword = only_label_first_subword def _generate_biolabel2id_map(self, classes: Union[tuple, list]) -> Dict: bio_label_list = [] - classes = sorted([c.upper() for c in classes]) - for c in classes: - if c == 'OTHER': - bio_label_list.insert(0, c) + for c in sorted(classes): + if c == self.other_label_name: + bio_label_list.insert(0, 'O') else: bio_label_list.append(f'B-{c}') bio_label_list.append(f'I-{c}') @@ -247,29 +244,33 @@ def _convert(self, results: dict) -> None: tokenizer = results['tokenizer'] instances = results['instances'] - labels = [label.upper() for label in instances['labels']] - word_ids = results['word_ids'] - - biolabel_ids = [] - pre_word_id = None - for cur_word_id in word_ids: - if cur_word_id is not None: - if cur_word_id != pre_word_id: - biolabel_name = f'B-{labels[cur_word_id]}' \ - if labels[cur_word_id] != 'OTHER' else 'OTHER' - elif self.only_label_first_subword: - biolabel_name = 'OTHER' + labels = [label for label in instances['labels']] + + batch_biolabel_ids = [] + for truncation_word_ids in results['truncation_word_ids']: + biolabel_ids = [] + pre_word_id = None + for cur_word_id in truncation_word_ids: + if cur_word_id is not None: + if cur_word_id != pre_word_id: + biolabel_name = f'B-{labels[cur_word_id]}' \ + if labels[cur_word_id] != \ + self.other_label_name else 'O' + elif self.only_label_first_subword: + biolabel_name = 'O' + else: + biolabel_name = f'I-{labels[cur_word_id]}' \ + if labels[cur_word_id] != \ + self.other_label_name else 'O' + # convert biolabel to id + biolabel_ids.append(self.biolabel2id[biolabel_name]) else: - biolabel_name = f'I-{labels[cur_word_id]}' \ - if labels[cur_word_id] != 'OTHER' else 'OTHER' - # convert biolabel to id - biolabel_ids.append(self.biolabel2id[biolabel_name]) - else: - biolabel_ids.append(tokenizer.pad_token_label) - pre_word_id = cur_word_id + biolabel_ids.append(tokenizer.pad_token_label) + pre_word_id = cur_word_id + batch_biolabel_ids.append(biolabel_ids) - # record biolabel_ids - results['labels'] = biolabel_ids + # record batch_biolabel_ids + results['labels'] = batch_biolabel_ids def transform(self, results: dict) -> Dict: self._convert(results) diff --git a/projects/LayoutLMv3/datasets/utils.py b/projects/LayoutLMv3/datasets/utils.py new file mode 100644 index 000000000..55775b70e --- /dev/null +++ b/projects/LayoutLMv3/datasets/utils.py @@ -0,0 +1,63 @@ +from typing import Any, Mapping, Sequence + +import torch +from mmengine.dataset.utils import COLLATE_FUNCTIONS +from mmengine.structures import BaseDataElement + + +@COLLATE_FUNCTIONS.register_module() +def long_text_data_collate(data_batch: Sequence, training: bool = True) -> Any: + """This code is referenced from + ``mmengine.dataset.utils.default_collate``""" + data_item = data_batch[0] + data_item_type = type(data_item) + + if isinstance(data_item, (BaseDataElement, str, bytes)): + return data_batch + elif isinstance(data_item, tuple) and hasattr(data_item, '_fields'): + # named_tuple + return data_item_type(*(long_text_data_collate(samples) + for samples in zip(*data_batch))) + elif isinstance(data_item, list): + flattened_data_batch = [ + sub_item for item in data_batch for sub_item in item + ] + if training: + return flattened_data_batch[:len(data_batch)] + else: + return flattened_data_batch + elif isinstance(data_item, Sequence): + # check to make sure that the data_itements in batch have + # consistent size + it = iter(data_batch) + data_item_size = len(next(it)) + if not all(len(data_item) == data_item_size for data_item in it): + raise RuntimeError( + 'each data_itement in list of batch should be of equal size') + transposed = list(zip(*data_batch)) + + if isinstance(data_item, tuple): + return [long_text_data_collate(samples) + for samples in transposed] # Compat with Pytorch. + else: + try: + return data_item_type([ + long_text_data_collate(samples) for samples in transposed + ]) + except TypeError: + # The sequence type may not support `__init__(iterable)` + # (e.g., `range`). + return [ + long_text_data_collate(samples) for samples in transposed + ] + elif isinstance(data_item, Mapping): + return data_item_type({ + key: long_text_data_collate([d[key] for d in data_batch]) + for key in data_item + }) + else: + concat_data_batch = torch.concat(data_batch, dim=0) + if training: + return concat_data_batch[:len(data_batch)] + else: + return concat_data_batch diff --git a/mmocr/datasets/xfund_dataset.py b/projects/LayoutLMv3/datasets/xfund_dataset.py similarity index 100% rename from mmocr/datasets/xfund_dataset.py rename to projects/LayoutLMv3/datasets/xfund_dataset.py diff --git a/mmocr/models/ser/__init__.py b/projects/LayoutLMv3/models/__init__.py similarity index 50% rename from mmocr/models/ser/__init__.py rename to projects/LayoutLMv3/models/__init__.py index 4188c5950..e84b0b2ac 100644 --- a/mmocr/models/ser/__init__.py +++ b/projects/LayoutLMv3/models/__init__.py @@ -1,5 +1,9 @@ # Copyright (c) OpenMMLab. All rights reserved. from .hf_layoutlmv3_wrapper import HFLayoutLMv3ForTokenClassificationWrapper +from .loss_processor import ComputeLossAfterLabelSmooth from .ser_postprocessor import SERPostprocessor -__all__ = ['HFLayoutLMv3ForTokenClassificationWrapper', 'SERPostprocessor'] +__all__ = [ + 'HFLayoutLMv3ForTokenClassificationWrapper', 'SERPostprocessor', + 'ComputeLossAfterLabelSmooth' +] diff --git a/mmocr/models/ser/hf_layoutlmv3_wrapper.py b/projects/LayoutLMv3/models/hf_layoutlmv3_wrapper.py similarity index 91% rename from mmocr/models/ser/hf_layoutlmv3_wrapper.py rename to projects/LayoutLMv3/models/hf_layoutlmv3_wrapper.py index efc8bcbef..e5dbcdce7 100644 --- a/mmocr/models/ser/hf_layoutlmv3_wrapper.py +++ b/projects/LayoutLMv3/models/hf_layoutlmv3_wrapper.py @@ -6,7 +6,7 @@ from mmocr.registry import MODELS from mmocr.utils.typing_utils import OptSERSampleList, SERSampleList -from transformers import AutoModelForTokenClassification +from transformers import LayoutLMv3ForTokenClassification from transformers.modeling_outputs import TokenClassifierOutput ForwardResults = Union[Dict[str, torch.Tensor], SERSampleList, @@ -19,6 +19,7 @@ class HFLayoutLMv3ForTokenClassificationWrapper(BaseModel): def __init__(self, layoutlmv3_token_classifier: dict = dict( pretrained_model_name_or_path=None), + loss_processor: Optional[Dict] = None, data_preprocessor: Optional[Dict] = None, postprocessor: Optional[Dict] = None, init_cfg: Optional[Dict] = None): @@ -27,13 +28,17 @@ def __init__(self, if isinstance(layoutlmv3_token_classifier, dict) and \ layoutlmv3_token_classifier.get( 'pretrained_model_name_or_path', None): - self.model = AutoModelForTokenClassification.from_pretrained( + self.model = LayoutLMv3ForTokenClassification.from_pretrained( **layoutlmv3_token_classifier) else: raise TypeError( 'layoutlmv3_token_classifier cfg should be a `dict` and a key ' '`pretrained_model_name_or_path` must be specified') + if loss_processor is not None: + assert isinstance(loss_processor, dict) + self.loss_processor = MODELS.build(loss_processor) + if postprocessor is not None: assert isinstance(postprocessor, dict) self.postprocessor = MODELS.build(postprocessor) @@ -93,8 +98,9 @@ def loss(self, inputs: torch.Tensor, data_samples: SERSampleList) -> Dict: Returns: dict[str, Tensor]: A dictionary of loss components. """ + labels = inputs.pop('labels') outputs: TokenClassifierOutput = self.model(**inputs) - return {'ce_loss': outputs['loss']} + return self.loss_processor(outputs, labels) def predict(self, inputs: torch.Tensor, data_samples: SERSampleList) -> SERSampleList: @@ -124,8 +130,7 @@ def predict(self, inputs: torch.Tensor, instance, in (xn, yn) order. """ outputs: TokenClassifierOutput = self.model(**inputs) - logits = outputs['logits'] - return self.postprocessor(logits, data_samples) + return self.postprocessor(outputs['logits'], data_samples) def _forward(self, inputs: torch.Tensor, @@ -144,5 +149,4 @@ def _forward(self, Tensor or tuple[Tensor]: A tuple of features from ``det_head`` forward. """ - x = self.extract_feat(inputs) - return self.det_head(x, data_samples) + return self.model(**inputs) diff --git a/projects/LayoutLMv3/models/loss_processor.py b/projects/LayoutLMv3/models/loss_processor.py new file mode 100644 index 000000000..a9ac2b563 --- /dev/null +++ b/projects/LayoutLMv3/models/loss_processor.py @@ -0,0 +1,18 @@ +from mmocr.registry import MODELS +from transformers.trainer_pt_utils import LabelSmoother + + +@MODELS.register_module() +class ComputeLossAfterLabelSmooth(LabelSmoother): + """Compute loss after label-smoothing. + + Args: + epsilon (`float`, *optional*, defaults to 0.1): + The label smoothing factor. + ignore_index (`int`, *optional*, defaults to -100): + The index in the labels to ignore when computing the loss. + """ + + def __call__(self, model_output, labels, shift_labels=False): + loss = super().__call__(model_output, labels, shift_labels) + return {'loss': loss} diff --git a/projects/LayoutLMv3/models/ser_postprocessor.py b/projects/LayoutLMv3/models/ser_postprocessor.py new file mode 100644 index 000000000..947312b92 --- /dev/null +++ b/projects/LayoutLMv3/models/ser_postprocessor.py @@ -0,0 +1,84 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from typing import Dict, Sequence, Union + +import torch +import torch.nn as nn +from mmengine.structures import LabelData + +from mmocr.registry import MODELS +from mmocr.structures import SERDataSample +from projects.LayoutLMv3.utils.bio_label_utils import \ + find_other_label_name_of_biolabel + + +@MODELS.register_module() +class SERPostprocessor(nn.Module): + """PostProcessor for SER.""" + + def __init__(self, + classes: Union[tuple, list], + ignore_index: int = -100) -> None: + super().__init__() + self.other_label_name = find_other_label_name_of_biolabel(classes) + self.id2biolabel = self._generate_id2biolabel_map(classes) + self.ignore_index = ignore_index + self.softmax = nn.Softmax(dim=-1) + + def _generate_id2biolabel_map(self, classes: Union[tuple, list]) -> Dict: + bio_label_list = [] + classes = sorted([c for c in classes]) + for c in classes: + if c == self.other_label_name: + bio_label_list.insert(0, 'O') + else: + bio_label_list.append(f'B-{c}') + bio_label_list.append(f'I-{c}') + id2biolabel_map = { + idx: bio_label + for idx, bio_label in enumerate(bio_label_list) + } + return id2biolabel_map + + def __call__(self, outputs: torch.Tensor, + data_samples: Sequence[SERDataSample] + ) -> Sequence[SERDataSample]: + # convert outputs dim from (truncation_num, max_length, label_num) + # to (truncation_num * max_length, label_num) + outputs = outputs.cpu().detach() + truncation_num = outputs.size(0) + outputs = torch.reshape(outputs, (-1, outputs.size(-1))) + # merge gt label ids from data_samples + gt_label_ids = [ + data_samples[truncation_idx].gt_label.item + for truncation_idx in range(truncation_num) + ] + gt_label_ids = torch.cat(gt_label_ids, dim=0).cpu().detach().numpy() + # get pred label ids/scores from outputs + probs = self.softmax(outputs) + max_value, max_idx = torch.max(probs, -1) + pred_label_ids = max_idx.numpy() + pred_label_scores = max_value.numpy() + # select valid token and convert iid to biolabel + gt_biolabels = [ + self.id2biolabel[g] for (g, p) in zip(gt_label_ids, pred_label_ids) + if g != self.ignore_index + ] + pred_biolabels = [ + self.id2biolabel[p] for (g, p) in zip(gt_label_ids, pred_label_ids) + if g != self.ignore_index + ] + pred_biolabel_scores = [ + s for (g, s) in zip(gt_label_ids, pred_label_scores) + if g != self.ignore_index + ] + # record pred_label + pred_label = LabelData() + pred_label.item = pred_biolabels + pred_label.score = pred_biolabel_scores + # merge several truncation data_sample to one data_sample + merged_data_sample = copy.deepcopy(data_samples[0]) + merged_data_sample.pred_label = pred_label + # update merged gt_label + merged_data_sample.gt_label.item = gt_biolabels + return [merged_data_sample] diff --git a/projects/LayoutLMv3/scripts/run_ser.sh b/projects/LayoutLMv3/scripts/run_ser.sh index aa03883c2..2b1135ccb 100644 --- a/projects/LayoutLMv3/scripts/run_ser.sh +++ b/projects/LayoutLMv3/scripts/run_ser.sh @@ -1,7 +1,8 @@ -config='/Users/wangnu/Documents/GitHub/mmocr/projects/LayoutLMv3/configs/ser/layoutlmv3_xfund_zh.py' +config='/Users/wangnu/Documents/GitHub/mmocr/projects/LayoutLMv3/configs/ser/layoutlmv3_1k_xfund_zh_1gpu_bs8.py' export TOKENIZERS_PARALLELISM=false export OMP_NUM_THREADS=1 +export PYTHONPATH='/Users/wangnu/Documents/GitHub/mmocr' -python tools/train.py \ +python ./tools/train.py \ ${config} \ diff --git a/projects/LayoutLMv3/test.py b/projects/LayoutLMv3/test.py deleted file mode 100644 index f5c31adb9..000000000 --- a/projects/LayoutLMv3/test.py +++ /dev/null @@ -1,76 +0,0 @@ -from functools import partial - -from mmengine.config import Config -from mmengine.dataset.utils import COLLATE_FUNCTIONS -from mmengine.registry import DATA_SAMPLERS, init_default_scope -from torch.utils.data import DataLoader - -from mmocr.registry import DATASETS, MODELS - -if __name__ == '__main__': - cfg_path = '/Users/wangnu/Documents/GitHub/mmocr' \ - '/configs/ser/_base_/datasets/xfund_zh.py' - cfg = Config.fromfile(cfg_path) - init_default_scope(cfg.get('default_scope', 'mmocr')) - - pretrained_model = '/Users/wangnu/Documents/GitHub/' \ - 'mmocr/data/layoutlmv3-base-chinese' - classes = ('answer', 'header', 'question', 'other') - - dataset_cfg = cfg.xfund_zh_ser_train - train_pipeline = [ - dict(type='LoadImageFromFile', color_type='color'), - dict( - type='LoadProcessorFromPretrainedModel', - pretrained_model_name_or_path=pretrained_model, - image_processor=dict(size=(224, 224), apply_ocr=False), - tokenizer=dict()), - dict(type='ProcessImageForLayoutLMv3'), - dict( - type='ProcessTokenForLayoutLMv3', - padding='max_length', - max_length=512, - truncation=True), - dict( - type='ConvertBIOLabelForSER', - classes=classes, - only_label_first_subword=True), - dict( - type='PackSERInputs', - meta_keys=('img_path', 'ori_shape', 'img_shape', 'scale_factor')) - ] - dataset_cfg['pipeline'] = train_pipeline - train_dataset = DATASETS.build(dataset_cfg) - - model_cfg = dict( - type='HFLayoutLMv3ForTokenClassificationWrapper', - layoutlmv3_token_classifier=dict( - pretrained_model_name_or_path=pretrained_model, num_labels=7), - postprocessor=dict(type='SERPostprocessor')) - - collate_fn_cfg = dict(type='default_collate') - collate_fn_type = collate_fn_cfg.pop('type') - collate_fn = COLLATE_FUNCTIONS.get(collate_fn_type) - collate_fn = partial(collate_fn, **collate_fn_cfg) - - sampler_cfg = dict( - type='DefaultSampler', dataset=train_dataset, shuffle=True) - sampler = DATA_SAMPLERS.build(sampler_cfg) - - train_dataloader = DataLoader( - batch_size=2, - dataset=train_dataset, - pin_memory=True, - persistent_workers=True, - sampler=sampler, - collate_fn=collate_fn, - num_workers=8) - - model = MODELS.build(model_cfg) - - for idx, data_batch in enumerate(train_dataloader): - # result = model.forward(**data_batch, mode='loss') - result = model.forward(**data_batch, mode='predict') - break - - print('Done') diff --git a/projects/LayoutLMv3/tools/train.py b/projects/LayoutLMv3/tools/train.py new file mode 100755 index 000000000..79c3888bb --- /dev/null +++ b/projects/LayoutLMv3/tools/train.py @@ -0,0 +1,116 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import logging +import os +import os.path as osp + +from mmengine.config import Config, DictAction +from mmengine.logging import print_log +from mmengine.registry import RUNNERS +from mmengine.runner import Runner + +from projects.LayoutLMv3 import * # NOQA + + +def parse_args(): + parser = argparse.ArgumentParser(description='Train a model') + parser.add_argument('config', help='Train config file path') + parser.add_argument('--work-dir', help='The dir to save logs and models') + parser.add_argument( + '--resume', action='store_true', help='Whether to resume checkpoint.') + parser.add_argument( + '--amp', + action='store_true', + default=False, + help='Enable automatic-mixed-precision training') + parser.add_argument( + '--auto-scale-lr', + action='store_true', + help='Whether to scale the learning rate automatically. It requires ' + '`auto_scale_lr` in config, and `base_batch_size` in `auto_scale_lr`') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='Override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + parser.add_argument( + '--launcher', + choices=['none', 'pytorch', 'slurm', 'mpi'], + default='none', + help='Job launcher') + # When using PyTorch version >= 2.0.0, the `torch.distributed.launch` + # will pass the `--local-rank` parameter to `tools/train.py` instead + # of `--local_rank`. + parser.add_argument('--local_rank', '--local-rank', type=int, default=0) + args = parser.parse_args() + if 'LOCAL_RANK' not in os.environ: + os.environ['LOCAL_RANK'] = str(args.local_rank) + + return args + + +def main(): + args = parse_args() + + # load config + cfg = Config.fromfile(args.config) + cfg.launcher = args.launcher + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + + # work_dir is determined in this priority: CLI > segment in file > filename + if args.work_dir is not None: + # update configs according to CLI args if args.work_dir is not None + cfg.work_dir = args.work_dir + elif cfg.get('work_dir', None) is None: + # use config filename as default work_dir if cfg.work_dir is None + cfg.work_dir = osp.join('./work_dirs', + osp.splitext(osp.basename(args.config))[0]) + # enable automatic-mixed-precision training + if args.amp: + optim_wrapper = cfg.optim_wrapper.type + if optim_wrapper == 'AmpOptimWrapper': + print_log( + 'AMP training is already enabled in your config.', + logger='current', + level=logging.WARNING) + else: + assert optim_wrapper == 'OptimWrapper', ( + '`--amp` is only supported when the optimizer wrapper type is ' + f'`OptimWrapper` but got {optim_wrapper}.') + cfg.optim_wrapper.type = 'AmpOptimWrapper' + cfg.optim_wrapper.loss_scale = 'dynamic' + + if args.resume: + cfg.resume = True + + # enable automatically scaling LR + if args.auto_scale_lr: + if 'auto_scale_lr' in cfg and \ + 'base_batch_size' in cfg.auto_scale_lr: + cfg.auto_scale_lr.enable = True + else: + raise RuntimeError('Can not find "auto_scale_lr" or ' + '"auto_scale_lr.base_batch_size" in your' + ' configuration file.') + + # build the runner from config + if 'runner_type' not in cfg: + # build the default runner + runner = Runner.from_cfg(cfg) + else: + # build customized runner from the registry + # if 'runner_type' is set in the cfg + runner = RUNNERS.build(cfg) + + # start training + runner.train() + + +if __name__ == '__main__': + main() diff --git a/projects/LayoutLMv3/utils/bio_label_utils.py b/projects/LayoutLMv3/utils/bio_label_utils.py new file mode 100644 index 000000000..0e5c94e53 --- /dev/null +++ b/projects/LayoutLMv3/utils/bio_label_utils.py @@ -0,0 +1,14 @@ +from typing import List, Tuple, Union + + +def find_other_label_name_of_biolabel(classes: Union[List[str], Tuple[str]]): + """Find the original name of BIO label `O` + + Args: + classes (List[str]): The list or tuple of class_names. + """ + valid_other_label_names = ('other', 'Other', 'OTHER') + for c in classes: + if c in valid_other_label_names: + return c + return None diff --git a/projects/LayoutLMv3/visualization/__init__.py b/projects/LayoutLMv3/visualization/__init__.py new file mode 100644 index 000000000..fa9a62c1d --- /dev/null +++ b/projects/LayoutLMv3/visualization/__init__.py @@ -0,0 +1,3 @@ +from .ser_visualizer import SERLocalVisualizer + +__all__ = ['SERLocalVisualizer'] diff --git a/projects/LayoutLMv3/visualization/ser_visualizer.py b/projects/LayoutLMv3/visualization/ser_visualizer.py new file mode 100644 index 000000000..d22855001 --- /dev/null +++ b/projects/LayoutLMv3/visualization/ser_visualizer.py @@ -0,0 +1,211 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, List, Optional, Tuple, Union + +import mmcv +import numpy as np +import torch +from mmdet.visualization.palette import _get_adaptive_scales +from mmengine.structures import LabelData + +from mmocr.registry import VISUALIZERS +from mmocr.structures import SERDataSample +from mmocr.visualization.base_visualizer import BaseLocalVisualizer + + +@VISUALIZERS.register_module() +class SERLocalVisualizer(BaseLocalVisualizer): + """The MMOCR Semantic Entity Recognition Local Visualizer. + + Args: + name (str): Name of the instance. Defaults to 'visualizer'. + image (np.ndarray, optional): The origin image to draw. The format + should be RGB. Defaults to None. + with_poly (bool): Whether to draw polygons. Defaults to True. + with_bbox (bool): Whether to draw bboxes. Defaults to False. + vis_backends (list, optional): Visual backend config list. + Defaults to None. + save_dir (str, optional): Save file dir for all storage backends. + If it is None, the backend storage will not save any data. + bbox_color (Union[str, tuple, list[str], list[tuple]]): The + colors of bboxes. ``colors`` can have the same + length with lines or just single value. If ``colors`` is single + value, all the lines will have the same colors. Refer to + `matplotlib.colors` for full list of formats that are accepted. + Defaults to 'b'. + label_color (Union[str, tuple, list[str], list[tuple]]): The + colors of gt/pred label. ``colors`` can have + the same length with lines or just single value. If ``colors`` + is single value, all the lines will have the same colors. Refer + to `matplotlib.colors` for full list of formats that are accepted. + Defaults to 'g'. + line_width (int, float): The linewidth of lines. Defaults to 2. + alpha (float): The transparency of bboxes or polygons. Defaults to 0.8. + """ + + def __init__(self, + name: str = 'visualizer', + image: Optional[np.ndarray] = None, + with_poly: bool = True, + with_bbox: bool = False, + vis_backends: Optional[Dict] = None, + save_dir: Optional[str] = None, + bbox_color: Union[str, Tuple, List[str], List[Tuple]] = 'b', + label_color: Union[str, Tuple, List[str], List[Tuple]] = 'g', + line_width: Union[int, float] = 2, + alpha: float = 0.8) -> None: + super().__init__( + name=name, + image=image, + vis_backends=vis_backends, + save_dir=save_dir) + self.with_poly = with_poly + self.with_bbox = with_bbox + self.bbox_color = bbox_color + self.label_color = label_color + self.line_width = line_width + self.alpha = alpha + + def _draw_instances(self, image: np.ndarray, bboxes: Union[np.ndarray, + torch.Tensor], + word_ids: List[int], gt_labels: Optional[LabelData], + pred_labels: Optional[LabelData]) -> np.ndarray: + """Draw bboxes and polygons on image. + + Args: + image (np.ndarray): The origin image to draw. + bboxes (Union[np.ndarray, torch.Tensor]): The bboxes to draw. + word_ids (List[int]): The word id of tokens. + gt_labels (Optional[LabelData]): The gt LabelData. + pred_labels (Optional[LabelData]): The pred LabelData. + Returns: + np.ndarray: The image with bboxes and gt/pred labels drawn. + """ + # draw bboxes + if bboxes is not None and self.with_bbox: + image = self.get_bboxes_image( + image, + bboxes, + colors=self.bbox_color, + line_width=self.line_width, + alpha=self.alpha) + + # draw gt/pred labels + if gt_labels is not None and pred_labels is not None: + gt_tokens_biolabel = gt_labels.item + gt_words_label = [] + pred_tokens_biolabel = pred_labels.item + pred_words_label = [] + + if 'score' in pred_labels: + pred_tokens_biolabel_score = pred_labels.score + pred_words_label_score = [] + else: + pred_tokens_biolabel_score = None + pred_words_label_score = None + + pre_word_id = None + for idx, cur_word_id in enumerate(word_ids): + if cur_word_id is not None: + if cur_word_id != pre_word_id: + gt_words_label_name = gt_tokens_biolabel[idx][2:] \ + if gt_tokens_biolabel[idx] != 'O' else 'other' + gt_words_label.append(gt_words_label_name) + pred_words_label_name = pred_tokens_biolabel[idx][2:] \ + if pred_tokens_biolabel[idx] != 'O' else 'other' + pred_words_label.append(pred_words_label_name) + if pred_tokens_biolabel_score is not None: + pred_words_label_score.append( + pred_tokens_biolabel_score[idx]) + pre_word_id = cur_word_id + assert len(gt_words_label) == len(bboxes) + assert len(pred_words_label) == len(bboxes) + + areas = (bboxes[:, 3] - bboxes[:, 1]) * ( + bboxes[:, 2] - bboxes[:, 0]) + scales = _get_adaptive_scales(areas) + positions = bboxes[:, :2] - self.line_width + + self.set_image(image) + for i, (pos, gt, pred) in enumerate( + zip(positions, gt_words_label, pred_words_label)): + if pred_words_label_score is not None: + score = round(float(pred_words_label_score[i]) * 100, 1) + label_text = f'{gt} | {pred}({score})' + else: + label_text = f'{gt} | {pred}' + + self.draw_texts( + label_text, + pos, + color=self.label_color if gt == pred else 'r', + font_sizes=int(13 * scales[i])) + + return self.get_image() + + def add_datasample(self, + name: str, + image: np.ndarray, + data_sample: Optional[SERDataSample] = None, + draw_gt: bool = True, + draw_pred: bool = True, + show: bool = False, + wait_time: int = 0, + out_file: Optional[str] = None, + pred_score_thr: float = 0.3, + step: int = 0) -> None: + """Draw datasample and save to all backends. + + - If GT and prediction are plotted at the same time, they are + displayed in a stitched image where the left image is the + ground truth and the right image is the prediction. + - If ``show`` is True, all storage backends are ignored, and + the images will be displayed in a local window. + - If ``out_file`` is specified, the drawn image will be + saved to ``out_file``. This is usually used when the display + is not available. + + Args: + name (str): The image identifier. + image (np.ndarray): The image to draw. + data_sample (:obj:`TextDetDataSample`, optional): + TextDetDataSample which contains gt and prediction. Defaults + to None. + draw_gt (bool): Whether to draw GT TextDetDataSample. + Defaults to True. + draw_pred (bool): Whether to draw Predicted TextDetDataSample. + Defaults to True. + show (bool): Whether to display the drawn image. Default to False. + wait_time (float): The interval of show (s). Defaults to 0. + out_file (str): Path to output file. Defaults to None. + pred_score_thr (float): The threshold to visualize the bboxes + and masks. Defaults to 0.3. + step (int): Global step value to record. Defaults to 0. + """ + cat_images = [] + if data_sample is not None: + bboxes = np.array(data_sample.instances.get('boxes', None)) + # here need to flatten truncation_word_ids + word_ids = [ + word_id for word_ids in data_sample.truncation_word_ids + for word_id in word_ids[1:-1] + ] + gt_label = data_sample.gt_label if \ + draw_gt and 'gt_label' in data_sample else None + pred_label = data_sample.pred_label if \ + draw_pred and 'pred_label' in data_sample else None + draw_img = self._draw_instances(image.copy(), bboxes, word_ids, + gt_label, pred_label) + cat_images.append(draw_img) + cat_images = self._cat_image(cat_images, axis=1) + if cat_images is None: + cat_images = image + if show: + self.show(cat_images, win_name=name, wait_time=wait_time) + else: + self.add_image(name, cat_images, step) + + if out_file is not None: + mmcv.imwrite(cat_images[..., ::-1], out_file) + + self.set_image(cat_images) + return self.get_image() From 40cfe65fcdf69fc43e94a2a498e12864db11c69b Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Mon, 1 May 2023 23:35:35 +0800 Subject: [PATCH 35/50] fix an error --- mmocr/datasets/transforms/__init__.py | 42 ++++++--------------------- 1 file changed, 9 insertions(+), 33 deletions(-) diff --git a/mmocr/datasets/transforms/__init__.py b/mmocr/datasets/transforms/__init__.py index e4131184b..61a15ec96 100644 --- a/mmocr/datasets/transforms/__init__.py +++ b/mmocr/datasets/transforms/__init__.py @@ -15,37 +15,13 @@ from .wrappers import ConditionApply, ImgAugWrapper, TorchVisionWrapper __all__ = [ - 'LoadOCRAnnotations', - 'RandomRotate', - 'ImgAugWrapper', - 'SourceImagePad', - 'TextDetRandomCropFlip', - 'PyramidRescale', - 'TorchVisionWrapper', - 'Resize', - 'RandomCrop', - 'TextDetRandomCrop', - 'RandomCrop', - 'PackTextDetInputs', - 'PackTextRecogInputs', - 'RescaleToHeight', - 'PadToWidth', - 'ShortScaleAspectJitter', - 'RandomFlip', - 'BoundedScaleAspectJitter', - 'PackKIEInputs', - 'LoadKIEAnnotations', - 'FixInvalidPolygon', - 'MMDet2MMOCR', - 'MMOCR2MMDet', - 'LoadImageFromFile', - 'LoadImageFromNDArray', - 'CropHeight', - 'InferencerLoader', - 'RemoveIgnored', - 'ConditionApply', - 'CropHeight', - 'TextRecogGeneralAug', - 'ImageContentJitter', - 'ReversePixels', + 'LoadOCRAnnotations', 'RandomRotate', 'ImgAugWrapper', 'SourceImagePad', + 'TextDetRandomCropFlip', 'PyramidRescale', 'TorchVisionWrapper', 'Resize', + 'RandomCrop', 'TextDetRandomCrop', 'RandomCrop', 'PackTextDetInputs', + 'PackTextRecogInputs', 'RescaleToHeight', 'PadToWidth', + 'ShortScaleAspectJitter', 'RandomFlip', 'BoundedScaleAspectJitter', + 'PackKIEInputs', 'LoadKIEAnnotations', 'FixInvalidPolygon', 'MMDet2MMOCR', + 'MMOCR2MMDet', 'LoadImageFromFile', 'LoadImageFromNDArray', 'CropHeight', + 'InferencerLoader', 'RemoveIgnored', 'ConditionApply', 'CropHeight', + 'TextRecogGeneralAug', 'ImageContentJitter', 'ReversePixels' ] From d1f43e76a9fc774665589afaaed949e633d47402 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Mon, 1 May 2023 23:58:18 +0800 Subject: [PATCH 36/50] =?UTF-8?q?=E5=B0=86ser=5Fdata=5Fsample=E7=A7=BB?= =?UTF-8?q?=E5=88=B0projects=E9=87=8C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mmocr/structures/__init__.py | 3 +-- mmocr/utils/__init__.py | 7 +++---- mmocr/utils/typing_utils.py | 4 +--- projects/LayoutLMv3/datasets/transforms/formatting.py | 2 +- projects/LayoutLMv3/models/hf_layoutlmv3_wrapper.py | 3 ++- projects/LayoutLMv3/models/ser_postprocessor.py | 2 +- projects/LayoutLMv3/structures/__init__.py | 3 +++ .../LayoutLMv3}/structures/ser_data_sample.py | 0 projects/LayoutLMv3/utils/typing_utils.py | 6 ++++++ projects/LayoutLMv3/visualization/ser_visualizer.py | 2 +- 10 files changed, 19 insertions(+), 13 deletions(-) create mode 100644 projects/LayoutLMv3/structures/__init__.py rename {mmocr => projects/LayoutLMv3}/structures/ser_data_sample.py (100%) create mode 100644 projects/LayoutLMv3/utils/typing_utils.py diff --git a/mmocr/structures/__init__.py b/mmocr/structures/__init__.py index 2d8b78857..2b71ac262 100644 --- a/mmocr/structures/__init__.py +++ b/mmocr/structures/__init__.py @@ -1,11 +1,10 @@ # Copyright (c) OpenMMLab. All rights reserved. from .kie_data_sample import KIEDataSample -from .ser_data_sample import SERDataSample from .textdet_data_sample import TextDetDataSample from .textrecog_data_sample import TextRecogDataSample from .textspotting_data_sample import TextSpottingDataSample __all__ = [ 'TextDetDataSample', 'TextRecogDataSample', 'KIEDataSample', - 'TextSpottingDataSample', 'SERDataSample' + 'TextSpottingDataSample' ] diff --git a/mmocr/utils/__init__.py b/mmocr/utils/__init__.py index afdfee26a..3e4fb6fb2 100644 --- a/mmocr/utils/__init__.py +++ b/mmocr/utils/__init__.py @@ -28,9 +28,8 @@ LabelList, MultiConfig, OptConfigType, OptDetSampleList, OptInitConfigType, OptInstanceList, OptKIESampleList, OptLabelList, - OptMultiConfig, OptRecSampleList, OptSERSampleList, - OptTensor, RangeType, RecForwardResults, - RecSampleList, SERSampleList) + OptMultiConfig, OptRecSampleList, OptTensor, + RangeType, RecForwardResults, RecSampleList) __all__ = [ 'collect_env', 'is_3dlist', 'is_type_list', 'is_none_or_type', 'equal_len', @@ -51,5 +50,5 @@ 'is_archive', 'check_integrity', 'list_files', 'get_md5', 'InstanceList', 'LabelList', 'OptInstanceList', 'OptLabelList', 'RangeType', 'remove_pipeline_elements', 'bezier2poly', 'poly2bezier', - 'track_parallel_progress_multi_args', 'SERSampleList', 'OptSERSampleList' + 'track_parallel_progress_multi_args' ] diff --git a/mmocr/utils/typing_utils.py b/mmocr/utils/typing_utils.py index 45cbc649b..592fb36e7 100644 --- a/mmocr/utils/typing_utils.py +++ b/mmocr/utils/typing_utils.py @@ -9,7 +9,7 @@ from mmengine.structures import InstanceData, LabelData from mmocr import digit_version -from mmocr.structures import (KIEDataSample, SERDataSample, TextDetDataSample, +from mmocr.structures import (KIEDataSample, TextDetDataSample, TextRecogDataSample, TextSpottingDataSample) # Config @@ -29,11 +29,9 @@ RecSampleList = List[TextRecogDataSample] DetSampleList = List[TextDetDataSample] KIESampleList = List[KIEDataSample] -SERSampleList = List[SERDataSample] OptRecSampleList = Optional[RecSampleList] OptDetSampleList = Optional[DetSampleList] OptKIESampleList = Optional[KIESampleList] -OptSERSampleList = Optional[SERSampleList] OptE2ESampleList = Optional[E2ESampleList] OptTensor = Optional[torch.Tensor] diff --git a/projects/LayoutLMv3/datasets/transforms/formatting.py b/projects/LayoutLMv3/datasets/transforms/formatting.py index 0ceab6bb8..36b0f3ee0 100644 --- a/projects/LayoutLMv3/datasets/transforms/formatting.py +++ b/projects/LayoutLMv3/datasets/transforms/formatting.py @@ -6,7 +6,7 @@ from mmengine.structures import LabelData from mmocr.registry import TRANSFORMS -from mmocr.structures import SERDataSample +from projects.LayoutLMv3.structures import SERDataSample @TRANSFORMS.register_module() diff --git a/projects/LayoutLMv3/models/hf_layoutlmv3_wrapper.py b/projects/LayoutLMv3/models/hf_layoutlmv3_wrapper.py index e5dbcdce7..9c2d71e84 100644 --- a/projects/LayoutLMv3/models/hf_layoutlmv3_wrapper.py +++ b/projects/LayoutLMv3/models/hf_layoutlmv3_wrapper.py @@ -5,7 +5,8 @@ from mmengine.model import BaseModel from mmocr.registry import MODELS -from mmocr.utils.typing_utils import OptSERSampleList, SERSampleList +from projects.LayoutLMv3.utils.typing_utils import (OptSERSampleList, + SERSampleList) from transformers import LayoutLMv3ForTokenClassification from transformers.modeling_outputs import TokenClassifierOutput diff --git a/projects/LayoutLMv3/models/ser_postprocessor.py b/projects/LayoutLMv3/models/ser_postprocessor.py index 947312b92..370173301 100644 --- a/projects/LayoutLMv3/models/ser_postprocessor.py +++ b/projects/LayoutLMv3/models/ser_postprocessor.py @@ -7,7 +7,7 @@ from mmengine.structures import LabelData from mmocr.registry import MODELS -from mmocr.structures import SERDataSample +from projects.LayoutLMv3.structures import SERDataSample from projects.LayoutLMv3.utils.bio_label_utils import \ find_other_label_name_of_biolabel diff --git a/projects/LayoutLMv3/structures/__init__.py b/projects/LayoutLMv3/structures/__init__.py new file mode 100644 index 000000000..729b26f57 --- /dev/null +++ b/projects/LayoutLMv3/structures/__init__.py @@ -0,0 +1,3 @@ +from .ser_data_sample import SERDataSample + +__all__ = ['SERDataSample'] diff --git a/mmocr/structures/ser_data_sample.py b/projects/LayoutLMv3/structures/ser_data_sample.py similarity index 100% rename from mmocr/structures/ser_data_sample.py rename to projects/LayoutLMv3/structures/ser_data_sample.py diff --git a/projects/LayoutLMv3/utils/typing_utils.py b/projects/LayoutLMv3/utils/typing_utils.py new file mode 100644 index 000000000..fa555e74c --- /dev/null +++ b/projects/LayoutLMv3/utils/typing_utils.py @@ -0,0 +1,6 @@ +from typing import List, Optional + +from projects.LayoutLMv3.structures import SERDataSample + +SERSampleList = List[SERDataSample] +OptSERSampleList = Optional[SERSampleList] diff --git a/projects/LayoutLMv3/visualization/ser_visualizer.py b/projects/LayoutLMv3/visualization/ser_visualizer.py index d22855001..6a26d27b5 100644 --- a/projects/LayoutLMv3/visualization/ser_visualizer.py +++ b/projects/LayoutLMv3/visualization/ser_visualizer.py @@ -8,8 +8,8 @@ from mmengine.structures import LabelData from mmocr.registry import VISUALIZERS -from mmocr.structures import SERDataSample from mmocr.visualization.base_visualizer import BaseLocalVisualizer +from projects.LayoutLMv3.structures import SERDataSample @VISUALIZERS.register_module() From 50fa7f925fdd198e95dcb3307940fd29fd0df176 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Mon, 8 May 2023 11:08:31 +0800 Subject: [PATCH 37/50] =?UTF-8?q?=E8=A7=84=E8=8C=83xfund=E6=95=B0=E6=8D=AE?= =?UTF-8?q?=E9=9B=86=E5=87=86=E5=A4=87=E8=84=9A=E6=9C=AC=E6=96=87=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- projects/LayoutLMv3/scripts/prepare_dataset.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/projects/LayoutLMv3/scripts/prepare_dataset.sh b/projects/LayoutLMv3/scripts/prepare_dataset.sh index e385e2abe..6c74a9e45 100644 --- a/projects/LayoutLMv3/scripts/prepare_dataset.sh +++ b/projects/LayoutLMv3/scripts/prepare_dataset.sh @@ -2,6 +2,7 @@ PROJ_ROOT=$(pwd) DATASET_ZOO_PATH=${PROJ_ROOT}/dataset_zoo NPROC=8 TASKS=('ser' 're') +SPLITS=('train' 'test') # DATASET_NAME=('xfund/de' 'xfund/es' 'xfund/fr' 'xfund/jt' 'xfund/ja' 'xfund/pt' 'xfund/zh') DATASET_NAME=('xfund/zh') @@ -11,7 +12,7 @@ do ${DATASET_NAME[@]} \ --nproc ${NPROC} \ --task ${TASK} \ - --splits train test \ - --overwrite-cfg \ - --dataset-zoo-path ${DATASET_ZOO_PATH} + --splits ${SPLITS[@]} \ + --dataset-zoo-path ${DATASET_ZOO_PATH} \ + --overwrite-cfg done From a04cd518a37e26047e8a4f2054370a934d433643 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Mon, 8 May 2023 13:10:06 +0800 Subject: [PATCH 38/50] =?UTF-8?q?[Fix]=E8=A7=A3=E5=86=B3=E6=8E=A8=E7=90=86?= =?UTF-8?q?=E6=97=B6=E5=AD=98=E5=9C=A8=E7=9A=84=E4=B8=80=E4=B8=AAbug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- projects/LayoutLMv3/datasets/utils.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/projects/LayoutLMv3/datasets/utils.py b/projects/LayoutLMv3/datasets/utils.py index 55775b70e..68fb5693a 100644 --- a/projects/LayoutLMv3/datasets/utils.py +++ b/projects/LayoutLMv3/datasets/utils.py @@ -16,7 +16,7 @@ def long_text_data_collate(data_batch: Sequence, training: bool = True) -> Any: return data_batch elif isinstance(data_item, tuple) and hasattr(data_item, '_fields'): # named_tuple - return data_item_type(*(long_text_data_collate(samples) + return data_item_type(*(long_text_data_collate(samples, training) for samples in zip(*data_batch))) elif isinstance(data_item, list): flattened_data_batch = [ @@ -37,22 +37,26 @@ def long_text_data_collate(data_batch: Sequence, training: bool = True) -> Any: transposed = list(zip(*data_batch)) if isinstance(data_item, tuple): - return [long_text_data_collate(samples) - for samples in transposed] # Compat with Pytorch. + return [ + long_text_data_collate(samples, training) + for samples in transposed + ] # Compat with Pytorch. else: try: return data_item_type([ - long_text_data_collate(samples) for samples in transposed + long_text_data_collate(samples, training) + for samples in transposed ]) except TypeError: # The sequence type may not support `__init__(iterable)` # (e.g., `range`). return [ - long_text_data_collate(samples) for samples in transposed + long_text_data_collate(samples, training) + for samples in transposed ] elif isinstance(data_item, Mapping): return data_item_type({ - key: long_text_data_collate([d[key] for d in data_batch]) + key: long_text_data_collate([d[key] for d in data_batch], training) for key in data_item }) else: From 81b8f869554ed37822eec52744d89751bc83c6ca Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Mon, 8 May 2023 13:11:27 +0800 Subject: [PATCH 39/50] =?UTF-8?q?=E4=BD=BF=E7=94=A8custom=5Fimports?= =?UTF-8?q?=E4=BC=98=E5=8C=96=E8=87=AA=E5=AE=9A=E4=B9=89=E6=A8=A1=E5=9D=97?= =?UTF-8?q?=E7=9A=84=E5=AF=BC=E5=85=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- projects/LayoutLMv3/__init__.py | 4 - .../ser/layoutlmv3_1k_xfund_zh_1gpu_bs8.py | 8 ++ projects/LayoutLMv3/scripts/run_ser.sh | 2 +- projects/LayoutLMv3/tools/train.py | 116 ------------------ 4 files changed, 9 insertions(+), 121 deletions(-) delete mode 100644 projects/LayoutLMv3/__init__.py delete mode 100755 projects/LayoutLMv3/tools/train.py diff --git a/projects/LayoutLMv3/__init__.py b/projects/LayoutLMv3/__init__.py deleted file mode 100644 index 8c347c22d..000000000 --- a/projects/LayoutLMv3/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from .datasets import * # NOQA -from .evaluation import * # NOQA -from .models import * # NOQA -from .visualization import * # NOQA diff --git a/projects/LayoutLMv3/configs/ser/layoutlmv3_1k_xfund_zh_1gpu_bs8.py b/projects/LayoutLMv3/configs/ser/layoutlmv3_1k_xfund_zh_1gpu_bs8.py index a9fc3f896..66e0047be 100644 --- a/projects/LayoutLMv3/configs/ser/layoutlmv3_1k_xfund_zh_1gpu_bs8.py +++ b/projects/LayoutLMv3/configs/ser/layoutlmv3_1k_xfund_zh_1gpu_bs8.py @@ -136,3 +136,11 @@ draw_pred=True), ) # ==================================================================== +# ========================= Custom imports =========================== +custom_imports = dict( + imports=[ + 'projects.LayoutLMv3.datasets', 'projects.LayoutLMv3.evaluation', + 'projects.LayoutLMv3.models', 'projects.LayoutLMv3.visualization' + ], + allow_failed_imports=False) +# ==================================================================== diff --git a/projects/LayoutLMv3/scripts/run_ser.sh b/projects/LayoutLMv3/scripts/run_ser.sh index 2b1135ccb..f5d75ca84 100644 --- a/projects/LayoutLMv3/scripts/run_ser.sh +++ b/projects/LayoutLMv3/scripts/run_ser.sh @@ -4,5 +4,5 @@ export TOKENIZERS_PARALLELISM=false export OMP_NUM_THREADS=1 export PYTHONPATH='/Users/wangnu/Documents/GitHub/mmocr' -python ./tools/train.py \ +python tools/train.py \ ${config} \ diff --git a/projects/LayoutLMv3/tools/train.py b/projects/LayoutLMv3/tools/train.py deleted file mode 100755 index 79c3888bb..000000000 --- a/projects/LayoutLMv3/tools/train.py +++ /dev/null @@ -1,116 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import argparse -import logging -import os -import os.path as osp - -from mmengine.config import Config, DictAction -from mmengine.logging import print_log -from mmengine.registry import RUNNERS -from mmengine.runner import Runner - -from projects.LayoutLMv3 import * # NOQA - - -def parse_args(): - parser = argparse.ArgumentParser(description='Train a model') - parser.add_argument('config', help='Train config file path') - parser.add_argument('--work-dir', help='The dir to save logs and models') - parser.add_argument( - '--resume', action='store_true', help='Whether to resume checkpoint.') - parser.add_argument( - '--amp', - action='store_true', - default=False, - help='Enable automatic-mixed-precision training') - parser.add_argument( - '--auto-scale-lr', - action='store_true', - help='Whether to scale the learning rate automatically. It requires ' - '`auto_scale_lr` in config, and `base_batch_size` in `auto_scale_lr`') - parser.add_argument( - '--cfg-options', - nargs='+', - action=DictAction, - help='Override some settings in the used config, the key-value pair ' - 'in xxx=yyy format will be merged into config file. If the value to ' - 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' - 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' - 'Note that the quotation marks are necessary and that no white space ' - 'is allowed.') - parser.add_argument( - '--launcher', - choices=['none', 'pytorch', 'slurm', 'mpi'], - default='none', - help='Job launcher') - # When using PyTorch version >= 2.0.0, the `torch.distributed.launch` - # will pass the `--local-rank` parameter to `tools/train.py` instead - # of `--local_rank`. - parser.add_argument('--local_rank', '--local-rank', type=int, default=0) - args = parser.parse_args() - if 'LOCAL_RANK' not in os.environ: - os.environ['LOCAL_RANK'] = str(args.local_rank) - - return args - - -def main(): - args = parse_args() - - # load config - cfg = Config.fromfile(args.config) - cfg.launcher = args.launcher - if args.cfg_options is not None: - cfg.merge_from_dict(args.cfg_options) - - # work_dir is determined in this priority: CLI > segment in file > filename - if args.work_dir is not None: - # update configs according to CLI args if args.work_dir is not None - cfg.work_dir = args.work_dir - elif cfg.get('work_dir', None) is None: - # use config filename as default work_dir if cfg.work_dir is None - cfg.work_dir = osp.join('./work_dirs', - osp.splitext(osp.basename(args.config))[0]) - # enable automatic-mixed-precision training - if args.amp: - optim_wrapper = cfg.optim_wrapper.type - if optim_wrapper == 'AmpOptimWrapper': - print_log( - 'AMP training is already enabled in your config.', - logger='current', - level=logging.WARNING) - else: - assert optim_wrapper == 'OptimWrapper', ( - '`--amp` is only supported when the optimizer wrapper type is ' - f'`OptimWrapper` but got {optim_wrapper}.') - cfg.optim_wrapper.type = 'AmpOptimWrapper' - cfg.optim_wrapper.loss_scale = 'dynamic' - - if args.resume: - cfg.resume = True - - # enable automatically scaling LR - if args.auto_scale_lr: - if 'auto_scale_lr' in cfg and \ - 'base_batch_size' in cfg.auto_scale_lr: - cfg.auto_scale_lr.enable = True - else: - raise RuntimeError('Can not find "auto_scale_lr" or ' - '"auto_scale_lr.base_batch_size" in your' - ' configuration file.') - - # build the runner from config - if 'runner_type' not in cfg: - # build the default runner - runner = Runner.from_cfg(cfg) - else: - # build customized runner from the registry - # if 'runner_type' is set in the cfg - runner = RUNNERS.build(cfg) - - # start training - runner.train() - - -if __name__ == '__main__': - main() From 059e203d9c721868b0a64529ad5f5555215cc3ce Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Mon, 8 May 2023 13:12:32 +0800 Subject: [PATCH 40/50] =?UTF-8?q?=E4=BC=98=E5=8C=96SER=E4=BB=BB=E5=8A=A1?= =?UTF-8?q?=E7=BB=93=E6=9E=9C=E5=8F=AF=E8=A7=86=E5=8C=96=E6=95=88=E6=9E=9C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../visualization/ser_visualizer.py | 42 +++++++++++++------ 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/projects/LayoutLMv3/visualization/ser_visualizer.py b/projects/LayoutLMv3/visualization/ser_visualizer.py index 6a26d27b5..e2e2834df 100644 --- a/projects/LayoutLMv3/visualization/ser_visualizer.py +++ b/projects/LayoutLMv3/visualization/ser_visualizer.py @@ -20,8 +20,8 @@ class SERLocalVisualizer(BaseLocalVisualizer): name (str): Name of the instance. Defaults to 'visualizer'. image (np.ndarray, optional): The origin image to draw. The format should be RGB. Defaults to None. - with_poly (bool): Whether to draw polygons. Defaults to True. - with_bbox (bool): Whether to draw bboxes. Defaults to False. + with_poly (bool): Whether to draw polygons. Defaults to False. + with_bbox (bool): Whether to draw bboxes. Defaults to True. vis_backends (list, optional): Visual backend config list. Defaults to None. save_dir (str, optional): Save file dir for all storage backends. @@ -45,8 +45,8 @@ class SERLocalVisualizer(BaseLocalVisualizer): def __init__(self, name: str = 'visualizer', image: Optional[np.ndarray] = None, - with_poly: bool = True, - with_bbox: bool = False, + with_poly: bool = False, + with_bbox: bool = True, vis_backends: Optional[Dict] = None, save_dir: Optional[str] = None, bbox_color: Union[str, Tuple, List[str], List[Tuple]] = 'b', @@ -67,19 +67,21 @@ def __init__(self, def _draw_instances(self, image: np.ndarray, bboxes: Union[np.ndarray, torch.Tensor], - word_ids: List[int], gt_labels: Optional[LabelData], + word_ids: Optional[List[int]], + gt_labels: Optional[LabelData], pred_labels: Optional[LabelData]) -> np.ndarray: """Draw bboxes and polygons on image. Args: image (np.ndarray): The origin image to draw. bboxes (Union[np.ndarray, torch.Tensor]): The bboxes to draw. - word_ids (List[int]): The word id of tokens. + word_ids (Optional[List[int]]): The word id of tokens. gt_labels (Optional[LabelData]): The gt LabelData. pred_labels (Optional[LabelData]): The pred LabelData. Returns: np.ndarray: The image with bboxes and gt/pred labels drawn. """ + self.set_image(image) # draw bboxes if bboxes is not None and self.with_bbox: image = self.get_bboxes_image( @@ -123,9 +125,8 @@ def _draw_instances(self, image: np.ndarray, bboxes: Union[np.ndarray, areas = (bboxes[:, 3] - bboxes[:, 1]) * ( bboxes[:, 2] - bboxes[:, 0]) scales = _get_adaptive_scales(areas) - positions = bboxes[:, :2] - self.line_width + positions = (bboxes[:, :2] + bboxes[:, 2:]) // 2 - self.set_image(image) for i, (pos, gt, pred) in enumerate( zip(positions, gt_words_label, pred_words_label)): if pred_words_label_score is not None: @@ -137,8 +138,10 @@ def _draw_instances(self, image: np.ndarray, bboxes: Union[np.ndarray, self.draw_texts( label_text, pos, - color=self.label_color if gt == pred else 'r', - font_sizes=int(13 * scales[i])) + colors=self.label_color if gt == pred else 'r', + font_sizes=int(13 * scales[i]), + vertical_alignments='center', + horizontal_alignments='center') return self.get_image() @@ -193,9 +196,22 @@ def add_datasample(self, draw_gt and 'gt_label' in data_sample else None pred_label = data_sample.pred_label if \ draw_pred and 'pred_label' in data_sample else None - draw_img = self._draw_instances(image.copy(), bboxes, word_ids, - gt_label, pred_label) - cat_images.append(draw_img) + # draw original image with bboxes + orig_img_with_bboxes = self._draw_instances( + image=image.copy(), + bboxes=bboxes, + word_ids=None, + gt_labels=None, + pred_labels=None) + cat_images.append(orig_img_with_bboxes) + empty_img = np.full_like(image, 255) + empty_img_with_label = self._draw_instances( + image=empty_img, + bboxes=bboxes, + word_ids=word_ids, + gt_labels=gt_label, + pred_labels=pred_label) + cat_images.append(empty_img_with_label) cat_images = self._cat_image(cat_images, axis=1) if cat_images is None: cat_images = image From f0a03acadc918dd911d21d8a475e9d638b498cf8 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Thu, 25 May 2023 12:41:41 +0800 Subject: [PATCH 41/50] =?UTF-8?q?=E8=A7=84=E8=8C=83=E9=85=8D=E7=BD=AE?= =?UTF-8?q?=E6=96=87=E4=BB=B6=E5=91=BD=E5=90=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...xfund_zh_1gpu_bs8.py => layoutlmv3_1k_xfund_zh_1xbs8.py} | 6 +++--- projects/LayoutLMv3/scripts/run_ser.sh | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) rename projects/LayoutLMv3/configs/ser/{layoutlmv3_1k_xfund_zh_1gpu_bs8.py => layoutlmv3_1k_xfund_zh_1xbs8.py} (96%) diff --git a/projects/LayoutLMv3/configs/ser/layoutlmv3_1k_xfund_zh_1gpu_bs8.py b/projects/LayoutLMv3/configs/ser/layoutlmv3_1k_xfund_zh_1xbs8.py similarity index 96% rename from projects/LayoutLMv3/configs/ser/layoutlmv3_1k_xfund_zh_1gpu_bs8.py rename to projects/LayoutLMv3/configs/ser/layoutlmv3_1k_xfund_zh_1xbs8.py index 66e0047be..76cacde9b 100644 --- a/projects/LayoutLMv3/configs/ser/layoutlmv3_1k_xfund_zh_1gpu_bs8.py +++ b/projects/LayoutLMv3/configs/ser/layoutlmv3_1k_xfund_zh_1xbs8.py @@ -13,7 +13,7 @@ lr = 7e-5 train_batch_size_per_gpu = 2 train_num_workers = 8 -test_batch_size_per_gpu = 1 # can't batch inference now +test_batch_size_per_gpu = 1 # can't batch infer now test_num_workers = 8 only_label_first_subword = True # select label process strategy # ==================================================================== @@ -89,7 +89,7 @@ pin_memory=True, persistent_workers=True, sampler=dict(type='InfiniteSampler', shuffle=True), - collate_fn=dict(type='long_text_data_collate', training=True), + collate_fn=dict(type='ser_collate', training=True), dataset=train_dataset) val_dataloader = dict( batch_size=test_batch_size_per_gpu, @@ -97,7 +97,7 @@ pin_memory=True, persistent_workers=True, sampler=dict(type='DefaultSampler', shuffle=False), - collate_fn=dict(type='long_text_data_collate', training=False), + collate_fn=dict(type='ser_collate', training=False), dataset=test_dataset) test_dataloader = val_dataloader # ==================================================================== diff --git a/projects/LayoutLMv3/scripts/run_ser.sh b/projects/LayoutLMv3/scripts/run_ser.sh index f5d75ca84..12b0f8271 100644 --- a/projects/LayoutLMv3/scripts/run_ser.sh +++ b/projects/LayoutLMv3/scripts/run_ser.sh @@ -1,4 +1,4 @@ -config='/Users/wangnu/Documents/GitHub/mmocr/projects/LayoutLMv3/configs/ser/layoutlmv3_1k_xfund_zh_1gpu_bs8.py' +config='/Users/wangnu/Documents/GitHub/mmocr/projects/LayoutLMv3/configs/ser/layoutlmv3_1k_xfund_zh_1xbs8.py' export TOKENIZERS_PARALLELISM=false export OMP_NUM_THREADS=1 From d9a3a5e8f68a7a6cd42af781c6e5cc9fdf7e0bd5 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Thu, 25 May 2023 12:44:47 +0800 Subject: [PATCH 42/50] =?UTF-8?q?=E5=8C=96=E7=B9=81=E4=B8=BA=E7=AE=80?= =?UTF-8?q?=EF=BC=8C=E4=BC=98=E5=8C=96=E4=B9=8B=E5=89=8D=E5=9F=BA=E4=BA=8E?= =?UTF-8?q?default=5Fcollate=E7=9A=84long=5Ftext=5Fdata=5Fcollate=E4=B8=BA?= =?UTF-8?q?=E6=9B=B4=E6=98=8E=E7=A1=AE=E6=98=93=E7=90=86=E8=A7=A3=E7=9A=84?= =?UTF-8?q?ser=5Fcollate?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- projects/LayoutLMv3/datasets/__init__.py | 4 +- projects/LayoutLMv3/datasets/utils.py | 115 +++++++++++------------ 2 files changed, 58 insertions(+), 61 deletions(-) diff --git a/projects/LayoutLMv3/datasets/__init__.py b/projects/LayoutLMv3/datasets/__init__.py index 691c8198a..b218a79fc 100644 --- a/projects/LayoutLMv3/datasets/__init__.py +++ b/projects/LayoutLMv3/datasets/__init__.py @@ -1,5 +1,5 @@ from .transforms import * # NOQA -from .utils import long_text_data_collate +from .utils import ser_collate from .xfund_dataset import XFUNDDataset -__all__ = ['XFUNDDataset', 'long_text_data_collate'] +__all__ = ['XFUNDDataset', 'ser_collate'] diff --git a/projects/LayoutLMv3/datasets/utils.py b/projects/LayoutLMv3/datasets/utils.py index 68fb5693a..9526a1994 100644 --- a/projects/LayoutLMv3/datasets/utils.py +++ b/projects/LayoutLMv3/datasets/utils.py @@ -1,67 +1,64 @@ -from typing import Any, Mapping, Sequence +from typing import Dict, Sequence import torch from mmengine.dataset.utils import COLLATE_FUNCTIONS -from mmengine.structures import BaseDataElement @COLLATE_FUNCTIONS.register_module() -def long_text_data_collate(data_batch: Sequence, training: bool = True) -> Any: - """This code is referenced from - ``mmengine.dataset.utils.default_collate``""" - data_item = data_batch[0] - data_item_type = type(data_item) +def ser_collate(data_batch: Sequence, training: bool = True) -> Dict: + """A collate function designed for SER. - if isinstance(data_item, (BaseDataElement, str, bytes)): - return data_batch - elif isinstance(data_item, tuple) and hasattr(data_item, '_fields'): - # named_tuple - return data_item_type(*(long_text_data_collate(samples, training) - for samples in zip(*data_batch))) - elif isinstance(data_item, list): - flattened_data_batch = [ - sub_item for item in data_batch for sub_item in item - ] - if training: - return flattened_data_batch[:len(data_batch)] - else: - return flattened_data_batch - elif isinstance(data_item, Sequence): - # check to make sure that the data_itements in batch have - # consistent size - it = iter(data_batch) - data_item_size = len(next(it)) - if not all(len(data_item) == data_item_size for data_item in it): - raise RuntimeError( - 'each data_itement in list of batch should be of equal size') - transposed = list(zip(*data_batch)) + Args: + data_batch (Sequence): Data sampled from dataset. + Like: + [ + { + 'inputs': {'input_ids': ..., 'bbox': ..., ...}, + 'data_samples': ['SERDataSample_1'] + }, + { + 'inputs': {'input_ids': ..., 'bbox': ..., ...}, + 'data_samples': ['SERDataSample_1', 'SERDataSample_2', ...] + }, + ... + ] + training (bool): whether training process or not. - if isinstance(data_item, tuple): - return [ - long_text_data_collate(samples, training) - for samples in transposed - ] # Compat with Pytorch. - else: - try: - return data_item_type([ - long_text_data_collate(samples, training) - for samples in transposed - ]) - except TypeError: - # The sequence type may not support `__init__(iterable)` - # (e.g., `range`). - return [ - long_text_data_collate(samples, training) - for samples in transposed - ] - elif isinstance(data_item, Mapping): - return data_item_type({ - key: long_text_data_collate([d[key] for d in data_batch], training) - for key in data_item - }) - else: - concat_data_batch = torch.concat(data_batch, dim=0) - if training: - return concat_data_batch[:len(data_batch)] - else: - return concat_data_batch + Note: + Different from ``default_collate`` in pytorch or in mmengine, + ``ser_collate`` can accept `inputs` tensor and `data_samples` + list with the different shape. + + Returns: + transposed (Dict): A dict have two elements, + the first element `inputs` is a dict + the second element `data_samples` is a list + """ + batch_size = len(data_batch) + # transpose `inputs`, which is a dict. + batch_inputs = [data_item['inputs'] for data_item in data_batch] + batch_inputs_item = batch_inputs[0] + transposed_batch_inputs = {} + for key in batch_inputs_item: + concat_value = torch.concat([d[key] for d in batch_inputs], dim=0) + # TODO: because long text will be truncated, the concat_value + # cannot be sliced directly when training=False. + # How to support batch inference? + transposed_batch_inputs[key] = concat_value[:batch_size] \ + if training else concat_value + # transpose `data_samples`, which is a list. + batch_data_samples = [ + data_item['data_samples'] for data_item in data_batch + ] + flattened = [sub_item for item in batch_data_samples for sub_item in item] + # TODO: because long text will be truncated, the concat_value + # cannot be sliced directly when training=False. + # How to support batch inference? + transposed_batch_data_samples = flattened[:batch_size] \ + if training else flattened + + transposed = { + 'inputs': transposed_batch_inputs, + 'data_samples': transposed_batch_data_samples + } + return transposed From b04e126acdaa96beb8ac3afec7e3bf16fb9d6e15 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Thu, 25 May 2023 12:52:22 +0800 Subject: [PATCH 43/50] =?UTF-8?q?=E9=92=88=E5=AF=B9inference=E9=98=B6?= =?UTF-8?q?=E6=AE=B5=E6=B2=A1=E6=9C=89gt=5Flabel=E7=9A=84=E6=83=85?= =?UTF-8?q?=E5=86=B5=E9=92=88=E5=AF=B9=E6=80=A7=E4=BF=AE=E5=A4=8Dser=5Fpos?= =?UTF-8?q?tprocessor=E4=BB=A5=E5=8F=8Aser=5Fvisualizer=E4=B8=AD=E5=AD=98?= =?UTF-8?q?=E5=9C=A8=E7=9A=84bug.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../datasets/transforms/formatting.py | 13 ++-- .../LayoutLMv3/models/ser_postprocessor.py | 65 ++++++++++------- .../visualization/ser_visualizer.py | 70 ++++++++++++------- 3 files changed, 92 insertions(+), 56 deletions(-) diff --git a/projects/LayoutLMv3/datasets/transforms/formatting.py b/projects/LayoutLMv3/datasets/transforms/formatting.py index 36b0f3ee0..7dd4911f6 100644 --- a/projects/LayoutLMv3/datasets/transforms/formatting.py +++ b/projects/LayoutLMv3/datasets/transforms/formatting.py @@ -98,8 +98,7 @@ def transform(self, results: dict) -> dict: for key in self.ser_keys: if key not in results: continue - value = to_tensor(results[key]) - inputs[key] = value + inputs[key] = to_tensor(results[key]) packed_results['inputs'] = inputs # pack `data_samples` @@ -107,13 +106,15 @@ def transform(self, results: dict) -> dict: for truncation_idx in range(truncation_number): data_sample = SERDataSample() gt_label = LabelData() - assert 'labels' in results, 'key `labels` not in results.' - value = to_tensor(results['labels'][truncation_idx]) - gt_label.item = value + if results.get('labels', None): + gt_label.item = to_tensor(results['labels'][truncation_idx]) data_sample.gt_label = gt_label meta = {} for key in self.meta_keys: - meta[key] = results[key] + if key == 'truncation_word_ids': + meta[key] = results[key][truncation_idx] + else: + meta[key] = results[key] data_sample.set_metainfo(meta) data_samples.append(data_sample) packed_results['data_samples'] = data_samples diff --git a/projects/LayoutLMv3/models/ser_postprocessor.py b/projects/LayoutLMv3/models/ser_postprocessor.py index 370173301..a70c2ae82 100644 --- a/projects/LayoutLMv3/models/ser_postprocessor.py +++ b/projects/LayoutLMv3/models/ser_postprocessor.py @@ -16,13 +16,10 @@ class SERPostprocessor(nn.Module): """PostProcessor for SER.""" - def __init__(self, - classes: Union[tuple, list], - ignore_index: int = -100) -> None: + def __init__(self, classes: Union[tuple, list]) -> None: super().__init__() self.other_label_name = find_other_label_name_of_biolabel(classes) self.id2biolabel = self._generate_id2biolabel_map(classes) - self.ignore_index = ignore_index self.softmax = nn.Softmax(dim=-1) def _generate_id2biolabel_map(self, classes: Union[tuple, list]) -> Dict: @@ -43,42 +40,62 @@ def _generate_id2biolabel_map(self, classes: Union[tuple, list]) -> Dict: def __call__(self, outputs: torch.Tensor, data_samples: Sequence[SERDataSample] ) -> Sequence[SERDataSample]: + # merge several truncation data_sample to one data_sample + assert all('truncation_word_ids' in d for d in data_samples), \ + 'The key `truncation_word_ids` should be specified' \ + 'in PackSERInputs.' + truncation_word_ids = [] + for data_sample in data_samples: + truncation_word_ids.append(data_sample.pop('truncation_word_ids')) + merged_data_sample = copy.deepcopy(data_samples[0]) + merged_data_sample.set_metainfo( + dict(truncation_word_ids=truncation_word_ids)) + flattened_word_ids = [ + word_id for word_ids in truncation_word_ids for word_id in word_ids + ] + # convert outputs dim from (truncation_num, max_length, label_num) # to (truncation_num * max_length, label_num) outputs = outputs.cpu().detach() - truncation_num = outputs.size(0) outputs = torch.reshape(outputs, (-1, outputs.size(-1))) - # merge gt label ids from data_samples - gt_label_ids = [ - data_samples[truncation_idx].gt_label.item - for truncation_idx in range(truncation_num) - ] - gt_label_ids = torch.cat(gt_label_ids, dim=0).cpu().detach().numpy() # get pred label ids/scores from outputs probs = self.softmax(outputs) max_value, max_idx = torch.max(probs, -1) pred_label_ids = max_idx.numpy() pred_label_scores = max_value.numpy() - # select valid token and convert iid to biolabel - gt_biolabels = [ - self.id2biolabel[g] for (g, p) in zip(gt_label_ids, pred_label_ids) - if g != self.ignore_index - ] + + # determine whether it is an inference process + if 'item' in data_samples[0].gt_label: + # merge gt label ids from data_samples + gt_label_ids = [ + data_sample.gt_label.item for data_sample in data_samples + ] + gt_label_ids = torch.cat( + gt_label_ids, dim=0).cpu().detach().numpy() + gt_biolabels = [ + self.id2biolabel[g] + for (w, g) in zip(flattened_word_ids, gt_label_ids) + if w is not None + ] + # update merged gt_label + merged_data_sample.gt_label.item = gt_biolabels + + # inference process do not have item in gt_label, + # so select valid token with flattened_word_ids + # rather than with gt_label_ids like official code. pred_biolabels = [ - self.id2biolabel[p] for (g, p) in zip(gt_label_ids, pred_label_ids) - if g != self.ignore_index + self.id2biolabel[p] + for (w, p) in zip(flattened_word_ids, pred_label_ids) + if w is not None ] pred_biolabel_scores = [ - s for (g, s) in zip(gt_label_ids, pred_label_scores) - if g != self.ignore_index + s for (w, s) in zip(flattened_word_ids, pred_label_scores) + if w is not None ] # record pred_label pred_label = LabelData() pred_label.item = pred_biolabels pred_label.score = pred_biolabel_scores - # merge several truncation data_sample to one data_sample - merged_data_sample = copy.deepcopy(data_samples[0]) merged_data_sample.pred_label = pred_label - # update merged gt_label - merged_data_sample.gt_label.item = gt_biolabels + return [merged_data_sample] diff --git a/projects/LayoutLMv3/visualization/ser_visualizer.py b/projects/LayoutLMv3/visualization/ser_visualizer.py index e2e2834df..f0cdc3707 100644 --- a/projects/LayoutLMv3/visualization/ser_visualizer.py +++ b/projects/LayoutLMv3/visualization/ser_visualizer.py @@ -91,19 +91,13 @@ def _draw_instances(self, image: np.ndarray, bboxes: Union[np.ndarray, line_width=self.line_width, alpha=self.alpha) - # draw gt/pred labels - if gt_labels is not None and pred_labels is not None: + areas = (bboxes[:, 3] - bboxes[:, 1]) * (bboxes[:, 2] - bboxes[:, 0]) + scales = _get_adaptive_scales(areas) + positions = (bboxes[:, :2] + bboxes[:, 2:]) // 2 + + if gt_labels is not None: gt_tokens_biolabel = gt_labels.item gt_words_label = [] - pred_tokens_biolabel = pred_labels.item - pred_words_label = [] - - if 'score' in pred_labels: - pred_tokens_biolabel_score = pred_labels.score - pred_words_label_score = [] - else: - pred_tokens_biolabel_score = None - pred_words_label_score = None pre_word_id = None for idx, cur_word_id in enumerate(word_ids): @@ -112,29 +106,32 @@ def _draw_instances(self, image: np.ndarray, bboxes: Union[np.ndarray, gt_words_label_name = gt_tokens_biolabel[idx][2:] \ if gt_tokens_biolabel[idx] != 'O' else 'other' gt_words_label.append(gt_words_label_name) + pre_word_id = cur_word_id + assert len(gt_words_label) == len(bboxes) + if pred_labels is not None: + pred_tokens_biolabel = pred_labels.item + pred_words_label = [] + pred_tokens_biolabel_score = pred_labels.score + pred_words_label_score = [] + + pre_word_id = None + for idx, cur_word_id in enumerate(word_ids): + if cur_word_id is not None: + if cur_word_id != pre_word_id: pred_words_label_name = pred_tokens_biolabel[idx][2:] \ if pred_tokens_biolabel[idx] != 'O' else 'other' pred_words_label.append(pred_words_label_name) - if pred_tokens_biolabel_score is not None: - pred_words_label_score.append( - pred_tokens_biolabel_score[idx]) + pred_words_label_score.append( + pred_tokens_biolabel_score[idx]) pre_word_id = cur_word_id - assert len(gt_words_label) == len(bboxes) assert len(pred_words_label) == len(bboxes) - areas = (bboxes[:, 3] - bboxes[:, 1]) * ( - bboxes[:, 2] - bboxes[:, 0]) - scales = _get_adaptive_scales(areas) - positions = (bboxes[:, :2] + bboxes[:, 2:]) // 2 - + # draw gt or pred labels + if gt_labels is not None and pred_labels is not None: for i, (pos, gt, pred) in enumerate( zip(positions, gt_words_label, pred_words_label)): - if pred_words_label_score is not None: - score = round(float(pred_words_label_score[i]) * 100, 1) - label_text = f'{gt} | {pred}({score})' - else: - label_text = f'{gt} | {pred}' - + score = round(float(pred_words_label_score[i]) * 100, 1) + label_text = f'{gt} | {pred}({score})' self.draw_texts( label_text, pos, @@ -142,6 +139,27 @@ def _draw_instances(self, image: np.ndarray, bboxes: Union[np.ndarray, font_sizes=int(13 * scales[i]), vertical_alignments='center', horizontal_alignments='center') + elif pred_labels is not None: + for i, (pos, pred) in enumerate(zip(positions, pred_words_label)): + score = round(float(pred_words_label_score[i]) * 100, 1) + label_text = f'Pred: {pred}({score})' + self.draw_texts( + label_text, + pos, + colors=self.label_color, + font_sizes=int(13 * scales[i]), + vertical_alignments='center', + horizontal_alignments='center') + elif gt_labels is not None: + for i, (pos, gt) in enumerate(zip(positions, gt_words_label)): + label_text = f'GT: {gt}' + self.draw_texts( + label_text, + pos, + colors=self.label_color, + font_sizes=int(13 * scales[i]), + vertical_alignments='center', + horizontal_alignments='center') return self.get_image() From b6f55f88a667565085994a0b684d3d2ef4b993b3 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Mon, 29 May 2023 15:05:48 +0800 Subject: [PATCH 44/50] =?UTF-8?q?=E4=BC=98=E5=8C=96ser=5Fpostprocessor?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ser/layoutlmv3_1k_xfund_zh_1xbs8.py | 5 +- .../transforms/layoutlmv3_transforms.py | 4 +- .../LayoutLMv3/models/ser_postprocessor.py | 116 ++++++++++++------ .../visualization/ser_visualizer.py | 51 +++----- 4 files changed, 99 insertions(+), 77 deletions(-) diff --git a/projects/LayoutLMv3/configs/ser/layoutlmv3_1k_xfund_zh_1xbs8.py b/projects/LayoutLMv3/configs/ser/layoutlmv3_1k_xfund_zh_1xbs8.py index 76cacde9b..304156810 100644 --- a/projects/LayoutLMv3/configs/ser/layoutlmv3_1k_xfund_zh_1xbs8.py +++ b/projects/LayoutLMv3/configs/ser/layoutlmv3_1k_xfund_zh_1xbs8.py @@ -108,7 +108,10 @@ pretrained_model_name_or_path=hf_pretrained_model, num_labels=len(class_name) * 2 - 1), loss_processor=dict(type='ComputeLossAfterLabelSmooth'), - postprocessor=dict(type='SERPostprocessor', classes=class_name)) + postprocessor=dict( + type='SERPostprocessor', + classes=class_name, + only_label_first_subword=only_label_first_subword)) # ==================================================================== # ========================= Evaluation =============================== val_evaluator = dict(type='SeqevalMetric', prefix=dataset_name) diff --git a/projects/LayoutLMv3/datasets/transforms/layoutlmv3_transforms.py b/projects/LayoutLMv3/datasets/transforms/layoutlmv3_transforms.py index 2bed95708..02684a2b0 100644 --- a/projects/LayoutLMv3/datasets/transforms/layoutlmv3_transforms.py +++ b/projects/LayoutLMv3/datasets/transforms/layoutlmv3_transforms.py @@ -220,10 +220,12 @@ class ConvertBIOLabelForSER(BaseTransform): def __init__(self, classes: Union[tuple, list], - only_label_first_subword: bool = False) -> None: + only_label_first_subword: bool = True) -> None: super().__init__() self.other_label_name = find_other_label_name_of_biolabel(classes) self.biolabel2id = self._generate_biolabel2id_map(classes) + assert only_label_first_subword is True, \ + 'Only support `only_label_first_subword=True` now.' self.only_label_first_subword = only_label_first_subword def _generate_biolabel2id_map(self, classes: Union[tuple, list]) -> Dict: diff --git a/projects/LayoutLMv3/models/ser_postprocessor.py b/projects/LayoutLMv3/models/ser_postprocessor.py index a70c2ae82..8bae6c829 100644 --- a/projects/LayoutLMv3/models/ser_postprocessor.py +++ b/projects/LayoutLMv3/models/ser_postprocessor.py @@ -16,10 +16,15 @@ class SERPostprocessor(nn.Module): """PostProcessor for SER.""" - def __init__(self, classes: Union[tuple, list]) -> None: + def __init__(self, + classes: Union[tuple, list], + only_label_first_subword: bool = True) -> None: super().__init__() self.other_label_name = find_other_label_name_of_biolabel(classes) self.id2biolabel = self._generate_id2biolabel_map(classes) + assert only_label_first_subword is True, \ + 'Only support `only_label_first_subword=True` now.' + self.only_label_first_subword = only_label_first_subword self.softmax = nn.Softmax(dim=-1) def _generate_id2biolabel_map(self, classes: Union[tuple, list]) -> Dict: @@ -40,62 +45,95 @@ def _generate_id2biolabel_map(self, classes: Union[tuple, list]) -> Dict: def __call__(self, outputs: torch.Tensor, data_samples: Sequence[SERDataSample] ) -> Sequence[SERDataSample]: - # merge several truncation data_sample to one data_sample assert all('truncation_word_ids' in d for d in data_samples), \ 'The key `truncation_word_ids` should be specified' \ 'in PackSERInputs.' - truncation_word_ids = [] - for data_sample in data_samples: - truncation_word_ids.append(data_sample.pop('truncation_word_ids')) - merged_data_sample = copy.deepcopy(data_samples[0]) - merged_data_sample.set_metainfo( - dict(truncation_word_ids=truncation_word_ids)) - flattened_word_ids = [ - word_id for word_ids in truncation_word_ids for word_id in word_ids + truncation_word_ids = [ + data_sample.pop('truncation_word_ids') + for data_sample in data_samples + ] + word_ids = [ + word_id for word_ids in truncation_word_ids + for word_id in word_ids[1:-1] ] + # merge several truncation data_sample to one data_sample + merged_data_sample = copy.deepcopy(data_samples[0]) + # convert outputs dim from (truncation_num, max_length, label_num) # to (truncation_num * max_length, label_num) outputs = outputs.cpu().detach() - outputs = torch.reshape(outputs, (-1, outputs.size(-1))) + outputs = torch.reshape(outputs[:, 1:-1, :], (-1, outputs.size(-1))) # get pred label ids/scores from outputs probs = self.softmax(outputs) max_value, max_idx = torch.max(probs, -1) - pred_label_ids = max_idx.numpy() - pred_label_scores = max_value.numpy() + pred_label_ids = max_idx.numpy().tolist() + pred_label_scores = max_value.numpy().tolist() + + # inference process do not have item in gt_label, + # so select valid token with word_ids rather than + # with gt_label_ids like official code. + pred_words_biolabels = [] + word_biolabels = [] + pre_word_id = None + for idx, cur_word_id in enumerate(word_ids): + if cur_word_id is not None: + if cur_word_id != pre_word_id: + if word_biolabels: + pred_words_biolabels.append(word_biolabels) + word_biolabels = [] + word_biolabels.append((self.id2biolabel[pred_label_ids[idx]], + pred_label_scores[idx])) + else: + pred_words_biolabels.append(word_biolabels) + break + pre_word_id = cur_word_id + # record pred_label + if self.only_label_first_subword: + pred_label = LabelData() + pred_label.item = [ + pred_word_biolabels[0][0] + for pred_word_biolabels in pred_words_biolabels + ] + pred_label.score = [ + pred_word_biolabels[0][1] + for pred_word_biolabels in pred_words_biolabels + ] + merged_data_sample.pred_label = pred_label + else: + raise NotImplementedError( + 'The `only_label_first_subword=False` is not support yet.') # determine whether it is an inference process if 'item' in data_samples[0].gt_label: # merge gt label ids from data_samples gt_label_ids = [ - data_sample.gt_label.item for data_sample in data_samples + data_sample.gt_label.item[1:-1] for data_sample in data_samples ] gt_label_ids = torch.cat( - gt_label_ids, dim=0).cpu().detach().numpy() - gt_biolabels = [ - self.id2biolabel[g] - for (w, g) in zip(flattened_word_ids, gt_label_ids) - if w is not None - ] + gt_label_ids, dim=0).cpu().detach().numpy().tolist() + gt_words_biolabels = [] + word_biolabels = [] + pre_word_id = None + for idx, cur_word_id in enumerate(word_ids): + if cur_word_id is not None: + if cur_word_id != pre_word_id: + if word_biolabels: + gt_words_biolabels.append(word_biolabels) + word_biolabels = [] + word_biolabels.append(self.id2biolabel[gt_label_ids[idx]]) + else: + gt_words_biolabels.append(word_biolabels) + break + pre_word_id = cur_word_id # update merged gt_label - merged_data_sample.gt_label.item = gt_biolabels - - # inference process do not have item in gt_label, - # so select valid token with flattened_word_ids - # rather than with gt_label_ids like official code. - pred_biolabels = [ - self.id2biolabel[p] - for (w, p) in zip(flattened_word_ids, pred_label_ids) - if w is not None - ] - pred_biolabel_scores = [ - s for (w, s) in zip(flattened_word_ids, pred_label_scores) - if w is not None - ] - # record pred_label - pred_label = LabelData() - pred_label.item = pred_biolabels - pred_label.score = pred_biolabel_scores - merged_data_sample.pred_label = pred_label + if self.only_label_first_subword: + merged_data_sample.gt_label.item = [ + gt_word_biolabels[0] + for gt_word_biolabels in gt_words_biolabels + ] + else: + raise NotImplementedError( + 'The `only_label_first_subword=False` is not support yet.') return [merged_data_sample] diff --git a/projects/LayoutLMv3/visualization/ser_visualizer.py b/projects/LayoutLMv3/visualization/ser_visualizer.py index f0cdc3707..0df89db0b 100644 --- a/projects/LayoutLMv3/visualization/ser_visualizer.py +++ b/projects/LayoutLMv3/visualization/ser_visualizer.py @@ -65,11 +65,11 @@ def __init__(self, self.line_width = line_width self.alpha = alpha - def _draw_instances(self, image: np.ndarray, bboxes: Union[np.ndarray, - torch.Tensor], - word_ids: Optional[List[int]], - gt_labels: Optional[LabelData], - pred_labels: Optional[LabelData]) -> np.ndarray: + def _draw_instances(self, + image: np.ndarray, + bboxes: Union[np.ndarray, torch.Tensor], + gt_labels: Optional[LabelData] = None, + pred_labels: Optional[LabelData] = None) -> np.ndarray: """Draw bboxes and polygons on image. Args: @@ -97,33 +97,19 @@ def _draw_instances(self, image: np.ndarray, bboxes: Union[np.ndarray, if gt_labels is not None: gt_tokens_biolabel = gt_labels.item - gt_words_label = [] - - pre_word_id = None - for idx, cur_word_id in enumerate(word_ids): - if cur_word_id is not None: - if cur_word_id != pre_word_id: - gt_words_label_name = gt_tokens_biolabel[idx][2:] \ - if gt_tokens_biolabel[idx] != 'O' else 'other' - gt_words_label.append(gt_words_label_name) - pre_word_id = cur_word_id + gt_words_label = [ + token_biolabel[2:] if token_biolabel != 'O' else 'other' + for token_biolabel in gt_tokens_biolabel + ] assert len(gt_words_label) == len(bboxes) + if pred_labels is not None: pred_tokens_biolabel = pred_labels.item - pred_words_label = [] - pred_tokens_biolabel_score = pred_labels.score - pred_words_label_score = [] - - pre_word_id = None - for idx, cur_word_id in enumerate(word_ids): - if cur_word_id is not None: - if cur_word_id != pre_word_id: - pred_words_label_name = pred_tokens_biolabel[idx][2:] \ - if pred_tokens_biolabel[idx] != 'O' else 'other' - pred_words_label.append(pred_words_label_name) - pred_words_label_score.append( - pred_tokens_biolabel_score[idx]) - pre_word_id = cur_word_id + pred_words_label = [ + token_biolabel[2:] if token_biolabel != 'O' else 'other' + for token_biolabel in pred_tokens_biolabel + ] + pred_words_label_score = pred_labels.score assert len(pred_words_label) == len(bboxes) # draw gt or pred labels @@ -205,11 +191,6 @@ def add_datasample(self, cat_images = [] if data_sample is not None: bboxes = np.array(data_sample.instances.get('boxes', None)) - # here need to flatten truncation_word_ids - word_ids = [ - word_id for word_ids in data_sample.truncation_word_ids - for word_id in word_ids[1:-1] - ] gt_label = data_sample.gt_label if \ draw_gt and 'gt_label' in data_sample else None pred_label = data_sample.pred_label if \ @@ -218,7 +199,6 @@ def add_datasample(self, orig_img_with_bboxes = self._draw_instances( image=image.copy(), bboxes=bboxes, - word_ids=None, gt_labels=None, pred_labels=None) cat_images.append(orig_img_with_bboxes) @@ -226,7 +206,6 @@ def add_datasample(self, empty_img_with_label = self._draw_instances( image=empty_img, bboxes=bboxes, - word_ids=word_ids, gt_labels=gt_label, pred_labels=pred_label) cat_images.append(empty_img_with_label) From edf7fe8535698ac13bf386cd7feff817fec709a3 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Mon, 12 Jun 2023 12:47:44 +0800 Subject: [PATCH 45/50] =?UTF-8?q?[Fix]=20=E4=BF=AE=E5=A4=8D=E4=B8=80?= =?UTF-8?q?=E4=B8=AA=E5=9B=A0=E4=B8=BA=E5=88=86=E8=AF=8D=E7=BB=93=E6=9E=9C?= =?UTF-8?q?=E6=81=B0=E5=A5=BD510*n=E4=B8=AA=EF=BC=8C=E5=89=94=E9=99=A4?= =?UTF-8?q?=E6=94=B6=E5=B0=BENone=E6=A0=87=E8=AF=86=E5=90=8E=E6=B2=A1?= =?UTF-8?q?=E6=9C=89=E7=BB=93=E6=9D=9F=E6=A0=87=E5=BF=97=EF=BC=8C=E5=AF=BC?= =?UTF-8?q?=E8=87=B4=E6=9C=80=E5=90=8E=E4=B8=80=E4=B8=AAlabel=E6=97=A0?= =?UTF-8?q?=E6=B3=95=E5=8A=A0=E5=85=A5=E7=BB=93=E6=9E=9C=E7=9A=84Bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- projects/LayoutLMv3/models/ser_postprocessor.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/projects/LayoutLMv3/models/ser_postprocessor.py b/projects/LayoutLMv3/models/ser_postprocessor.py index 8bae6c829..cff120558 100644 --- a/projects/LayoutLMv3/models/ser_postprocessor.py +++ b/projects/LayoutLMv3/models/ser_postprocessor.py @@ -88,6 +88,8 @@ def __call__(self, outputs: torch.Tensor, pred_words_biolabels.append(word_biolabels) break pre_word_id = cur_word_id + if word_biolabels: + pred_words_biolabels.append(word_biolabels) # record pred_label if self.only_label_first_subword: pred_label = LabelData() @@ -126,6 +128,8 @@ def __call__(self, outputs: torch.Tensor, gt_words_biolabels.append(word_biolabels) break pre_word_id = cur_word_id + if word_biolabels: + gt_words_biolabels.append(word_biolabels) # update merged gt_label if self.only_label_first_subword: merged_data_sample.gt_label.item = [ From 8a1e37bf64e8cb26d37631dd3e163829c9137174 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Mon, 12 Jun 2023 13:01:59 +0800 Subject: [PATCH 46/50] =?UTF-8?q?[Fix]=20=E9=87=8D=E7=BD=AEword=5Fbiolabel?= =?UTF-8?q?s=E9=98=B2=E6=AD=A2=E9=87=8D=E5=A4=8D=E6=B7=BB=E5=8A=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- projects/LayoutLMv3/models/ser_postprocessor.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/projects/LayoutLMv3/models/ser_postprocessor.py b/projects/LayoutLMv3/models/ser_postprocessor.py index cff120558..d8be05433 100644 --- a/projects/LayoutLMv3/models/ser_postprocessor.py +++ b/projects/LayoutLMv3/models/ser_postprocessor.py @@ -86,6 +86,7 @@ def __call__(self, outputs: torch.Tensor, pred_label_scores[idx])) else: pred_words_biolabels.append(word_biolabels) + word_biolabels = [] break pre_word_id = cur_word_id if word_biolabels: @@ -126,6 +127,7 @@ def __call__(self, outputs: torch.Tensor, word_biolabels.append(self.id2biolabel[gt_label_ids[idx]]) else: gt_words_biolabels.append(word_biolabels) + word_biolabels = [] break pre_word_id = cur_word_id if word_biolabels: From 0f0f8caa0802dc76e88ed9040f09cc849783cfea Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Tue, 27 Jun 2023 15:23:06 +0800 Subject: [PATCH 47/50] =?UTF-8?q?=E5=88=A0=E9=99=A4=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=E4=B8=AD=E6=89=80=E6=9C=89=E7=9A=84=E7=BB=9D=E5=AF=B9=E8=B7=AF?= =?UTF-8?q?=E5=BE=84=EF=BC=8C=E8=A1=A5=E5=85=85README.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- projects/LayoutLMv3/README.md | 70 ++++++++++--------- .../ser/layoutlmv3_1k_xfund_zh_1xbs8.py | 3 +- projects/LayoutLMv3/scripts/run_ser.sh | 8 --- 3 files changed, 39 insertions(+), 42 deletions(-) delete mode 100644 projects/LayoutLMv3/scripts/run_ser.sh diff --git a/projects/LayoutLMv3/README.md b/projects/LayoutLMv3/README.md index c2a22bfb2..41561997d 100644 --- a/projects/LayoutLMv3/README.md +++ b/projects/LayoutLMv3/README.md @@ -1,84 +1,90 @@ -# Dummy ResNet Wrapper +# LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking -> This is a README template for community `projects/`. - -> All the fields in this README are **mandatory** for others to understand what you have achieved in this implementation. If you still feel unclear about the requirements, please read our [contribution guide](https://mmocr.readthedocs.io/en/dev-1.x/notes/contribution_guide.html), [projects FAQ](../faq.md), or approach us in [Discussions](https://github.com/open-mmlab/mmocr/discussions). + ## Description -> Share any information you would like others to know. For example: -> -> Author: @xxx. -> -> This is an implementation of \[XXX\]. +This is an implementation of [LayoutLMv3](https://github.com/microsoft/unilm/tree/master/layoutlmv3) based on [MMOCR](https://github.com/open-mmlab/mmocr/tree/dev-1.x), [MMCV](https://github.com/open-mmlab/mmcv), [MMEngine](https://github.com/open-mmlab/mmengine) and [Transformers](https://github.com/huggingface/transformers). -This project implements a dummy ResNet wrapper, which literally does nothing new but prints "hello world" during initialization. +**LayoutLMv3** Self-supervised pre-training techniques have achieved remarkable progress in Document AI. Most multimodal pre-trained models use a masked language modeling objective to learn bidirectional representations on the text modality, but they differ in pre-training objectives for the image modality. This discrepancy adds difficulty to multimodal representation learning. In this paper, we propose LayoutLMv3 to pre-train multimodal Transformers for Document AI with unified text and image masking. Additionally, LayoutLMv3 is pre-trained with a word-patch alignment objective to learn cross-modal alignment by predicting whether the corresponding image patch of a text word is masked. The simple unified architecture and training objectives make LayoutLMv3 a general-purpose pre-trained model for both text-centric and image-centric Document AI tasks. Experimental results show that LayoutLMv3 achieves state-of-the-art performance not only in text-centric tasks, including form understanding, receipt understanding, and document visual question answering, but also in image-centric tasks such as document image classification and document layout analysis.The code and models are publicly available at https://aka.ms/layoutlmv3. + +
+ +
## Usage -> For a typical model, this section should contain the commands for training and testing. You are also suggested to dump your environment specification to env.yml by `conda env export > env.yml`. + ### Prerequisites - Python 3.7 - PyTorch 1.6 or higher +- [Transformers](https://github.com/huggingface/transformers) 4.31.0.dev0 or higher - [MIM](https://github.com/open-mmlab/mim) - [MMOCR](https://github.com/open-mmlab/mmocr) -All the commands below rely on the correct configuration of `PYTHONPATH`, which should point to the project's directory so that Python can locate the module files. In `example_project/` root directory, run the following line to add the current directory to `PYTHONPATH`: +### Preparing xfund dataset + +In MMOCR's root directory, run the following command to prepare xfund dataset: ```shell -# Linux -export PYTHONPATH=`pwd`:$PYTHONPATH -# Windows PowerShell -$env:PYTHONPATH=Get-Location +sh projects/LayoutLMv3/scripts/prepare_dataset.sh ``` +### Downloading Pre-training LayoutLMv3 model + +Download the [LayoutLMv3 Chinese pre-trained model](https://huggingface.co/microsoft/layoutlmv3-base-chinese) from huggingface. + ### Training commands +Modify the path of the parameter `hf_pretrained_model` in the config file(`projects/LayoutLMv3/configs/ser/layoutlmv3_1k_xfund_zh_1xbs8.py`) + In MMOCR's root directory, run the following command to train the model: ```bash -mim train mmocr configs/dbnet_dummy-resnet_fpnc_1200e_icdar2015.py --work-dir work_dirs/dummy_mae/ +export TOKENIZERS_PARALLELISM=false +export OMP_NUM_THREADS=1 +mim train mmocr projects/LayoutLMv3/configs/ser/layoutlmv3_1k_xfund_zh_1xbs8.py --work-dir work_dirs/ ``` -To train on multiple GPUs, e.g. 8 GPUs, run the following command: + ### Testing commands In MMOCR's root directory, run the following command to test the model: ```bash -mim test mmocr configs/dbnet_dummy-resnet_fpnc_1200e_icdar2015.py --work-dir work_dirs/dummy_mae/ --checkpoint ${CHECKPOINT_PATH} +mim test mmocr projects/LayoutLMv3/configs/ser/layoutlmv3_1k_xfund_zh_1xbs8.py --work-dir work_dirs/ --checkpoint ${CHECKPOINT_PATH} ``` ## Results -> List the results as usually done in other model's README. [Example](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textdet/dbnet/README.md#results-and-models) + ## Citation -> You may remove this section if not applicable. +If you find LayoutLMv3 useful in your research or applications, please cite LayoutLMv3 with the following BibTeX entry. ```bibtex -@software{MMOCR_Contributors_OpenMMLab_Text_Detection_2020, -author = {{MMOCR Contributors}}, -license = {Apache-2.0}, -month = {8}, -title = {{OpenMMLab Text Detection, Recognition and Understanding Toolbox}}, -url = {https://github.com/open-mmlab/mmocr}, -version = {0.3.0}, -year = {2020} +@inproceedings{huang2022layoutlmv3, + title={Layoutlmv3: Pre-training for document ai with unified text and image masking}, + author={Huang, Yupan and Lv, Tengchao and Cui, Lei and Lu, Yutong and Wei, Furu}, + booktitle={Proceedings of the 30th ACM International Conference on Multimedia}, + pages={4083--4091}, + year={2022} } ``` @@ -96,7 +102,7 @@ Here is a checklist illustrating a usual development workflow of a successful pr - [ ] Milestone 1: PR-ready, and acceptable to be one of the `projects/`. - - [ ] Finish the code + - [x] Finish the code > The code's design shall follow existing interfaces and convention. For example, each model component should be registered into `mmocr.registry.MODELS` and configurable via a config file. diff --git a/projects/LayoutLMv3/configs/ser/layoutlmv3_1k_xfund_zh_1xbs8.py b/projects/LayoutLMv3/configs/ser/layoutlmv3_1k_xfund_zh_1xbs8.py index 304156810..a4f9fcf0a 100644 --- a/projects/LayoutLMv3/configs/ser/layoutlmv3_1k_xfund_zh_1xbs8.py +++ b/projects/LayoutLMv3/configs/ser/layoutlmv3_1k_xfund_zh_1xbs8.py @@ -4,8 +4,7 @@ ] # ================== Frequently modified parameters ================== -hf_pretrained_model = '/Users/wangnu/Documents/GitHub' \ - '/mmocr/data/layoutlmv3-base-chinese' +hf_pretrained_model = 'data/layoutlmv3-base-chinese' dataset_name = 'xfund_zh' class_name = ('answer', 'header', 'question', 'other') max_iters = 1000 diff --git a/projects/LayoutLMv3/scripts/run_ser.sh b/projects/LayoutLMv3/scripts/run_ser.sh deleted file mode 100644 index 12b0f8271..000000000 --- a/projects/LayoutLMv3/scripts/run_ser.sh +++ /dev/null @@ -1,8 +0,0 @@ -config='/Users/wangnu/Documents/GitHub/mmocr/projects/LayoutLMv3/configs/ser/layoutlmv3_1k_xfund_zh_1xbs8.py' - -export TOKENIZERS_PARALLELISM=false -export OMP_NUM_THREADS=1 -export PYTHONPATH='/Users/wangnu/Documents/GitHub/mmocr' - -python tools/train.py \ -${config} \ From ae8c4269abfc0aa1132226570ba3c403c9e6b89a Mon Sep 17 00:00:00 2001 From: gaotongxiao Date: Wed, 18 Oct 2023 12:05:19 +0800 Subject: [PATCH 48/50] fix lint --- .../datasets/transforms/layoutlmv3_transforms.py | 8 ++++---- projects/LayoutLMv3/models/hf_layoutlmv3_wrapper.py | 4 ++-- projects/LayoutLMv3/models/loss_processor.py | 3 ++- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/projects/LayoutLMv3/datasets/transforms/layoutlmv3_transforms.py b/projects/LayoutLMv3/datasets/transforms/layoutlmv3_transforms.py index 02684a2b0..ed545d5f6 100644 --- a/projects/LayoutLMv3/datasets/transforms/layoutlmv3_transforms.py +++ b/projects/LayoutLMv3/datasets/transforms/layoutlmv3_transforms.py @@ -2,10 +2,6 @@ from typing import Dict, List, Optional, Union from mmcv.transforms.base import BaseTransform - -from mmocr.registry import TRANSFORMS -from projects.LayoutLMv3.utils.bio_label_utils import \ - find_other_label_name_of_biolabel from transformers import LayoutLMv3ImageProcessor, LayoutXLMTokenizerFast from transformers.file_utils import PaddingStrategy from transformers.image_processing_utils import BatchFeature @@ -13,6 +9,10 @@ from transformers.tokenization_utils_base import (BatchEncoding, TruncationStrategy) +from mmocr.registry import TRANSFORMS +from projects.LayoutLMv3.utils.bio_label_utils import \ + find_other_label_name_of_biolabel + @TRANSFORMS.register_module() class LoadProcessorFromPretrainedModel(BaseTransform): diff --git a/projects/LayoutLMv3/models/hf_layoutlmv3_wrapper.py b/projects/LayoutLMv3/models/hf_layoutlmv3_wrapper.py index 9c2d71e84..dcb786602 100644 --- a/projects/LayoutLMv3/models/hf_layoutlmv3_wrapper.py +++ b/projects/LayoutLMv3/models/hf_layoutlmv3_wrapper.py @@ -3,12 +3,12 @@ import torch from mmengine.model import BaseModel +from transformers import LayoutLMv3ForTokenClassification +from transformers.modeling_outputs import TokenClassifierOutput from mmocr.registry import MODELS from projects.LayoutLMv3.utils.typing_utils import (OptSERSampleList, SERSampleList) -from transformers import LayoutLMv3ForTokenClassification -from transformers.modeling_outputs import TokenClassifierOutput ForwardResults = Union[Dict[str, torch.Tensor], SERSampleList, Tuple[torch.Tensor], torch.Tensor] diff --git a/projects/LayoutLMv3/models/loss_processor.py b/projects/LayoutLMv3/models/loss_processor.py index a9ac2b563..54154ffa2 100644 --- a/projects/LayoutLMv3/models/loss_processor.py +++ b/projects/LayoutLMv3/models/loss_processor.py @@ -1,6 +1,7 @@ -from mmocr.registry import MODELS from transformers.trainer_pt_utils import LabelSmoother +from mmocr.registry import MODELS + @MODELS.register_module() class ComputeLossAfterLabelSmooth(LabelSmoother): From db5673ffe6b13ed76c3e4edbf28a9ed78946020a Mon Sep 17 00:00:00 2001 From: gaotongxiao Date: Fri, 20 Oct 2023 16:56:22 +0800 Subject: [PATCH 49/50] fix ci --- mmocr/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mmocr/__init__.py b/mmocr/__init__.py index faf1ae81e..4524c4c3c 100644 --- a/mmocr/__init__.py +++ b/mmocr/__init__.py @@ -43,7 +43,7 @@ f'<{mmengine_maximum_version}.' mmdet_minimum_version = '3.0.0rc5' -mmdet_maximum_version = '3.2.0' +mmdet_maximum_version = '3.4.0' mmdet_version = digit_version(mmdet.__version__) assert (mmdet_version >= digit_version(mmdet_minimum_version) From c7a38950bf72815e0f47a9b826b52b01e1be0062 Mon Sep 17 00:00:00 2001 From: gaotongxiao Date: Sat, 28 Oct 2023 12:01:15 +0800 Subject: [PATCH 50/50] ci --- .circleci/docker/Dockerfile | 1 + .circleci/test.yml | 20 +++++++++++++------- .github/workflows/merge_stage_test.yml | 5 ++++- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/.circleci/docker/Dockerfile b/.circleci/docker/Dockerfile index d9cf8cc77..b5efe06a4 100644 --- a/.circleci/docker/Dockerfile +++ b/.circleci/docker/Dockerfile @@ -1,6 +1,7 @@ ARG PYTORCH="1.8.1" ARG CUDA="10.2" ARG CUDNN="7" +ARG DEBIAN_FRONTEND=noninteractive FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel diff --git a/.circleci/test.yml b/.circleci/test.yml index c24bebcb5..51d9770ad 100644 --- a/.circleci/test.yml +++ b/.circleci/test.yml @@ -80,16 +80,22 @@ jobs: type: string cuda: type: enum - enum: ["10.1", "10.2", "11.1", "11.7"] + enum: ["10.1", "10.2", "11.1", "11.7", "11.8"] cudnn: type: integer default: 7 machine: - image: ubuntu-2004-cuda-11.4:202110-01 + image: linux-cuda-11:default # docker_layer_caching: true - resource_class: gpu.nvidia.small + resource_class: gpu.nvidia.small.multi steps: - checkout + - run: + name: Install nvidia-container-toolkit and Restart Docker + command: | + sudo apt-get update + sudo apt-get install -y nvidia-container-toolkit + sudo systemctl restart docker - run: # Cloning repos in VM since Docker doesn't have access to the private key name: Clone Repos @@ -152,8 +158,8 @@ workflows: - lint - build_cpu: name: maximum_version_cpu - torch: 2.0.0 - torchvision: 0.15.1 + torch: 2.1.0 + torchvision: 0.16.0 python: 3.9.0 requires: - minimum_version_cpu @@ -171,10 +177,10 @@ workflows: - hold - build_cuda: name: mainstream_version_gpu - torch: 2.0.0 + torch: 2.1.0 # Use double quotation mark to explicitly specify its type # as string instead of number - cuda: "11.7" + cuda: "11.8" cudnn: 8 requires: - hold diff --git a/.github/workflows/merge_stage_test.yml b/.github/workflows/merge_stage_test.yml index 856ede833..44be34746 100644 --- a/.github/workflows/merge_stage_test.yml +++ b/.github/workflows/merge_stage_test.yml @@ -60,7 +60,7 @@ jobs: strategy: matrix: python-version: [3.7] - torch: [1.6.0, 1.7.1, 1.8.1, 1.9.1, 1.10.1, 1.11.0, 1.12.1, 1.13.0] + torch: [1.6.0, 1.7.1, 1.8.1, 1.9.1, 1.10.1, 1.11.0, 1.12.1, 1.13.0, 2.0.0, 2.1.0] include: - torch: 1.6.0 torchvision: 0.7.0 @@ -81,6 +81,9 @@ jobs: - torch: 2.0.0 torchvision: 0.15.1 python-version: 3.8 + - torch: 2.1.0 + torchvision: 0.16.0 + python-version: 3.8 steps: - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }}