-
Notifications
You must be signed in to change notification settings - Fork 0
/
LeclercDataLoader.py
78 lines (59 loc) · 2.59 KB
/
LeclercDataLoader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import os
import xml.etree.ElementTree as ET
import re
def read_leclerc_dataset(train_directory_path):
products = list()
files = os.listdir(train_directory_path)
for file in files:
print("reading file : " + file)
tree = ET.parse(os.path.join(train_directory_path, file))
items = tree.getroot()
# Each item represents a product
for item in items:
product = {}
product['categories'] = read_categories(item)
product['text'] = str(read_metadata(item)) + " " + str(read_properties(item))
products.append(product.copy())
return products
def read_metadata(item):
"""read metadata (title, description, suplier, universalName ) for each item (product)."""
metadata = " "
clean = re.compile('<.*?>')
title = item.find("./title")
if title is not None and title.text is not None:
metadata += " " + re.sub(clean, ' ', str(title.text))
subtitle = item.find("./subtitle")
if subtitle is not None and subtitle.text is not None:
metadata += " " + re.sub(clean, ' ', str(subtitle.text))
description = item.find("./description")
if description is not None and description.text is not None:
metadata += " " + re.sub(clean, ' ', str(description.text))
suplier = item.find("./suplier")
if suplier is not None and suplier.text is not None:
metadata += " " + re.sub(clean, ' ', str(suplier.text))
universe_name = item.find("./universe/name")
if universe_name is not None and universe_name.text is not None:
metadata += " " + re.sub(clean, ' ', str(universe_name.text))
return metadata
def read_properties(item):
"""read properties"""
properties = item.findall("./properties/property")
clean = re.compile('<.*?>')
content_properties = " "
for property in properties:
if property is not None:
content_properties += " " + re.sub(clean, ' ', str(property.get('name')))
content_properties += " " + re.sub(clean, ' ', str(property.text))
return content_properties
def read_categories(item):
categories = list()
category1 = item.find("./categories/category_1")
category2 = item.find("./categories/category_2")
category3 = item.find("./categories/category_3")
if category1 is not None and category1.text is not None:
categories.append(category1.text)
if category2 is not None and category2.text is not None:
categories.append(category2.text)
if category3 is not None and category3.text is not None:
categories.append(category3.text)
return categories