Skip to content

Commit

Permalink
Add basic category support
Browse files Browse the repository at this point in the history
* This adds a rudimentary implementation of the category feature. Almost
  nothing has been mapped and also not really tested so there might be
  lots of mistakes.
* Also add a command line argument to support a different output
  directory if desired.
  • Loading branch information
Javex committed Sep 30, 2023
1 parent 7f96a4b commit e5b7225
Show file tree
Hide file tree
Showing 8 changed files with 2,706 additions and 42 deletions.
27 changes: 22 additions & 5 deletions hotprices_au/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,24 @@
from . import output, sites


def get_canoncial_for(store, raw_items, today):
def get_canoncial_for(store, raw_items, category_map, today):
canonical_items = []
store_module = sites.sites[store]
for raw_item in raw_items:
try:
canonical_item = sites.sites[store].get_canonical(raw_item, today)
canonical_item = store_module.get_canonical(raw_item, today)
except Exception:
logger.exception(f"Unable to process store '{store}' item: {raw_item}")
import pprint; pprint.pprint(raw_item)
import pdb; pdb.set_trace()
continue
if canonical_item is None:
continue
canonical_item['store'] = store
try:
canonical_item['category'] = store_module.get_category_from_map(category_map, raw_item)
except KeyError:
canonical_item['category'] = None
canonical_items.append(canonical_item)
return canonical_items

Expand Down Expand Up @@ -88,19 +95,29 @@ def transform_data(day, output_dir, data_dir, store_filter=None):
# Skip if we only transform one store
continue
store_items = []
raw_categories = output.load_data(store, day=day)
raw_categories = output.load_data(store, output_dir, day=day)
# Let's try and figure out categories
store_module = sites.sites[store]
category_map = store_module.get_category_mapping(raw_categories)

for category in raw_categories:
try:
raw_items = category['Products']
except KeyError:
# Don't have items for this category
continue

canonical_items = get_canoncial_for(store, raw_items, day.strftime('%Y-%m-%d'))
canonical_items = get_canoncial_for(store, raw_items, category_map, day.strftime('%Y-%m-%d'))
store_items += canonical_items

store_items = dedup_items(store_items)
logger.info(f"Total number of products for store '{store}': {len(store_items)}")

uncategorised = 0
for item in store_items:
if item['category'] is None:
uncategorised += 1

logger.info(f"Total number of products for store '{store}': {len(store_items)}. Uncategorised: {uncategorised}")
all_items += store_items

latest_canonical_file = pathlib.Path(output_dir / "latest-canonical.json.gz")
Expand Down
32 changes: 32 additions & 0 deletions hotprices_au/categories.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import json
import pathlib

from hotprices_au.logging import logger

PKG_DATA_FOLDER = pathlib.Path(__file__).parent / "data"

def merge_save_save_categories(store, categories):
store_file = PKG_DATA_FOLDER / f"{store}-categories.json"
# Create folder if it doesn't exist
PKG_DATA_FOLDER.mkdir(parents=True, exist_ok=True)

if store_file.exists():
old_mapping = json.loads(store_file.read_text())
old_lookup = {c['id']: c for c in old_mapping}

for category in categories:
old_category = old_lookup.pop(category['id'], None)
if old_category is None:
logger.info(f"Found new unmapped category for {store}: {category['id']} - {category['description']}")
else:
category['code'] = old_category['code']

if old_lookup:
for old_cat in old_lookup.values():
logger.info(
f"Found category absent in latest mapping for {store}: "
f"{old_cat['id']} - {old_cat['description']}")
categories.append(old_cat)

store_file.write_text(json.dumps(categories, indent=2))
return categories
Loading

0 comments on commit e5b7225

Please sign in to comment.