Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🎉 wizard: dataset preview #3765

Merged
merged 19 commits into from
Dec 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file.
308 changes: 308 additions & 0 deletions apps/wizard/app_pages/dataset_preview/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,308 @@
"""Show indicators in datasets from database.

The idea is to quickly prototype a better way to show indicators in datasets.

TODO: only works for ETL-based datasets.
"""

from collections import defaultdict

import streamlit as st

from apps.wizard.app_pages.dataset_preview.dependency_graph import load_dag_cached, show_modal_dependency_graph
from apps.wizard.app_pages.dataset_preview.utils import (
IndicatorsInCharts,
IndicatorsInExplorers,
IndicatorSingleDimension,
IndicatorWithDimensions,
get_charts_views,
get_datasets,
get_explorers_views,
get_table_charts,
get_table_explorers,
get_users,
show_table_charts,
show_table_explorers,
)
from apps.wizard.utils.components import Pagination, grapher_chart, st_horizontal, st_tag
from etl.config import OWID_ENV
from etl.grapher.io import load_variables_in_dataset

ICONS_DIMENSIONS = {
"age": ":material/cake:",
"sex": ":material/wc:",
}


def parse_indicators(indicators_raw):
"""Build list with indicators.

It groups indicators whenever applicable. This will make it easier to show them in the UI.
"""
indicators = []

# Group indicators with dimensions by short_name (add them to indicators_with_dim)
# and those without dimensions (add them to indicators_no_dim)
indicators_with_dim = defaultdict(list)
for indicator in indicators_raw:
# Add dimensions, if any
if indicator.dimensions is not None:
short_name = indicator.dimensions["originalShortName"]
assert isinstance(indicator.catalogPath, str), f"`catalogPath` is empty for variable {indicator.id}"
table = indicator.catalogPath.split("#")[0]
key = f"{table}#{short_name}"
indicators_with_dim[key].append(indicator)
# Does not have dimensions
else:
assert isinstance(indicator.catalogPath, str), f"`catalogPath` is empty for variable {indicator.id}"
key = indicator.catalogPath
indicators.append(IndicatorSingleDimension(indicator))

# Prepare objects with indicator-collection
for key, vars in indicators_with_dim.items():
indicators.append(IndicatorWithDimensions(vars))

return indicators


def filter_sort_indicators(indicators):
"""Optional function to sort list of indicators.

TODO: add filters / sorting options in UI.
"""
indicators.sort(key=lambda x: (x.key is None, x.key))
return indicators


def prompt_dataset_options(dataset_options):
"""Ask user which dataset they want!

It also syncs the selection with query params.
"""
# Get list with non-archived ones
# dataset_options_non_archived = [
# dataset_id for dataset_id in dataset_options if DATASETS[dataset_id]["isArchived"] == 0
# ]

# Update query params if dataset is selected
if "dataset_select" in st.session_state:
st.query_params["datasetId"] = str(st.session_state["dataset_select"])

# Collect Query params
dataset_id = st.query_params.get("datasetId")

# Correct dataset id
if dataset_id is None:
dataset_index = None
else:
dataset_id = int(dataset_id)
if dataset_id in dataset_options:
if DATASETS[dataset_id]["isArchived"] == 0:
dataset_options = [
dataset_id for dataset_id in dataset_options if DATASETS[dataset_id]["isArchived"] == 0
]
dataset_index = dataset_options.index(dataset_id)
else:
st.error(f"Dataset with ID {dataset_id} not found. Please review the URL query parameters.")
dataset_index = None

# Show dropdown with options
dataset_id = st.selectbox(
label="Dataset",
options=dataset_options,
format_func=lambda x: DATASETS[x]["display_name"],
key="dataset_select",
placeholder="Select dataset",
index=dataset_index, # type: ignore
help="By default, only non-archived datasets are shown. However, if you search for an archived one via QUERY PARAMS, the list will show all datasets, including archived ones. To use QUERY PARAMS, add `?datasetId=YOUR_DATASET_ID` to the URL.",
)

return dataset_id


def prompt_display_charts():
"""Show charts or not."""
if "display_charts" in st.session_state:
st.query_params["displayCharts"] = str(st.session_state["display_charts"])
show_charts = st.query_params.get("displayCharts", "True") == "True"
return st.checkbox(
"Display charts",
key="display_charts",
help="Uncheck to show only indicator descriptions. This avoids rendering charts and can improve performance.",
value=show_charts,
)


@st.fragment
def st_show_indicator(indicator, indicator_charts, display_charts=True):
"""Display indicator."""
with st.container(border=False):
# Allocate space for indicator title / URI
st_header = st.container()
st_metadata_left, st_metadata_right = st.columns(2)

with st_metadata_right:
# Show dimensions as pills -- TODO: add icons for recognized dimensions
if indicator.is_mdim:
# Dimensions
with st.container(border=True):
st.markdown("**Dimensions**")
dim_values_dix = {}
for dim in indicator.dimensions.keys():
key_pills = f"dataset_pills_{indicator.key}_{dim}"
options = indicator.get_dimensions_conditioned(dim, dim_values_dix)
st.pills(
dim,
options,
key=key_pills,
default=options[0],
)

dim_value_ = st.session_state.get(key_pills)
dim_values_dix[dim] = dim_value_
dim_values = tuple(dim_values_dix.values())

# Sanity check on dimensions
assert all(value is not None for value in dim_values)

# Get indicator-dimensions combination
var = indicator.get_dimension(dim_values)
else:
# st.markdown("No dimensions")
var = indicator.indicators[0]

# Charts
df_charts = get_table_charts(indicator_charts, USERS, CHART_VIEWS, var.id)
show_table_charts(df_charts)

# Explorers
df_explorers = get_table_explorers(indicator_explorers, EXPLORER_VIEWS, var.id)
show_table_explorers(df_explorers)

# Show indicator title and URI
name = var.name
iid = var.id
with st_header:
with st_horizontal(): # (vertical_alignment="center"):
st.markdown(f"#### [**{name}**]({OWID_ENV.indicator_admin_site(iid)})")
st.caption(var.catalogPath.replace("grapher/", ""))
if indicator.is_mdim:
st_tag(tag_name="dimensions", color="primary", icon=":material/deployed_code")

# Show chart (contains description, and other metadata fields)
with st_metadata_left:
if not display_charts:
if var.descriptionShort:
st.markdown(var.descriptionShort)
else:
grapher_chart(variable_id=iid, tab="map") # type: ignore


# CONFIG
st.set_page_config(
# page_title="Wizard: Dataset Explorer",
layout="wide",
page_icon="🪄",
# initial_sidebar_state="collapsed",
)
PAGE_ITEMS_LIMIT = 25

# Session state
st.session_state.setdefault("indicator_selected", {})

# Get datasets from DB / cached
DATASETS = get_datasets()
CHART_VIEWS = get_charts_views()
EXPLORER_VIEWS = get_explorers_views()
USERS = get_users()
DAG = load_dag_cached()

# Get datasets (only non-archived!)
dataset_options = list(DATASETS.keys())

# Show dataset search bar
DATASET_ID = prompt_dataset_options(dataset_options)
# DATASET_ID = 6617 # DEBUG
DISPLAY_CHARTS = prompt_display_charts()

# DATASET_ID = 6869, 6813
if DATASET_ID is not None:
dataset = DATASETS[DATASET_ID]

# 0/ Present Dataset
title = dataset["name"]
st.header(f"[{title}]({OWID_ENV.dataset_admin_site(DATASET_ID)})")

# 1/ Get indicators from dataset
indicators_raw = load_variables_in_dataset(dataset_id=[int(DATASET_ID)])

if indicators_raw == []:
st.warning("No indicators found in this dataset.")
else:
## Chart info
indicator_charts = IndicatorsInCharts.from_indicators(indicators_raw)

## Chart info
indicator_explorers = IndicatorsInExplorers.from_indicators(indicators_raw)

## Parse indicators
indicators = parse_indicators(indicators_raw)

# 2/ Get charts
df_charts = get_table_charts(indicator_charts, USERS, CHART_VIEWS)
df_explorers = get_table_explorers(indicator_explorers, EXPLORER_VIEWS)

# 3/ Dataset metadata
with st_horizontal():
st.markdown(f":material/schedule: Last modified: {dataset['updatedAt'].strftime('%Y-%m-%d')}")
st.markdown(f"{len(indicators)} indicators")
st.markdown(f"{len(df_charts)} charts")

if dataset["isPrivate"] == 1:
st_tag("Private", color="blue", icon=":material/lock")
if dataset["isArchived"] == 1:
st_tag("Archived", color="red", icon=":material/delete_forever")
# Any mdim?
if any(ind.is_mdim for ind in indicators):
st_tag(tag_name="indicators with dimensions", color="primary", icon=":material/deployed_code")

@st.fragment
def show_dependency_btn():
st.button(
"Dependency graph",
icon=":material/account_tree:",
on_click=lambda dataset=dataset: show_modal_dependency_graph(dataset, DAG),
)

show_dependency_btn()

# 4/ Tabs
tab_indicators, tab_charts = st.tabs(["Indicators", "Charts"])

with tab_indicators:
# Apply filters / sorting
indicators = filter_sort_indicators(indicators)

# Use pagination
pagination = Pagination(
items=indicators,
items_per_page=PAGE_ITEMS_LIMIT,
pagination_key="pagination-dataset-search",
)

if len(indicators) > PAGE_ITEMS_LIMIT:
pagination.show_controls(mode="bar")

# Show items (only current page)
for item in pagination.get_page_items():
st_show_indicator(item, indicator_charts, DISPLAY_CHARTS)
st.divider()

with tab_charts:
st.markdown("#### Charts")
show_table_charts(df_charts)
st.markdown("#### Explorers")
show_table_explorers(df_explorers)
st.markdown("#### Most frequent chart editors")
user_counts = df_charts["User"].value_counts()
st.dataframe(user_counts, use_container_width=True)
Loading
Loading