From bccfd008adff50d9ad04ae082563b8574e0d382c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lucas=20Rod=C3=A9s-Guirao?= Date: Mon, 3 Jun 2024 16:47:40 +0200 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20data=20workflow:=20various=20enhanc?= =?UTF-8?q?ements=20(#2706)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * init staging * deprecate meta_expert * update models, cost estimates * integrate oracle into expert * reorder options * make page link wrapper more flexible * make link to expert less verbose * align wizard theme with site * only ignore secrets * remove warning * wip * fix enum * three-state review * minor fix * minor tweak * remove unused var * error if same indicator old=new * fix bug * remove exception catch, so we can debug better * logo placeholder * add confusion matrix tab * confusion matrix if categorical * lint * add reject to owidbot and chart-sync * fix * fix --------- Co-authored-by: Marigold --- .gitignore | 2 +- .streamlit/config.toml | 5 + apps/chart_sync/cli.py | 44 +- apps/owidbot/chart_diff.py | 21 +- apps/wizard/app.py | 3 + apps/wizard/config/config.yml | 10 +- apps/wizard/etl_steps/express.py | 2 +- apps/wizard/home.py | 11 +- apps/wizard/pages/chart_diff/app.py | 68 ++- apps/wizard/pages/chart_diff/chart_diff.py | 48 +- apps/wizard/pages/expert/app.py | 50 +- apps/wizard/pages/expert/prompts.py | 488 ++++++++++++++++++ apps/wizard/pages/indicator_upgrade/app.py | 7 +- .../pages/indicator_upgrade/charts_update.py | 2 +- .../pages/indicator_upgrade/explore_mode.py | 176 +++++-- apps/wizard/pages/meta_expert/__init__.py | 1 - apps/wizard/pages/meta_expert/app.py | 145 ------ .../meta_expert/docs_reduced/dataset.txt | 73 --- .../meta_expert/docs_reduced/indicators.txt | 134 ----- .../pages/meta_expert/docs_reduced/origin.txt | 68 --- .../pages/meta_expert/docs_reduced/tables.txt | 33 -- apps/wizard/pages/meta_expert/prompts.py | 152 ------ apps/wizard/pages/owid_datasette_oracle.py | 223 -------- .../pages/owid_datasette_oracle_prompt.py | 486 ----------------- apps/wizard/utils/__init__.py | 21 +- apps/wizard/utils/gpt.py | 40 +- etl/grapher_model.py | 26 +- 27 files changed, 835 insertions(+), 1504 deletions(-) create mode 100644 .streamlit/config.toml delete mode 100644 apps/wizard/pages/meta_expert/__init__.py delete mode 100644 apps/wizard/pages/meta_expert/app.py delete mode 100644 apps/wizard/pages/meta_expert/docs_reduced/dataset.txt delete mode 100644 apps/wizard/pages/meta_expert/docs_reduced/indicators.txt delete mode 100644 apps/wizard/pages/meta_expert/docs_reduced/origin.txt delete mode 100644 apps/wizard/pages/meta_expert/docs_reduced/tables.txt delete mode 100644 apps/wizard/pages/meta_expert/prompts.py delete mode 100644 apps/wizard/pages/owid_datasette_oracle.py delete mode 100644 apps/wizard/pages/owid_datasette_oracle_prompt.py diff --git a/.gitignore b/.gitignore index 3e1ca276418..016c1e6af96 100644 --- a/.gitignore +++ b/.gitignore @@ -47,7 +47,7 @@ site/ .wizard .wizardcfg/* -.streamlit/* +.streamlit/secrets.toml .ipynb_lock .execution_time.json diff --git a/.streamlit/config.toml b/.streamlit/config.toml new file mode 100644 index 00000000000..f48b0c62459 --- /dev/null +++ b/.streamlit/config.toml @@ -0,0 +1,5 @@ +[theme] +base="light" +primaryColor="#ce261e" +textColor="#333333" +secondaryBackgroundColor="#f0f4fa" diff --git a/apps/chart_sync/cli.py b/apps/chart_sync/cli.py index 416c626694f..ec122be29b5 100644 --- a/apps/chart_sync/cli.py +++ b/apps/chart_sync/cli.py @@ -100,7 +100,7 @@ def cli( - Both published charts and drafts from staging are synced. - Existing charts (with the same slug) are added as chart revisions in target. (Revisions could be pre-approved with `--approve-revisions` flag) - You get a warning if the chart **_has been modified on live_** after staging server was created. - - If the chart is unapproved in chart-diff, you'll get a warning and Slack notification + - If the chart is pending in chart-diff, you'll get a warning and Slack notification - Deleted charts are **_not synced_**. **Considerations on chart revisions:** @@ -196,24 +196,23 @@ def cli( ### New chart-diff workflow ### if chartdiff: # Change has been approved, update the chart - if diff.approved: + if diff.is_approved: log.info("chart_sync.chart_update", slug=chart_slug, chart_id=chart_id) charts_synced += 1 if not dry_run: target_api.update_chart(chart_id, diff.source_chart.config) - # TODO: should we add rejected state? # Rejected chart diff - # elif diff.rejected: - # log.info( - # "chart_sync.rejected", - # slug=chart_slug, - # chart_id=chart_id, - # ) - # continue - - # Not approved, notify us about it - elif diff.unapproved: + elif diff.is_rejected: + log.info( + "chart_sync.is_rejected", + slug=chart_slug, + chart_id=chart_id, + ) + continue + + # Pending chart, notify us about it + elif diff.is_pending: log.warning( "chart_sync.pending_chart", slug=chart_slug, @@ -309,7 +308,7 @@ def cli( if chartdiff: # New chart has been approved - if diff.approved: + if diff.is_approved: charts_synced += 1 if not dry_run: resp = target_api.create_chart(diff.source_chart.config) @@ -322,18 +321,17 @@ def cli( slug=chart_slug, new_chart_id=resp["chartId"], ) - # TODO: should we add rejected state? # Rejected chart diff - # elif diff.rejected: - # log.info( - # "chart_sync.rejected", - # slug=chart_slug, - # chart_id=chart_id, - # ) - # continue + elif diff.is_rejected: + log.info( + "chart_sync.is_rejected", + slug=chart_slug, + chart_id=chart_id, + ) + continue # Not approved, create the chart, but notify us about it - elif diff.unapproved: + elif diff.is_pending: log.warning( "chart_sync.new_unapproved_chart", slug=chart_slug, diff --git a/apps/owidbot/chart_diff.py b/apps/owidbot/chart_diff.py index f95bbb85145..fad3400a4b4 100644 --- a/apps/owidbot/chart_diff.py +++ b/apps/owidbot/chart_diff.py @@ -22,13 +22,13 @@ def create_check_run(repo_name: str, branch: str, charts_df: pd.DataFrame, dry_r if charts_df.empty: conclusion = "neutral" - title = "No new or modified charts" - elif charts_df.approved.all(): + title = "No charts for review" + elif charts_df.is_reviewed.all(): conclusion = "success" - title = "All charts are approved" + title = "All charts are reviewed" else: conclusion = "failure" - title = "Some charts are not approved" + title = "Some charts are not reviewed" if not dry_run: # Create the check run and complete it in a single command @@ -49,7 +49,7 @@ def run(branch: str, charts_df: pd.DataFrame) -> str: chart_diff = format_chart_diff(charts_df) - if charts_df.empty or charts_df.approved.all(): + if charts_df.empty or charts_df.is_reviewed.all(): status = "✅" else: status = "❌" @@ -83,7 +83,10 @@ def call_chart_diff(branch: str) -> pd.DataFrame: df.append( { "chart_id": diff.chart_id, - "approved": diff.approved, + "is_approved": diff.is_approved, + "is_pending": diff.is_pending, + "is_rejected": diff.is_rejected, + "is_reviewed": diff.is_reviewed, "is_new": diff.is_new, } ) @@ -93,14 +96,14 @@ def call_chart_diff(branch: str) -> pd.DataFrame: def format_chart_diff(df: pd.DataFrame) -> str: if df.empty: - return "No new or modified charts." + return "No charts for review." new = df[df.is_new] modified = df[~df.is_new] return f""" """.strip() diff --git a/apps/wizard/app.py b/apps/wizard/app.py index 34c02a6a9e3..e82eaaa5385 100644 --- a/apps/wizard/app.py +++ b/apps/wizard/app.py @@ -10,6 +10,9 @@ from apps.wizard import utils from apps.wizard.config import WIZARD_CONFIG +# Logo +# st.logo("docs/assets/logo.png") + # Get current directory CURRENT_DIR = Path(__file__).parent # Page config diff --git a/apps/wizard/config/config.yml b/apps/wizard/config/config.yml index 46c0c401de9..38e0e4f20e5 100644 --- a/apps/wizard/config/config.yml +++ b/apps/wizard/config/config.yml @@ -32,15 +32,11 @@ main: expert: title: "Expert" description: "Ask the expert ETL questions!" - maintainer: "@lucas" + maintainer: + - "@lucas" + - "@daniel" entrypoint: pages/expert/app.py emoji: "🧙" - oracle: - title: "OWID Datasette Oracle" - description: "Get help writing SQL queries for datasette!" - maintainer: "@daniel" - entrypoint: pages/owid_datasette_oracle.py - emoji: "🔮" # ETL steps etl: diff --git a/apps/wizard/etl_steps/express.py b/apps/wizard/etl_steps/express.py index 2b102d3ddab..764b165cbe3 100644 --- a/apps/wizard/etl_steps/express.py +++ b/apps/wizard/etl_steps/express.py @@ -372,7 +372,7 @@ def remove_notebook(dataset_dir): # TITLE st.title("Create step 🐆 **:gray[Express]**") -st.info("Use this step to create Meadow, Garden and Grapher step for a _single dataset_! **Requires ETL expertise**.") +st.info("Use this step to create Meadow, Garden and Grapher step for a _single dataset_!.") # SIDEBAR with st.sidebar: diff --git a/apps/wizard/home.py b/apps/wizard/home.py index 9c8eb14d019..24b29c72c2a 100644 --- a/apps/wizard/home.py +++ b/apps/wizard/home.py @@ -20,9 +20,14 @@ """ ) -with st.container(border=True): - st.markdown("Questions about the documentation? Ask the expert!") - st_page_link("expert", help="Ask the expert any documentation question!", use_container_width=True) + +st_page_link( + "expert", + label="Questions about ETL or Grapher? Ask the expert!", + help="Ask the expert any documentation question!", + use_container_width=True, + border=True, +) # Generic tools ## Default styling for the cards (Wizard apps are presented as cards) diff --git a/apps/wizard/pages/chart_diff/app.py b/apps/wizard/pages/chart_diff/app.py index 51a9c4a1446..ec412309612 100644 --- a/apps/wizard/pages/chart_diff/app.py +++ b/apps/wizard/pages/chart_diff/app.py @@ -8,6 +8,7 @@ from st_pages import add_indentation from structlog import get_logger +import etl.grapher_model as gm from apps.chart_sync.cli import _modified_chart_ids_by_admin from apps.wizard.pages.chart_diff.chart_diff import ChartDiffModified from apps.wizard.pages.chart_diff.config_diff import st_show_diff @@ -36,12 +37,14 @@ initial_sidebar_state="collapsed", ) add_indentation() -st.session_state.hide_approved_charts = st.session_state.get("hide_approved_charts", False) +st.session_state.hide_reviewed_charts = st.session_state.get("hide_reviewed_charts", False) ######################################## # LOAD ENVS ######################################## +warn_msg = [] + SOURCE = OWID_ENV assert OWID_ENV.env_type_id != "production", "Your .env points to production DB, please use a staging environment." @@ -49,14 +52,20 @@ if config.ENV_FILE_PROD: TARGET = OWIDEnv.from_env_file(config.ENV_FILE_PROD) else: - warning_msg = "ENV file doesn't connect to production DB, comparing against staging-site-master" + warning_msg = "ENV file doesn't connect to production DB, comparing against `staging-site-master`." log.warning(warning_msg) - st.warning(warning_msg) + warn_msg.append(warning_msg) TARGET = OWIDEnv.from_staging("master") CHART_PER_PAGE = 10 +######################################## +# WARNING MSG +######################################## +warn_msg += ["This tool is being developed! Please report any issues you encounter in `#proj-new-data-workflow`"] +st.warning("- " + "\n\n- ".join(warn_msg)) + ######################################## # FUNCTIONS ######################################## @@ -94,13 +103,16 @@ def get_chart_diffs( def st_show(diff: ChartDiffModified, source_session, target_session=None) -> None: """Show the chart diff in Streamlit.""" # Define label - emoji = "✅" if diff.approved else "⏳" + print("Showing diff, state:", diff.is_approved, diff.is_rejected, diff.is_pending) + emoji = "✅" if diff.is_approved else ("❌" if diff.is_rejected else "⏳") label = f"{emoji} {diff.source_chart.config['slug']}" # Define action for Toggle on change - def tgl_on_change(diff, session) -> None: + def chart_state_change(diff, session) -> None: + # print(st.session_state.chart_diffs[diff.chart_id].approval_status) with st.spinner(): - diff.switch_state(session=session) + status = st.session_state[f"radio-{diff.chart_id}"] + diff.set_status(session=session, status=status) # Define action for Refresh on click def refresh_on_click(source_session=source_session, target_session=None): @@ -133,7 +145,7 @@ def refresh_on_click(source_session=source_session, target_session=None): raise ValueError("chart_diff show have flags `is_modified = not is_new`.") # Actually show stuff - with st.expander(label, not diff.approved): + with st.expander(label, not diff.is_reviewed): col1, col2 = st.columns([1, 3]) # Refresh @@ -145,11 +157,20 @@ def refresh_on_click(source_session=source_session, target_session=None): help="Get the latest version of the chart from the staging server.", ) - options = ["Pending", "Approve"] + # Actions on chart diff: approve, pending, reject options = { - "Approve": "green", - "Pending": "orange", - # "Reject": "red", + gm.ChartStatus.APPROVED.value: { + "label": "Approve", + "color": "green", + }, + gm.ChartStatus.REJECTED.value: { + "label": "Reject", + "color": "red", + }, + gm.ChartStatus.PENDING.value: { + "label": "Pending", + "color": "gray", + }, } option_names = list(options.keys()) with col1: @@ -158,12 +179,13 @@ def refresh_on_click(source_session=source_session, target_session=None): key=f"radio-{diff.chart_id}", options=option_names, horizontal=True, - format_func=lambda x: f":{options.get(x)}-background[{x}]", - index=option_names.index("Approve") if diff.approved else option_names.index("Pending"), - on_change=lambda diff=diff, session=source_session: tgl_on_change(diff, session), + format_func=lambda x: f":{options[x]['color']}-background[{options[x]['label']}]", + index=option_names.index(diff.approval_status), # type: ignore + on_change=lambda diff=diff, session=source_session: chart_state_change(diff, session), # label_visibility="collapsed", ) + # Show diff if diff.is_modified: tab1, tab2 = st.tabs(["Charts", "Config diff"]) with tab1: @@ -230,14 +252,13 @@ def get_engines() -> tuple[Engine, Engine]: def show_help_text(): with st.popover("How does this work?"): - staging_name = OWID_ENV.name.upper() st.markdown( f""" - **Chart diff** is a living page that compares all ongoing charts between `PRODUCTION` and your `{staging_name}` environment. + **Chart diff** is a living page that compares all ongoing charts between [`production`](http://owid.cloud) and your [`{OWID_ENV.name}`]({OWID_ENV.admin_site}) environment. - It lists all those charts that have been modified in the `{staging_name}` environment. + It lists all those charts that have been modified in the `{OWID_ENV.name}` environment. - If you want any of the modified charts in `{staging_name}` to be migrated to `PRODUCTION`, you can approve them by clicking on the toggle button. + If you want any of the modified charts in `{OWID_ENV.name}` to be migrated to `production`, you can approve them by clicking on the toggle button. """ ) @@ -254,7 +275,6 @@ def reject_chart_diffs(engine): ######################################## def main(): st.title("Chart ⚡ **:gray[Diff]**") - st.warning("This tool is being developed! Please report any issues you encounter in #proj-new-data-workflow") show_help_text() # Get stuff from DB @@ -278,22 +298,22 @@ def main(): on_click=lambda e=source_engine: reject_chart_diffs(e), ) - def hide_approved(): + def hide_reviewed(): set_states( { - "hide_approved_charts": not st.session_state.hide_approved_charts, + "hide_reviewed_charts": not st.session_state.hide_reviewed_charts, } ) # Hide approved (if option selected) - if st.session_state.hide_approved_charts: + if st.session_state.hide_reviewed_charts: st.session_state.chart_diffs_filtered = { - k: v for k, v in st.session_state.chart_diffs.items() if not v.approved + k: v for k, v in st.session_state.chart_diffs.items() if not v.is_reviewed } else: st.session_state.chart_diffs_filtered = st.session_state.chart_diffs - st.toggle("Hide appoved charts", key="hide-approved-charts", on_change=hide_approved) + st.toggle("Hide reviewed charts", key="hide-reviewed-charts", on_change=hide_reviewed) # Get actual charts if st.session_state.chart_diffs == {}: diff --git a/apps/wizard/pages/chart_diff/chart_diff.py b/apps/wizard/pages/chart_diff/chart_diff.py index 8d2d7bf36de..536445fd240 100644 --- a/apps/wizard/pages/chart_diff/chart_diff.py +++ b/apps/wizard/pages/chart_diff/chart_diff.py @@ -1,5 +1,5 @@ import datetime as dt -from typing import Optional, get_args +from typing import Optional import streamlit as st from sqlalchemy.exc import NoResultFound @@ -9,11 +9,16 @@ class ChartDiffModified: + # Chart in source environment source_chart: gm.Chart + # Chart in target environment (if new in source environment, there won't be one) target_chart: Optional[gm.Chart] - approval_status: gm.CHART_DIFF_STATUS + # Three state: 'approved', 'pending', 'rejected' + approval_status: gm.CHART_DIFF_STATUS | str - def __init__(self, source_chart: gm.Chart, target_chart: Optional[gm.Chart], approval_status: gm.CHART_DIFF_STATUS): + def __init__( + self, source_chart: gm.Chart, target_chart: Optional[gm.Chart], approval_status: gm.CHART_DIFF_STATUS | str + ): self.source_chart = source_chart self.target_chart = target_chart self.approval_status = approval_status @@ -22,12 +27,20 @@ def __init__(self, source_chart: gm.Chart, target_chart: Optional[gm.Chart], app self.chart_id = source_chart.id @property - def approved(self) -> bool: - return self.approval_status == "approved" + def is_reviewed(self) -> bool: + return self.is_approved or self.is_rejected @property - def unapproved(self) -> bool: - return self.approval_status == "unapproved" + def is_approved(self) -> bool: + return self.approval_status == gm.ChartStatus.APPROVED.value + + @property + def is_rejected(self) -> bool: + return self.approval_status == gm.ChartStatus.REJECTED.value + + @property + def is_pending(self) -> bool: + return self.approval_status == gm.ChartStatus.PENDING.value @property def is_new(self): @@ -80,6 +93,7 @@ def from_chart_id(cls, chart_id, source_session: Session, target_session: Option source_chart.updatedAt, target_chart.updatedAt if target_chart else None, ) + print("called DB for state, got:", approval_status) # Build object chart_diff = cls(source_chart, target_chart, approval_status) @@ -99,19 +113,17 @@ def sync(self, source_session: Session, target_session: Optional[Session] = None def approve(self, session: Session) -> None: """Approve chart diff.""" # Update status variable - self.set_status(session, "approved") + self.set_status(session, gm.ChartStatus.APPROVED) - def unapprove(self, session: Session) -> None: - """Unapprove chart diff.""" + def reject(self, session: Session) -> None: + """Reject chart diff.""" # Update status variable - self.set_status(session, "unapproved") + self.set_status(session, gm.ChartStatus.REJECTED) - def switch_state(self, session: Session) -> None: - """Switch the state of the chart diff. This will work only with two states: approved and unapproved.""" + def unreview(self, session: Session) -> None: + """Set chart diff to pending.""" # Update status variable - assert get_args(gm.CHART_DIFF_STATUS) == ("approved", "unapproved") - status = "approved" if self.unapproved else "unapproved" - self.set_status(session, status) + self.set_status(session, gm.ChartStatus.PENDING) def set_status(self, session: Session, status: gm.CHART_DIFF_STATUS) -> None: """Update the state of the chart diff.""" @@ -121,7 +133,6 @@ def set_status(self, session: Session, status: gm.CHART_DIFF_STATUS) -> None: self.approval_status = status # Update approval status (in database) - st.toast(f"Updating state for **chart {self.chart_id}** to `{self.approval_status}`") assert self.chart_id if self.is_modified: assert self.target_chart @@ -129,8 +140,9 @@ def set_status(self, session: Session, status: gm.CHART_DIFF_STATUS) -> None: chartId=self.chart_id, sourceUpdatedAt=self.source_chart.updatedAt, targetUpdatedAt=None if self.is_new else self.target_chart.updatedAt, # type: ignore - status="approved" if self.approved else "unapproved", + status=self.approval_status, ) + st.toast(f"Updating state for **chart {self.chart_id}** to `{self.approval_status}`") session.add(approval) session.commit() diff --git a/apps/wizard/pages/expert/app.py b/apps/wizard/pages/expert/app.py index ae4dac64f43..1b0dceed429 100644 --- a/apps/wizard/pages/expert/app.py +++ b/apps/wizard/pages/expert/app.py @@ -11,6 +11,7 @@ from structlog import get_logger from apps.wizard.pages.expert.prompts import ( + SYSTEM_PROMPT_DATASETTE, SYSTEM_PROMPT_FULL, SYSTEM_PROMPT_GUIDES, SYSTEM_PROMPT_METADATA, @@ -43,10 +44,11 @@ def ask_gpt(query, model): # GPT CONFIG -MODEL_DEFAULT = "gpt-4-turbo-preview" +MODEL_DEFAULT = "gpt-4o" MODELS_AVAILABLE = { - "gpt-3.5-turbo-0125": "GPT-3.5 Turbo (gpt-3.5-turbo-0125)", - "gpt-4-turbo-preview": "GPT-4 Turbo (gpt-4-turbo-preview)", + "gpt-4o": "GPT-4o", # IN: US$5.00 / 1M tokens; OUT: US$15.00 / 1M tokens + "gpt-4-turbo": "GPT-4 Turbo", # IN: US$10.00 / 1M tokens; OUT: US$30.00 / 1M tokens (gpt-4-turbo-2024-04-09) + "gpt-3.5-turbo": "GPT 3.5 Turbo", # IN: US$0.50 / 1M tokens; OUT: US$1.50 / 1M tokens (gpt-3.5-turbo-0125) } MODELS_AVAILABLE_LIST = list(MODELS_AVAILABLE.keys()) @@ -56,6 +58,7 @@ def ask_gpt(query, model): class Options: """Chat categories.""" + DATASETTE = "Datasette" METADATA = "Metadata" START = "Setting up your environment" GUIDES = "How to use, tools, APIs, and guides" @@ -69,9 +72,6 @@ def handle_feedback(feedback: Dict[str, Any]) -> None: """Handle feedback.""" print("handle feedback") print(feedback) - # st.write(feedback) - # st.write(st.session_state.prompt) - # st.write(st.session_state.response) WizardDB().add_usage( question=st.session_state.messages[-2]["content"], answer=st.session_state.response, @@ -101,6 +101,9 @@ def get_system_prompt() -> str: case Options.FULL: log.warning("Switching to 'All' system prompt.") system_prompt = SYSTEM_PROMPT_FULL + case Options.DATASETTE: + log.warning("Switching to 'DATASETTE' system prompt.") + system_prompt = SYSTEM_PROMPT_DATASETTE case Options.DEBUG: log.warning("Switching to 'DEBUG' system prompt.") system_prompt = "" @@ -128,10 +131,10 @@ def reset_messages() -> None: options=[ Options.FULL, Options.METADATA, + Options.DATASETTE, Options.START, Options.GUIDES, Options.PRINCIPLES, - # Options.DEBUG, ], index=1, help="Choosing a domain reduces the cost of the query to chatGPT, since only a subset of the documentation will be used in the query (i.e. fewer tokens used).", @@ -139,14 +142,21 @@ def reset_messages() -> None: on_change=reset_messages, ) -## Examples -EXAMPLE_QUERIES = [ - "> In the metadata yaml file, which field should I use to disable the map tap view?", - "> In the metadata yaml file, how can I define a common `description_processing` that affects all indicators in a specific table?" - "> What is the difference between `description_key` and `description_from_producer`? Be concise.", - "> Is the following snapshot title correct? 'Cherry Blossom Full Blook Dates in Kyoto, Japan'", - "> What is the difference between an Origin and Dataset?", -] +## EXAMPLE QUERIES +if st.session_state["category_gpt"] == Options.DATASETTE: + EXAMPLE_QUERIES = [ + "> Which are our top 10 articles by pageviews?", + "> How many charts do we have that use only a single indicator?", + "> Do we have datasets whose indicators are not used in any chart?", + ] +else: + EXAMPLE_QUERIES = [ + "> In the metadata yaml file, which field should I use to disable the map tap view?", + "> In the metadata yaml file, how can I define a common `description_processing` that affects all indicators in a specific table?" + "> What is the difference between `description_key` and `description_from_producer`? Be concise.", + "> Is the following snapshot title correct? 'Cherry Blossom Full Blook Dates in Kyoto, Japan'", + "> What is the difference between an Origin and Dataset?", + ] with st.popover("See examples"): for example in EXAMPLE_QUERIES: st.markdown(example) @@ -176,10 +186,10 @@ def reset_messages() -> None: options=MODELS_AVAILABLE_LIST, format_func=lambda x: MODELS_AVAILABLE[x], index=MODELS_AVAILABLE_LIST.index(MODEL_DEFAULT), - help="[Pricing](https://openai.com/pricing) | [Model list](https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo)", + help="[Pricing](https://openai.com/api/pricing) | [Model list](https://platform.openai.com/docs/models/)", ) - ## See pricing list: https://openai.com/pricing (USD) - ## See model list: https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo + ## See pricing list: https://openai.com/api/pricing (USD) + ## See model list: https://platform.openai.com/docs/models/ use_reduced_context = st.toggle( "Reduced context window", @@ -198,8 +208,8 @@ def reset_messages() -> None: st.number_input( "Max tokens", min_value=32, - max_value=2048, - value=512, + max_value=4096, + value=4096, step=32, help="The maximum number of tokens in the response.", ) diff --git a/apps/wizard/pages/expert/prompts.py b/apps/wizard/pages/expert/prompts.py index 3018af48d5a..0401cfcb8c0 100644 --- a/apps/wizard/pages/expert/prompts.py +++ b/apps/wizard/pages/expert/prompts.py @@ -151,3 +151,491 @@ def read_page_md(page_path: str) -> str: {METADATA_REFERENCE} """ + +# DATASETTE ORACLE +SYSTEM_PROMPT_DATASETTE = """ +## Datasette Oracle V2 + +Datasette Oracle is designed to effectively utilize the provided database schema, making intelligent use of foreign key constraints to deduce relationships from natural language inquiries. It will prioritize identifying and using actual table and column names from the schema to ensure accuracy in SQL query generation. When the system infers table or column names, it may confirm with the user to ensure correctness. The SQL dialect used is SQLite. + +The schema is provided in yaml below. The top level array represents the tables, with a "name" field and an optional "description" field. The columns are listed under the "columns" key. If a column has a foreign key constraint onto another table, this is specified with the fields "fkTargetTable" and "fkTargetColumn". + +```yaml +- name: algolia_searches_by_week + columns: + - name: week_start_date + - name: index + - name: query + - name: total_searches + - name: total_hits +- name: analytics_pageviews + description: | + contains information on pageviews which can be very useful to order results by (e.g. to show + posts with the most pageviews first). The `url` of this table contains full urls - to match + it up with the `slug` column on `posts` or `posts_gdocs` or `charts` table you have to turn + those into full urls. `posts` and `posts_gdocs` slug just needs to be prefixed with + `https://ourworldindata.org/`, for charts it is `https://ourworldindata.org/grapher/`, + for explorers it is `https://ourworldindata.org/explorers/` + columns: + - name: day + - name: url + - name: views_7d + - name: views_14d + - name: views_365d + - name: url_domain + - name: url_path + - name: url_query + - name: url_fragment +- name: chart_dimensions + description: this table enumerates the variables (aka indicators) that are used in a chart + columns: + - name: id + - name: order + - name: property + - name: chartId + fkTargetTable: charts + fkTargetColumn: id + - name: variableId + fkTargetTable: variables + fkTargetColumn: id + - name: createdAt + - name: updatedAt +- name: chart_slug_redirects + descriptioN: this table contains alternative slugs pointing to charts + columns: + - name: id + - name: slug + - name: chart_id + fkTargetTable: charts + fkTargetColumn: id + - name: createdAt + - name: updatedAt +- name: chart_tags + columns: + - name: chartId + fkTargetTable: charts + fkTargetColumn: id + - name: tagId + fkTargetTable: tags + fkTargetColumn: id + - name: keyChartLevel + - name: createdAt + - name: updatedAt + - name: isApproved +- name: chart_variables + columns: + - name: chartId + fkTargetTable: charts + fkTargetColumn: id + - name: variableId + fkTargetTable: variables + fkTargetColumn: id +- name: charts + description: | + contains the configuration for our data visualization. The `config` column contains a json + configuration for the chart. Important fields inside this json are hasMapTab, hasChartTab, + title, subtitle, slug and type (one of LineChart ScatterPlot StackedArea DiscreteBar + StackedDiscreteBar SlopeChart StackedBar Marimekko or missing in which case LineChart is the default) + columns: + - name: id + - name: slug + - name: type + - name: config + - name: createdAt + - name: updatedAt + - name: lastEditedAt + - name: publishedAt + - name: lastEditedByUserId + fkTargetTable: users + fkTargetColumn: id + - name: publishedByUserId + fkTargetTable: users + fkTargetColumn: id + - name: is_indexable + - name: title + - name: subtitle + - name: note + - name: title_plus_variant + - name: configWithDefaults +- name: dataset_tags + columns: + - name: datasetId + fkTargetTable: datasets + fkTargetColumn: id + - name: tagId + fkTargetTable: tags + fkTargetColumn: id + - name: createdAt + - name: updatedAt +- name: datasets + description: a collection of varaibles + columns: + - name: id + - name: name + - name: description + - name: createdAt + - name: updatedAt + - name: namespace + - name: isPrivate + - name: createdByUserId + fkTargetTable: users + fkTargetColumn: id + - name: metadataEditedAt + - name: metadataEditedByUserId + fkTargetTable: users + fkTargetColumn: id + - name: dataEditedAt + - name: dataEditedByUserId + fkTargetTable: users + fkTargetColumn: id + - name: nonRedistributable + - name: isArchived + - name: sourceChecksum + - name: shortName + - name: version + - name: updatePeriodDays +- name: entities + columns: + - name: id + - name: code + - name: name + - name: validated + - name: createdAt + - name: updatedAt + - name: displayName +- name: explorer_charts + columns: + - name: id + - name: explorerSlug + fkTargetTable: explorers + fkTargetColumn: slug + - name: chartId + fkTargetTable: charts + fkTargetColumn: id +- name: explorer_tags + columns: + - name: id + - name: explorerSlug + - name: tagId + fkTargetTable: tags + fkTargetColumn: id +- name: explorer_variables + columns: + - name: id + - name: explorerSlug + fkTargetTable: explorers + fkTargetColumn: slug + - name: variableId + fkTargetTable: variables + fkTargetColumn: id +- name: explorers + description: | + contains our explorers, which are more complex data visualisations. They can include charts but can also be configured differently. If they are are using charts then the link is established in the `explorer_charts` table. Linking this to `variables` can be done as well but if doing so, alert the user to the fact that there are a lot of connections between these entities that are not tracked in the database. + columns: + - name: slug + - name: isPublished + - name: config + - name: createdAt + - name: updatedAt +- name: images + columns: + - name: id + - name: googleId + - name: filename + - name: defaultAlt + - name: originalWidth + - name: updatedAt + - name: originalHeight +- name: namespaces + columns: + - name: id + - name: name + - name: description + - name: isArchived + - name: createdAt + - name: updatedAt +- name: origins + columns: + - name: id + - name: titleSnapshot + - name: title + - name: descriptionSnapshot + - name: description + - name: producer + - name: citationFull + - name: attribution + - name: attributionShort + - name: versionProducer + - name: urlMain + - name: urlDownload + - name: dateAccessed + - name: datePublished + - name: license +- name: origins_variables + columns: + - name: originId + fkTargetTable: origins + fkTargetColumn: id + - name: variableId + fkTargetTable: variables + fkTargetColumn: id + - name: displayOrder +- name: post_broken_chart_links + columns: + - name: id + - name: postId + fkTargetTable: posts + fkTargetColumn: id + - name: chartSlug + - name: kind +- name: post_charts + columns: + - name: id + - name: postId + fkTargetTable: posts + fkTargetColumn: id + - name: chartId + fkTargetTable: charts + fkTargetColumn: id + - name: kind + - name: through_redirect +- name: post_links + columns: + - name: id + - name: postId + fkTargetTable: posts + fkTargetColumn: id + - name: link + - name: kind +- name: post_tags + columns: + - name: post_id + fkTargetTable: posts + fkTargetColumn: id + - name: tag_id + fkTargetTable: tags + fkTargetColumn: id + - name: createdAt + - name: updatedAt +- name: posts + description: | + The table for our old posts that were written in wordpress. It contains the html content of the post in the `content` column + and a markdown version of the content in the markdown `column`. + columns: + - name: id + - name: title + - name: slug + - name: type + - name: status + - name: content + - name: archieml + - name: archieml_update_statistics + - name: published_at + - name: updated_at + - name: gdocSuccessorId + - name: authors + - name: excerpt + - name: created_at_in_wordpress + - name: updated_at_in_wordpress + - name: featured_image + - name: formattingOptions + - name: markdown + - name: wpApiSnapshot +- name: posts_gdocs + description: | + The table for our new posts written in Google Docs. It contains content in form of json in the `content` column and a + markdown version of the content in the markdown `column`. + columns: + - name: id + - name: slug + - name: type + - name: content + - name: published + - name: createdAt + - name: publishedAt + - name: updatedAt + - name: publicationContext + - name: revisionId + - name: breadcrumbs + - name: markdown + - name: title +- name: posts_gdocs_links + columns: + - name: id + - name: sourceId + fkTargetTable: posts_gdocs + fkTargetColumn: id + - name: target + - name: linkType + - name: componentType + - name: text + - name: queryString + - name: hash +- name: posts_gdocs_variables_faqs + columns: + - name: gdocId + fkTargetTable: posts_gdocs + fkTargetColumn: id + - name: variableId + fkTargetTable: variables + fkTargetColumn: id + - name: fragmentId + - name: displayOrder +- name: posts_gdocs_x_images + columns: + - name: id + - name: gdocId + fkTargetTable: posts_gdocs + fkTargetColumn: id + - name: imageId + fkTargetTable: images + fkTargetColumn: id +- name: posts_gdocs_x_tags + columns: + - name: gdocId + fkTargetTable: posts_gdocs + fkTargetColumn: id + - name: tagId + fkTargetTable: tags + fkTargetColumn: id +- name: posts_links + columns: + - name: id + - name: sourceId + fkTargetTable: posts + fkTargetColumn: id + - name: target + - name: linkType + - name: componentType + - name: text + - name: queryString + - name: hash +- name: posts_unified + description: | + this table combines posts and posts_gdocs. To get the content you need to join it with + posts and posts_gdocs but this is the best place to query e.g. all titles. Type is one of: article homepage topic-page linear-topic-page data-insight author about-page. We sometimes call topic-page pages "Modular topic pages". + columns: + - name: id + - name: slug + - name: title + - name: type + - name: publishedAt + - name: updatedAt + - name: authors + - name: createdAt + - name: publicationContext + - name: gdocId + fkTargetTable: posts_gdocs + fkTargetColumn: id + - name: wordpressId + fkTargetTable: posts + fkTargetColumn: id +- name: redirects + columns: + - name: id + - name: source + - name: target + - name: code + - name: createdAt + - name: updatedAt +- name: sources + columns: + - name: id + - name: name + - name: description + - name: createdAt + - name: updatedAt + - name: datasetId + fkTargetTable: datasets + fkTargetColumn: id + - name: additionalInfo + - name: link + - name: dataPublishedBy +- name: sqlite_sequence + columns: + - name: name + - name: seq +- name: tags + columns: + - name: id + - name: name + - name: createdAt + - name: updatedAt + - name: parentId + fkTargetTable: tags + fkTargetColumn: id + - name: specialType + - name: slug +- name: tags_variables_topic_tags + columns: + - name: tagId + fkTargetTable: tags + fkTargetColumn: id + - name: variableId + fkTargetTable: variables + fkTargetColumn: id + - name: displayOrder +- name: users + columns: + - name: id + - name: password + - name: lastLogin + - name: isSuperuser + - name: email + - name: createdAt + - name: updatedAt + - name: isActive + - name: fullName + - name: lastSeen +- name: variables + columns: + - name: id + - name: name + - name: unit + - name: description + - name: createdAt + - name: updatedAt + - name: code + - name: coverage + - name: timespan + - name: datasetId + fkTargetTable: datasets + fkTargetColumn: id + - name: sourceId + fkTargetTable: sources + fkTargetColumn: id + - name: shortUnit + - name: display + - name: columnOrder + - name: originalMetadata + - name: grapherConfigAdmin + - name: shortName + - name: catalogPath + - name: dimensions + - name: schemaVersion + - name: processingLevel + - name: processingLog + - name: titlePublic + - name: titleVariant + - name: attributionShort + - name: attribution + - name: descriptionShort + - name: descriptionFromProducer + - name: descriptionKey + - name: descriptionProcessing + - name: licenses + - name: license + - name: grapherConfigETL + - name: type + - name: sort + +``` + +The content of the database is all the information for the Our World In Data website, a publication with writing and data visualization about the world's biggest problems. + +For questions about posts, articles, topic pages and so on, posts_unified is usually the best starting point and you should prefer querying that table over posts or posts_gdocs unless there is a compelling reason. For questions about grapher charts it is charts. For question about indicators or variables it is variables. + +Your job is to create a SQL query for the user that answers their question given the schema above. You may ask the user for clarification, e.g. if it is unclear if unpublished items should be included (when applicable) or if there is ambiguity in which tables to use to answer a question. + +Upon generating a query, Datasette Oracle will always provide the SQL query both as text and as a clickable Datasette link, formatted for the user's convenience. The datasette URL is http://datasette-private and the database name is owid. An example query to get all rows from the algolia_searches_by_week table is this one that demonstrates the escaping: `http://datasette-private/owid?sql=select+*+from+algolia_searches_by_week` Remember, you cannot actually run the SQL query, you are just to output the query as text and a datasette link that will run that query! +""" diff --git a/apps/wizard/pages/indicator_upgrade/app.py b/apps/wizard/pages/indicator_upgrade/app.py index a1043e61393..d4855d25daf 100644 --- a/apps/wizard/pages/indicator_upgrade/app.py +++ b/apps/wizard/pages/indicator_upgrade/app.py @@ -158,9 +158,4 @@ ########################################################################################## if st.session_state.submitted_datasets and st.session_state.submitted_indicators and st.session_state.submitted_charts: if isinstance(charts, list) and len(charts) > 0: - try: - push_new_charts(charts, SCHEMA_CHART_CONFIG) - except Exception: - st.error( - "Something went wrong when trying to update the charts and pushing them to the database. Please try again or report the error #004001" - ) + push_new_charts(charts, SCHEMA_CHART_CONFIG) diff --git a/apps/wizard/pages/indicator_upgrade/charts_update.py b/apps/wizard/pages/indicator_upgrade/charts_update.py index 8b0184ffd00..066031e25f9 100644 --- a/apps/wizard/pages/indicator_upgrade/charts_update.py +++ b/apps/wizard/pages/indicator_upgrade/charts_update.py @@ -125,4 +125,4 @@ def push_new_charts(charts: List[gm.Chart], schema_chart_config: Dict[str, Any]) st.exception(e) else: st.success("The charts were successfully updated! Review the changes with `chart diff`") - st_page_link("chart_diff") + st_page_link("chart-diff") diff --git a/apps/wizard/pages/indicator_upgrade/explore_mode.py b/apps/wizard/pages/indicator_upgrade/explore_mode.py index 9981d25cd47..5df635a1a14 100644 --- a/apps/wizard/pages/indicator_upgrade/explore_mode.py +++ b/apps/wizard/pages/indicator_upgrade/explore_mode.py @@ -2,7 +2,8 @@ This is currently shown in the indicator upgrader, but might be moved to chart-diff in the future. """ -from typing import Dict, Tuple, cast +from dataclasses import dataclass, field +from typing import Dict, Optional, Tuple, cast import numpy as np import pandas as pd @@ -27,7 +28,10 @@ def st_explore_indicator_dialog(df, indicator_old, indicator_new, var_id_to_disp More on dialogs: https://docs.streamlit.io/develop/api-reference/execution-flow/st.dialog """ - st_explore_indicator(df, indicator_old, indicator_new, var_id_to_display) + if indicator_old == indicator_new: + st.error("Comparison failed, because old and new inidcators are the same.") + else: + st_explore_indicator(df, indicator_old, indicator_new, var_id_to_display) # @st.cache_data(show_spinner=False) @@ -60,7 +64,10 @@ def st_explore_indicator(df, indicator_old, indicator_new, var_id_to_display) -> # 2/ Get similarity score score = get_similarity_score(df_indicators, indicator_old, indicator_new) - # 3/ Get names + # 3/ Get summary + summary = get_summary_diff(df_indicators, indicator_old, indicator_new, is_numeric) + + # 4/ Get names name_old = var_id_to_display[indicator_old] name_new = var_id_to_display[indicator_new] @@ -68,29 +75,64 @@ def st_explore_indicator(df, indicator_old, indicator_new, var_id_to_display) -> st.markdown(f"New: `{name_new}`") # Check if there is any change - num_changes = (df_indicators[(indicator_old)] != df_indicators[indicator_new]).sum() + num_changes = (df_indicators[indicator_old] != df_indicators[indicator_new]).sum() if num_changes == 0: st.success("No changes in the data points!") else: - tab1, tab2 = st.tabs(["Summary", "Error distribution"]) - - with tab1: - # 4/ Show score - st_show_score(score) - - # 5/ other info (% of rows changed, number of rows changed) - st_show_details(df_indicators, indicator_old, indicator_new, is_numeric) - - # Rename, remove equal datapoints - df_indicators = df_indicators.loc[df_indicators[(indicator_old)] != df_indicators[indicator_new]] - df_indicators = df_indicators.rename(columns=var_id_to_display) - - # 6/ Show dataframe with different rows - st.header("Changes in data points") - st_show_dataframe(df_indicators, col_old=name_old, col_new=name_new) - with tab2: - # 7/ Show distribution of change - st_show_plot(df_indicators, col_old=name_old, col_new=name_new, is_numeric=is_numeric) + # No change in datapoints. I.e. only show main tab. + if summary.num_datapoints_changed == 0: + st_show_tab_main( + score, summary, df_indicators, indicator_old, indicator_new, var_id_to_display, name_old, name_new + ) + # Changes in datapoints. Show more tabs + else: + tab_names = ["Summary"] + if is_numeric: + tab_names.append("Error distribution") + if ((summary.num_categories_old < 10) & (summary.num_categories_new < 10)) | (not is_numeric): + tab_names.append("Confusion Matrix") + + tabs = st.tabs(tab_names) + for tab, tab_name in zip(tabs, tab_names): + with tab: + if tab_name == "Summary": + st_show_tab_main( + score, + summary, + df_indicators, + indicator_old, + indicator_new, + var_id_to_display, + name_old, + name_new, + ) + elif tab_name == "Error distribution": + st_show_plot(df_indicators, col_old=name_old, col_new=name_new, is_numeric=is_numeric) + elif tab_name == "Confusion Matrix": + df_ = df_indicators.copy() + df_[indicator_old] = df_[indicator_old].fillna("None") + df_[indicator_new] = df_[indicator_new].fillna("None") + confusion_matrix = pd.crosstab(df_[indicator_old], df_[indicator_new], dropna=False) + st.dataframe(confusion_matrix) + + +def st_show_tab_main( + score, summary, df_indicators, indicator_old, indicator_new, var_id_to_display, name_old, name_new +): + # 5/ Show score + if summary.num_datapoints_changed > 0: + st_show_score(score) + + # 6/ other info (% of rows changed, number of rows changed) + st_show_details(summary) + + # Rename, remove equal datapoints + df_indicators = df_indicators.loc[df_indicators[(indicator_old)] != df_indicators[indicator_new]] + df_indicators = df_indicators.rename(columns=var_id_to_display) + + # 7/ Show dataframe with different rows + st.header("Changes in data points") + st_show_dataframe(df_indicators, col_old=name_old, col_new=name_new) def correct_dtype(series: pd.Series) -> pd.Series: @@ -220,7 +262,7 @@ def get_similarity_score( if COLUMN_ABS_RELATIVE_ERROR in df.columns: # Numeric values with pd.option_context("mode.use_inf_as_na", True): - score["rel_error"] = df.loc[:, COLUMN_ABS_RELATIVE_ERROR].dropna().mean().round(N_ROUND_DEC) + score["rel_error"] = round(df.loc[:, COLUMN_ABS_RELATIVE_ERROR].dropna().mean(), N_ROUND_DEC) if (COLUMN_LOG_ERROR not in df.columns) and (COLUMN_RELATIVE_ERROR not in df.columns): # Categorical values assert (column_old is not None) and ( @@ -234,6 +276,55 @@ def get_similarity_score( return score +@dataclass +class SummaryDiff: + num_nan_score: Optional[int] = None + num_rows_change_relative: float = field(default=0.0) + num_rows_start: int = field(default=0) + num_rows_changed: int = field(default=0) + num_datapoints_changed: int = field(default=0) + num_datapoints_new: int = field(default=0) + num_datapoints_lost: int = field(default=0) + num_categories_old: int = field(default=0) + num_categories_new: int = field(default=0) + + +def get_summary_diff(df: pd.DataFrame, indicator_old: str, indicator_new: str, is_numeric: bool) -> SummaryDiff: + """Get summary of changes""" + summary = {} + + # Number of unknown scores + if is_numeric: + num_nan_score = df[COLUMN_RELATIVE_ERROR].isna().sum() + summary["num_nan_score"] = num_nan_score + + # Number of rows changed + nrows_0 = df.shape[0] + nrows_1 = (df[(indicator_old)] != df[indicator_new]).sum() + nrows_change_relative = round(100 * nrows_1 / nrows_0, 1) + summary["num_rows_start"] = nrows_0 + summary["num_rows_changed"] = nrows_1 + summary["num_rows_change_relative"] = nrows_change_relative + + # Number of changes in datapoints + num_changes = _get_num_changes(df, indicator_old, indicator_new) + summary["num_datapoints_changed"] = num_changes + + # Number of NAs is old indicator = new datapoints + num_nan_old = df[indicator_old].isna().sum() + summary["num_datapoints_new"] = num_nan_old + + # Number of NAs is new indicator + num_nan_new = df[indicator_new].isna().sum() + summary["num_datapoints_lost"] = num_nan_new + + # Get number of categories in old and new + summary["num_categories_old"] = df[indicator_old].nunique() + summary["num_categories_new"] = df[indicator_new].nunique() + + return SummaryDiff(**summary) + + def st_show_score(score): """Show similarity scores. @@ -277,33 +368,31 @@ def st_show_error_diff(score): st_show_error_diff(score) -def st_show_details(df, indicator_old, indicator_new, is_numeric): +def st_show_details(summary: SummaryDiff): # with col: text = [] # Number of unknown scores - if is_numeric: - num_nan_score = df[COLUMN_RELATIVE_ERROR].isna().sum() - text.append(f"**{num_nan_score}** rows with unknown score") + if summary.num_nan_score is not None: + text.append(f"**{summary.num_nan_score}** rows with unknown score") + # Number of rows changed - nrows_0 = df.shape[0] - nrows_1 = (df[(indicator_old)] != df[indicator_new]).sum() - nrows_change_relative = round(100 * nrows_1 / nrows_0, 1) - text.append(f"**{nrows_change_relative} %** of the rows changed ({nrows_1} out of {nrows_0})") + text.append( + f"**{summary.num_rows_change_relative} %** of the rows changed ({summary.num_rows_changed} out of {summary.num_rows_start})" + ) # Changes in detail text_changes = [] # Number of changes in datapoints - num_changes = len(df[df[(indicator_old)] != df[indicator_new]].dropna(subset=[indicator_old, indicator_new])) - text_changes.append(f"**{num_changes}** changes in datapoint values") + text_changes.append(f"**{summary.num_datapoints_changed}** changes in datapoint values") + # Number of NAs is old indicator = new datapoints - num_nan_old = df[indicator_old].isna().sum() - if num_nan_old > 0: - text_changes.append(f"**{num_nan_old}** new datapoints") + if summary.num_datapoints_new > 0: + text_changes.append(f"**{summary.num_datapoints_new}** new datapoints") + # Number of NAs is new indicator - num_nan_new = df[indicator_new].isna().sum() - if num_nan_new > 0: - text_changes.append(f"**{num_nan_new}** NAs in new indicator") + if summary.num_datapoints_lost > 0: + text_changes.append(f"**{summary.num_datapoints_lost}** NAs in new indicator") text_changes = "\n\t- " + "\n\t- ".join(text_changes) @@ -311,6 +400,11 @@ def st_show_details(df, indicator_old, indicator_new, is_numeric): st.info(text) +def _get_num_changes(df: pd.DataFrame, indicator_old: str, indicator_new: str) -> int: + num_changes = len(df[df[(indicator_old)] != df[indicator_new]].dropna(subset=[indicator_old, indicator_new])) + return num_changes + + def st_show_dataframe(df: pd.DataFrame, col_old: str, col_new: str) -> None: """Show dataframe accounting for cell limit and good sorting.""" df_show = df.copy() @@ -360,7 +454,7 @@ def st_show_dataframe(df: pd.DataFrame, col_old: str, col_new: str) -> None: def st_show_plot(df: pd.DataFrame, col_old: str, col_new: str, is_numeric: bool) -> None: - if is_numeric: + if not is_numeric: # TODO: Show as a sankey diagram where the flow from old to new categories is shown. # Reshape df_cat = df.melt(id_vars=COLUMNS_INDEX, value_vars=[col_old, col_new], var_name="indicator", value_name="value") diff --git a/apps/wizard/pages/meta_expert/__init__.py b/apps/wizard/pages/meta_expert/__init__.py deleted file mode 100644 index 9de94a8c8bb..00000000000 --- a/apps/wizard/pages/meta_expert/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""GPT-based chat with ETL knowledge.""" diff --git a/apps/wizard/pages/meta_expert/app.py b/apps/wizard/pages/meta_expert/app.py deleted file mode 100644 index b33d2770f04..00000000000 --- a/apps/wizard/pages/meta_expert/app.py +++ /dev/null @@ -1,145 +0,0 @@ -"""Ask chat GPT questions about our metadata. - -references: -- https://docs.streamlit.io/knowledge-base/tutorials/build-conversational-apps#build-a-chatgpt-like-app -""" -from typing import cast - -import streamlit as st -from st_pages import add_indentation - -from apps.wizard.pages.meta_expert.prompts import SYSTEM_PROMPT_FULL, SYSTEM_PROMPT_REDUCED -from apps.wizard.utils.gpt import OpenAIWrapper, get_cost_and_tokens -from etl.config import load_env - -# CONFIG -st.set_page_config(page_title="Wizard: Ask the Metadata Expert", page_icon="🪄") -add_indentation() -## Title/subtitle -st.title("Metadata 🧙 **:gray[Expert]**") -st.markdown("Ask the Expert any questions about the metadata!") -## Examples -EXAMPLE_QUERIES = [ - "What is the difference between `description_key` and `description_from_producer`? Be concise.", - "Is the following snapshot title correct? 'Cherry Blossom Full Blook Dates in Kyoto, Japan'", - "What is the difference between an Origin and Dataset?", -] -with st.expander("See examples"): - for example in EXAMPLE_QUERIES: - st.caption(example) -## Load variables -load_env() - - -@st.cache_data(show_spinner=True) -def ask_gpt(query, model): - response = api.query_gpt(query, model=model) - return response - - -# GPT CONFIG -MODEL_DEFAULT = "gpt-4-turbo-preview" -MODELS_AVAILABLE = { - "gpt-3.5-turbo-0125": "GPT-3.5 Turbo (gpt-3.5-turbo-0125)", - "gpt-4-turbo-preview": "GPT-4 Turbo (gpt-4-turbo-preview)", -} -MODELS_AVAILABLE_LIST = list(MODELS_AVAILABLE.keys()) - - -# Sidebar with GPT config -with st.sidebar: - st.markdown("## GPT Configuration") - model_name = st.selectbox( - label="Select GPT model", - options=MODELS_AVAILABLE_LIST, - format_func=lambda x: MODELS_AVAILABLE[x], - index=MODELS_AVAILABLE_LIST.index(MODEL_DEFAULT), - help="[Pricing](https://openai.com/pricing) | [Model list](https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo)", - ) - ## See pricing list: https://openai.com/pricing (USD) - ## See model list: https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo - - use_reduced_context = st.toggle( - "Reduced context window", - value=False, - help="If checked, only the last user message will be accounted (i.e less tokens and therefore cheaper).", - ) - use_reduced_docs = st.toggle( - "Reduced docs", - value=False, - help="If checked, a reduced ETL documentation is used in the GPT query (i.e. less tokens and therefore cheaper). Otherwise, the complete documentation is used (slightly more costly)", - ) - temperature = st.slider( - "Temperature", - min_value=0.0, - max_value=2.0, - value=0.15, - step=0.01, - help="What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.", - ) - max_tokens = int( - st.number_input( - "Max tokens", - min_value=32, - max_value=2048, - value=512, - step=32, - help="The maximum number of tokens in the response.", - ) - ) - -if use_reduced_docs: - SYSTEM_PROMPT = SYSTEM_PROMPT_REDUCED -else: - SYSTEM_PROMPT = SYSTEM_PROMPT_FULL -api = OpenAIWrapper() - -# ACTUAL APP -# Initialize chat history -if "messages" not in st.session_state: - st.session_state.messages = [{"role": "system", "content": SYSTEM_PROMPT}] - -# Display chat messages from history on app rerun -for message in st.session_state.messages: - if message["role"] != "system": - with st.chat_message(message["role"]): - st.markdown(message["content"]) -# Reduce to only systme prompt -# st.session_state.messages = st.session_state.messages[-2:] - -# React to user input -if prompt := st.chat_input("Ask me!"): - # Display user message in chat message container - st.chat_message("user").markdown(prompt) - # Add user message to chat history - st.session_state.messages.append({"role": "user", "content": prompt}) - - # Build GPT query (only use the system prompt and latest user input) - if use_reduced_context: - messages = [{"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": prompt}] - else: - messages = st.session_state.messages - - # Display assistant response in chat message container - with st.chat_message("assistant"): - # Ask GPT (stream) - stream = api.chat.completions.create( - model=cast(str, model_name), - messages=messages, # type: ignore - temperature=temperature, - max_tokens=max_tokens, - stream=True, - ) - response = cast(str, st.write_stream(stream)) - # st.markdown(response.message_content) - # st.info(f"Cost: {response.cost} USD. \nTokens: {response.usage.total_tokens}.") - # Add assistant response to chat history - - # Get cost & tokens - text_in = "\n".join([m["content"] for m in st.session_state.messages]) - cost, num_tokens = get_cost_and_tokens(text_in, response, cast(str, model_name)) - cost_msg = f"**Cost**: ≥{cost} USD.\n\n **Tokens**: ≥{num_tokens}." - st.info(cost_msg) - - # Add new response by the System - st.session_state.messages.append({"role": "assistant", "content": response}) diff --git a/apps/wizard/pages/meta_expert/docs_reduced/dataset.txt b/apps/wizard/pages/meta_expert/docs_reduced/dataset.txt deleted file mode 100644 index cbbf63fc868..00000000000 --- a/apps/wizard/pages/meta_expert/docs_reduced/dataset.txt +++ /dev/null @@ -1,73 +0,0 @@ -# `dataset` - -An ETL dataset comprises tables, described by `DatasetMeta` attributes. - -## `dataset.description` - -*Type*: `string` | Recommended - -Describes the dataset for internal use or data catalog users, typically a short paragraph detailing table contents. - -### Guidelines - -- Start with a capital letter and end with a period. -- Avoid mentioning other metadata fields unless crucial. -- Focus on describing the dataset succinctly. -- Override the automatic description for datasets with multiple tables. - - -## `dataset.licenses` - -*Type*: `array` - -Lists licenses from the dataset's processing history. - -### Guidelines - -- Licenses usually propagate automatically. Manually fill only if this fails. Future updates may remove this field as licenses will reside in `origins`. - - -## `dataset.non_redistributable` - -*Type*: `boolean` - -Indicates if the dataset is non-redistributable, restricting data download. - -## `dataset.sources` - -*Type*: `array` - -(DEPRECATED) Lists the dataset's indicator sources. - -## `dataset.title` - -*Type*: `string` | Required - -One-line dataset title for internal use or data catalog users. - -### Guidelines - -- Start with a capital letter without ending in a period. -- Clearly identify the dataset. -- Override the automatic title for datasets with multiple tables. - - -## `dataset.update_period_days` - -*Type*: `integer` | Required - -Defines the expected update frequency in days by OWID (e.g., `30`, `90`, `365`). Use `0` for datasets not planned to be updated. - -### Guidelines - -- Define during the garden step. -- Use integers to specify OWID's update frequency. - -### Examples - -| DO | DON'T | -| --- | ------ | -| `7` | `2023-01-07` | -| `30` | `monthly` | -| `90` | `0.2` | -| `365` | `1/365` | \ No newline at end of file diff --git a/apps/wizard/pages/meta_expert/docs_reduced/indicators.txt b/apps/wizard/pages/meta_expert/docs_reduced/indicators.txt deleted file mode 100644 index 980f06f9346..00000000000 --- a/apps/wizard/pages/meta_expert/docs_reduced/indicators.txt +++ /dev/null @@ -1,134 +0,0 @@ -# Simplified Documentation - -## `variable` - -An indicator, or variable, is a data series with metadata, defined by `VariableMeta`. - -### `variable.description_from_producer` - -- *Type*: `string` | Recommended -- Producer's description. -- **Guidelines**: - - Capitalize the first letter and end with a period. - - Keep producer's text with minor edits allowed. - - Include only if provided by the producer. - -### `variable.description_key` - -- *Type*: `array` | Recommended for curated indicators -- Key information list. -- **Guidelines**: - - Use short paragraphs, capitalized and ending with a period. - - Exclude `description_short`. - - Include essential information, excluding processing details. - -### `variable.description_processing` - -- *Type*: `string` | Required if applicable -- Processing details by OWID. -- **Guidelines**: - - Start with a capital letter and end with a period. - - Include only significant editorial decisions. - -### `variable.description_short` - -- *Type*: `string` | Required -- Brief description complementing the title. -- **Guidelines**: - - One short paragraph, capitalized, ending with a period. - - Exclude metadata fields except crucial units. - -### `variable.display` - -Less powerful than grapher config, retains display settings. - -#### Display Attributes - -- `color`: *Type*: `string` | Chart color. -- `conversionFactor`: *Type*: `number` | Avoid using; prefer ETL conversion. -- `description`: *Type*: `string` | Display description. -- `entityAnnotationsMap`: *Type*: `string` | Entity annotations. -- `includeInTable`: *Type*: `boolean` | Indicator in table sheet. -- `isProjection`: *Type*: `boolean` | Forward projection indicator. -- `name`: *Type*: `string` | Required | Chart legend title. - - **Guidelines**: - - Very short, no period. -- `numDecimalPlaces`: *Type*: `integer` | Decimal places in charts. -- `shortUnit`: *Type*: `string` | Short unit in charts. -- `tableDisplay`: Config for table tab. - - `hideAbsoluteChange`: *Type*: `boolean` | Hide absolute change. - - `hideRelativeChange`: *Type*: `boolean` | Hide relative change. -- `tolerance`: *Type*: `integer` | Tolerance in charts. -- `unit`: *Type*: `string` | Unit in charts. -- `yearIsDay`: *Type*: `boolean` | Year column as day. -- `zeroDay`: *Type*: `string` | Starting date if `yearIsDay` is true. - -### `variable.license` - -- *Type*: `string` | Required (future automation) -- Depends on processing level and origin licenses. -- **Guidelines**: - - `CC BY 4.0` for major processing. - - Strictest origin license for minor processing. - -### `variable.origins` - -- *Type*: `array` -- List of indicator origins. -- **Note**: Automatic propagation preferred. - -### `variable.presentation` - -Defines public metadata display. - -#### Presentation Attributes - -- `attribution`: *Type*: `string` | Optional | Custom citation. -- `attribution_short`: *Type*: `string` | Recommended for curated indicators | Short citation. -- `faqs`: *Type*: `array` | Recommended for curated indicators | FAQ references. -- `grapher_config`: OWID grapher configuration. -- `title_public`: *Type*: `string` | Optional | Public title. -- `title_variant`: *Type*: `string` | Optional | Title disambiguation. -- `topic_tags`: *Type*: `array` | Recommended for curated indicators | Relevant topics. - -### `variable.presentation_license` - -Overrides `license`. - -- `name`: *Type*: `string` -- `url`: *Type*: `string` - -### `variable.processing_level` - -- *Type*: `string` | Required (future automation) -- Indicates minor or major processing. -- **Guidelines**: - - `minor` for simple operations. - - `major` for complex operations. - -### `variable.short_unit` - -- *Type*: `string` | Required -- Measurement unit abbreviation. -- **Guidelines**: - - Follow SI capitalization, no period, no spaces. - - Use `display.short_unit` for chart simplifications. - -### `variable.sources` - -- *Type*: `array` -- List of sources, replaced by `origins`. - -### `variable.title` - -- *Type*: `string` | Required -- Indicator title. -- **Guidelines**: - - Capitalized, no period, short sentence. - -### `variable.unit` - -- *Type*: `string` | Required -- Measurement unit name. -- **Guidelines**: - - Lowercase, plural, metric units preferred, '%' for percentages. \ No newline at end of file diff --git a/apps/wizard/pages/meta_expert/docs_reduced/origin.txt b/apps/wizard/pages/meta_expert/docs_reduced/origin.txt deleted file mode 100644 index 45f03963d5b..00000000000 --- a/apps/wizard/pages/meta_expert/docs_reduced/origin.txt +++ /dev/null @@ -1,68 +0,0 @@ -# Simplified Documentation for `origin` - -## Overview - -The `origin` details the source of an indicator's data and metadata, referring to a specific snapshot from a data product. A snapshot is a selected portion of data from a broader dataset, paper, or database captured on a specific date. The data product's producer is usually an institution or authors. - -Snapshots can be the entire data product or a subset, depending on the data's nature and source. The `Origin` object attributes in ETL describe these details. - -## Fields - -### `origin.attribution` (Optional, String) -Citation for the data product, overriding the default `producer (year)` format when necessary. Follow these rules: -- Start with a capital letter, except for specific names (e.g., `van Haasteren`). -- Do not end with a period but conclude with the publication year in parentheses. -- Avoid semicolons and use only if the default format is insufficient, following the format `{producer} - {title} {version_producer} ({year})`. - -### `origin.attribution_short` (Recommended, String) -A concise version of `attribution` for limited spaces, omitting the year. It should: -- Begin with a capital letter, except for specific names. -- Not end with a period. -- Preferably be an acronym or a brief name of the producer or data product. - -### `origin.citation_full` (Required, String) -The complete citation as specified by the producer, including the publication year. It should: -- Start with a capital letter and end with a period. -- Adhere to the producer's guidelines, allowing minor edits for formatting or typos. - -### `origin.date_accessed` (Required, String) -The date the current version of the data was accessed, in `YYYY-MM-DD` format. - -### `origin.date_published` (Required, String) -The publication date of the current data version, in `YYYY-MM-DD` or `YYYY` if the day is unknown. - -### `origin.description` (Recommended, String) -A brief description of the data product, avoiding other metadata fields unless crucial. - -### `origin.description_snapshot` (Recommended, String) -Details the snapshot if it differs from the data product, without repeating the `description` content. - -### `origin.license` -Specifies the data product's license. Required fields include: -- `origin.license.name` (Required, String): The license name, following standard or producer-specific formats. -- `origin.license.url` (Required if existing, String): The URL to the license details on the producer's site. - -### `origin.producer` (Required, String) -The name of the data producer, following specific formatting rules for capitalization, author names, and acronyms. - -### `origin.title` (Required, String) -The data product's title, clearly identifying the content without ending in a period or including unrelated metadata. - -### `origin.title_snapshot` (Required if different, String) -The snapshot's title, distinct from the data product's title, following similar formatting rules. - -### `origin.url_download` (Required if existing, String) -A direct URL or S3 URI for downloading the data as a single file. - -### `origin.url_main` (Required, String) -The main website URL for the data product, providing detailed information. - -### `origin.version_producer` (Recommended, String/Number) -The version of the data product as specified by the producer. - -## Guidelines Summary -- Capitalization and punctuation are crucial across fields, with specific exceptions noted. -- Avoid periods at the end of fields unless specified. -- Use concise, clear formatting for citations and descriptions. -- Direct URLs are preferred for downloads and license details. -- Producer names and titles should be accurately represented, adhering to any specific requests or known formats. \ No newline at end of file diff --git a/apps/wizard/pages/meta_expert/docs_reduced/tables.txt b/apps/wizard/pages/meta_expert/docs_reduced/tables.txt deleted file mode 100644 index 7a388c2e653..00000000000 --- a/apps/wizard/pages/meta_expert/docs_reduced/tables.txt +++ /dev/null @@ -1,33 +0,0 @@ -# `table` - -A table groups indicators with a shared index. Its metadata attributes are defined by the `TableMeta` object in ETL. - -## `table.common` - -## `table.description` - -*type*: `string` | recommended (often automatic) - -Briefly describes the table for internal use or data catalog users. It should be a concise one or few paragraphs. - -### Guidelines - -- Begin with a capital letter and end with a period. -- Avoid mentioning other metadata fields (e.g., `producer`, `date_published`), except when essential for describing the data product. -- Use to provide a specific description if the automatic one (usually from the origin) is insufficient, such as for tables with multiple origins. - -## `table.title` - -*type*: `string` | required (often automatic) - -A short title for the table for internal or catalog use. - -### Guidelines - -- Start with a capital letter; do not end with a period. -- Clearly identify the table. -- Override the automatic title (usually from the origin) if necessary, like for tables with multiple origins. - -## `table.variables` - -Indicators, or 'variables', are data point collections (typically time series) with metadata, defined by the `VariableMeta` object in ETL. \ No newline at end of file diff --git a/apps/wizard/pages/meta_expert/prompts.py b/apps/wizard/pages/meta_expert/prompts.py deleted file mode 100644 index 77248ab1dbc..00000000000 --- a/apps/wizard/pages/meta_expert/prompts.py +++ /dev/null @@ -1,152 +0,0 @@ -"""Prompts for GPT-interaction. - -Also: Generate reduced version of the documentation using chat GPT. - -We need to provide some documentation context to the GPT model, via the system prompt. However, the original documentation is too long, especially if using a simple model like 3.5. Therefore, we need to reduce the length of the documentation before passing it to the GPT model. This process is done using GPT-4. - -Run this as follows (from python shell): - -from apps.wizard.pages.meta_expert.generate_prompt import generate_documentation -generate_documentation() -""" -from structlog import get_logger - -from apps.wizard.utils import WIZARD_DIR -from apps.wizard.utils.gpt import GPTQuery, OpenAIWrapper, get_number_tokens -from etl.config import load_env -from etl.docs import render_dataset, render_indicator, render_origin, render_table - -# Logger -log = get_logger() - -# ENVIRONMENT CONFIG -load_env() - - -######################################### -# GENERATE REDUCED DOCUMENTATION -######################################### -# Path -DOCS_REDUCED_DIR = WIZARD_DIR / "pages" / "meta_expert" / "docs_reduced" - -# GPT CONFIG -# Model name -MODEL_NAME_REDUCED_DEFAULT = "gpt-4-turbo-preview" # "gpt-4" -# System prompt -SYSTEM_PROMPT = """ -- You are a technical expert. -- You are given the documentation of a certain data API and you are asked to make it shorter, more consise and better structured, while not losing any any information. -""" -# User prompt (template) -USER_PROMPT = """ -Reduce the token length of the following documentation, without losing any information: - -{docs_original} -""" - -# LOAD ORIGINAL DOCUMENTATION (markdown) -metadata_original = { - "dataset": render_dataset(), - "tables": render_table(), - "origin": render_origin(), - "indicators": render_indicator(), -} - - -def generate_documentation(model_name: str = MODEL_NAME_REDUCED_DEFAULT) -> None: - """Generate reduced version of the documentation using chat GPT.""" - # Initiate OpenAI - api = OpenAIWrapper() - # Generate docs for each category - for docs_name, docs in metadata_original.items(): - log.info(f"Generating reduced docs for {docs_name} with model {model_name}.") - messages = [ - {"role": "system", "content": SYSTEM_PROMPT}, - {"role": "user", "content": USER_PROMPT.format(docs_original=docs)}, - ] - - # Query ChatGPT - query = GPTQuery( - messages=messages, - temperature=0, - ) - - try: - response = api.query_gpt(query=query, model=model_name) - except Exception: - raise ValueError("Error in GPT query!") - else: - if response: - # Get response text - text = response.message_content - - tokens_before = get_number_tokens(docs, model_name) - tokens_after = get_number_tokens(text, model_name) - path = f"{DOCS_REDUCED_DIR}/{docs_name}.txt" - log.info( - f"Reducing '{docs_name}' documentation from {tokens_before} to {tokens_after} tokens. Cost: {response.cost} USD. Saving it to file {path}" - ) - with open(f"{path}.original", "w") as f: - f.write(docs) - with open(path, "w") as f: - f.write(text) - else: - raise ValueError("Error in GPT query!") - - -######################################### -# OTHER PROMPTS -######################################### -# Prompt using original documentation -SYSTEM_PROMPT_FULL = f""" -As an expert in OWID's metadata structure, you'll respond to inquiries about its structure, comprising four main entities: Origin, Dataset, Table, and Indicator (Variable). Datasets group together Tables, which are akin to pandas DataFrames but include extra metadata, and Tables feature Indicators as columns. Indicators may be linked to multiple Origins, identifying the data's sources. Detailed explanations of each entity follow, separated by '------'. - - -# Datasets: -{render_dataset()} - ------- -# Tables: -{render_table()} - ------- -# Indicators: -{render_indicator()} - ------- -# Origins: -{render_origin()} -""" - - -# Prompt using reduced documentation -def render_docs_reduced(entity_name): - with open(DOCS_REDUCED_DIR / f"{entity_name}.txt", "r") as f: - return f.read() - - -SYSTEM_PROMPT_REDUCED = f""" -You are a technical expert at Our World in Data. Your expertise is in the metadata structure of the ETL pipeline. You'll respond to inquiries about ETL's metadata structure. - -You should answer in the context of OWID's metadata structure, which is explained below. - - -There are four main entities: Origin, Dataset, Table, and Indicator (Variable). Datasets group together Tables, which are akin to pandas DataFrames but include extra metadata, and Tables feature Indicators as columns. Indicators may be linked to multiple Origins, identifying the data's sources. Detailed explanations of each entity follow, separated by '------'. - - -# Datasets: -{render_docs_reduced('dataset')} - ------- -# Tables: -{render_docs_reduced('tables')} - ------- -# Indicators: -{render_docs_reduced('indicators')} - ------- -# Origins: -{render_docs_reduced('origin')} - -""" diff --git a/apps/wizard/pages/owid_datasette_oracle.py b/apps/wizard/pages/owid_datasette_oracle.py deleted file mode 100644 index 288da256278..00000000000 --- a/apps/wizard/pages/owid_datasette_oracle.py +++ /dev/null @@ -1,223 +0,0 @@ -"""Ask chat GPT for help writing datasette queries. -""" -from typing import Any, Dict, cast - -import streamlit as st -from st_pages import add_indentation -from streamlit_feedback import streamlit_feedback -from structlog import get_logger - -from apps.wizard.pages.owid_datasette_oracle_prompt import OWID_DATASETTE_ORACLE_PROMPT -from apps.wizard.utils import set_states -from apps.wizard.utils.db import DB_IS_SET_UP, WizardDB -from apps.wizard.utils.gpt import OpenAIWrapper, get_cost_and_tokens -from etl.config import load_env - -# LOG -log = get_logger() - -# CONFIG -st.set_page_config(page_title="OWID Datasette Oracle", page_icon="🔮") -add_indentation() -## Title/subtitle -st.title("**OWID Datasette oracle** 🔮") -st.markdown("Get help writing SQL queries for Datasette!") - -## Load variables -load_env() - - -@st.cache_data(show_spinner=True) -def ask_gpt(query, model): - response = api.query_gpt(query, model=model) - return response - - -# GPT CONFIG -MODEL_DEFAULT = "gpt-4-turbo-preview" -MODELS_AVAILABLE = { - "gpt-3.5-turbo-0125": "GPT-3.5 Turbo (gpt-3.5-turbo-0125)", - "gpt-4-turbo-preview": "GPT-4 Turbo (gpt-4-turbo-preview)", -} -MODELS_AVAILABLE_LIST = list(MODELS_AVAILABLE.keys()) - - -# Handle feedback -def handle_feedback(feedback: Dict[str, Any]) -> None: - """Handle feedback.""" - print("handle feedback") - print(feedback) - # st.write(feedback) - # st.write(st.session_state.prompt) - # st.write(st.session_state.response) - WizardDB().add_usage( - question=st.session_state.messages[-2]["content"], - answer=st.session_state.response, - feedback=1 if feedback["score"] == "👍" else 0, - feedback_text=feedback.get("text", None), - cost=st.session_state.cost_last, - ) - - -# Switch category function -def get_system_prompt() -> str: - """Get appropriate system prompt.""" - return OWID_DATASETTE_ORACLE_PROMPT - - -# Reset chat history -def reset_messages() -> None: - """Reset messages to default.""" - set_states( - { - "messages": [{"role": "system", "content": get_system_prompt()}], - "response": None, - "prompt": None, - } - ) - - -## Examples -EXAMPLE_QUERIES = [ - "> Which are our top 10 articles by pageviews?", - "> How many charts do we have that use only a single indicator?", - "> Do we have datasets whose indicators are not used in any chart?", -] -with st.popover("See examples"): - for example in EXAMPLE_QUERIES: - st.markdown(example) - -# Sidebar with GPT config -st.session_state.analytics = st.session_state.get("analytics", True) -with st.sidebar: - st.button( - label="Clear chat", - on_click=reset_messages, - ) - st.divider() - st.toggle( - label="Collect data for analytics", - value=True, - on_change=lambda: set_states( - { - "analytics": not st.session_state.analytics, - } - ), - help="If enabled, we will collect usage data to improve the app. \n\nThis **is really helpful to improve** how we query chat GPT: E.g. which system prompt to use, optimise costs, and much more 😊. \n\nData collected: questions, responses and feedback submitted. \n\nYou can see how this data is collected [here](https://github.com/owid/etl/blob/master/apps/wizard/utils/db.py). \n\nRecords are anonymous.", - ) - st.divider() - st.markdown("## GPT Configuration") - model_name = st.selectbox( - label="Select GPT model", - options=MODELS_AVAILABLE_LIST, - format_func=lambda x: MODELS_AVAILABLE[x], - index=MODELS_AVAILABLE_LIST.index(MODEL_DEFAULT), - help="[Pricing](https://openai.com/pricing) | [Model list](https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo)", - ) - ## See pricing list: https://openai.com/pricing (USD) - ## See model list: https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo - - use_reduced_context = st.toggle( - "Reduced context window", - value=False, - help="If checked, only the last user message will be accounted (i.e less tokens and therefore cheaper).", - ) - temperature = st.slider( - "Temperature", - min_value=0.0, - max_value=2.0, - value=0.15, - step=0.01, - help="What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.", - ) - max_tokens = int( - st.number_input( - "Max tokens", - min_value=32, - max_value=4096, - value=4096, - step=32, - help="The maximum number of tokens in the response.", - ) - ) - -# API with OPENAI -api = OpenAIWrapper() - -# ACTUAL APP -# Initialize chat history -if "messages" not in st.session_state: - reset_messages() - -# DEGUG -# st.write([m for m in st.session_state.messages if m["role"] != "system"]) - -# Display chat messages from history on app rerun -for message in st.session_state.messages: - if message["role"] != "system": - with st.chat_message(message["role"]): - st.markdown(message["content"]) - -# Initialise session state -st.session_state.response = st.session_state.get("response", None) -st.session_state.prompt = st.session_state.get("prompt", None) -st.session_state.feedback_key = st.session_state.get("feedback_key", 0) -st.session_state.cost_last = st.session_state.get("cost_last", 0) - -# React to user input -if prompt := st.chat_input("Ask me!"): - st.session_state.feedback_key += 1 - print("asking GPT...") - # Display user message in chat message container - with st.chat_message("user"): - st.markdown(prompt) - - # Add user message to chat history - st.session_state.messages.append({"role": "user", "content": prompt}) - - # Build GPT query (only use the system prompt and latest user input) - if use_reduced_context: - messages = [{"role": "system", "content": get_system_prompt()}, {"role": "user", "content": prompt}] - else: - messages = st.session_state.messages - - # Display assistant response in chat message container - with st.chat_message("assistant"): - # Ask GPT (stream) - stream = api.chat.completions.create( - model=cast(str, model_name), - messages=messages, # type: ignore - temperature=temperature, - max_tokens=max_tokens, - stream=True, - ) - st.session_state.response = cast(str, st.write_stream(stream)) - - # Add new response by the System - st.session_state.messages.append({"role": "assistant", "content": st.session_state.response}) - - # Add prompt to session state - st.session_state.prompt = prompt - - print("finished asking GPT...") - -if st.session_state.response: - # Get cost & tokens - text_in = "\n".join([m["content"] for m in st.session_state.messages]) - cost, num_tokens = get_cost_and_tokens(text_in, st.session_state.response, cast(str, model_name)) - cost_msg = f"**Cost**: ≥{cost} USD.\n\n **Tokens**: ≥{num_tokens}." - st.session_state.cost_last = cost - - if DB_IS_SET_UP and st.session_state.analytics: - # Get feedback only if DB is properly setup - feedback = streamlit_feedback( - feedback_type="thumbs", - optional_text_label="[Optional] Please provide an explanation", - key=f"feedback_{st.session_state.feedback_key}", - on_submit=handle_feedback, - ) - # Show cost below feedback - st.info(cost_msg) - -# DEBUG -# st.write([m for m in st.session_state.messages if m["role"] != "system"]) diff --git a/apps/wizard/pages/owid_datasette_oracle_prompt.py b/apps/wizard/pages/owid_datasette_oracle_prompt.py deleted file mode 100644 index 15eeeeeac43..00000000000 --- a/apps/wizard/pages/owid_datasette_oracle_prompt.py +++ /dev/null @@ -1,486 +0,0 @@ -OWID_DATASETTE_ORACLE_PROMPT = """ -## OWID datasette Oracle V2 - -OWID Datasette Oracle is designed to effectively utilize the provided database schema, making intelligent use of foreign key constraints to deduce relationships from natural language inquiries. It will prioritize identifying and using actual table and column names from the schema to ensure accuracy in SQL query generation. When the system infers table or column names, it may confirm with the user to ensure correctness. The SQL dialect used is SQLite. - -The schema is provided in yaml below. The top level array represents the tables, with a "name" field and an optional "description" field. The columns are listed under the "columns" key. If a column has a foreign key constraint onto another table, this is specified with the fields "fkTargetTable" and "fkTargetColumn". - -```yaml -- name: algolia_searches_by_week - columns: - - name: week_start_date - - name: index - - name: query - - name: total_searches - - name: total_hits -- name: analytics_pageviews - description: | - contains information on pageviews which can be very useful to order results by (e.g. to show - posts with the most pageviews first). The `url` of this table contains full urls - to match - it up with the `slug` column on `posts` or `posts_gdocs` or `charts` table you have to turn - those into full urls. `posts` and `posts_gdocs` slug just needs to be prefixed with - `https://ourworldindata.org/`, for charts it is `https://ourworldindata.org/grapher/`, - for explorers it is `https://ourworldindata.org/explorers/` - columns: - - name: day - - name: url - - name: views_7d - - name: views_14d - - name: views_365d - - name: url_domain - - name: url_path - - name: url_query - - name: url_fragment -- name: chart_dimensions - description: this table enumerates the variables (aka indicators) that are used in a chart - columns: - - name: id - - name: order - - name: property - - name: chartId - fkTargetTable: charts - fkTargetColumn: id - - name: variableId - fkTargetTable: variables - fkTargetColumn: id - - name: createdAt - - name: updatedAt -- name: chart_slug_redirects - descriptioN: this table contains alternative slugs pointing to charts - columns: - - name: id - - name: slug - - name: chart_id - fkTargetTable: charts - fkTargetColumn: id - - name: createdAt - - name: updatedAt -- name: chart_tags - columns: - - name: chartId - fkTargetTable: charts - fkTargetColumn: id - - name: tagId - fkTargetTable: tags - fkTargetColumn: id - - name: keyChartLevel - - name: createdAt - - name: updatedAt - - name: isApproved -- name: chart_variables - columns: - - name: chartId - fkTargetTable: charts - fkTargetColumn: id - - name: variableId - fkTargetTable: variables - fkTargetColumn: id -- name: charts - description: | - contains the configuration for our data visualization. The `config` column contains a json - configuration for the chart. Important fields inside this json are hasMapTab, hasChartTab, - title, subtitle, slug and type (one of LineChart ScatterPlot StackedArea DiscreteBar - StackedDiscreteBar SlopeChart StackedBar Marimekko or missing in which case LineChart is the default) - columns: - - name: id - - name: slug - - name: type - - name: config - - name: createdAt - - name: updatedAt - - name: lastEditedAt - - name: publishedAt - - name: lastEditedByUserId - fkTargetTable: users - fkTargetColumn: id - - name: publishedByUserId - fkTargetTable: users - fkTargetColumn: id - - name: is_indexable - - name: title - - name: subtitle - - name: note - - name: title_plus_variant - - name: configWithDefaults -- name: dataset_tags - columns: - - name: datasetId - fkTargetTable: datasets - fkTargetColumn: id - - name: tagId - fkTargetTable: tags - fkTargetColumn: id - - name: createdAt - - name: updatedAt -- name: datasets - description: a collection of varaibles - columns: - - name: id - - name: name - - name: description - - name: createdAt - - name: updatedAt - - name: namespace - - name: isPrivate - - name: createdByUserId - fkTargetTable: users - fkTargetColumn: id - - name: metadataEditedAt - - name: metadataEditedByUserId - fkTargetTable: users - fkTargetColumn: id - - name: dataEditedAt - - name: dataEditedByUserId - fkTargetTable: users - fkTargetColumn: id - - name: nonRedistributable - - name: isArchived - - name: sourceChecksum - - name: shortName - - name: version - - name: updatePeriodDays -- name: entities - columns: - - name: id - - name: code - - name: name - - name: validated - - name: createdAt - - name: updatedAt - - name: displayName -- name: explorer_charts - columns: - - name: id - - name: explorerSlug - fkTargetTable: explorers - fkTargetColumn: slug - - name: chartId - fkTargetTable: charts - fkTargetColumn: id -- name: explorer_tags - columns: - - name: id - - name: explorerSlug - - name: tagId - fkTargetTable: tags - fkTargetColumn: id -- name: explorer_variables - columns: - - name: id - - name: explorerSlug - fkTargetTable: explorers - fkTargetColumn: slug - - name: variableId - fkTargetTable: variables - fkTargetColumn: id -- name: explorers - description: | - contains our explorers, which are more complex data visualisations. They can include charts but can also be configured differently. If they are are using charts then the link is established in the `explorer_charts` table. Linking this to `variables` can be done as well but if doing so, alert the user to the fact that there are a lot of connections between these entities that are not tracked in the database. - columns: - - name: slug - - name: isPublished - - name: config - - name: createdAt - - name: updatedAt -- name: images - columns: - - name: id - - name: googleId - - name: filename - - name: defaultAlt - - name: originalWidth - - name: updatedAt - - name: originalHeight -- name: namespaces - columns: - - name: id - - name: name - - name: description - - name: isArchived - - name: createdAt - - name: updatedAt -- name: origins - columns: - - name: id - - name: titleSnapshot - - name: title - - name: descriptionSnapshot - - name: description - - name: producer - - name: citationFull - - name: attribution - - name: attributionShort - - name: versionProducer - - name: urlMain - - name: urlDownload - - name: dateAccessed - - name: datePublished - - name: license -- name: origins_variables - columns: - - name: originId - fkTargetTable: origins - fkTargetColumn: id - - name: variableId - fkTargetTable: variables - fkTargetColumn: id - - name: displayOrder -- name: post_broken_chart_links - columns: - - name: id - - name: postId - fkTargetTable: posts - fkTargetColumn: id - - name: chartSlug - - name: kind -- name: post_charts - columns: - - name: id - - name: postId - fkTargetTable: posts - fkTargetColumn: id - - name: chartId - fkTargetTable: charts - fkTargetColumn: id - - name: kind - - name: through_redirect -- name: post_links - columns: - - name: id - - name: postId - fkTargetTable: posts - fkTargetColumn: id - - name: link - - name: kind -- name: post_tags - columns: - - name: post_id - fkTargetTable: posts - fkTargetColumn: id - - name: tag_id - fkTargetTable: tags - fkTargetColumn: id - - name: createdAt - - name: updatedAt -- name: posts - description: | - The table for our old posts that were written in wordpress. It contains the html content of the post in the `content` column - and a markdown version of the content in the markdown `column`. - columns: - - name: id - - name: title - - name: slug - - name: type - - name: status - - name: content - - name: archieml - - name: archieml_update_statistics - - name: published_at - - name: updated_at - - name: gdocSuccessorId - - name: authors - - name: excerpt - - name: created_at_in_wordpress - - name: updated_at_in_wordpress - - name: featured_image - - name: formattingOptions - - name: markdown - - name: wpApiSnapshot -- name: posts_gdocs - description: | - The table for our new posts written in Google Docs. It contains content in form of json in the `content` column and a - markdown version of the content in the markdown `column`. - columns: - - name: id - - name: slug - - name: type - - name: content - - name: published - - name: createdAt - - name: publishedAt - - name: updatedAt - - name: publicationContext - - name: revisionId - - name: breadcrumbs - - name: markdown - - name: title -- name: posts_gdocs_links - columns: - - name: id - - name: sourceId - fkTargetTable: posts_gdocs - fkTargetColumn: id - - name: target - - name: linkType - - name: componentType - - name: text - - name: queryString - - name: hash -- name: posts_gdocs_variables_faqs - columns: - - name: gdocId - fkTargetTable: posts_gdocs - fkTargetColumn: id - - name: variableId - fkTargetTable: variables - fkTargetColumn: id - - name: fragmentId - - name: displayOrder -- name: posts_gdocs_x_images - columns: - - name: id - - name: gdocId - fkTargetTable: posts_gdocs - fkTargetColumn: id - - name: imageId - fkTargetTable: images - fkTargetColumn: id -- name: posts_gdocs_x_tags - columns: - - name: gdocId - fkTargetTable: posts_gdocs - fkTargetColumn: id - - name: tagId - fkTargetTable: tags - fkTargetColumn: id -- name: posts_links - columns: - - name: id - - name: sourceId - fkTargetTable: posts - fkTargetColumn: id - - name: target - - name: linkType - - name: componentType - - name: text - - name: queryString - - name: hash -- name: posts_unified - description: | - this table combines posts and posts_gdocs. To get the content you need to join it with - posts and posts_gdocs but this is the best place to query e.g. all titles. Type is one of: article homepage topic-page linear-topic-page data-insight author about-page. We sometimes call topic-page pages "Modular topic pages". - columns: - - name: id - - name: slug - - name: title - - name: type - - name: publishedAt - - name: updatedAt - - name: authors - - name: createdAt - - name: publicationContext - - name: gdocId - fkTargetTable: posts_gdocs - fkTargetColumn: id - - name: wordpressId - fkTargetTable: posts - fkTargetColumn: id -- name: redirects - columns: - - name: id - - name: source - - name: target - - name: code - - name: createdAt - - name: updatedAt -- name: sources - columns: - - name: id - - name: name - - name: description - - name: createdAt - - name: updatedAt - - name: datasetId - fkTargetTable: datasets - fkTargetColumn: id - - name: additionalInfo - - name: link - - name: dataPublishedBy -- name: sqlite_sequence - columns: - - name: name - - name: seq -- name: tags - columns: - - name: id - - name: name - - name: createdAt - - name: updatedAt - - name: parentId - fkTargetTable: tags - fkTargetColumn: id - - name: specialType - - name: slug -- name: tags_variables_topic_tags - columns: - - name: tagId - fkTargetTable: tags - fkTargetColumn: id - - name: variableId - fkTargetTable: variables - fkTargetColumn: id - - name: displayOrder -- name: users - columns: - - name: id - - name: password - - name: lastLogin - - name: isSuperuser - - name: email - - name: createdAt - - name: updatedAt - - name: isActive - - name: fullName - - name: lastSeen -- name: variables - columns: - - name: id - - name: name - - name: unit - - name: description - - name: createdAt - - name: updatedAt - - name: code - - name: coverage - - name: timespan - - name: datasetId - fkTargetTable: datasets - fkTargetColumn: id - - name: sourceId - fkTargetTable: sources - fkTargetColumn: id - - name: shortUnit - - name: display - - name: columnOrder - - name: originalMetadata - - name: grapherConfigAdmin - - name: shortName - - name: catalogPath - - name: dimensions - - name: schemaVersion - - name: processingLevel - - name: processingLog - - name: titlePublic - - name: titleVariant - - name: attributionShort - - name: attribution - - name: descriptionShort - - name: descriptionFromProducer - - name: descriptionKey - - name: descriptionProcessing - - name: licenses - - name: license - - name: grapherConfigETL - - name: type - - name: sort - -``` - -The content of the database is all the information for the Our World In Data website, a publication with writing and data visualization about the world's biggest problems. - -For questions about posts, articles, topic pages and so on, posts_unified is usually the best starting point and you should prefer querying that table over posts or posts_gdocs unless there is a compelling reason. For questions about grapher charts it is charts. For question about indicators or variables it is variables. - -Your job is to create a SQL query for the user that answers their question given the schema above. You may ask the user for clarification, e.g. if it is unclear if unpublished items should be included (when applicable) or if there is ambiguity in which tables to use to answer a question. - -Upon generating a query, OWID Datasette Oracle will always provide the SQL query both as text and as a clickable Datasette link, formatted for the user's convenience. The datasette URL is http://datasette-private and the database name is owid. An example query to get all rows from the algolia_searches_by_week table is this one that demonstrates the escaping: `http://datasette-private/owid?sql=select+*+from+algolia_searches_by_week` Remember, you cannot actually run the SQL query, you are just to output the query as text and a datasette link that will run that query! -""" diff --git a/apps/wizard/utils/__init__.py b/apps/wizard/utils/__init__.py index f47e6221634..b1d3589d785 100644 --- a/apps/wizard/utils/__init__.py +++ b/apps/wizard/utils/__init__.py @@ -557,21 +557,18 @@ def set_states(states_values: Dict[str, Any], logging: bool = False, only_if_not def st_page_link(alias: str, border: bool = False, **kwargs) -> None: """Link to page.""" + if "page" not in kwargs: + kwargs["page"] = PAGES_BY_ALIAS[alias]["entrypoint"] + if "label" not in kwargs: + kwargs["label"] = PAGES_BY_ALIAS[alias]["title"] + if "icon" not in kwargs: + kwargs["icon"] = PAGES_BY_ALIAS[alias]["emoji"] + if border: with st.container(border=True): - st.page_link( - page=PAGES_BY_ALIAS[alias]["entrypoint"], - label=PAGES_BY_ALIAS[alias]["title"], - icon=PAGES_BY_ALIAS[alias]["emoji"], - **kwargs, - ) + st.page_link(**kwargs) else: - st.page_link( - page=PAGES_BY_ALIAS[alias]["entrypoint"], - label=PAGES_BY_ALIAS[alias]["title"], - icon=PAGES_BY_ALIAS[alias]["emoji"], - **kwargs, - ) + st.page_link(**kwargs) st.cache_data diff --git a/apps/wizard/utils/gpt.py b/apps/wizard/utils/gpt.py index 682b3a8559e..b787c3cf2a1 100644 --- a/apps/wizard/utils/gpt.py +++ b/apps/wizard/utils/gpt.py @@ -2,7 +2,6 @@ Auxiliary classes, functions and variables. """ from dataclasses import asdict, dataclass -from datetime import date from typing import Any, Dict, List, Optional, Tuple import structlog @@ -19,34 +18,47 @@ MODEL_DEFAULT = "gpt-3.5-turbo" # PRICING (per 1,000 tokens) -## See pricing list: https://openai.com/pricing (USD) +## See pricing list: https://openai.com/api/pricing/ (USD) ## See model list: https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo RATE_DEFAULT_IN = 0.005 MODEL_EQUIVALENCES = { # "gpt-3.5-turbo": "gpt-3.5-turbo-0125", - "gpt-3.5-turbo": "gpt-3.5-turbo-0613" if date.today() >= date(2024, 2, 16) else "gpt-3.5-turbo-0125", - "gpt-4-turbo-preview": "gpt-4-0125-preview", + "gpt-3.5-turbo": "gpt-3.5-turbo-0125", + "gpt-4-turbo-preview": "gpt-4-turbo-2024-04-09", + "gpt-4o": "gpt-4o-2024-05-13", } MODEL_RATES_1000_TOKEN = { + # GPT 3.5 "gpt-3.5-turbo-0613": { - "in": 0.0015, - "out": 0.0020, + "in": 1.5 / 1000, + "out": 2 / 1000, }, "gpt-3.5-turbo-0125": { - "in": 0.0005, - "out": 0.0015, + "in": 0.5 / 1000, + "out": 1.5 / 1000, }, + # GPT 4 "gpt-4-0125-preview": { - "in": 0.01, - "out": 0.03, + "in": 10 / 1000, + "out": 30 / 1000, }, "gpt-4": { - "in": 0.03, - "out": 0.06, + "in": 30 / 1000, + "out": 60 / 1000, }, "gpt-4-32k": { - "in": 0.06, - "out": 0.12, + "in": 60 / 1000, + "out": 120 / 1000, + }, + # GPT 4 Turbo + "gpt-4-turbo-2024-04-09": { + "in": 10 / 1000, + "out": 30 / 1000, + }, + # GPT 4o + "gpt-4o-2024-05-13": { + "in": 5 / 1000, + "out": 15 / 1000, }, } MODEL_RATES_1000_TOKEN = { diff --git a/etl/grapher_model.py b/etl/grapher_model.py index 8fc5c1ddbb7..3db12ca44bb 100644 --- a/etl/grapher_model.py +++ b/etl/grapher_model.py @@ -11,6 +11,7 @@ import json from datetime import date, datetime +from enum import Enum from pathlib import Path from typing import Any, Dict, List, Literal, Optional, Union, get_args @@ -43,7 +44,7 @@ VARCHAR, ) from sqlalchemy.exc import NoResultFound -from sqlalchemy.orm import DeclarativeBase, Mapped, MappedAsDataclass, Session, mapped_column +from sqlalchemy.orm import DeclarativeBase, Mapped, MappedAsDataclass, Session, mapped_column # type: ignore from sqlalchemy.sql import Select from typing_extensions import Self, TypedDict @@ -1351,8 +1352,17 @@ def upsert(self, session: Session) -> "Origin": return origin -# TODO: should we also add "rejected" status and exclude such charts from chart-sync? -CHART_DIFF_STATUS = Literal["approved", "unapproved"] +class ChartStatus(Enum): + APPROVED = "approved" + PENDING = "pending" + REJECTED = "rejected" + + +CHART_DIFF_STATUS = Literal[ + ChartStatus.APPROVED, + ChartStatus.PENDING, + ChartStatus.REJECTED, +] class ChartDiffApprovals(Base): @@ -1372,10 +1382,8 @@ class ChartDiffApprovals(Base): updatedAt: Mapped[datetime] = mapped_column(DateTime, default=func.utc_timestamp()) @classmethod - def latest_chart_status( - cls, session: Session, chart_id: int, source_updated_at, target_updated_at - ) -> CHART_DIFF_STATUS: - """Load the latest approval of the chart. If there's none, return 'unapproved'.""" + def latest_chart_status(cls, session: Session, chart_id: int, source_updated_at, target_updated_at) -> str: + """Load the latest approval of the chart. If there's none, return ChartStatus.PENDING.""" result = session.scalars( select(cls) .where( @@ -1387,9 +1395,9 @@ def latest_chart_status( .limit(1) ).first() if result: - return result.status + return result.status # type: ignore else: - return "unapproved" + return ChartStatus.PENDING.value def _json_is(json_field: Any, key: str, val: Any) -> Any: