diff --git a/etl/steps/data/garden/hmd/2024-11-19/hfd.meta.yml b/etl/steps/data/garden/hmd/2024-11-19/hfd.meta.yml index 5064cc95848..80c64e139c1 100644 --- a/etl/steps/data/garden/hmd/2024-11-19/hfd.meta.yml +++ b/etl/steps/data/garden/hmd/2024-11-19/hfd.meta.yml @@ -13,12 +13,28 @@ definitions: <%- elif birth_order == '5p' %> fifth (or greater) <% endif %> + bo_1_m1: |- + <% if birth_order == '2' %> + second + <%- elif birth_order == '3' %> + third + <%- elif birth_order == '4' %> + fourth + <%- elif birth_order == '5p' %> + fifth (or greater) + <% endif %> title: |- <% if birth_order == 'total' %> << title >> - Total <%- else %> << title >> - Birth order: << birth_order >> <%- endif %> + title_age: |- + <% if birth_order == 'total' %> + << title >> - Mother age: << age >> - Birth order: Total + <%- else %> + << title >> - Mother age: << age >> - Birth order: << birth_order >> + <%- endif %> common: presentation: topic_tags: @@ -284,3 +300,49 @@ tables: description_key: - Reflects variability in the timing of births up to age 40 within a cohort. - Helps to understand how concentrated or spread out early childbearing is within the cohort. + + ppr: + title: |- + Cohort parity progression ratios - << birth_order | int >> to << (birth_order | int) + 1 >> birth + description_short: |- + <% if birth_order == '1' %> + Probability of giving birth to a first child. + <%- elif birth_order == '2' %> + Probability of giving birth to a second child, conditioned on having had a first child. + <%- elif birth_order == '3' %> + Probability of giving birth to a third child, conditioned on having had a second child. + <%- elif birth_order == '4' %> + Probability of giving birth to a fourth child, conditioned on having had a third child. + <% endif %> + unit: "" + description_key: + - Measures the likelihood that a woman with a given number of children will go on to have another child. + - It is useful for understanding family-building dynamics and changes in reproductive behavior over time. + + period_ages: + variables: + asfr: + title: |- + <% set title = "Period fertility rates" %> + {definitions.others.title_age} + description_short: |- + Age-specific fertility rates for each calendar year, measured in completed years of age. + unit: "births per woman" + description_key: + - Represents fertility rates for each age group in a specific year. + - Useful for detailed analysis of fertility patterns by both age and year. + - Presented in the form of Lexis squares, which provide a snapshot of fertility behavior over time. + + cohort_ages: + variables: + asfr: + title: |- + <% set title = "Cohort fertility rates" %> + {definitions.others.title_age} + unit: "births per woman" + description_short: |- + Age-specific fertility rates for women in a specific birth cohort, measured by their age in completed years. + description_key: + - Represents fertility rates for a specific cohort as they age. + - Useful for understanding how fertility behavior changes across different cohorts over time. + - Presented in the form of horizontal parallelograms, allowing for the tracking of cohort-specific fertility patterns. diff --git a/etl/steps/data/garden/hmd/2024-11-19/hfd.py b/etl/steps/data/garden/hmd/2024-11-19/hfd.py index 71372a78d88..9cea04c68c9 100644 --- a/etl/steps/data/garden/hmd/2024-11-19/hfd.py +++ b/etl/steps/data/garden/hmd/2024-11-19/hfd.py @@ -410,21 +410,234 @@ }, # "pmabc", } +REGEX_PERIOD_BO = {} + # Tables to process for COHORT country-cohort TABLES_COHORT = [ "mabvh", - # "pprvhbo", + "pprvhbo", "sdmabvh", "tfrvh", ] TABLES_COHORT_W_PARITY = { - # "pprvhbo": { - # "indicators": ["patfr"], - # }, + "pprvhbo": { + "indicators": ["ppr"], + }, +} +REGEX_COHORT_BO = { + "pprvhbo": { + "ppr": r"^ppr\d+_\d+$", + }, } +# Tables to process for PERIOD country-year-age +TABLES_PERIOD_AGE = [ + "asfrrr", +] +TABLES_PERIOD_AGE_W_PARITY = {} +REGEX_PERIOD_AGE_BO = {} +# Tables to process for COHORT country-year-age +TABLES_COHORT_AGE = [ + "asfrvh", +] +TABLES_COHORT_AGE_W_PARITY = {} +REGEX_COHORT_AGE_BO = {} + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("hfd") + + # 1/ Read period tables + consolidate in one table tb_period + ## Initial definitions + cols_index = ["country", "year"] + col_bo = "birth_order" + ## Read tables + tbs = make_table_list( + ds_meadow=ds_meadow, + table_names=TABLES_PERIOD, + tables_w_parity=TABLES_PERIOD_W_PARITY, + cols_index=cols_index, + col_bo=col_bo, + regex_bo=REGEX_PERIOD_BO, + ) + ## Merge + tb_period = consolidate_table_from_list(tbs, cols_index + [col_bo], "period") + + # 2/ Read cohort tables + consolidate in one table tb_cohort + ## Initial definitions + cols_index = ["country", "cohort"] + col_bo = "birth_order" + ## Read tables + tbs = make_table_list( + ds_meadow=ds_meadow, + table_names=TABLES_COHORT, + tables_w_parity=TABLES_COHORT_W_PARITY, + cols_index=cols_index, + col_bo=col_bo, + regex_bo=REGEX_COHORT_BO, + ) + # Quick fix: change birth_order label for PPR + tbs = _fix_ppr(tbs) + ## Merge + tb_cohort = consolidate_table_from_list(tbs, cols_index + [col_bo], "cohort") + + # 3/ Period tables (by age) + cols_index = ["country", "year", "age"] + col_bo = "birth_order" + ## Read tables + tbs = make_table_list( + ds_meadow=ds_meadow, + table_names=TABLES_PERIOD_AGE, + tables_w_parity=TABLES_PERIOD_AGE_W_PARITY, + cols_index=cols_index, + col_bo=col_bo, + regex_bo=REGEX_PERIOD_AGE_BO, + ) + ## Consolidate + tb_period_ages = consolidate_table_from_list( + tbs=tbs, + cols_index_out=cols_index + [col_bo], + short_name="period_ages", + fcn=keep_relevant_ages, + ) + + # 4/ Cohort tables (by age) + cols_index = ["country", "cohort", "age"] + col_bo = "birth_order" + ## Read tables + tbs = make_table_list( + ds_meadow=ds_meadow, + table_names=TABLES_COHORT_AGE, + tables_w_parity=TABLES_COHORT_AGE_W_PARITY, + cols_index=cols_index, + col_bo=col_bo, + regex_bo=REGEX_COHORT_AGE_BO, + check_integration=False, + check_integration_limit=143, + ) + ## Consolidate + tb_cohort_ages = consolidate_table_from_list( + tbs=tbs, + cols_index_out=cols_index + [col_bo], + short_name="cohort_ages", + fcn=keep_relevant_ages, + ) + + # + # Process data. + # + tables = [ + tb_period, + tb_cohort, + tb_period_ages, + tb_cohort_ages, + ] + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, + tables=tables, + check_variables_metadata=True, + default_metadata=ds_meadow.metadata, + ) + + # Save changes in the new garden dataset. + ds_garden.save() + + +def make_table_list( + ds_meadow, + table_names, + tables_w_parity, + cols_index, + col_bo, + regex_bo=None, + check_integration=True, + check_integration_limit=None, +): + """Reads relevant tables, and formats them accordingly. + + Tables come in wide format, sometimes as two-tables (main and birth order). This function consolidates them into single tables per topic. + + For instance, we have one table with total fertility rates (columns `tfr`). And then another one with fertilities broken down by birth order (columns `tfr`, `tfr1`, etc.) + Instead, we want a table in long format, which has one column `tfr` and adds the birth order as a dimension of the table. + """ + if regex_bo is None: + regex_bo = {} + + tbs = [] + for tname in table_names: + # Get custom regex for this table + regex = regex_bo.get(tname) + + # Read main table + tb = read_table(ds_meadow, tname) + + # Check if there is a birth order table for this indicator(s). If so, process it and integrate it to the main table + tname_bo = tname + "bo" + if tname_bo in ds_meadow.table_names: + # Read BO table + tb_bo = read_table(ds_meadow, tname_bo, tname) + # Get list of core indicators: These are the names of the columns that are actual indicators (and not dimensional indicators, e.g. `tfr1`, `tfr2`, etc.) + core_indicators = [col for col in tb.columns.intersection(tb_bo.columns) if col not in cols_index] + # Add BO to main table + tb = integrate_bo( + tb=tb, + tb_bo=tb_bo, + cols_index=cols_index, + core_indicators=core_indicators, + check=check_integration, + check_limit_wrong=check_integration_limit, + ) + # Consolidate table: Use long format, and add birth_order as a dimension of the main table. + tb = make_table_with_birth_order(tb, cols_index, core_indicators, col_bo, regex) + # Sometimes, the main table contains already indicators broken down by birth order. In such cases, we also need to reshape the table. + elif tname in tables_w_parity: + core_indicators = tables_w_parity[tname]["indicators"] + tb = make_table_with_birth_order(tb, cols_index, core_indicators, col_bo, regex) + + # Add formatted table to the list of tables. + tbs.append(tb) + + return tbs + + +def read_table(ds_meadow, tname, tname_base=None): + """Read table from dataset and minor cleaning: + + - Rename columns if applicable + - Harmonize country names + """ + # Read table + tb = ds_meadow.read(tname) + + # Rename columns + if tname_base is None: + tname_base = tname + if tname_base in COLUMNS_RENAME: + tb = tb.rename(columns=COLUMNS_RENAME[tname_base]) + # Harmonize country names + tb = geo.harmonize_countries( + df=tb, + countries_file=paths.country_mapping_path, + country_col="code", + warn_on_unused_countries=False, + ) -def integrate_bo(tb, tb_bo, col_index, core_indicators): + # Rename country column + tb = tb.rename(columns={"code": "country"}) + + return tb + + +def integrate_bo(tb, tb_bo, cols_index, core_indicators, check=True, check_limit_wrong=None): """Merge main table with its BO counterpart. Some tables have a secondary table which provides the same core indicators but by birth order. @@ -432,7 +645,7 @@ def integrate_bo(tb, tb_bo, col_index, core_indicators): # Outer join tb = tb.merge( tb_bo, - on=col_index, + on=cols_index, suffixes=["", "__bo"], how="outer", ) @@ -442,10 +655,15 @@ def integrate_bo(tb, tb_bo, col_index, core_indicators): for col in core_indicators: # Check that we can integrate them! TOLERANCE = 5e-3 - assert ( - (((tb[col] - tb[f"{col}__bo"]) / tb[col]).dropna().abs() < TOLERANCE).all() - ).all(), f"Integration failed for {col}. Core indicator is not equivalent between main and `bo` tables." - + if check: + assert ( + (((tb[col] - tb[f"{col}__bo"]) / tb[col]).dropna().abs() < TOLERANCE).all() + ).all(), f"Integration failed for {col}. Core indicator is not equivalent between main and `bo` tables." + elif check_limit_wrong is not None: + num = (~(((tb[col] - tb[f"{col}__bo"]) / tb[col]).dropna().abs() < TOLERANCE)).sum() + assert ( + num == check_limit_wrong + ), f"Integration failed for {col}. There are too many allowed miss-matches ({check_limit_wrong})!" # Actual integration tb[col] = tb[col].fillna(tb[f"{col}__bo"]) tb = tb.drop(columns=[f"{col}__bo"]) @@ -453,19 +671,22 @@ def integrate_bo(tb, tb_bo, col_index, core_indicators): return tb -def make_table_with_birth_order(tb, col_index, core_indicators, col_bo="birth_order"): +def make_table_with_birth_order(tb, cols_index, core_indicators, col_bo="birth_order", regex_bo=None): """Change the format of a table from wide to long, to incorporate the birth order as a dimension.""" + if regex_bo is None: + regex_bo = {} + def _generate_regex(name): - if re.search(r"\d$", name): # Check if the name ends with a number + if re.search(r"\d$", string=name): # Check if the name ends with a number return rf"^{name}_?(\d+|(\d+p)?)$" else: return rf"^{name}(\d+|(\d+p)?)$" - regex_patterns = {name: _generate_regex(name) for name in core_indicators} + regex_patterns = {name: regex_bo.get(name, _generate_regex(name)) for name in core_indicators} tb = tb.melt( - col_index, + cols_index, var_name=COLUMN_RAW, value_name="value", ) @@ -491,7 +712,7 @@ def _generate_regex(name): # Final reshape tb = tb.drop(columns=[COLUMN_RAW]) - tb = tb.pivot(index=col_index + [col_bo], columns=COLUMN_IND, values="value").reset_index() + tb = tb.pivot(index=cols_index + [col_bo], columns=COLUMN_IND, values="value").reset_index() tb = tb.rename_axis(None, axis=1) # Drop NaNs @@ -500,123 +721,42 @@ def _generate_regex(name): return tb -def read_table(ds_meadow, tname, tname_base=None): - """Read table from dataset and minor cleaning: - - - Rename columns if applicable - - Harmonize country names - """ - # Read table - tb = ds_meadow.read(tname) - - # Rename columns - if tname_base is None: - tname_base = tname - if tname_base in COLUMNS_RENAME: - tb = tb.rename(columns=COLUMNS_RENAME[tname_base]) +def consolidate_table_from_list(tbs, cols_index_out, short_name, fcn=None) -> geo.Table: + ## Sanity check: no column is named the same + _sanity_check_colnames(tbs, cols_index_out) - # Harmonize country names - tb = geo.harmonize_countries( - df=tb, - countries_file=paths.country_mapping_path, - country_col="code", - warn_on_unused_countries=False, - ) + # Merge + tb = pr.multi_merge(tbs, on=cols_index_out, how="outer") - # Rename country column - tb = tb.rename(columns={"code": "country"}) + # Optional function + if fcn is not None: + tb = fcn(tb) + # Format + tb = tb.format(cols_index_out, short_name=short_name) return tb -def make_table_list(ds_meadow, table_names, tables_w_parity, cols_index, col_bo): - """Reads relevant tables, and formats them accordingly. - - Tables come in wide format, sometimes as two-tables (main and birth order). This function consolidates them into single tables per topic. - - For instance, we have one table with total fertility rates (columns `tfr`). And then another one with fertilities broken down by birth order (columns `tfr`, `tfr1`, etc.) - Instead, we want a table in long format, which has one column `tfr` and adds the birth order as a dimension of the table. - """ - tbs = [] - for tname in table_names: - # Read main table - tb = read_table(ds_meadow, tname) - - # Check if there is a birth order table for this indicator(s). If so, process it and integrate it to the main table - tname_bo = tname + "bo" - if tname_bo in ds_meadow.table_names: - # Read BO table - tb_bo = read_table(ds_meadow, tname_bo, tname) - # Get list of core indicators: These are the names of the columns that are actual indicators (and not dimensional indicators, e.g. `tfr1`, `tfr2`, etc.) - core_indicators = [col for col in tb.columns.intersection(tb_bo.columns) if col not in cols_index] - # Add BO to main table - tb = integrate_bo(tb, tb_bo, cols_index, core_indicators) - # Consolidate table: Use long format, and add birth_order as a dimension of the main table. - tb = make_table_with_birth_order(tb, cols_index, core_indicators, col_bo) - # Sometimes, the main table contains already indicators broken down by birth order. In such cases, we also need to reshape the table. - elif tname in tables_w_parity: - core_indicators = cols_index + tables_w_parity[tname]["indicators"] - tb = make_table_with_birth_order(tb, cols_index, core_indicators, col_bo) - - # Add formatted table to the list of tables. - tbs.append(tb) - +def _fix_ppr(tbs): + for tb in tbs: + if tb.m.short_name == "pprvhbo": + tb["birth_order"] = tb["birth_order"].str.split("_").str[-1] return tbs -def run(dest_dir: str) -> None: - # - # Load inputs. - # - # Load meadow dataset. - ds_meadow = paths.load_dataset("hfd") - - # 1/ Read period tables + consolidate in one table tb_period - ## Initial definitions - cols_index = ["country", "year"] - col_bo = "birth_order" - cols_index_out = cols_index + [col_bo] - ## Read tables - tbs = make_table_list(ds_meadow, TABLES_PERIOD, TABLES_PERIOD_W_PARITY, cols_index, col_bo) - ## Sanity check: no column is named the same +def _sanity_check_colnames(tbs, cols_index_out): colnames = [col for t in tbs for col in t.columns if col not in cols_index_out] assert len(colnames) == len(set(colnames)), "Some columns are named the same!" - ## Merge - tb_period = pr.multi_merge(tbs, on=cols_index_out, how="outer") - tb_period = tb_period.format(cols_index_out, short_name="period") - # 2/Read cohort tables + consolidate in one table tb_cohort - ## Initial definitions - cols_index = ["country", "cohort"] - col_bo = "birth_order" - cols_index_out = cols_index + [col_bo] - ## Read tables - tbs = make_table_list(ds_meadow, TABLES_COHORT, TABLES_COHORT_W_PARITY, cols_index, col_bo) - ## Sanity check: no column is named the same - colnames = [col for t in tbs for col in t.columns if col not in cols_index_out] - assert len(colnames) == len(set(colnames)), "Some columns are named the same!" - ## Merge - tb_cohort = pr.multi_merge(tbs, on=cols_index_out, how="outer") - tb_cohort = tb_cohort.format(cols_index_out, short_name="cohort") - # - # Process data. - # - tables = [ - tb_period, - tb_cohort, +def keep_relevant_ages(tb): + AGES_RELEVANT = [ + "12-", + "20", + "30", + "40", + "50", + "55+", ] - - # - # Save outputs. - # - # Create a new garden dataset with the same metadata as the meadow dataset. - ds_garden = create_dataset( - dest_dir, - tables=tables, - check_variables_metadata=True, - default_metadata=ds_meadow.metadata, - ) - - # Save changes in the new garden dataset. - ds_garden.save() + tb = tb.loc[tb["age"].isin(AGES_RELEVANT)] + return tb diff --git a/etl/steps/data/grapher/hmd/2024-11-19/hfd.py b/etl/steps/data/grapher/hmd/2024-11-19/hfd.py index df9aef583ce..553a3a49bbc 100644 --- a/etl/steps/data/grapher/hmd/2024-11-19/hfd.py +++ b/etl/steps/data/grapher/hmd/2024-11-19/hfd.py @@ -16,7 +16,17 @@ def run(dest_dir: str) -> None: # Read table from garden dataset. tables = [ ds_garden.read("period", reset_index=False), - ds_garden.read("cohort", reset_index=False).rename_index_names({"cohort": "year"}), + ds_garden.read("cohort", reset_index=False).rename_index_names( + { + "cohort": "year", + } + ), + ds_garden.read("period_ages", reset_index=False), + ds_garden.read("cohort_ages", reset_index=False).rename_index_names( + { + "cohort": "year", + } + ), ] # # Process data.