diff --git a/Dockerfile.light b/Dockerfile.light index 671953f..19e8ef0 100644 --- a/Dockerfile.light +++ b/Dockerfile.light @@ -29,7 +29,8 @@ RUN pip install --no-cache-dir git+https://github.com/ciemss/pyciemss.git@adeb6b # Install MIRA from GitHub RUN git clone https://github.com/indralab/mira.git /home/jupyter/mira WORKDIR /home/jupyter/mira -RUN git reset --hard 3043c9a66e46218645c5d9200c1ca7f028da5b5a +RUN git reset --hard 420832f132b1cebb30341a3fee1610a6542becef + RUN pip install --no-cache-dir /home/jupyter/mira/"[ode,tests,dkg-client,dkg-construct,sbml,docs]" && \ rm -r /home/jupyter/mira diff --git a/pyproject.toml b/pyproject.toml index bca956c..df13fab 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,8 @@ classifiers = [ "Programming Language :: Python :: Implementation :: PyPy", ] dependencies = [ - "beaker-kernel>=1.5.3", + "beaker-kernel==1.6.7", + "archytas==1.1.8", "pandas==1.3.3", "matplotlib~=3.7.1", "xarray==0.19.0", @@ -39,7 +40,7 @@ dependencies = [ "tiktoken~=0.5.2", "chirho[extras]~=0.2.0", "pyro-ppl~=1.8.6", - "pyro-api~=0.1.2", + "pyro-api~=0.1.2", "torchdiffeq", "h5netcdf==1.3.0", "netcdf4==1.6.5", diff --git a/src/askem_beaker/contexts/dataset/agent.py b/src/askem_beaker/contexts/dataset/agent.py index d7576ab..6d43af6 100644 --- a/src/askem_beaker/contexts/dataset/agent.py +++ b/src/askem_beaker/contexts/dataset/agent.py @@ -1,6 +1,7 @@ import json import logging import re +import os from archytas.react import Undefined from archytas.tool_utils import AgentRef, LoopControllerRef, tool @@ -12,6 +13,11 @@ logging.disable(logging.WARNING) # Disable warnings logger = logging.Logger(__name__) +# Specify the full path to the markdown file +file_path = os.path.join(os.path.dirname(__file__), 'incidence_to_prevalence.md') + +with open(file_path, 'r') as file: + incidence_to_prevalence = file.read() class DatasetAgent(BaseAgent): """ @@ -55,6 +61,10 @@ async def generate_code( You also have access to the libraries {agent.context.metadata.get("libraries", "that are common for these tasks")}. +You may be asked to assist in converting incidence data to prevalence data. In that case, please follow the following instructions: + +{incidence_to_prevalence} + Please generate the code as if you were programming inside a Jupyter Notebook and the code is to be executed inside a cell. You MUST wrap the code with a line containing three backticks (```) before and after the generated code. No addtional text is needed in the response, just the code block. diff --git a/src/askem_beaker/contexts/dataset/incidence_to_prevalence.md b/src/askem_beaker/contexts/dataset/incidence_to_prevalence.md new file mode 100644 index 0000000..573d16c --- /dev/null +++ b/src/askem_beaker/contexts/dataset/incidence_to_prevalence.md @@ -0,0 +1,290 @@ +## Task Overview + +You are given one or more epidemiological datasets containing **incidence data** (new cases/events per time period). The user wants you to: + +1. **Convert the incidence data to prevalence** (ongoing cases at each time point). +2. **Map the prevalence data to specific compartments** in a compartmental model, such as Susceptible (S), Infected (I), Recovered (R), Hospitalized (H), and Deceased (D). +3. **Handle different naming conventions** and **adjust time windows** as specified by the user. + +--- + +## Steps + +### 1. Data Loading and Interpretation + +- Assume the user has provided one or more datasets as DataFrames, each containing columns for date, location, and an incidence measure (e.g., new cases, hospitalizations, deaths). +- The naming conventions of the datasets or columns might vary, so the task requires you to identify the meaning of each dataset from the provided description. + +### 2. Key Definitions + +- **Incidence** refers to the count of new events (e.g., new cases, hospitalizations, or deaths) at each time point. +- **Prevalence** refers to the total number of ongoing cases at each time point (e.g., currently infected or hospitalized individuals). +- **Compartmental Models** involve variables like: + - **S** (Susceptible): Those who have not been infected. + - **I** (Infected): Those currently infected (active cases). + - **R** (Recovered): Those who have recovered from the infection. + - **H** (Hospitalized): Those currently hospitalized. + - **D** (Deceased): Those who have died (cumulative deaths). + +### 3. Steps for Converting Incidence to Prevalence + +#### 3.1. Identifying Data +- Identify which dataset corresponds to each compartment. The user might provide specific instructions like: + - **Incident Cases**: Map to new infections. + - **Incident Hospitalizations**: Map to new hospitalizations. + - **Cumulative Deaths**: Map to the total number of deaths. + +#### 3.2. Handling Dates and Locations +- You may be asked to filter by location. In that case you must identify and utilizie the geographic feature(s) available to you +in the dataset. Some datasets will not have geographic features. All _should_ have dates; ensure the dataset is sorted by date. + +#### 3.3. User-Specified Windows +- Allow the user to specify recovery windows (e.g., 14 days for infections, 10 days for hospitalizations). +- Default values: 14 days for infections, 10 days for hospitalizations, 3 days for death-related hospitalization. + +#### 3.4. Calculate Prevalence +- For incidence datasets (e.g., new infections or hospitalizations), use a rolling sum over the specified window to calculate prevalence. For example: + - **Infected Prevalence (I)** = Sum of new infections over the last 14 days. + - **Hospitalized Prevalence (H)** = Sum of new hospitalizations over the last 10 days. + +- For cumulative datasets (e.g., deaths), directly use the cumulative sum as the prevalence: + - **Deaths (D)** = Cumulative deaths. + +#### 3.5. Calculate Recovered Individuals +- For recovered individuals, assume recovery occurs after a specified window (e.g., 14 days for infections). The formula is: + - **Recovered (R)** = Cumulative sum of incident cases up to (current date - recovery window) - current deaths. + +#### 3.6. Handle Variable Data Formats +- Assume the column names might differ (e.g., `"new_cases"`, `"incident_cases"`, or `"hospitalizations"`). Ask the user for clarification on naming conventions if needed. + +--- + +### 4. Mapping to Compartmental Model + +#### 4.1. Create a Shared DataFrame +- Combine the time series data into one DataFrame. Ensure that all variables (I, R, H, D) are aligned by their common date index. + +#### 4.2. Calculate Susceptible Population (S) +- Define a total population (e.g., 150 million) if not provided by the user. Be explicit and obvious so the user can adjust this as needed. +- Calculate the susceptible population as: + - **S** = Total population - I - R - H - D (and/or other compartments as needed) + +#### 4.3. Adjust Recovered Population +- Ensure that recovered individuals exclude deaths: + - Adjust R as: **R = R - D**. + +--- + +### 5. Return the Final Data +- Return the final DataFrame with columns for each compartment (e.g. S, I, R, H, D). +- Ensure the output is flexible and can handle various compartmental models or additional epidemiological categories. + +--- + +## Examples and Pseudo Code + +### Example 1: Incidence Cases to Prevalence + +```python +# User provides incidence data +inc_cases = user_input_inc_cases # e.g., "incident_cases" column +window = user_specified_window or 14 # Recovery time, default is 14 days + +# Convert incident cases to prevalence +prevalence_I = inc_cases.rolling(window).sum().dropna() + +# Adjust for recovered individuals +prevalence_R = inc_cases.cumsum().shift(window) - cumulative_deaths +``` + +### Example 2: Hospitalizations to Prevalence +```python +# User provides hospitalization data +inc_hospitalizations = user_input_hosp_data # e.g., "new_hospitalizations" column +window = user_specified_window or 10 # Hospitalization recovery time, default is 10 days + +# Convert incident hospitalizations to prevalence +prevalence_H = inc_hospitalizations.rolling(window).sum().dropna() +``` + +### Example 3: Combining Data into Compartmental Model +```python +# Initialize total population +total_population = 150e6 # User-specified or default + +# Create DataFrame with compartments +compartments_df = pd.DataFrame({ + "I": prevalence_I, + "R": prevalence_R - prevalence_D, # Adjust for deaths + "H": prevalence_H, + "D": cumulative_deaths, +}) + +# Calculate Susceptible population +compartments_df["S"] = total_population - compartments_df["I"] - compartments_df["R"] - compartments_df["H"] - compartments_df["D"] +``` + +### Example 4: Different Column Headers, Date Formats, and Groupings + +#### Dataset Example: +This dataset might come from a public health organization with different column names and some additional grouping variables, such as age group and gender. + +| country | region | date_reported | new_infections | new_hospitalizations | total_deaths | age_group | gender | +|--------------|------------|---------------|----------------|----------------------|--------------|-----------|--------| +| United States | Northeast | 2022-01-01 | 500 | 20 | 15 | 18-30 | M | +| United States | Northeast | 2022-01-01 | 600 | 25 | 10 | 18-30 | F | +| United States | Northeast | 2022-01-02 | 450 | 15 | 12 | 18-30 | M | +| United States | Southeast | 2022-01-01 | 300 | 10 | 5 | 30-50 | F | + +#### Key Differences: +1. **Date Column**: This dataset uses `date_reported` rather than `date`. +2. **Incidence Columns**: `new_infections`, `new_hospitalizations`, and `total_deaths` are named differently. +3. **Grouping Variables**: The dataset includes additional groupings by `age_group`, `gender`, and `region`, which might or might not be relevant depending on the user’s goals. + +#### Approach: +To work with this dataset, we: +1. **Align the column names** by either renaming them or referencing them directly based on the schema. +2. **Aggregate data** if needed by summing over categories like `age_group` and `gender` to get national or regional totals. +3. **Convert incidence to prevalence** using user-specified time windows. + +##### Pseudo Code Example: +```python +# Set date as index and ensure datetime format +df['date'] = pd.to_datetime(df['date']) +df = df.set_index('date') + +# Now process the columns to convert to prevalence +# Infected Prevalence (I) +prevalence_I = df.groupby("region")["incident_cases"].rolling(window).sum().dropna().reset_index() + +# Hospitalized Prevalence (H) +prevalence_H = df.groupby("region")["incident_hospitalizations"].rolling(window).sum().dropna().reset_index() + +# Deaths (D) +prevalence_D = df.groupby("region")["cumulative_deaths"].apply(lambda x: x).reset_index() + +# Recovered (R) +prevalence_R = df.groupby("region")["incident_cases"].cumsum().shift(window).reset_index() +prevalence_R['prevalence_R'] = prevalence_R['incident_cases'] - prevalence_D['cumulative_deaths'] + +# Combine into a final DataFrame +final_df = prevalence_I.merge(prevalence_R, on=["region", "date"]).merge(prevalence_H, on=["region", "date"]).merge(prevalence_D, on=["region", "date"]) +final_df = final_df.rename(columns={ + 'sum_x': 'I', + 'sum_y': 'R', + 'sum': 'H', + 'cumulative_deaths': 'D' +}) +``` + +### Example 5: Weekly Aggregated Data with Cumulative Incidence + +#### Dataset Example: +Some organizations (e.g., WHO or CDC) might release **weekly aggregated** data with cumulative cases and hospitalizations, which requires different handling. + +| location | week_start | cumulative_cases | cumulative_hospitalizations | cumulative_deaths | +|--------------|-------------|------------------|-----------------------------|-------------------| +| United States | 2022-01-03 | 50000 | 500 | 1000 | +| United States | 2022-01-10 | 55000 | 530 | 1050 | +| United States | 2022-01-17 | 60000 | 560 | 1100 | + +#### Key Differences: +1. **Weekly Data**: Data is reported by week (`week_start`), not daily. +2. **Cumulative Incidence**: Cases and hospitalizations are reported as cumulative totals, which means we need to calculate the **difference** between weeks to get the **weekly incidence**. +3. **Prevalence** needs to be calculated using custom time windows over these weekly intervals. + +#### Approach: +We will: +1. **Compute weekly incidence** from the cumulative totals. +2. **Convert weekly incidence to prevalence** using rolling sums with adjusted windows to reflect weekly reporting. + +##### Pseudo Code Example: +```python +# First calculate the weekly incidence +df["weekly_cases"] = df["cumulative_cases"].diff().fillna(0) +df["weekly_hospitalizations"] = df["cumulative_hospitalizations"].diff().fillna(0) +df["weekly_deaths"] = df["cumulative_deaths"].diff().fillna(0) + +# Now convert to prevalence based on weekly data: +window_weeks = user_specified_window or 2 # 14 days is roughly 2 weeks + +# Infected Prevalence (I) +prevalence_I = df["weekly_cases"].rolling(window_weeks).sum().dropna() + +# Hospitalized Prevalence (H) +prevalence_H = df["weekly_hospitalizations"].rolling(window_weeks).sum().dropna() + +# Deaths (D) +prevalence_D = df["cumulative_deaths"] + +# Recovered (R) +prevalence_R = df["cumulative_cases"].shift(window_weeks) - prevalence_D + +# Combine into final DataFrame +final_df = pd.DataFrame({ + "I": prevalence_I, + "R": prevalence_R, + "H": prevalence_H, + "D": prevalence_D, + "time": df["week_start"] +}).dropna().set_index("time") +``` + +### Example 6: Global Data with Multiple Countries and Granularity + +#### Dataset Example: +In global datasets, organizations like the WHO might report data for multiple countries with varying levels of granularity (e.g., daily vs. weekly). + +| Country | Date | Confirmed_Cases | New_Hospitalizations | Deaths | +|--------------|-------------|-----------------|----------------------|---------| +| United States | 2022-01-01 | 500 | 20 | 15 | +| United Kingdom| 2022-01-01 | 600 | 25 | 10 | +| Germany | 2022-01-01 | 450 | 15 | 12 | +| United States | 2022-01-02 | 520 | 18 | 17 | + +#### Key Differences: +1. **Multiple Countries**: Data is reported for several countries, requiring handling of the `Country` field. +2. **Granularity**: Data might be reported daily or weekly, depending on the country. +3. **Different naming conventions**: Confirmed cases, new hospitalizations, and deaths are labeled differently. + +#### Approach: +- **Filter by country** as necessary. +- **Handle varying granularity** by grouping data appropriately for the specified task (daily or weekly prevalence). +- **Map the column headers** to a consistent format. + +##### Pseudo Code Example: +```python +# Filter data by specific country if necessary +df = user_dataset[user_dataset["Country"] == "United States"] + +# Rename columns for consistency +df = df.rename(columns={ + "Confirmed_Cases": "incident_cases", + "New_Hospitalizations": "incident_hospitalizations", + "Deaths": "cumulative_deaths" +}) + +# Convert to prevalence: +window_days = user_specified_window or 14 # Assuming 14 days for recovery + +# Infected Prevalence (I) +prevalence_I = df["incident_cases"].rolling(window_days).sum().dropna() + +# Hospitalized Prevalence (H) +prevalence_H = df["incident_hospitalizations"].rolling(window_days).sum().dropna() + +# Deaths (D) +prevalence_D = df["cumulative_deaths"] + +# Recovered (R) +prevalence_R = df["incident_cases"].cumsum().shift(window_days) - prevalence_D + +# Combine into a final DataFrame +final_df = pd.DataFrame({ + "I": prevalence_I, + "R": prevalence_R, + "H": prevalence_H, + "D": prevalence_D, + "time": df["Date"] +}).dropna().set_index("time") +``` \ No newline at end of file diff --git a/src/askem_beaker/contexts/mira_model_edit/agent.py b/src/askem_beaker/contexts/mira_model_edit/agent.py index 4ccc6fe..b9491ff 100644 --- a/src/askem_beaker/contexts/mira_model_edit/agent.py +++ b/src/askem_beaker/contexts/mira_model_edit/agent.py @@ -126,6 +126,50 @@ async def add_observable(self, new_id: str, new_name: str, new_expression: str, "content": code.strip(), } ) + + @tool() + async def add_observable_pattern(self, new_name: str, + identifier_keys: list[str], + identifier_values: list[str], + context_keys: list[str], + context_values: list[str], + agent: AgentRef, + loop: LoopControllerRef): + """ + This tool is used when a user wants to add an observable via a complex pattern. You should inspect the model BEFORE using this tool + so that you can properly map the users request to the correct identifiers and contexts in the model. Typically the identifier key + will be something like "ido" and the identifier value will be something like "0000514". Context keys will be the name of the strata context (e.g. "Age") + and the values will be the value for that strata context (e.g. "youth"). + + When the user specifies a high level state (such as Infected) this would be specified via the identifiers; when the user specifies + a strata (such as "youth") that is specified via the context. + + Args: + new_name (str): The new name provided for the observable. If this is not provided something intuitive should be set. + identifier_keys (list[str]): The keys for the identifiers that will be used in the observable. + identifier_values (list[str]): The values for the identifiers that will be used in the observable. + context_keys (list[str]): The keys for the context that will be used in the observable. + context_values (list[str]): The values for the context that will be used in the observable. + """ + identifier_dict = dict(zip(identifier_keys, identifier_values)) + context_dict = dict(zip(context_keys, context_values)) + code = agent.context.get_code("add_observable_pattern", + {"new_name": new_name, + "identifier_keys": identifier_keys, + "identifier_values": identifier_values, + "context_keys": context_keys, + "context_values": context_values, + "identifier_dict": identifier_dict, + "context_dict": context_dict} + ) + loop.set_state(loop.STOP_SUCCESS) + return json.dumps( + { + "action": "code_cell", + "language": "python3", + "content": code.strip(), + } + ) @tool() async def remove_observable(self, remove_id: str, agent: AgentRef, loop: LoopControllerRef): @@ -502,7 +546,11 @@ async def stratify(self, structure: Optional[Iterable[Tuple[str, str]]] = None, directed: bool = False, cartesian_control: bool = False, - modify_names: bool = True + modify_names: bool = True, + concepts_to_stratify: Optional[Collection[str]] = None, + concepts_to_preserve: Optional[Collection[str]] = None, + params_to_stratify: Optional[Collection[str]] = None, + params_to_preserve: Optional[Collection[str]] = None ): """ This tool is used when a user wants to stratify a model. @@ -525,6 +573,8 @@ async def stratify(self, An iterable of pairs corresponding to a directed network structure where each of the pairs has two strata. If none given, will assume a complete network structure. If no structure is necessary, pass an empty list. + For example [["Young", "Old"]] would mean that the population in Young can interact with the population in Old provided they are within the same state. + [["Toronto", "New York"], ["New York", "Toronto"]] would mean that the population in Toronto and New York can interact with each other provided they are in the same state. By default this should be an empty list. directed (bool): If the structure tuples are combinations this should be True. If they are permutations this should be false. @@ -555,6 +605,22 @@ async def stratify(self, (e.g., ``"S"`` becomes ``"S_boston"``). If false, will keep the original names. If this cannot be found it should default to True + concepts_to_stratify (Optional): + This is a list of the state variables in the model that is required to be stratified. + For example, given a model with state variables ("S", "E", "I", "R") and a request to only stratify the "S" state variable, the value of this argument should be ["S"]. + If the request does not specify any state variable to stratify in particular, then the value of this argument should default to None. + concepts_to_preserve (Optional): + This is a list of the state variables in the model that must not be stratified. + For example, given a model with state variables ("S", "E", "I", "R") and a request like "preserve" or "do not stratify" the "S" state variable, the value of this argument should be ["S"]. + If the request does not specify any state variable to not be stratified or preserved in particular, then the value of this argument should default to None. + params_to_stratify (Optional): + This is a list of the parameters in the model that is required to be stratified. + For example, given a model with parameters ("beta", "gamma") and a request to only stratify the "beta" parameter, the value of this argument should be ["beta"]. + If the request does not specify any parameter to stratify in particular, then the value of this argument should default to None. + params_to_preserve (Optional): + This is a list of the parameters in the model that must not be stratified. + For example, given a model with parameters ("beta", "gamma") and a request like "preserve" or "do not stratify" the "beta" parameter, the value of this argument should be ["beta"]. + If the request does not specify any parameter to not be stratified or preserved in particular, then the value of this argument should default to None. """ code = agent.context.get_code("stratify", { @@ -563,7 +629,11 @@ async def stratify(self, "structure": structure, "directed": directed, "cartesian_control": cartesian_control, - "modify_names": modify_names + "modify_names": modify_names, + "concepts_to_stratify": concepts_to_stratify, + "concepts_to_preserve": concepts_to_preserve, + "params_to_stratify": params_to_stratify, + "params_to_preserve": params_to_preserve }) loop.set_state(loop.STOP_SUCCESS) return json.dumps( diff --git a/src/askem_beaker/contexts/mira_model_edit/context.py b/src/askem_beaker/contexts/mira_model_edit/context.py index eeb5e20..dbc719d 100644 --- a/src/askem_beaker/contexts/mira_model_edit/context.py +++ b/src/askem_beaker/contexts/mira_model_edit/context.py @@ -547,6 +547,7 @@ async def stratify_request(self, message): key = content.get("key") strata = content.get("strata") concepts_to_stratify = content.get("concepts_to_stratify") + concepts_to_preserve = content.get("concepts_to_preserve") params_to_stratify = content.get("params_to_stratify") params_to_preserve = content.get("params_to_preserve") cartesian_control = content.get("cartesian_control") @@ -556,8 +557,9 @@ async def stratify_request(self, message): "key": key, "strata": strata, "concepts_to_stratify": concepts_to_stratify, + "concepts_to_preserve": concepts_to_preserve, "params_to_stratify": params_to_stratify, - "params_to_preserve": params_to_preserve, + "params_to_preserve": params_to_preserve, "cartesian_control": cartesian_control, "structure": structure }) diff --git a/src/askem_beaker/contexts/mira_model_edit/procedures/python3/add_observable_pattern.py b/src/askem_beaker/contexts/mira_model_edit/procedures/python3/add_observable_pattern.py new file mode 100644 index 0000000..f57751e --- /dev/null +++ b/src/askem_beaker/contexts/mira_model_edit/procedures/python3/add_observable_pattern.py @@ -0,0 +1,6 @@ +add_observable_pattern( + model, + "{{ new_name }}", + identifiers = {{ identifier_dict }}, + context = {{ context_dict }} + ) \ No newline at end of file diff --git a/src/askem_beaker/contexts/mira_model_edit/procedures/python3/stratify.py b/src/askem_beaker/contexts/mira_model_edit/procedures/python3/stratify.py index 8e9a00b..95d0749 100644 --- a/src/askem_beaker/contexts/mira_model_edit/procedures/python3/stratify.py +++ b/src/askem_beaker/contexts/mira_model_edit/procedures/python3/stratify.py @@ -7,7 +7,8 @@ cartesian_control={{ cartesian_control|default(False) }}, modify_names={{ modify_names|default(True) }}, concepts_to_stratify={{ concepts_to_stratify|default(None) }}, #If none given, will stratify all concepts. + concepts_to_preserve={{ concepts_to_preserve|default(None) }}, #If none given, will stratify all concepts. params_to_stratify= {{ params_to_stratify|default(None) }}, #If none given, will stratify all parameters. - params_to_preserve= {{ params_to_preserve|default(None) }}, + params_to_preserve= {{ params_to_preserve|default(None) }}, #If none given, will stratify all parameters. param_renaming_uses_strata_names = True )