From e59e48b2216c0b546dfdeb872f142e6558cc6181 Mon Sep 17 00:00:00 2001
From: IK-173 <162045008+IK-173@users.noreply.github.com>
Date: Fri, 16 Aug 2024 17:24:11 -0400
Subject: [PATCH 1/2] totalCOPD_heatmap_workflow

---
 process.json              |  12 +-
 totalCOPD_breathright1.py | 126 +++++++++++++++++++
 totalCOPD_breathright2.py | 256 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 388 insertions(+), 6 deletions(-)
 create mode 100644 totalCOPD_breathright1.py
 create mode 100644 totalCOPD_breathright2.py

diff --git a/process.json b/process.json
index a40711a..b341986 100644
--- a/process.json
+++ b/process.json
@@ -1,16 +1,16 @@
 [{
-  "id" : "6zmcpn",
-  "name" : "1. Incidence COPD data prep",
+  "id" : "74c3mf",
+  "name" : "totalCOPD_breathright1",
   "description" : null,
-  "code" : "import pandas as pd\nimport geopandas as gpd\n\n# Read CDC COPD incidence data\nCOPD_incidence_df = pd.read_csv('/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/IHME_2000-2021_COPD_Incidence_DATA.csv')\n\n# Read EPA air quality data for PM2.5 and ozone\npm25_df = pd.read_csv('/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/Ozone and PM2.5 Data/combined_pm25_data.csv', low_memory=False)\nozone_df = pd.read_csv('/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/Ozone and PM2.5 Data/combined_ozone_data.csv', low_memory=False)\n\n# Read county shapefile for spatial analysis\ncounties_gdf = gpd.read_file('/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/County data/county_shapefile.shp')\n\n# Convert date column to datetime\npm25_df['Date Local'] = pd.to_datetime(pm25_df['Date Local'], format='%Y-%m-%d')\nozone_df['Date Local'] = pd.to_datetime(ozone_df['Date Local'], format='%Y-%m-%d')\n\n# Extract year from date\npm25_df['year'] = pm25_df['Date Local'].dt.year\nozone_df['year'] = ozone_df['Date Local'].dt.year\n\n# Get year from PM2.5\n\n# Convert 'Date Local' to datetime and extract year\npm25_df['Date Local'] = pd.to_datetime(pm25_df['Date Local'], format='%Y-%m-%d')\npm25_df['year'] = pm25_df['Date Local'].dt.year\n\n# Rename 'Sample Measurement' to 'PM2.5'\npm25_df.rename(columns={'Arithmetic Mean': 'PM2.5'}, inplace=True)\n\n# Verify the DataFrame\nprint(pm25_df.head())\nprint(pm25_df.columns)\n\n# Convert 'Date Local' to datetime and extract year\nozone_df['Date Local'] = pd.to_datetime(ozone_df['Date Local'], format='%m/%d/%y')\nozone_df['year'] = ozone_df['Date Local'].dt.year\n\n# Rename 'Sample Measurement' to 'Ozone'\nozone_df.rename(columns={'Arithmetic Mean': 'Ozone'}, inplace=True)\n\n# Verify the DataFrame\nprint(ozone_df.head())\nprint(ozone_df.columns)\n\n# Ensure 'State Name' and 'year' exist\nprint(pm25_df[['State Name', 'year']].head())\nprint(ozone_df[['State Name', 'year']].head())\n\n# Group by state and year to calculate annual averages\npm25_annual = pm25_df.groupby(['State Name', 'year'])['PM2.5'].mean().reset_index()\nozone_annual = ozone_df.groupby(['State Name', 'year'])['Ozone'].mean().reset_index()\n\n# Get latitude and longitude for each state-year pair\nlat_lon_pm25 = pm25_df.groupby(['State Name', 'year']).agg({\n    'Latitude': 'first',\n    'Longitude': 'first'\n}).reset_index()\n\nlat_lon_ozone = ozone_df.groupby(['State Name', 'year']).agg({\n    'Latitude': 'first',\n    'Longitude': 'first'\n}).reset_index()\n\n# Merge latitude and longitude with PM2.5 and Ozone averages\npm25_annual = pd.merge(pm25_annual, lat_lon_pm25, on=['State Name', 'year'])\nozone_annual = pd.merge(ozone_annual, lat_lon_ozone, on=['State Name', 'year'])\n\n# Verify the columns, all should have 'State Name' and 'year'\nprint(\"PM2.5 DataFrame columns:\")\nprint(pm25_df.columns)\nprint(\"Ozone DataFrame columns:\")\nprint(ozone_df.columns)\nprint(\"COPD incidence DataFrame columns:\")\nprint(COPD_incidence_df.columns)\n\n# Merge lung disease data with PM2.5 and ozone data\nCOPD_incidence_merged_df = pd.merge(COPD_incidence_df, pm25_annual, on=['State Name', 'year'])\nCOPD_incidence_merged_df = pd.merge(COPD_incidence_merged_df, ozone_annual, on=['State Name', 'year'])\n\n# Find rows where Latitude_x and Latitude_y differ\ndiscrepancies = COPD_incidence_merged_df[COPD_incidence_merged_df['Latitude_x'] != COPD_incidence_merged_df['Latitude_y']]\n\n# Display the rows with discrepancies\nprint(discrepancies[['State Name', 'year', 'Latitude_x', 'Latitude_y']])\n\n# Drop the incorrect columns and rename the correct ones\nCOPD_incidence_merged_df = COPD_incidence_merged_df.drop(columns=['Latitude_x', 'Longitude_x'])\nCOPD_incidence_merged_df = COPD_incidence_merged_df.rename(columns={'Latitude_y': 'Latitude', 'Longitude_y': 'Longitude'})\n\nfrom shapely.geometry import Point\n\n# Create the geometry column using the latitude and longitude columns\nCOPD_incidence_merged_df['geometry'] = COPD_incidence_merged_df.apply(lambda row: Point(row['Longitude'], row['Latitude']), axis=1)\n\n\n# Convert merged DataFrame to GeoDataFrame for spatial analysis\nCOPD_incidence_gdf = gpd.GeoDataFrame(COPD_incidence_merged_df, geometry='geometry')\n\nCOPD_incidence_gdf.crs = 'EPSG:4326'  # Example CRS, adjust as needed\n\n# Save the GeoDataFrame to a shapefile or other format if needed\nCOPD_incidence_gdf.to_file('/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_incidence_gdf.shp')\n\n# Save the updated DataFrame to CSV\nCOPD_incidence_merged_df.to_csv('/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_incidence_merged_df.csv', index=False)\n\n\n\n\n# Check for matches in State Name and year across dataframes\nprint(COPD_incidence_df[['State Name', 'year']].drop_duplicates())\nprint(pm25_annual[['State Name', 'year']].drop_duplicates())\nprint(ozone_annual[['State Name', 'year']].drop_duplicates())\n\n# Check for common values between dataframes\ncommon_states_years = pd.merge(COPD_incidence_df[['State Name', 'year']], pm25_annual[['State Name', 'year']], on=['State Name', 'year'])\nprint(common_states_years.head())\n\n# Verify data types before merging\nprint(COPD_incidence_df.dtypes)\nprint(pm25_annual.dtypes)\nprint(ozone_annual.dtypes)\n\n# Ensure no columns have trailing spaces or unexpected characters\nCOPD_incidence_df.columns = COPD_incidence_df.columns.str.strip()\npm25_annual.columns = pm25_annual.columns.str.strip()\nozone_annual.columns = ozone_annual.columns.str.strip()\n\n\n\n\n\n\n# Making Random Forest Prediction Model\nimport pandas as pd\nimport numpy as np\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.ensemble import RandomForestRegressor\nfrom sklearn.metrics import mean_squared_error, r2_score\n\n# Load the merged DataFrame\nCOPD_incidence_merged_df = pd.read_csv('/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_incidence_merged_df.csv')\n\nprint(COPD_incidence_merged_df.dtypes)\n\n# Check for any non-numeric values or unexpected text\nnon_numeric_values = COPD_incidence_merged_df[~COPD_incidence_merged_df['COPD Incidence'].apply(pd.to_numeric, errors='coerce').notnull()]\nprint(non_numeric_values)\n\nprint(COPD_incidence_merged_df['State Name'].unique())\nprint(COPD_incidence_merged_df['metric_name'].unique())\n\nCOPD_incidence_merged_df['COPD Incidence'] = pd.to_numeric(COPD_incidence_merged_df['COPD Incidence'], errors='coerce')\nCOPD_incidence_merged_df['PM2.5'] = pd.to_numeric(COPD_incidence_merged_df['PM2.5'], errors='coerce')\nCOPD_incidence_merged_df['Ozone'] = pd.to_numeric(COPD_incidence_merged_df['Ozone'], errors='coerce')\n\n# Verify if any columns were unintentionally concatenated\nprint(COPD_incidence_merged_df.head(10))\n\n\n# Identify numeric columns\nnumeric_cols = COPD_incidence_merged_df.select_dtypes(include='number').columns\nnon_numeric_cols = COPD_incidence_merged_df.select_dtypes(exclude='number').columns\n\n\n# Identify non-numeric columns\nnon_numeric_cols = COPD_incidence_merged_df.select_dtypes(exclude=[np.number]).columns.tolist()\n\n# Fill NaN values in numeric columns with the mean\nCOPD_incidence_merged_df[numeric_cols].fillna(COPD_incidence_merged_df[numeric_cols].mean(), inplace=True)\n\n\n# Check for missing values\nprint(COPD_incidence_merged_df.isnull().sum())\n\n# Fill missing values if necessary\n# For simplicity, you can use the mean or median of the columns with missing values\nCOPD_incidence_merged_df.fillna(COPD_incidence_merged_df.mean(), inplace=True)\n\n# Extract features and target variable\n# Assume 'COPD Incidence' is the target variable and the rest are features\nfeatures = COPD_incidence_merged_df[['PM2.5', 'Ozone', 'Latitude', 'Longitude']]\ntarget = COPD_incidence_merged_df['COPD Incidence']\n\n# Split the data into training and testing sets\nX_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)\n\n# Initialize and fit the Random Forest model\nrf_model = RandomForestRegressor(n_estimators=300, max_depth=10, min_samples_split=10, random_state=42)\nrf_model.fit(X_train, y_train)\n\n# Make predictions\ny_pred = rf_model.predict(X_test)\n\n# Evaluate the model\nmse = mean_squared_error(y_test, y_pred)\nr2 = r2_score(y_test, y_pred)\n\nprint(f\"Mean Squared Error: {mse}\")\nprint(f\"R^2 Score: {r2}\")\n\n# Get feature importances\nimportances = rf_model.feature_importances_\nfeatures_importance = pd.DataFrame({\n    'Feature': X_train.columns,\n    'Importance': importances\n}).sort_values(by='Importance', ascending=False)\n\nprint(features_importance)\n\n\n\n\n\n\n",
+  "code" : "import os\nimport pandas as pd\nimport re\n\n# Paths to your data\nNon_Smoke_merged_df = \"/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/Non Smoke/Non_Smoke_merged_df.csv\"\ngreat_pm25_combined_df = pd.read_csv('/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/Ozone and PM2.5 Data/great_combined_daily_pm25.csv', low_memory=False)\ngreat_ozone_combined_df = pd.read_csv('/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/Ozone and PM2.5 Data/great_combined_daily_ozone.csv', low_memory=False)\n\n# Read all the csv into pandas dataframe in memory\n#great_ozone_combined_df = pd.read_csv(ozone_data_path, parse_dates=['Date Local'])\nNon_Smoke_merged_df = pd.read_csv(Non_Smoke_merged_df)\n#great_pm25_combined_df = pd.read_csv(great_pm25_combined_df)\n\nprint(Non_Smoke_merged_df.columns)\nprint(Non_Smoke_merged_df['COPD_prevalence_percentage_total'].head())\n\n\n# Convert 'Date Local' to datetime format\ngreat_ozone_combined_df['Date Local'] = pd.to_datetime(great_ozone_combined_df['Date Local'], errors='coerce')\n\nprint(Non_Smoke_merged_df.columns)\n\n# Rename columns to be consistent\n#COPD_incidence_df.rename(columns={'Location': 'County Name'}, inplace=True)\n\n# Use the melt function to transform the DataFrame from wide to long format. This will convert the year-specific columns into rows.\n#COPD_incidence_long = COPD_incidence_df.melt(\n    #id_vars=['County Name', 'FIPS', '% Change in Mortality Rate, 1980-2014'],\n    #var_name='year',\n    #value_name='Mortality'\n#)\n\n# Extract the year from the 'year' column using string operations and convert it to an integer.\n#ILD_long['year'] = ILD_long['year'].str.extract(r'(\\d{4})').astype(int)\n\n# Function to split the Mortality Rate column\nimport pandas as pd\n\ndef split_COPD_prevalence_percentage_total_column_into_three(COPD_prevalence_percentage_total):\n    if pd.isna(COPD_prevalence_percentage_total):\n        return pd.Series([None, None, None], index=['Average', 'Min', 'Max'])\n    \n    COPD_prevalence_percentage_total = str(COPD_prevalence_percentage_total)\n    \n    # Check for range format\n    match = re.match(r'(\\d+\\.\\d+) \\((\\d+\\.\\d+), (\\d+\\.\\d+)\\)', COPD_prevalence_percentage_total)\n    if match:\n        avg, min_val, max_val = match.groups()\n        return pd.Series([float(avg), float(min_val), float(max_val)], index=['Average', 'Min', 'Max'])\n    \n    # Handle single value format\n    try:\n        avg = float(COPD_prevalence_percentage_total)\n        return pd.Series([avg, avg, avg], index=['Average', 'Min', 'Max'])\n    except ValueError:\n        return pd.Series([None, None, None], index=['Average', 'Min', 'Max'])\n# Apply the function to the DataFrame\nNon_Smoke_merged_df[['Average', 'Min', 'Max']] = Non_Smoke_merged_df['COPD_prevalence_percentage_total'].apply(split_COPD_prevalence_percentage_total_column_into_three)\n\n\n\n# Ensure 'COPD Incidence' is a string and handle NaN values\nNon_Smoke_merged_df['COPD_prevalence_percentage_total'] = Non_Smoke_merged_df['COPD_prevalence_percentage_total'].astype(str)\n\n# Apply the function to the DataFrame\nNon_Smoke_merged_df[['Average', 'Min', 'Max']] = Non_Smoke_merged_df['COPD_prevalence_percentage_total'].apply(split_COPD_prevalence_percentage_total_column_into_three)\n\n# Drop the original 'Mortality' column if no longer needed\n#COPD_incidence_df = COPD_incidence_df.drop(columns=['COPD In'])\n\nprint(\"COPD_prevalence_percentage_total DataFrame columns:\", Non_Smoke_merged_df.columns)\n\n# Convert the daily ozone into yearly data\ngreat_ozone_combined_df['year'] = great_ozone_combined_df['Date Local'].dt.year\n\n# Group by additional columns and 'year'\ngrouped = great_ozone_combined_df.groupby(['State Name', 'year'])['Arithmetic Mean']\n\n# Compute statistics\nstats_df = grouped.agg(['max', 'min', 'mean', 'median', 'std']).reset_index()\n\n# Rename columns for clarity\n#stats_df.columns = ['State Name', 'County Name', 'year', 'Max', 'Min', 'Mean', 'Median', 'Std']\n\n# Convert columns to string in both DataFrames\n#stats_df['county'] = stats_df['County Name'].str.lower()\nstats_df['state'] = stats_df['State Name'].str.lower()\nstats_df['year'] = stats_df['year'].astype(int)\n\n#COPD_incidence_df['county'] = COPD_incidence_df['County Name'].str.strip().str.lower()\nNon_Smoke_merged_df['year'] = Non_Smoke_merged_df['year'].astype(int)\n\n# Check unique values in each DataFrame for the merge columns\nprint(\"Unique State Names in Non_Smoke_merged_df:\")\nprint(Non_Smoke_merged_df['State Name'].unique())\n\nprint(\"Unique State Names in stats_df:\")\nprint(stats_df['State Name'].unique())\n\nprint(\"Unique Years in Non_Smoke_merged_df:\")\nprint(Non_Smoke_merged_df['year'].unique())\n\nprint(\"Unique Years in stats_df:\")\nprint(stats_df['year'].unique())\n\n\n# Print the results\nprint(\"Ozone aggregated yearly data:\", stats_df)\nprint(\"COPD Incidence data header:\", Non_Smoke_merged_df.head())\n\n# Merge the statistics ozone DataFrame with the COPD_long DataFrame\ntotalCOPD_ozone_merged_df = pd.merge(Non_Smoke_merged_df, stats_df, on=['State Name', 'year'], how='inner')\n\nprint(\"Merged DataFrame:\", totalCOPD_ozone_merged_df.head())\n\n# Save to a CSV file\ntotalCOPD_ozone_merged_df.to_csv(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/Non Smoke/totalCOPD_ozone_merged_df.csv', index=False)\n\n\nprint(totalCOPD_ozone_merged_df)\nprint(totalCOPD_ozone_merged_df.columns)\n\n\n\n",
   "lang" : "python",
   "owner" : "111111",
   "confidential" : "FALSE"
 },{
-  "id" : "hjbur5",
-  "name" : "2. Incidence COPD Data Analysis",
+  "id" : "3snnwc",
+  "name" : "totalCOPD_breathright2",
   "description" : null,
-  "code" : "import pandas as pd\nimport geopandas as gpd\nimport matplotlib.pyplot as plt\nfrom esda.moran import Moran, Moran_Local\nfrom libpysal.weights import Queen\nfrom splot.esda import lisa_cluster\n\n# Load the merged dataset\nCOPD_incidence_merged_df = pd.read_csv('/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_incidence_merged_df.csv')\n\n# Load the shapefile into a GeoDataFrame\nCOPD_incidence_gdf = gpd.read_file('/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_incidence_gdf.shp')\n\n# Rename the column 'val' to 'COPD Incidence'\nCOPD_incidence_gdf = COPD_incidence_gdf.rename(columns={'val': 'COPD Incidence'})\n\n\n# Verify the loaded GeoDataFrame\nprint(COPD_incidence_gdf.head())\nprint(COPD_incidence_gdf.columns)\nprint(COPD_incidence_gdf.crs)  # Check the Coordinate Reference System\n\n# Descriptive Statistics\nsummary_stats = COPD_incidence_gdf.describe()\nprint(summary_stats)\n\n# Plot distribution of PM2.5 levels and Ozone levels\nplt.figure(figsize=(12, 6))\n\nplt.subplot(1, 2, 1)\nCOPD_incidence_gdf['PM2.5'].hist(bins=20)\nplt.title('Distribution of PM2.5 Levels')\nplt.xlabel('PM2.5')\nplt.ylabel('Frequency')\n\nplt.subplot(1, 2, 2)\nCOPD_incidence_gdf['Ozone'].hist(bins=20)\nplt.title('Distribution of Ozone Levels')\nplt.xlabel('Ozone')\nplt.ylabel('Frequency')\n\nplt.tight_layout()\nplt.show()\n\n\n# Scatter plot of PM2.5 vs. Mortality\nplt.figure(figsize=(12, 6))\n\nplt.subplot(1, 2, 1)\nplt.scatter(COPD_incidence_gdf['PM2.5'], COPD_incidence_gdf['COPD Incidence'], alpha=0.5, edgecolors='w', s=80)\nplt.title('PM2.5 vs. COPD Incidence')\nplt.xlabel('PM2.5')\nplt.ylabel('COPD Incidence')\n\n\n# Scatter plot of Ozone vs. Mortality\nplt.subplot(1, 2, 2)\nplt.scatter(COPD_incidence_gdf['Ozone'], COPD_incidence_gdf['COPD Incidence'], alpha=0.5, edgecolors='w', s=80)\nplt.title('Ozone vs. COPD Incidence')\nplt.xlabel('Ozone')\nplt.ylabel('COPD Incidence')\n\nplt.tight_layout()\nplt.show()\n\n\n# Add an ID column to the GeoDataFrame to use it on GeoDa\nCOPD_incidence_gdf['ID'] = range(1, len(COPD_incidence_gdf) + 1)\n\nprint(COPD_incidence_gdf.columns)\n\n# Define the path to the shapefile\nshapefile_path = '/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_incidence_gdf.shp'\n\n# Save the updated GeoDataFrame to the shapefile\nCOPD_incidence_gdf.to_file(shapefile_path)\n\n# Verify the file is saved\nprint(f\"Updated shapefile saved to {shapefile_path}\")\n\n\n\n\n\n\n\n\n",
+  "code" : "import pandas as pd\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\n# Load the CSV file into a DataFrame\n#file_path = '/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/Non Smoke/totalCOPD_ozone_merged_df.csv'\n#totalCOPD_ozone_merged_df = pd.read_csv(file_path)\n\n\n# Verify the data types after conversion\n#print(totalCOPD_ozone_merged_df.dtypes)\n\n\n# Display the first few rows of the DataFrame\n#print(totalCOPD_ozone_merged_df.head())\n#print(totalCOPD_ozone_merged_df.columns)\n\n# Assuming 'Min_x' and 'Min_y' are already in the DataFrame\n#totalCOPD_ozone_merged_df['difference_Min_x_Min_y'] = totalCOPD_ozone_merged_df['Min_x'] - totalCOPD_ozone_merged_df['Min_y']\n\n# Display the first few rows to check the new column\n#print(totalCOPD_ozone_merged_df[['Min_x', 'Min_y', 'difference_Min_x_Min_y']].head())\n\n# Calculate basic statistics for the difference\n#diff_mean = totalCOPD_ozone_merged_df['difference_Min_x_Min_y'].mean()\n#diff_median = totalCOPD_ozone_merged_df['difference_Min_x_Min_y'].median()\n#diff_std = totalCOPD_ozone_merged_df['difference_Min_x_Min_y'].std()\n\n# Fill NaN values in Min_x or Min_y with a specific value, e.g., 0\n#totalCOPD_ozone_merged_df_filled = totalCOPD_ozone_merged_df.fillna({'Min_x': 0, 'Min_y': 0})\n\n# Recalculate the difference\n#totalCOPD_ozone_merged_df_filled['difference_Min_x_Min_y'] = totalCOPD_ozone_merged_df_filled['Min_x'] - totalCOPD_ozone_merged_df_filled['Min_y']\n\n# Display the updated DataFrame\n#print(totalCOPD_ozone_merged_df_filled[['Min_x', 'Min_y', 'difference_Min_x_Min_y']].head())\n\n# Calculate basic statistics\n#diff_mean = totalCOPD_ozone_merged_df_filled['difference_Min_x_Min_y'].mean()\n#diff_median = totalCOPD_ozone_merged_df_filled['difference_Min_x_Min_y'].median()\n#diff_std = totalCOPD_ozone_merged_df_filled['difference_Min_x_Min_y'].std()\n\n#print(f\"Mean of the difference: {diff_mean}\")\n#print(f\"Median of the difference: {diff_median}\")\n#print(f\"Standard deviation of the difference: {diff_std}\")\n\n# Drop the Min_x column\n#totalCOPD_ozone_merged_df = totalCOPD_ozone_merged_df.drop(columns=['Min_x', 'Max_x'])\n\n# Rename columns\n#totalCOPD_ozone_merged_df = totalCOPD_ozone_merged_df.rename(columns={'Min_y': 'Min', 'Max_y': 'Max'})\n\n\n#def do_breatheright_correlation_analysis():\n    # Read in the merged CSV file with ozone and lung disease data\n    #totalCOPD_ozone_merged_df = pd.read_csv(f\"/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/Non Smoke/totalCOPD_ozone_merged_df.csv\")\n    #print(totalCOPD_ozone_merged_df.head())\n    #print(totalCOPD_ozone_merged_df.columns)\n\n# Drop the unnecessary columns\n# ILD_merged_pm25_df = ILD_merged_pm25_df.drop(columns=[\"county_x\", 'State Name', 'county_y', 'State Name_y'])\n\n# Figuring out which columns to drop\n# Values in 'county_x' but not in 'county_y'\n#county_x_not_in_county_y = set(nosmokeCOPD_merged_pm25_df['county_x'].dropna()).difference(set(ILD_merged_pm25_df['county_y'].dropna()))\n#print(\"Values in 'county_x' but not in 'county_y':\")\n#print(county_x_not_in_county_y)\n\n# Values in 'county_y' but not in 'county_x'\n#county_y_not_in_county_x = set(ILD_merged_pm25_df['county_y'].dropna()).difference(set(ILD_merged_pm25_df['county_x'].dropna()))\n#print(\"\\nValues in 'county_y' but not in 'county_x':\")\n#print(county_y_not_in_county_x)\n\n# Check for null values\n#print(\"\\nNull values in 'county_x':\")\n#print(ILD_merged_pm25_df['county_x'].isnull().sum())\n\n#print(\"\\nNull values in 'county_y':\")\n#print(ILD_merged_pm25_df['county_y'].isnull().sum())\n\n# Drop the 'county_x' column\n#ILD_merged_pm25_df = ILD_merged_pm25_df.drop(columns=['county_x'])\n\n# Convert state names in 'State Name' to lowercase\n#totalCOPD_ozone_merged_df['State Name'] = totalCOPD_ozone_merged_df['State Name'].str.lower()\n\n# Convert state names in 'state' to lowercase\n#totalCOPD_ozone_merged_df['state'] = totalCOPD_ozone_merged_df['state'].str.lower()\n\n# Get unique values in 'State Name' and 'state'\n#state_name_values = set(totalCOPD_ozone_merged_df['State Name'].dropna().unique())\n#state_values = set(totalCOPD_ozone_merged_df['state'].dropna().unique())\n\n# Find differences\n#diff_state_name_not_in_state = state_name_values - state_values\n#diff_state_not_in_state_name = state_values - state_name_values\n\n# Print the differences\n#print(\"Values in 'State Name' but not in 'state':\")\n#print(diff_state_name_not_in_state)\n\n#print(\"\\nValues in 'state' but not in 'State Name':\")\n#print(diff_state_not_in_state_name)\n\n# Check for null values\n#print(\"\\nNull values in 'State Name':\")\n#print(totalCOPD_ozone_merged_df['State Name'].isnull().sum())\n\n#print(\"\\nNull values in 'state':\")\n#print(totalCOPD_ozone_merged_df['state'].isnull().sum())\n\n# Drop the 'state' column\n#totalCOPD_ozone_merged_df = totalCOPD_ozone_merged_df.drop(columns=['state'])\n\n# Renaming columns for clarity\n#totalCOPD_ozone_merged_df = totalCOPD_ozone_merged_df.rename(columns={\n    #'Max': 'ozone_max',\n    #'Min': 'ozone_min',\n    #'Mean': 'ozone_mean',\n    #'Median': 'ozone_median',\n    #'Std': 'ozone_std',\n#})\n\n# Calculate correlations\n#correlation_matrix = totalCOPD_ozone_merged_df[[\n    #'Average', 'Min', 'Max',\n    #'ozone_max', 'ozone_min', 'ozone_mean', 'ozone_median', 'ozone_std'\n#]].corr()\n\n# Convert columns to numeric where applicable\n#numeric_columns = ['average', 'min', 'max', 'ozone_max', 'ozone_min', 'ozone_mean', 'ozone_median', 'ozone_std']\n#totalCOPD_ozone_merged_df[numeric_columns] = totalCOPD_ozone_merged_df[numeric_columns].apply(pd.to_numeric, errors='coerce')\n\n# Verify the data types after conversion\n#print(totalCOPD_ozone_merged_df.dtypes)\n\n# Save correlation matrix to CSV\n#correlation_matrix.to_csv(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/Non Smoke/COPDtotalozone_correlation_matrix.csv')\n\n# Plot Correlation Heatmap\n#plt.figure(figsize=(12, 13))\n#sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0)\n#plt.title('COPD Total Incidence vs. Ozone Correlation Heatmap')\n#plt.savefig(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/Non Smoke/totalCOPD1_correlation_heatmap_ozone.png')\n# plt.show()\n\n\n# Check for missing values in relevant columns\n#print(totalCOPD_ozone_merged_df[['average', 'min', 'max', 'ozone_max', 'ozone_min', 'ozone_mean', 'ozone_median', 'ozone_std']].isnull().sum())\n\n\n#print(totalCOPD_ozone_merged_df.dtypes)\n\n\n#print(totalCOPD_ozone_merged_df[['average', 'min', 'max', 'ozone_max', 'ozone_min', 'ozone_mean', 'ozone_median', 'ozone_std']].sample(10))\n\n\n\n\nimport pandas as pd\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\n# Load the CSV file into a DataFrame\nfile_path = '/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/Non Smoke/totalCOPD_ozone_merged_df.csv'\ntotalCOPD_ozone_merged_df = pd.read_csv(file_path)\n\n# Verify the data types after conversion\nprint(totalCOPD_ozone_merged_df.dtypes)\n\n# Display the first few rows of the DataFrame\nprint(totalCOPD_ozone_merged_df.head())\nprint(totalCOPD_ozone_merged_df.columns)\n\n# Assuming 'Min_x' and 'Min_y' are already in the DataFrame\n#totalCOPD_ozone_merged_df['difference_Min_x_Min_y'] = totalCOPD_ozone_merged_df['Min_x'] - totalCOPD_ozone_merged_df['Min_y']\n\n# Display the first few rows to check the new column\n#print(totalCOPD_ozone_merged_df[['Min_x', 'Min_y', 'difference_Min_x_Min_y']].head())\n\n# Calculate basic statistics for the difference\n#diff_mean = totalCOPD_ozone_merged_df['difference_Min_x_Min_y'].mean()\n#diff_median = totalCOPD_ozone_merged_df['difference_Min_x_Min_y'].median()\n#diff_std = totalCOPD_ozone_merged_df['difference_Min_x_Min_y'].std()\n\n# Fill NaN values in Min_x or Min_y with a specific value, e.g., 0\n#totalCOPD_ozone_merged_df_filled = totalCOPD_ozone_merged_df.fillna({'Min_x': 0, 'Min_y': 0})\n\n# Recalculate the difference\n#totalCOPD_ozone_merged_df_filled['difference_Min_x_Min_y'] = totalCOPD_ozone_merged_df_filled['Min_x'] - totalCOPD_ozone_merged_df_filled['Min_y']\n\n# Display the updated DataFrame\n#print(totalCOPD_ozone_merged_df_filled[['Min_x', 'Min_y', 'difference_Min_x_Min_y']].head())\n\n# Calculate basic statistics\n#diff_mean = totalCOPD_ozone_merged_df_filled['difference_Min_x_Min_y'].mean()\n#diff_median = totalCOPD_ozone_merged_df_filled['difference_Min_x_Min_y'].median()\n#diff_std = totalCOPD_ozone_merged_df_filled['difference_Min_x_Min_y'].std()\n\n#print(f\"Mean of the difference: {diff_mean}\")\n#print(f\"Median of the difference: {diff_median}\")\n#print(f\"Standard deviation of the difference: {diff_std}\")\n\n# Drop the Min_x and Max_x columns\n#totalCOPD_ozone_merged_df = totalCOPD_ozone_merged_df.drop(columns=['Min_x', 'Max_x'])\n\n# Rename columns\n#totalCOPD_ozone_merged_df = totalCOPD_ozone_merged_df.rename(columns={'Min_y': 'Min', 'Max_y': 'Max'})\n\n# Ensure the renaming has taken place\nprint(totalCOPD_ozone_merged_df.columns)\n\n# Drop the 'state' column\ntotalCOPD_ozone_merged_df = totalCOPD_ozone_merged_df.drop(columns=['state'])\n\n# Renaming columns for clarity\n#totalCOPD_ozone_merged_df = totalCOPD_ozone_merged_df.rename(columns={\n    #'Max': 'ozone_max',\n    #'Min': 'ozone_min',\n    #'Mean': 'ozone_mean',\n    #'Median': 'ozone_median',\n    #'Std': 'ozone_std',\n#})\n\n# Convert columns to numeric where applicable\nnumeric_columns = ['Average', 'ozone_max', 'ozone_min', 'ozone_mean', 'ozone_median', 'ozone_std']\ntotalCOPD_ozone_merged_df[numeric_columns] = totalCOPD_ozone_merged_df[numeric_columns].apply(pd.to_numeric, errors='coerce')\n\n# Verify the data types after conversion\nprint(totalCOPD_ozone_merged_df.dtypes)\n\n# Calculate correlations\ncorrelation_matrix = totalCOPD_ozone_merged_df[[\n    'Average', 'ozone_max', 'ozone_min', 'ozone_mean', 'ozone_median', 'ozone_std'\n]].corr()\n\n# Save correlation matrix to CSV\ncorrelation_matrix.to_csv(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/Non Smoke/COPDtotalozone_correlation_matrix.csv')\n\n# Plot Correlation Heatmap\nplt.figure(figsize=(12, 13))\nsns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0)\nplt.title('COPD Total Incidence vs. Ozone Correlation Heatmap')\nplt.savefig(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/Non Smoke/totalCOPD1_correlation_heatmap_ozone.png')\n# plt.show()\n\n# Check for missing values in relevant columns\nprint(totalCOPD_ozone_merged_df[['Average', 'ozone_max', 'ozone_min', 'ozone_mean', 'ozone_median', 'ozone_std']].isnull().sum())\n\nprint(totalCOPD_ozone_merged_df.dtypes)\n\nprint(totalCOPD_ozone_merged_df[['Average', 'ozone_max', 'ozone_min', 'ozone_mean', 'ozone_median', 'ozone_std']].sample(10))\n\n\n\n",
   "lang" : "python",
   "owner" : "111111",
   "confidential" : "FALSE"
diff --git a/totalCOPD_breathright1.py b/totalCOPD_breathright1.py
new file mode 100644
index 0000000..8e1d251
--- /dev/null
+++ b/totalCOPD_breathright1.py
@@ -0,0 +1,126 @@
+import os
+import pandas as pd
+import re
+
+# Paths to your data
+Non_Smoke_merged_df = "/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/Non Smoke/Non_Smoke_merged_df.csv"
+great_pm25_combined_df = pd.read_csv('/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/Ozone and PM2.5 Data/great_combined_daily_pm25.csv', low_memory=False)
+great_ozone_combined_df = pd.read_csv('/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/Ozone and PM2.5 Data/great_combined_daily_ozone.csv', low_memory=False)
+
+# Read all the csv into pandas dataframe in memory
+#great_ozone_combined_df = pd.read_csv(ozone_data_path, parse_dates=['Date Local'])
+Non_Smoke_merged_df = pd.read_csv(Non_Smoke_merged_df)
+#great_pm25_combined_df = pd.read_csv(great_pm25_combined_df)
+
+print(Non_Smoke_merged_df.columns)
+print(Non_Smoke_merged_df['COPD_prevalence_percentage_total'].head())
+
+
+# Convert 'Date Local' to datetime format
+great_ozone_combined_df['Date Local'] = pd.to_datetime(great_ozone_combined_df['Date Local'], errors='coerce')
+
+print(Non_Smoke_merged_df.columns)
+
+# Rename columns to be consistent
+#COPD_incidence_df.rename(columns={'Location': 'County Name'}, inplace=True)
+
+# Use the melt function to transform the DataFrame from wide to long format. This will convert the year-specific columns into rows.
+#COPD_incidence_long = COPD_incidence_df.melt(
+    #id_vars=['County Name', 'FIPS', '% Change in Mortality Rate, 1980-2014'],
+    #var_name='year',
+    #value_name='Mortality'
+#)
+
+# Extract the year from the 'year' column using string operations and convert it to an integer.
+#ILD_long['year'] = ILD_long['year'].str.extract(r'(\d{4})').astype(int)
+
+# Function to split the Mortality Rate column
+import pandas as pd
+
+def split_COPD_prevalence_percentage_total_column_into_three(COPD_prevalence_percentage_total):
+    if pd.isna(COPD_prevalence_percentage_total):
+        return pd.Series([None, None, None], index=['Average', 'Min', 'Max'])
+    
+    COPD_prevalence_percentage_total = str(COPD_prevalence_percentage_total)
+    
+    # Check for range format
+    match = re.match(r'(\d+\.\d+) \((\d+\.\d+), (\d+\.\d+)\)', COPD_prevalence_percentage_total)
+    if match:
+        avg, min_val, max_val = match.groups()
+        return pd.Series([float(avg), float(min_val), float(max_val)], index=['Average', 'Min', 'Max'])
+    
+    # Handle single value format
+    try:
+        avg = float(COPD_prevalence_percentage_total)
+        return pd.Series([avg, avg, avg], index=['Average', 'Min', 'Max'])
+    except ValueError:
+        return pd.Series([None, None, None], index=['Average', 'Min', 'Max'])
+# Apply the function to the DataFrame
+Non_Smoke_merged_df[['Average', 'Min', 'Max']] = Non_Smoke_merged_df['COPD_prevalence_percentage_total'].apply(split_COPD_prevalence_percentage_total_column_into_three)
+
+
+
+# Ensure 'COPD Incidence' is a string and handle NaN values
+Non_Smoke_merged_df['COPD_prevalence_percentage_total'] = Non_Smoke_merged_df['COPD_prevalence_percentage_total'].astype(str)
+
+# Apply the function to the DataFrame
+Non_Smoke_merged_df[['Average', 'Min', 'Max']] = Non_Smoke_merged_df['COPD_prevalence_percentage_total'].apply(split_COPD_prevalence_percentage_total_column_into_three)
+
+# Drop the original 'Mortality' column if no longer needed
+#COPD_incidence_df = COPD_incidence_df.drop(columns=['COPD In'])
+
+print("COPD_prevalence_percentage_total DataFrame columns:", Non_Smoke_merged_df.columns)
+
+# Convert the daily ozone into yearly data
+great_ozone_combined_df['year'] = great_ozone_combined_df['Date Local'].dt.year
+
+# Group by additional columns and 'year'
+grouped = great_ozone_combined_df.groupby(['State Name', 'year'])['Arithmetic Mean']
+
+# Compute statistics
+stats_df = grouped.agg(['max', 'min', 'mean', 'median', 'std']).reset_index()
+
+# Rename columns for clarity
+#stats_df.columns = ['State Name', 'County Name', 'year', 'Max', 'Min', 'Mean', 'Median', 'Std']
+
+# Convert columns to string in both DataFrames
+#stats_df['county'] = stats_df['County Name'].str.lower()
+stats_df['state'] = stats_df['State Name'].str.lower()
+stats_df['year'] = stats_df['year'].astype(int)
+
+#COPD_incidence_df['county'] = COPD_incidence_df['County Name'].str.strip().str.lower()
+Non_Smoke_merged_df['year'] = Non_Smoke_merged_df['year'].astype(int)
+
+# Check unique values in each DataFrame for the merge columns
+print("Unique State Names in Non_Smoke_merged_df:")
+print(Non_Smoke_merged_df['State Name'].unique())
+
+print("Unique State Names in stats_df:")
+print(stats_df['State Name'].unique())
+
+print("Unique Years in Non_Smoke_merged_df:")
+print(Non_Smoke_merged_df['year'].unique())
+
+print("Unique Years in stats_df:")
+print(stats_df['year'].unique())
+
+
+# Print the results
+print("Ozone aggregated yearly data:", stats_df)
+print("COPD Incidence data header:", Non_Smoke_merged_df.head())
+
+# Merge the statistics ozone DataFrame with the COPD_long DataFrame
+totalCOPD_ozone_merged_df = pd.merge(Non_Smoke_merged_df, stats_df, on=['State Name', 'year'], how='inner')
+
+print("Merged DataFrame:", totalCOPD_ozone_merged_df.head())
+
+# Save to a CSV file
+totalCOPD_ozone_merged_df.to_csv(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/Non Smoke/totalCOPD_ozone_merged_df.csv', index=False)
+
+
+print(totalCOPD_ozone_merged_df)
+print(totalCOPD_ozone_merged_df.columns)
+
+
+
+
diff --git a/totalCOPD_breathright2.py b/totalCOPD_breathright2.py
new file mode 100644
index 0000000..d00091a
--- /dev/null
+++ b/totalCOPD_breathright2.py
@@ -0,0 +1,256 @@
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+# Load the CSV file into a DataFrame
+#file_path = '/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/Non Smoke/totalCOPD_ozone_merged_df.csv'
+#totalCOPD_ozone_merged_df = pd.read_csv(file_path)
+
+
+# Verify the data types after conversion
+#print(totalCOPD_ozone_merged_df.dtypes)
+
+
+# Display the first few rows of the DataFrame
+#print(totalCOPD_ozone_merged_df.head())
+#print(totalCOPD_ozone_merged_df.columns)
+
+# Assuming 'Min_x' and 'Min_y' are already in the DataFrame
+#totalCOPD_ozone_merged_df['difference_Min_x_Min_y'] = totalCOPD_ozone_merged_df['Min_x'] - totalCOPD_ozone_merged_df['Min_y']
+
+# Display the first few rows to check the new column
+#print(totalCOPD_ozone_merged_df[['Min_x', 'Min_y', 'difference_Min_x_Min_y']].head())
+
+# Calculate basic statistics for the difference
+#diff_mean = totalCOPD_ozone_merged_df['difference_Min_x_Min_y'].mean()
+#diff_median = totalCOPD_ozone_merged_df['difference_Min_x_Min_y'].median()
+#diff_std = totalCOPD_ozone_merged_df['difference_Min_x_Min_y'].std()
+
+# Fill NaN values in Min_x or Min_y with a specific value, e.g., 0
+#totalCOPD_ozone_merged_df_filled = totalCOPD_ozone_merged_df.fillna({'Min_x': 0, 'Min_y': 0})
+
+# Recalculate the difference
+#totalCOPD_ozone_merged_df_filled['difference_Min_x_Min_y'] = totalCOPD_ozone_merged_df_filled['Min_x'] - totalCOPD_ozone_merged_df_filled['Min_y']
+
+# Display the updated DataFrame
+#print(totalCOPD_ozone_merged_df_filled[['Min_x', 'Min_y', 'difference_Min_x_Min_y']].head())
+
+# Calculate basic statistics
+#diff_mean = totalCOPD_ozone_merged_df_filled['difference_Min_x_Min_y'].mean()
+#diff_median = totalCOPD_ozone_merged_df_filled['difference_Min_x_Min_y'].median()
+#diff_std = totalCOPD_ozone_merged_df_filled['difference_Min_x_Min_y'].std()
+
+#print(f"Mean of the difference: {diff_mean}")
+#print(f"Median of the difference: {diff_median}")
+#print(f"Standard deviation of the difference: {diff_std}")
+
+# Drop the Min_x column
+#totalCOPD_ozone_merged_df = totalCOPD_ozone_merged_df.drop(columns=['Min_x', 'Max_x'])
+
+# Rename columns
+#totalCOPD_ozone_merged_df = totalCOPD_ozone_merged_df.rename(columns={'Min_y': 'Min', 'Max_y': 'Max'})
+
+
+#def do_breatheright_correlation_analysis():
+    # Read in the merged CSV file with ozone and lung disease data
+    #totalCOPD_ozone_merged_df = pd.read_csv(f"/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/Non Smoke/totalCOPD_ozone_merged_df.csv")
+    #print(totalCOPD_ozone_merged_df.head())
+    #print(totalCOPD_ozone_merged_df.columns)
+
+# Drop the unnecessary columns
+# ILD_merged_pm25_df = ILD_merged_pm25_df.drop(columns=["county_x", 'State Name', 'county_y', 'State Name_y'])
+
+# Figuring out which columns to drop
+# Values in 'county_x' but not in 'county_y'
+#county_x_not_in_county_y = set(nosmokeCOPD_merged_pm25_df['county_x'].dropna()).difference(set(ILD_merged_pm25_df['county_y'].dropna()))
+#print("Values in 'county_x' but not in 'county_y':")
+#print(county_x_not_in_county_y)
+
+# Values in 'county_y' but not in 'county_x'
+#county_y_not_in_county_x = set(ILD_merged_pm25_df['county_y'].dropna()).difference(set(ILD_merged_pm25_df['county_x'].dropna()))
+#print("\nValues in 'county_y' but not in 'county_x':")
+#print(county_y_not_in_county_x)
+
+# Check for null values
+#print("\nNull values in 'county_x':")
+#print(ILD_merged_pm25_df['county_x'].isnull().sum())
+
+#print("\nNull values in 'county_y':")
+#print(ILD_merged_pm25_df['county_y'].isnull().sum())
+
+# Drop the 'county_x' column
+#ILD_merged_pm25_df = ILD_merged_pm25_df.drop(columns=['county_x'])
+
+# Convert state names in 'State Name' to lowercase
+#totalCOPD_ozone_merged_df['State Name'] = totalCOPD_ozone_merged_df['State Name'].str.lower()
+
+# Convert state names in 'state' to lowercase
+#totalCOPD_ozone_merged_df['state'] = totalCOPD_ozone_merged_df['state'].str.lower()
+
+# Get unique values in 'State Name' and 'state'
+#state_name_values = set(totalCOPD_ozone_merged_df['State Name'].dropna().unique())
+#state_values = set(totalCOPD_ozone_merged_df['state'].dropna().unique())
+
+# Find differences
+#diff_state_name_not_in_state = state_name_values - state_values
+#diff_state_not_in_state_name = state_values - state_name_values
+
+# Print the differences
+#print("Values in 'State Name' but not in 'state':")
+#print(diff_state_name_not_in_state)
+
+#print("\nValues in 'state' but not in 'State Name':")
+#print(diff_state_not_in_state_name)
+
+# Check for null values
+#print("\nNull values in 'State Name':")
+#print(totalCOPD_ozone_merged_df['State Name'].isnull().sum())
+
+#print("\nNull values in 'state':")
+#print(totalCOPD_ozone_merged_df['state'].isnull().sum())
+
+# Drop the 'state' column
+#totalCOPD_ozone_merged_df = totalCOPD_ozone_merged_df.drop(columns=['state'])
+
+# Renaming columns for clarity
+#totalCOPD_ozone_merged_df = totalCOPD_ozone_merged_df.rename(columns={
+    #'Max': 'ozone_max',
+    #'Min': 'ozone_min',
+    #'Mean': 'ozone_mean',
+    #'Median': 'ozone_median',
+    #'Std': 'ozone_std',
+#})
+
+# Calculate correlations
+#correlation_matrix = totalCOPD_ozone_merged_df[[
+    #'Average', 'Min', 'Max',
+    #'ozone_max', 'ozone_min', 'ozone_mean', 'ozone_median', 'ozone_std'
+#]].corr()
+
+# Convert columns to numeric where applicable
+#numeric_columns = ['average', 'min', 'max', 'ozone_max', 'ozone_min', 'ozone_mean', 'ozone_median', 'ozone_std']
+#totalCOPD_ozone_merged_df[numeric_columns] = totalCOPD_ozone_merged_df[numeric_columns].apply(pd.to_numeric, errors='coerce')
+
+# Verify the data types after conversion
+#print(totalCOPD_ozone_merged_df.dtypes)
+
+# Save correlation matrix to CSV
+#correlation_matrix.to_csv(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/Non Smoke/COPDtotalozone_correlation_matrix.csv')
+
+# Plot Correlation Heatmap
+#plt.figure(figsize=(12, 13))
+#sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0)
+#plt.title('COPD Total Incidence vs. Ozone Correlation Heatmap')
+#plt.savefig(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/Non Smoke/totalCOPD1_correlation_heatmap_ozone.png')
+# plt.show()
+
+
+# Check for missing values in relevant columns
+#print(totalCOPD_ozone_merged_df[['average', 'min', 'max', 'ozone_max', 'ozone_min', 'ozone_mean', 'ozone_median', 'ozone_std']].isnull().sum())
+
+
+#print(totalCOPD_ozone_merged_df.dtypes)
+
+
+#print(totalCOPD_ozone_merged_df[['average', 'min', 'max', 'ozone_max', 'ozone_min', 'ozone_mean', 'ozone_median', 'ozone_std']].sample(10))
+
+
+
+
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+# Load the CSV file into a DataFrame
+file_path = '/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/Non Smoke/totalCOPD_ozone_merged_df.csv'
+totalCOPD_ozone_merged_df = pd.read_csv(file_path)
+
+# Verify the data types after conversion
+print(totalCOPD_ozone_merged_df.dtypes)
+
+# Display the first few rows of the DataFrame
+print(totalCOPD_ozone_merged_df.head())
+print(totalCOPD_ozone_merged_df.columns)
+
+# Assuming 'Min_x' and 'Min_y' are already in the DataFrame
+#totalCOPD_ozone_merged_df['difference_Min_x_Min_y'] = totalCOPD_ozone_merged_df['Min_x'] - totalCOPD_ozone_merged_df['Min_y']
+
+# Display the first few rows to check the new column
+#print(totalCOPD_ozone_merged_df[['Min_x', 'Min_y', 'difference_Min_x_Min_y']].head())
+
+# Calculate basic statistics for the difference
+#diff_mean = totalCOPD_ozone_merged_df['difference_Min_x_Min_y'].mean()
+#diff_median = totalCOPD_ozone_merged_df['difference_Min_x_Min_y'].median()
+#diff_std = totalCOPD_ozone_merged_df['difference_Min_x_Min_y'].std()
+
+# Fill NaN values in Min_x or Min_y with a specific value, e.g., 0
+#totalCOPD_ozone_merged_df_filled = totalCOPD_ozone_merged_df.fillna({'Min_x': 0, 'Min_y': 0})
+
+# Recalculate the difference
+#totalCOPD_ozone_merged_df_filled['difference_Min_x_Min_y'] = totalCOPD_ozone_merged_df_filled['Min_x'] - totalCOPD_ozone_merged_df_filled['Min_y']
+
+# Display the updated DataFrame
+#print(totalCOPD_ozone_merged_df_filled[['Min_x', 'Min_y', 'difference_Min_x_Min_y']].head())
+
+# Calculate basic statistics
+#diff_mean = totalCOPD_ozone_merged_df_filled['difference_Min_x_Min_y'].mean()
+#diff_median = totalCOPD_ozone_merged_df_filled['difference_Min_x_Min_y'].median()
+#diff_std = totalCOPD_ozone_merged_df_filled['difference_Min_x_Min_y'].std()
+
+#print(f"Mean of the difference: {diff_mean}")
+#print(f"Median of the difference: {diff_median}")
+#print(f"Standard deviation of the difference: {diff_std}")
+
+# Drop the Min_x and Max_x columns
+#totalCOPD_ozone_merged_df = totalCOPD_ozone_merged_df.drop(columns=['Min_x', 'Max_x'])
+
+# Rename columns
+#totalCOPD_ozone_merged_df = totalCOPD_ozone_merged_df.rename(columns={'Min_y': 'Min', 'Max_y': 'Max'})
+
+# Ensure the renaming has taken place
+print(totalCOPD_ozone_merged_df.columns)
+
+# Drop the 'state' column
+totalCOPD_ozone_merged_df = totalCOPD_ozone_merged_df.drop(columns=['state'])
+
+# Renaming columns for clarity
+#totalCOPD_ozone_merged_df = totalCOPD_ozone_merged_df.rename(columns={
+    #'Max': 'ozone_max',
+    #'Min': 'ozone_min',
+    #'Mean': 'ozone_mean',
+    #'Median': 'ozone_median',
+    #'Std': 'ozone_std',
+#})
+
+# Convert columns to numeric where applicable
+numeric_columns = ['Average', 'ozone_max', 'ozone_min', 'ozone_mean', 'ozone_median', 'ozone_std']
+totalCOPD_ozone_merged_df[numeric_columns] = totalCOPD_ozone_merged_df[numeric_columns].apply(pd.to_numeric, errors='coerce')
+
+# Verify the data types after conversion
+print(totalCOPD_ozone_merged_df.dtypes)
+
+# Calculate correlations
+correlation_matrix = totalCOPD_ozone_merged_df[[
+    'Average', 'ozone_max', 'ozone_min', 'ozone_mean', 'ozone_median', 'ozone_std'
+]].corr()
+
+# Save correlation matrix to CSV
+correlation_matrix.to_csv(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/Non Smoke/COPDtotalozone_correlation_matrix.csv')
+
+# Plot Correlation Heatmap
+plt.figure(figsize=(12, 13))
+sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0)
+plt.title('COPD Total Incidence vs. Ozone Correlation Heatmap')
+plt.savefig(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/Non Smoke/totalCOPD1_correlation_heatmap_ozone.png')
+# plt.show()
+
+# Check for missing values in relevant columns
+print(totalCOPD_ozone_merged_df[['Average', 'ozone_max', 'ozone_min', 'ozone_mean', 'ozone_median', 'ozone_std']].isnull().sum())
+
+print(totalCOPD_ozone_merged_df.dtypes)
+
+print(totalCOPD_ozone_merged_df[['Average', 'ozone_max', 'ozone_min', 'ozone_mean', 'ozone_median', 'ozone_std']].sample(10))
+
+
+
+

From 932e1f30cd5e02893d4fd1aa7eacd69bfa7095c6 Mon Sep 17 00:00:00 2001
From: IK-173 <162045008+IK-173@users.noreply.github.com>
Date: Fri, 16 Aug 2024 17:26:16 -0400
Subject: [PATCH 2/2] COPD_Incid_pm25_ozone_heatmap_workflows

---
 COPD_Incid_ozone_breathright_correlation.py | 102 ++++++++++++++++++++
 COPD_Incid_ozone_breathright_data_prep.py   |  91 +++++++++++++++++
 COPD_Incid_pm25_breathright_correlation.py  | 102 ++++++++++++++++++++
 COPD_Incid_pm25_breathright_data_prep.py    |  91 +++++++++++++++++
 process.json                                |  28 ++++--
 5 files changed, 408 insertions(+), 6 deletions(-)
 create mode 100644 COPD_Incid_ozone_breathright_correlation.py
 create mode 100644 COPD_Incid_ozone_breathright_data_prep.py
 create mode 100644 COPD_Incid_pm25_breathright_correlation.py
 create mode 100644 COPD_Incid_pm25_breathright_data_prep.py

diff --git a/COPD_Incid_ozone_breathright_correlation.py b/COPD_Incid_ozone_breathright_correlation.py
new file mode 100644
index 0000000..292a532
--- /dev/null
+++ b/COPD_Incid_ozone_breathright_correlation.py
@@ -0,0 +1,102 @@
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+# Load the CSV file into a DataFrame
+file_path = '/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_ozone_merged_df.csv'
+COPD_Incid_merged_ozone_df = pd.read_csv(file_path)
+
+COPD_Incid_merged_ozone_df = COPD_Incid_merged_ozone_df.drop(columns=['COPD_min', 'COPD_max', 'county', 'County Name'])
+
+
+
+# Display the first few rows of the DataFrame
+print(COPD_Incid_merged_ozone_df.head())
+print(COPD_Incid_merged_ozone_df.columns)
+
+def do_breatheright_correlation_analysis():
+    # Read in the merged CSV file with ozone and lung disease data
+    COPD_Incid_merged_ozone_df = pd.read_csv(f"/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_ozone_merged_df.csv")
+    print(COPD_Incid_merged_ozone_df.head())
+    print(COPD_Incid_merged_ozone_df.columns)
+
+# Drop the unnecessary columns
+# COPD_merged_pm25_df = COPD_merged_pm25_df.drop(columns=["county_x", 'State Name', 'county_y', 'State Name_y'])
+
+# Figuring out which columns to drop
+# Values in 'county_x' but not in 'county_y'
+#county_x_not_in_county_y = set(ILD_merged_pm25_df['county_x'].dropna()).difference(set(ILD_merged_pm25_df['county_y'].dropna()))
+#print("Values in 'county_x' but not in 'county_y':")
+#print(county_x_not_in_county_y)
+
+# Values in 'county_y' but not in 'county_x'
+#county_y_not_in_county_x = set(ILD_merged_pm25_df['county_y'].dropna()).difference(set(ILD_merged_pm25_df['county_x'].dropna()))
+#print("\nValues in 'county_y' but not in 'county_x':")
+#print(county_y_not_in_county_x)
+
+# Check for null values
+#print("\nNull values in 'county_x':")
+#print(ILD_merged_pm25_df['county_x'].isnull().sum())
+
+#print("\nNull values in 'county_y':")
+#print(ILD_merged_pm25_df['county_y'].isnull().sum())
+
+# Drop the 'county_x' column
+#ILD_merged_pm25_df = ILD_merged_pm25_df.drop(columns=['county_x'])
+
+# Convert state names in 'State Name' to lowercase
+COPD_Incid_merged_ozone_df['State Name'] = COPD_Incid_merged_ozone_df['State Name'].str.lower()
+
+# Convert state names in 'state' to lowercase
+COPD_Incid_merged_ozone_df['state'] = COPD_Incid_merged_ozone_df['state'].str.lower()
+
+# Get unique values in 'State Name' and 'state'
+state_name_values = set(COPD_Incid_merged_ozone_df['State Name'].dropna().unique())
+state_values = set(COPD_Incid_merged_ozone_df['state'].dropna().unique())
+
+# Find differences
+diff_state_name_not_in_state = state_name_values - state_values
+diff_state_not_in_state_name = state_values - state_name_values
+
+# Print the differences
+print("Values in 'State Name' but not in 'state':")
+print(diff_state_name_not_in_state)
+
+print("\nValues in 'state' but not in 'State Name':")
+print(diff_state_not_in_state_name)
+
+# Check for null values
+print("\nNull values in 'State Name':")
+print(COPD_Incid_merged_ozone_df['State Name'].isnull().sum())
+
+print("\nNull values in 'state':")
+print(COPD_Incid_merged_ozone_df['state'].isnull().sum())
+
+# Drop the 'state' column
+COPD_Incid_merged_ozone_df = COPD_Incid_merged_ozone_df.drop(columns=['state'])
+
+# Renaming columns for clarity
+COPD_Incid_merged_ozone_df = COPD_Incid_merged_ozone_df.rename(columns={
+    'Max': 'ozone_max',
+    'Min': 'ozone_min',
+    'Mean': 'ozone_mean',
+    'Median': 'ozone_median',
+    'Std': 'ozone_std',
+})
+
+# Calculate correlations
+correlation_matrix = COPD_Incid_merged_ozone_df[[
+    'COPD_average', 'lower', 'upper',
+    'ozone_max', 'ozone_min', 'ozone_mean', 'ozone_median', 'ozone_std'
+]].corr()
+
+# Save correlation matrix to CSV
+correlation_matrix.to_csv(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_ozone_correlation_matrix.csv')
+
+# Plot Correlation Heatmap
+plt.figure(figsize=(12, 13))
+sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0)
+plt.title('COPD Incidence vs. Ozone Correlation Heatmap')
+plt.savefig(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_correlation_heatmap_ozone.png')
+# plt.show()
+
diff --git a/COPD_Incid_ozone_breathright_data_prep.py b/COPD_Incid_ozone_breathright_data_prep.py
new file mode 100644
index 0000000..7aa5124
--- /dev/null
+++ b/COPD_Incid_ozone_breathright_data_prep.py
@@ -0,0 +1,91 @@
+import os
+import pandas as pd
+import re
+
+# Paths to your data
+ozone_data_path = "/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/Ozone and PM2.5 Data/combined_ozone_data.csv"
+COPD_incidence_data_path = "/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/IHME_2000-2021_COPD_Incidence_DATA.csv"
+pm25_data_path = "/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/Ozone and PM2.5 Data/combined_pm25_data.csv"
+
+# Read all the csv into pandas dataframe in memory
+ozone_df = pd.read_csv(ozone_data_path, parse_dates=['Date Local'])
+COPD_incidence_df = pd.read_csv(COPD_incidence_data_path)
+pm25_df = pd.read_csv(pm25_data_path)
+
+print(COPD_incidence_df.columns)
+
+# Convert 'Date Local' to datetime format
+ozone_df['Date Local'] = pd.to_datetime(ozone_df['Date Local'], errors='coerce')
+
+# Rename columns to be consistent
+#COPD_incidence_df.rename(columns={'Location': 'County Name'}, inplace=True)
+
+# Use the melt function to transform the DataFrame from wide to long format. This will convert the year-specific columns into rows.
+#COPD_incidence_long = COPD_incidence_df.melt(
+    #id_vars=['County Name', 'FIPS', '% Change in Mortality Rate, 1980-2014'],
+    #var_name='year',
+    #value_name='Mortality'
+#)
+
+# Extract the year from the 'year' column using string operations and convert it to an integer.
+#ILD_long['year'] = ILD_long['year'].str.extract(r'(\d{4})').astype(int)
+
+# Function to split the Mortality Rate column
+def split_COPD_Incidence_column_into_three(COPD_Incidence):
+    match = re.match(r'(\d+\.\d+) \((\d+\.\d+), (\d+\.\d+)\)', COPD_Incidence)
+    if match:
+        avg, min_val, max_val = match.groups()
+        return pd.Series([float(avg), float(min_val), float(max_val)], index=['COPD_average', 'COPD_min', 'COPD_max'])
+    else:
+        return pd.Series([None, None, None], index=['COPD_average', 'COPD_min', 'COPD_max'])
+
+# Ensure 'COPD Incidence' is a string and handle NaN values
+COPD_incidence_df['COPD Incidence'] = COPD_incidence_df['COPD Incidence'].astype(str)
+
+
+# Apply the function to split the 'Mortality' column
+COPD_incidence_df[['COPD_average', 'COPD_min', 'COPD_max']] = COPD_incidence_df['COPD Incidence'].apply(split_COPD_Incidence_column_into_three)
+
+# Drop the original 'Mortality' column if no longer needed
+#COPD_incidence_df = COPD_incidence_df.drop(columns=['COPD In'])
+
+print("COPD_Incidence DataFrame columns:", COPD_incidence_df.columns)
+
+# Convert the daily ozone into yearly data
+ozone_df['year'] = ozone_df['Date Local'].dt.year
+
+# Group by additional columns and 'year'
+grouped = ozone_df.groupby(['State Name', 'County Name', 'year'])['Arithmetic Mean']
+
+# Compute statistics
+stats_df = grouped.agg(['max', 'min', 'mean', 'median', 'std']).reset_index()
+
+# Rename columns for clarity
+stats_df.columns = ['State Name', 'County Name', 'year', 'Max', 'Min', 'Mean', 'Median', 'Std']
+
+# Convert columns to string in both DataFrames
+stats_df['county'] = stats_df['County Name'].str.lower()
+stats_df['state'] = stats_df['State Name'].str.lower()
+stats_df['year'] = stats_df['year'].astype(int)
+
+#COPD_incidence_df['county'] = COPD_incidence_df['County Name'].str.strip().str.lower()
+COPD_incidence_df['year'] = COPD_incidence_df['year'].astype(int)
+
+# Print the results
+print("Ozone aggregated yearly data:", stats_df)
+print("COPD Incidence data header:", COPD_incidence_df.head())
+
+# Merge the statistics ozone DataFrame with the COPD_long DataFrame
+COPD_Incid_merged_ozone_df = pd.merge(COPD_incidence_df, stats_df, on=['State Name', 'year'], how='inner')
+
+print("Merged DataFrame:", COPD_Incid_merged_ozone_df.head())
+
+# Save to a CSV file
+COPD_Incid_merged_ozone_df.to_csv(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_ozone_merged_df.csv', index=False)
+
+
+
+
+
+
+
diff --git a/COPD_Incid_pm25_breathright_correlation.py b/COPD_Incid_pm25_breathright_correlation.py
new file mode 100644
index 0000000..a318ccf
--- /dev/null
+++ b/COPD_Incid_pm25_breathright_correlation.py
@@ -0,0 +1,102 @@
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+# Load the CSV file into a DataFrame
+file_path = '/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_pm25_merged_df.csv'
+COPD_Incid_merged_pm25_df = pd.read_csv(file_path)
+
+COPD_Incid_merged_pm25_df = COPD_Incid_merged_pm25_df.drop(columns=['COPD_min', 'COPD_max', 'county', 'County Name'])
+
+
+
+# Display the first few rows of the DataFrame
+print(COPD_Incid_merged_pm25_df.head())
+print(COPD_Incid_merged_pm25_df.columns)
+
+def do_breatheright_correlation_analysis():
+    # Read in the merged CSV file with ozone and lung disease data
+    COPD_Incid_merged_pm25_df = pd.read_csv(f"/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_pm25_merged_df.csv")
+    print(COPD_Incid_merged_pm25_df.head())
+    print(COPD_Incid_merged_pm25_df.columns)
+
+# Drop the unnecessary columns
+# COPD_merged_pm25_df = COPD_merged_pm25_df.drop(columns=["county_x", 'State Name', 'county_y', 'State Name_y'])
+
+# Figuring out which columns to drop
+# Values in 'county_x' but not in 'county_y'
+#county_x_not_in_county_y = set(ILD_merged_pm25_df['county_x'].dropna()).difference(set(ILD_merged_pm25_df['county_y'].dropna()))
+#print("Values in 'county_x' but not in 'county_y':")
+#print(county_x_not_in_county_y)
+
+# Values in 'county_y' but not in 'county_x'
+#county_y_not_in_county_x = set(ILD_merged_pm25_df['county_y'].dropna()).difference(set(ILD_merged_pm25_df['county_x'].dropna()))
+#print("\nValues in 'county_y' but not in 'county_x':")
+#print(county_y_not_in_county_x)
+
+# Check for null values
+#print("\nNull values in 'county_x':")
+#print(ILD_merged_pm25_df['county_x'].isnull().sum())
+
+#print("\nNull values in 'county_y':")
+#print(ILD_merged_pm25_df['county_y'].isnull().sum())
+
+# Drop the 'county_x' column
+#ILD_merged_pm25_df = ILD_merged_pm25_df.drop(columns=['county_x'])
+
+# Convert state names in 'State Name' to lowercase
+COPD_Incid_merged_pm25_df['State Name'] = COPD_Incid_merged_pm25_df['State Name'].str.lower()
+
+# Convert state names in 'state' to lowercase
+COPD_Incid_merged_pm25_df['state'] = COPD_Incid_merged_pm25_df['state'].str.lower()
+
+# Get unique values in 'State Name' and 'state'
+state_name_values = set(COPD_Incid_merged_pm25_df['State Name'].dropna().unique())
+state_values = set(COPD_Incid_merged_pm25_df['state'].dropna().unique())
+
+# Find differences
+diff_state_name_not_in_state = state_name_values - state_values
+diff_state_not_in_state_name = state_values - state_name_values
+
+# Print the differences
+print("Values in 'State Name' but not in 'state':")
+print(diff_state_name_not_in_state)
+
+print("\nValues in 'state' but not in 'State Name':")
+print(diff_state_not_in_state_name)
+
+# Check for null values
+print("\nNull values in 'State Name':")
+print(COPD_Incid_merged_pm25_df['State Name'].isnull().sum())
+
+print("\nNull values in 'state':")
+print(COPD_Incid_merged_pm25_df['state'].isnull().sum())
+
+# Drop the 'state' column
+COPD_Incid_merged_pm25_df = COPD_Incid_merged_pm25_df.drop(columns=['state'])
+
+# Renaming columns for clarity
+COPD_Incid_merged_pm25_df = COPD_Incid_merged_pm25_df.rename(columns={
+    'Max': 'pm25_max',
+    'Min': 'pm25_min',
+    'Mean': 'pm25_mean',
+    'Median': 'pm25_median',
+    'Std': 'pm25_std',
+})
+
+# Calculate correlations
+correlation_matrix = COPD_Incid_merged_pm25_df[[
+    'COPD_average', 'lower', 'upper',
+    'pm25_max', 'pm25_min', 'pm25_mean', 'pm25_median', 'pm25_std'
+]].corr()
+
+# Save correlation matrix to CSV
+correlation_matrix.to_csv(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_pm25_correlation_matrix.csv')
+
+# Plot Correlation Heatmap
+plt.figure(figsize=(12, 13))
+sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0)
+plt.title('COPD Incidence vs. PM2.5 Correlation Heatmap')
+plt.savefig(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_correlation_heatmap_pm25.png')
+# plt.show()
+
diff --git a/COPD_Incid_pm25_breathright_data_prep.py b/COPD_Incid_pm25_breathright_data_prep.py
new file mode 100644
index 0000000..23b87a8
--- /dev/null
+++ b/COPD_Incid_pm25_breathright_data_prep.py
@@ -0,0 +1,91 @@
+import os
+import pandas as pd
+import re
+
+# Paths to your data
+ozone_data_path = "/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/Ozone and PM2.5 Data/combined_ozone_data.csv"
+COPD_incidence_data_path = "/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/IHME_2000-2021_COPD_Incidence_DATA.csv"
+pm25_data_path = "/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/Ozone and PM2.5 Data/combined_pm25_data.csv"
+
+# Read all the csv into pandas dataframe in memory
+ozone_df = pd.read_csv(ozone_data_path, parse_dates=['Date Local'])
+COPD_incidence_df = pd.read_csv(COPD_incidence_data_path)
+pm25_df = pd.read_csv(pm25_data_path)
+
+print(COPD_incidence_df.columns)
+
+# Convert 'Date Local' to datetime format
+pm25_df['Date Local'] = pd.to_datetime(pm25_df['Date Local'], errors='coerce')
+
+# Rename columns to be consistent
+#COPD_incidence_df.rename(columns={'Location': 'County Name'}, inplace=True)
+
+# Use the melt function to transform the DataFrame from wide to long format. This will convert the year-specific columns into rows.
+#COPD_incidence_long = COPD_incidence_df.melt(
+    #id_vars=['County Name', 'FIPS', '% Change in Mortality Rate, 1980-2014'],
+    #var_name='year',
+    #value_name='Mortality'
+#)
+
+# Extract the year from the 'year' column using string operations and convert it to an integer.
+#ILD_long['year'] = ILD_long['year'].str.extract(r'(\d{4})').astype(int)
+
+# Function to split the Mortality Rate column
+def split_COPD_Incidence_column_into_three(COPD_Incidence):
+    match = re.match(r'(\d+\.\d+) \((\d+\.\d+), (\d+\.\d+)\)', COPD_Incidence)
+    if match:
+        avg, min_val, max_val = match.groups()
+        return pd.Series([float(avg), float(min_val), float(max_val)], index=['COPD_average', 'COPD_min', 'COPD_max'])
+    else:
+        return pd.Series([None, None, None], index=['COPD_average', 'COPD_min', 'COPD_max'])
+
+# Ensure 'COPD Incidence' is a string and handle NaN values
+COPD_incidence_df['COPD Incidence'] = COPD_incidence_df['COPD Incidence'].astype(str)
+
+
+# Apply the function to split the 'Mortality' column
+COPD_incidence_df[['COPD_average', 'COPD_min', 'COPD_max']] = COPD_incidence_df['COPD Incidence'].apply(split_COPD_Incidence_column_into_three)
+
+# Drop the original 'Mortality' column if no longer needed
+#COPD_incidence_df = COPD_incidence_df.drop(columns=['COPD In'])
+
+print("COPD_Incidence DataFrame columns:", COPD_incidence_df.columns)
+
+# Convert the daily ozone into yearly data
+pm25_df['year'] = pm25_df['Date Local'].dt.year
+
+# Group by additional columns and 'year'
+grouped = pm25_df.groupby(['State Name', 'County Name', 'year'])['Arithmetic Mean']
+
+# Compute statistics
+stats_df = grouped.agg(['max', 'min', 'mean', 'median', 'std']).reset_index()
+
+# Rename columns for clarity
+stats_df.columns = ['State Name', 'County Name', 'year', 'Max', 'Min', 'Mean', 'Median', 'Std']
+
+# Convert columns to string in both DataFrames
+stats_df['county'] = stats_df['County Name'].str.lower()
+stats_df['state'] = stats_df['State Name'].str.lower()
+stats_df['year'] = stats_df['year'].astype(int)
+
+#COPD_incidence_df['county'] = COPD_incidence_df['County Name'].str.strip().str.lower()
+COPD_incidence_df['year'] = COPD_incidence_df['year'].astype(int)
+
+# Print the results
+print("Ozone aggregated yearly data:", stats_df)
+print("COPD Incidence data header:", COPD_incidence_df.head())
+
+# Merge the statistics ozone DataFrame with the COPD_long DataFrame
+COPD_Incid_merged_pm25_df = pd.merge(COPD_incidence_df, stats_df, on=['State Name', 'year'], how='inner')
+
+print("Merged DataFrame:", COPD_Incid_merged_pm25_df.head())
+
+# Save to a CSV file
+COPD_Incid_merged_pm25_df.to_csv(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_pm25_merged_df.csv', index=False)
+
+
+
+
+
+
+
diff --git a/process.json b/process.json
index b341986..13f0766 100644
--- a/process.json
+++ b/process.json
@@ -1,16 +1,32 @@
 [{
-  "id" : "74c3mf",
-  "name" : "totalCOPD_breathright1",
+  "id" : "fzdydc",
+  "name" : "COPD_Incid_pm25_breathright_data_prep",
   "description" : null,
-  "code" : "import os\nimport pandas as pd\nimport re\n\n# Paths to your data\nNon_Smoke_merged_df = \"/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/Non Smoke/Non_Smoke_merged_df.csv\"\ngreat_pm25_combined_df = pd.read_csv('/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/Ozone and PM2.5 Data/great_combined_daily_pm25.csv', low_memory=False)\ngreat_ozone_combined_df = pd.read_csv('/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/Ozone and PM2.5 Data/great_combined_daily_ozone.csv', low_memory=False)\n\n# Read all the csv into pandas dataframe in memory\n#great_ozone_combined_df = pd.read_csv(ozone_data_path, parse_dates=['Date Local'])\nNon_Smoke_merged_df = pd.read_csv(Non_Smoke_merged_df)\n#great_pm25_combined_df = pd.read_csv(great_pm25_combined_df)\n\nprint(Non_Smoke_merged_df.columns)\nprint(Non_Smoke_merged_df['COPD_prevalence_percentage_total'].head())\n\n\n# Convert 'Date Local' to datetime format\ngreat_ozone_combined_df['Date Local'] = pd.to_datetime(great_ozone_combined_df['Date Local'], errors='coerce')\n\nprint(Non_Smoke_merged_df.columns)\n\n# Rename columns to be consistent\n#COPD_incidence_df.rename(columns={'Location': 'County Name'}, inplace=True)\n\n# Use the melt function to transform the DataFrame from wide to long format. This will convert the year-specific columns into rows.\n#COPD_incidence_long = COPD_incidence_df.melt(\n    #id_vars=['County Name', 'FIPS', '% Change in Mortality Rate, 1980-2014'],\n    #var_name='year',\n    #value_name='Mortality'\n#)\n\n# Extract the year from the 'year' column using string operations and convert it to an integer.\n#ILD_long['year'] = ILD_long['year'].str.extract(r'(\\d{4})').astype(int)\n\n# Function to split the Mortality Rate column\nimport pandas as pd\n\ndef split_COPD_prevalence_percentage_total_column_into_three(COPD_prevalence_percentage_total):\n    if pd.isna(COPD_prevalence_percentage_total):\n        return pd.Series([None, None, None], index=['Average', 'Min', 'Max'])\n    \n    COPD_prevalence_percentage_total = str(COPD_prevalence_percentage_total)\n    \n    # Check for range format\n    match = re.match(r'(\\d+\\.\\d+) \\((\\d+\\.\\d+), (\\d+\\.\\d+)\\)', COPD_prevalence_percentage_total)\n    if match:\n        avg, min_val, max_val = match.groups()\n        return pd.Series([float(avg), float(min_val), float(max_val)], index=['Average', 'Min', 'Max'])\n    \n    # Handle single value format\n    try:\n        avg = float(COPD_prevalence_percentage_total)\n        return pd.Series([avg, avg, avg], index=['Average', 'Min', 'Max'])\n    except ValueError:\n        return pd.Series([None, None, None], index=['Average', 'Min', 'Max'])\n# Apply the function to the DataFrame\nNon_Smoke_merged_df[['Average', 'Min', 'Max']] = Non_Smoke_merged_df['COPD_prevalence_percentage_total'].apply(split_COPD_prevalence_percentage_total_column_into_three)\n\n\n\n# Ensure 'COPD Incidence' is a string and handle NaN values\nNon_Smoke_merged_df['COPD_prevalence_percentage_total'] = Non_Smoke_merged_df['COPD_prevalence_percentage_total'].astype(str)\n\n# Apply the function to the DataFrame\nNon_Smoke_merged_df[['Average', 'Min', 'Max']] = Non_Smoke_merged_df['COPD_prevalence_percentage_total'].apply(split_COPD_prevalence_percentage_total_column_into_three)\n\n# Drop the original 'Mortality' column if no longer needed\n#COPD_incidence_df = COPD_incidence_df.drop(columns=['COPD In'])\n\nprint(\"COPD_prevalence_percentage_total DataFrame columns:\", Non_Smoke_merged_df.columns)\n\n# Convert the daily ozone into yearly data\ngreat_ozone_combined_df['year'] = great_ozone_combined_df['Date Local'].dt.year\n\n# Group by additional columns and 'year'\ngrouped = great_ozone_combined_df.groupby(['State Name', 'year'])['Arithmetic Mean']\n\n# Compute statistics\nstats_df = grouped.agg(['max', 'min', 'mean', 'median', 'std']).reset_index()\n\n# Rename columns for clarity\n#stats_df.columns = ['State Name', 'County Name', 'year', 'Max', 'Min', 'Mean', 'Median', 'Std']\n\n# Convert columns to string in both DataFrames\n#stats_df['county'] = stats_df['County Name'].str.lower()\nstats_df['state'] = stats_df['State Name'].str.lower()\nstats_df['year'] = stats_df['year'].astype(int)\n\n#COPD_incidence_df['county'] = COPD_incidence_df['County Name'].str.strip().str.lower()\nNon_Smoke_merged_df['year'] = Non_Smoke_merged_df['year'].astype(int)\n\n# Check unique values in each DataFrame for the merge columns\nprint(\"Unique State Names in Non_Smoke_merged_df:\")\nprint(Non_Smoke_merged_df['State Name'].unique())\n\nprint(\"Unique State Names in stats_df:\")\nprint(stats_df['State Name'].unique())\n\nprint(\"Unique Years in Non_Smoke_merged_df:\")\nprint(Non_Smoke_merged_df['year'].unique())\n\nprint(\"Unique Years in stats_df:\")\nprint(stats_df['year'].unique())\n\n\n# Print the results\nprint(\"Ozone aggregated yearly data:\", stats_df)\nprint(\"COPD Incidence data header:\", Non_Smoke_merged_df.head())\n\n# Merge the statistics ozone DataFrame with the COPD_long DataFrame\ntotalCOPD_ozone_merged_df = pd.merge(Non_Smoke_merged_df, stats_df, on=['State Name', 'year'], how='inner')\n\nprint(\"Merged DataFrame:\", totalCOPD_ozone_merged_df.head())\n\n# Save to a CSV file\ntotalCOPD_ozone_merged_df.to_csv(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/Non Smoke/totalCOPD_ozone_merged_df.csv', index=False)\n\n\nprint(totalCOPD_ozone_merged_df)\nprint(totalCOPD_ozone_merged_df.columns)\n\n\n\n",
+  "code" : "import os\nimport pandas as pd\nimport re\n\n# Paths to your data\nozone_data_path = \"/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/Ozone and PM2.5 Data/combined_ozone_data.csv\"\nCOPD_incidence_data_path = \"/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/IHME_2000-2021_COPD_Incidence_DATA.csv\"\npm25_data_path = \"/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/Ozone and PM2.5 Data/combined_pm25_data.csv\"\n\n# Read all the csv into pandas dataframe in memory\nozone_df = pd.read_csv(ozone_data_path, parse_dates=['Date Local'])\nCOPD_incidence_df = pd.read_csv(COPD_incidence_data_path)\npm25_df = pd.read_csv(pm25_data_path)\n\nprint(COPD_incidence_df.columns)\n\n# Convert 'Date Local' to datetime format\npm25_df['Date Local'] = pd.to_datetime(pm25_df['Date Local'], errors='coerce')\n\n# Rename columns to be consistent\n#COPD_incidence_df.rename(columns={'Location': 'County Name'}, inplace=True)\n\n# Use the melt function to transform the DataFrame from wide to long format. This will convert the year-specific columns into rows.\n#COPD_incidence_long = COPD_incidence_df.melt(\n    #id_vars=['County Name', 'FIPS', '% Change in Mortality Rate, 1980-2014'],\n    #var_name='year',\n    #value_name='Mortality'\n#)\n\n# Extract the year from the 'year' column using string operations and convert it to an integer.\n#ILD_long['year'] = ILD_long['year'].str.extract(r'(\\d{4})').astype(int)\n\n# Function to split the Mortality Rate column\ndef split_COPD_Incidence_column_into_three(COPD_Incidence):\n    match = re.match(r'(\\d+\\.\\d+) \\((\\d+\\.\\d+), (\\d+\\.\\d+)\\)', COPD_Incidence)\n    if match:\n        avg, min_val, max_val = match.groups()\n        return pd.Series([float(avg), float(min_val), float(max_val)], index=['COPD_average', 'COPD_min', 'COPD_max'])\n    else:\n        return pd.Series([None, None, None], index=['COPD_average', 'COPD_min', 'COPD_max'])\n\n# Ensure 'COPD Incidence' is a string and handle NaN values\nCOPD_incidence_df['COPD Incidence'] = COPD_incidence_df['COPD Incidence'].astype(str)\n\n\n# Apply the function to split the 'Mortality' column\nCOPD_incidence_df[['COPD_average', 'COPD_min', 'COPD_max']] = COPD_incidence_df['COPD Incidence'].apply(split_COPD_Incidence_column_into_three)\n\n# Drop the original 'Mortality' column if no longer needed\n#COPD_incidence_df = COPD_incidence_df.drop(columns=['COPD In'])\n\nprint(\"COPD_Incidence DataFrame columns:\", COPD_incidence_df.columns)\n\n# Convert the daily ozone into yearly data\npm25_df['year'] = pm25_df['Date Local'].dt.year\n\n# Group by additional columns and 'year'\ngrouped = pm25_df.groupby(['State Name', 'County Name', 'year'])['Arithmetic Mean']\n\n# Compute statistics\nstats_df = grouped.agg(['max', 'min', 'mean', 'median', 'std']).reset_index()\n\n# Rename columns for clarity\nstats_df.columns = ['State Name', 'County Name', 'year', 'Max', 'Min', 'Mean', 'Median', 'Std']\n\n# Convert columns to string in both DataFrames\nstats_df['county'] = stats_df['County Name'].str.lower()\nstats_df['state'] = stats_df['State Name'].str.lower()\nstats_df['year'] = stats_df['year'].astype(int)\n\n#COPD_incidence_df['county'] = COPD_incidence_df['County Name'].str.strip().str.lower()\nCOPD_incidence_df['year'] = COPD_incidence_df['year'].astype(int)\n\n# Print the results\nprint(\"Ozone aggregated yearly data:\", stats_df)\nprint(\"COPD Incidence data header:\", COPD_incidence_df.head())\n\n# Merge the statistics ozone DataFrame with the COPD_long DataFrame\nCOPD_Incid_merged_pm25_df = pd.merge(COPD_incidence_df, stats_df, on=['State Name', 'year'], how='inner')\n\nprint(\"Merged DataFrame:\", COPD_Incid_merged_pm25_df.head())\n\n# Save to a CSV file\nCOPD_Incid_merged_pm25_df.to_csv(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_pm25_merged_df.csv', index=False)\n\n\n\n\n\n\n",
   "lang" : "python",
   "owner" : "111111",
   "confidential" : "FALSE"
 },{
-  "id" : "3snnwc",
-  "name" : "totalCOPD_breathright2",
+  "id" : "f7jhqc",
+  "name" : "COPD_Incid_pm25_breathright_correlation",
   "description" : null,
-  "code" : "import pandas as pd\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\n# Load the CSV file into a DataFrame\n#file_path = '/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/Non Smoke/totalCOPD_ozone_merged_df.csv'\n#totalCOPD_ozone_merged_df = pd.read_csv(file_path)\n\n\n# Verify the data types after conversion\n#print(totalCOPD_ozone_merged_df.dtypes)\n\n\n# Display the first few rows of the DataFrame\n#print(totalCOPD_ozone_merged_df.head())\n#print(totalCOPD_ozone_merged_df.columns)\n\n# Assuming 'Min_x' and 'Min_y' are already in the DataFrame\n#totalCOPD_ozone_merged_df['difference_Min_x_Min_y'] = totalCOPD_ozone_merged_df['Min_x'] - totalCOPD_ozone_merged_df['Min_y']\n\n# Display the first few rows to check the new column\n#print(totalCOPD_ozone_merged_df[['Min_x', 'Min_y', 'difference_Min_x_Min_y']].head())\n\n# Calculate basic statistics for the difference\n#diff_mean = totalCOPD_ozone_merged_df['difference_Min_x_Min_y'].mean()\n#diff_median = totalCOPD_ozone_merged_df['difference_Min_x_Min_y'].median()\n#diff_std = totalCOPD_ozone_merged_df['difference_Min_x_Min_y'].std()\n\n# Fill NaN values in Min_x or Min_y with a specific value, e.g., 0\n#totalCOPD_ozone_merged_df_filled = totalCOPD_ozone_merged_df.fillna({'Min_x': 0, 'Min_y': 0})\n\n# Recalculate the difference\n#totalCOPD_ozone_merged_df_filled['difference_Min_x_Min_y'] = totalCOPD_ozone_merged_df_filled['Min_x'] - totalCOPD_ozone_merged_df_filled['Min_y']\n\n# Display the updated DataFrame\n#print(totalCOPD_ozone_merged_df_filled[['Min_x', 'Min_y', 'difference_Min_x_Min_y']].head())\n\n# Calculate basic statistics\n#diff_mean = totalCOPD_ozone_merged_df_filled['difference_Min_x_Min_y'].mean()\n#diff_median = totalCOPD_ozone_merged_df_filled['difference_Min_x_Min_y'].median()\n#diff_std = totalCOPD_ozone_merged_df_filled['difference_Min_x_Min_y'].std()\n\n#print(f\"Mean of the difference: {diff_mean}\")\n#print(f\"Median of the difference: {diff_median}\")\n#print(f\"Standard deviation of the difference: {diff_std}\")\n\n# Drop the Min_x column\n#totalCOPD_ozone_merged_df = totalCOPD_ozone_merged_df.drop(columns=['Min_x', 'Max_x'])\n\n# Rename columns\n#totalCOPD_ozone_merged_df = totalCOPD_ozone_merged_df.rename(columns={'Min_y': 'Min', 'Max_y': 'Max'})\n\n\n#def do_breatheright_correlation_analysis():\n    # Read in the merged CSV file with ozone and lung disease data\n    #totalCOPD_ozone_merged_df = pd.read_csv(f\"/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/Non Smoke/totalCOPD_ozone_merged_df.csv\")\n    #print(totalCOPD_ozone_merged_df.head())\n    #print(totalCOPD_ozone_merged_df.columns)\n\n# Drop the unnecessary columns\n# ILD_merged_pm25_df = ILD_merged_pm25_df.drop(columns=[\"county_x\", 'State Name', 'county_y', 'State Name_y'])\n\n# Figuring out which columns to drop\n# Values in 'county_x' but not in 'county_y'\n#county_x_not_in_county_y = set(nosmokeCOPD_merged_pm25_df['county_x'].dropna()).difference(set(ILD_merged_pm25_df['county_y'].dropna()))\n#print(\"Values in 'county_x' but not in 'county_y':\")\n#print(county_x_not_in_county_y)\n\n# Values in 'county_y' but not in 'county_x'\n#county_y_not_in_county_x = set(ILD_merged_pm25_df['county_y'].dropna()).difference(set(ILD_merged_pm25_df['county_x'].dropna()))\n#print(\"\\nValues in 'county_y' but not in 'county_x':\")\n#print(county_y_not_in_county_x)\n\n# Check for null values\n#print(\"\\nNull values in 'county_x':\")\n#print(ILD_merged_pm25_df['county_x'].isnull().sum())\n\n#print(\"\\nNull values in 'county_y':\")\n#print(ILD_merged_pm25_df['county_y'].isnull().sum())\n\n# Drop the 'county_x' column\n#ILD_merged_pm25_df = ILD_merged_pm25_df.drop(columns=['county_x'])\n\n# Convert state names in 'State Name' to lowercase\n#totalCOPD_ozone_merged_df['State Name'] = totalCOPD_ozone_merged_df['State Name'].str.lower()\n\n# Convert state names in 'state' to lowercase\n#totalCOPD_ozone_merged_df['state'] = totalCOPD_ozone_merged_df['state'].str.lower()\n\n# Get unique values in 'State Name' and 'state'\n#state_name_values = set(totalCOPD_ozone_merged_df['State Name'].dropna().unique())\n#state_values = set(totalCOPD_ozone_merged_df['state'].dropna().unique())\n\n# Find differences\n#diff_state_name_not_in_state = state_name_values - state_values\n#diff_state_not_in_state_name = state_values - state_name_values\n\n# Print the differences\n#print(\"Values in 'State Name' but not in 'state':\")\n#print(diff_state_name_not_in_state)\n\n#print(\"\\nValues in 'state' but not in 'State Name':\")\n#print(diff_state_not_in_state_name)\n\n# Check for null values\n#print(\"\\nNull values in 'State Name':\")\n#print(totalCOPD_ozone_merged_df['State Name'].isnull().sum())\n\n#print(\"\\nNull values in 'state':\")\n#print(totalCOPD_ozone_merged_df['state'].isnull().sum())\n\n# Drop the 'state' column\n#totalCOPD_ozone_merged_df = totalCOPD_ozone_merged_df.drop(columns=['state'])\n\n# Renaming columns for clarity\n#totalCOPD_ozone_merged_df = totalCOPD_ozone_merged_df.rename(columns={\n    #'Max': 'ozone_max',\n    #'Min': 'ozone_min',\n    #'Mean': 'ozone_mean',\n    #'Median': 'ozone_median',\n    #'Std': 'ozone_std',\n#})\n\n# Calculate correlations\n#correlation_matrix = totalCOPD_ozone_merged_df[[\n    #'Average', 'Min', 'Max',\n    #'ozone_max', 'ozone_min', 'ozone_mean', 'ozone_median', 'ozone_std'\n#]].corr()\n\n# Convert columns to numeric where applicable\n#numeric_columns = ['average', 'min', 'max', 'ozone_max', 'ozone_min', 'ozone_mean', 'ozone_median', 'ozone_std']\n#totalCOPD_ozone_merged_df[numeric_columns] = totalCOPD_ozone_merged_df[numeric_columns].apply(pd.to_numeric, errors='coerce')\n\n# Verify the data types after conversion\n#print(totalCOPD_ozone_merged_df.dtypes)\n\n# Save correlation matrix to CSV\n#correlation_matrix.to_csv(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/Non Smoke/COPDtotalozone_correlation_matrix.csv')\n\n# Plot Correlation Heatmap\n#plt.figure(figsize=(12, 13))\n#sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0)\n#plt.title('COPD Total Incidence vs. Ozone Correlation Heatmap')\n#plt.savefig(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/Non Smoke/totalCOPD1_correlation_heatmap_ozone.png')\n# plt.show()\n\n\n# Check for missing values in relevant columns\n#print(totalCOPD_ozone_merged_df[['average', 'min', 'max', 'ozone_max', 'ozone_min', 'ozone_mean', 'ozone_median', 'ozone_std']].isnull().sum())\n\n\n#print(totalCOPD_ozone_merged_df.dtypes)\n\n\n#print(totalCOPD_ozone_merged_df[['average', 'min', 'max', 'ozone_max', 'ozone_min', 'ozone_mean', 'ozone_median', 'ozone_std']].sample(10))\n\n\n\n\nimport pandas as pd\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\n# Load the CSV file into a DataFrame\nfile_path = '/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/Non Smoke/totalCOPD_ozone_merged_df.csv'\ntotalCOPD_ozone_merged_df = pd.read_csv(file_path)\n\n# Verify the data types after conversion\nprint(totalCOPD_ozone_merged_df.dtypes)\n\n# Display the first few rows of the DataFrame\nprint(totalCOPD_ozone_merged_df.head())\nprint(totalCOPD_ozone_merged_df.columns)\n\n# Assuming 'Min_x' and 'Min_y' are already in the DataFrame\n#totalCOPD_ozone_merged_df['difference_Min_x_Min_y'] = totalCOPD_ozone_merged_df['Min_x'] - totalCOPD_ozone_merged_df['Min_y']\n\n# Display the first few rows to check the new column\n#print(totalCOPD_ozone_merged_df[['Min_x', 'Min_y', 'difference_Min_x_Min_y']].head())\n\n# Calculate basic statistics for the difference\n#diff_mean = totalCOPD_ozone_merged_df['difference_Min_x_Min_y'].mean()\n#diff_median = totalCOPD_ozone_merged_df['difference_Min_x_Min_y'].median()\n#diff_std = totalCOPD_ozone_merged_df['difference_Min_x_Min_y'].std()\n\n# Fill NaN values in Min_x or Min_y with a specific value, e.g., 0\n#totalCOPD_ozone_merged_df_filled = totalCOPD_ozone_merged_df.fillna({'Min_x': 0, 'Min_y': 0})\n\n# Recalculate the difference\n#totalCOPD_ozone_merged_df_filled['difference_Min_x_Min_y'] = totalCOPD_ozone_merged_df_filled['Min_x'] - totalCOPD_ozone_merged_df_filled['Min_y']\n\n# Display the updated DataFrame\n#print(totalCOPD_ozone_merged_df_filled[['Min_x', 'Min_y', 'difference_Min_x_Min_y']].head())\n\n# Calculate basic statistics\n#diff_mean = totalCOPD_ozone_merged_df_filled['difference_Min_x_Min_y'].mean()\n#diff_median = totalCOPD_ozone_merged_df_filled['difference_Min_x_Min_y'].median()\n#diff_std = totalCOPD_ozone_merged_df_filled['difference_Min_x_Min_y'].std()\n\n#print(f\"Mean of the difference: {diff_mean}\")\n#print(f\"Median of the difference: {diff_median}\")\n#print(f\"Standard deviation of the difference: {diff_std}\")\n\n# Drop the Min_x and Max_x columns\n#totalCOPD_ozone_merged_df = totalCOPD_ozone_merged_df.drop(columns=['Min_x', 'Max_x'])\n\n# Rename columns\n#totalCOPD_ozone_merged_df = totalCOPD_ozone_merged_df.rename(columns={'Min_y': 'Min', 'Max_y': 'Max'})\n\n# Ensure the renaming has taken place\nprint(totalCOPD_ozone_merged_df.columns)\n\n# Drop the 'state' column\ntotalCOPD_ozone_merged_df = totalCOPD_ozone_merged_df.drop(columns=['state'])\n\n# Renaming columns for clarity\n#totalCOPD_ozone_merged_df = totalCOPD_ozone_merged_df.rename(columns={\n    #'Max': 'ozone_max',\n    #'Min': 'ozone_min',\n    #'Mean': 'ozone_mean',\n    #'Median': 'ozone_median',\n    #'Std': 'ozone_std',\n#})\n\n# Convert columns to numeric where applicable\nnumeric_columns = ['Average', 'ozone_max', 'ozone_min', 'ozone_mean', 'ozone_median', 'ozone_std']\ntotalCOPD_ozone_merged_df[numeric_columns] = totalCOPD_ozone_merged_df[numeric_columns].apply(pd.to_numeric, errors='coerce')\n\n# Verify the data types after conversion\nprint(totalCOPD_ozone_merged_df.dtypes)\n\n# Calculate correlations\ncorrelation_matrix = totalCOPD_ozone_merged_df[[\n    'Average', 'ozone_max', 'ozone_min', 'ozone_mean', 'ozone_median', 'ozone_std'\n]].corr()\n\n# Save correlation matrix to CSV\ncorrelation_matrix.to_csv(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/Non Smoke/COPDtotalozone_correlation_matrix.csv')\n\n# Plot Correlation Heatmap\nplt.figure(figsize=(12, 13))\nsns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0)\nplt.title('COPD Total Incidence vs. Ozone Correlation Heatmap')\nplt.savefig(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/Non Smoke/totalCOPD1_correlation_heatmap_ozone.png')\n# plt.show()\n\n# Check for missing values in relevant columns\nprint(totalCOPD_ozone_merged_df[['Average', 'ozone_max', 'ozone_min', 'ozone_mean', 'ozone_median', 'ozone_std']].isnull().sum())\n\nprint(totalCOPD_ozone_merged_df.dtypes)\n\nprint(totalCOPD_ozone_merged_df[['Average', 'ozone_max', 'ozone_min', 'ozone_mean', 'ozone_median', 'ozone_std']].sample(10))\n\n\n\n",
+  "code" : "import pandas as pd\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\n# Load the CSV file into a DataFrame\nfile_path = '/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_pm25_merged_df.csv'\nCOPD_Incid_merged_pm25_df = pd.read_csv(file_path)\n\nCOPD_Incid_merged_pm25_df = COPD_Incid_merged_pm25_df.drop(columns=['COPD_min', 'COPD_max', 'county', 'County Name'])\n\n\n\n# Display the first few rows of the DataFrame\nprint(COPD_Incid_merged_pm25_df.head())\nprint(COPD_Incid_merged_pm25_df.columns)\n\ndef do_breatheright_correlation_analysis():\n    # Read in the merged CSV file with ozone and lung disease data\n    COPD_Incid_merged_pm25_df = pd.read_csv(f\"/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_pm25_merged_df.csv\")\n    print(COPD_Incid_merged_pm25_df.head())\n    print(COPD_Incid_merged_pm25_df.columns)\n\n# Drop the unnecessary columns\n# COPD_merged_pm25_df = COPD_merged_pm25_df.drop(columns=[\"county_x\", 'State Name', 'county_y', 'State Name_y'])\n\n# Figuring out which columns to drop\n# Values in 'county_x' but not in 'county_y'\n#county_x_not_in_county_y = set(ILD_merged_pm25_df['county_x'].dropna()).difference(set(ILD_merged_pm25_df['county_y'].dropna()))\n#print(\"Values in 'county_x' but not in 'county_y':\")\n#print(county_x_not_in_county_y)\n\n# Values in 'county_y' but not in 'county_x'\n#county_y_not_in_county_x = set(ILD_merged_pm25_df['county_y'].dropna()).difference(set(ILD_merged_pm25_df['county_x'].dropna()))\n#print(\"\\nValues in 'county_y' but not in 'county_x':\")\n#print(county_y_not_in_county_x)\n\n# Check for null values\n#print(\"\\nNull values in 'county_x':\")\n#print(ILD_merged_pm25_df['county_x'].isnull().sum())\n\n#print(\"\\nNull values in 'county_y':\")\n#print(ILD_merged_pm25_df['county_y'].isnull().sum())\n\n# Drop the 'county_x' column\n#ILD_merged_pm25_df = ILD_merged_pm25_df.drop(columns=['county_x'])\n\n# Convert state names in 'State Name' to lowercase\nCOPD_Incid_merged_pm25_df['State Name'] = COPD_Incid_merged_pm25_df['State Name'].str.lower()\n\n# Convert state names in 'state' to lowercase\nCOPD_Incid_merged_pm25_df['state'] = COPD_Incid_merged_pm25_df['state'].str.lower()\n\n# Get unique values in 'State Name' and 'state'\nstate_name_values = set(COPD_Incid_merged_pm25_df['State Name'].dropna().unique())\nstate_values = set(COPD_Incid_merged_pm25_df['state'].dropna().unique())\n\n# Find differences\ndiff_state_name_not_in_state = state_name_values - state_values\ndiff_state_not_in_state_name = state_values - state_name_values\n\n# Print the differences\nprint(\"Values in 'State Name' but not in 'state':\")\nprint(diff_state_name_not_in_state)\n\nprint(\"\\nValues in 'state' but not in 'State Name':\")\nprint(diff_state_not_in_state_name)\n\n# Check for null values\nprint(\"\\nNull values in 'State Name':\")\nprint(COPD_Incid_merged_pm25_df['State Name'].isnull().sum())\n\nprint(\"\\nNull values in 'state':\")\nprint(COPD_Incid_merged_pm25_df['state'].isnull().sum())\n\n# Drop the 'state' column\nCOPD_Incid_merged_pm25_df = COPD_Incid_merged_pm25_df.drop(columns=['state'])\n\n# Renaming columns for clarity\nCOPD_Incid_merged_pm25_df = COPD_Incid_merged_pm25_df.rename(columns={\n    'Max': 'pm25_max',\n    'Min': 'pm25_min',\n    'Mean': 'pm25_mean',\n    'Median': 'pm25_median',\n    'Std': 'pm25_std',\n})\n\n# Calculate correlations\ncorrelation_matrix = COPD_Incid_merged_pm25_df[[\n    'COPD_average', 'lower', 'upper',\n    'pm25_max', 'pm25_min', 'pm25_mean', 'pm25_median', 'pm25_std'\n]].corr()\n\n# Save correlation matrix to CSV\ncorrelation_matrix.to_csv(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_pm25_correlation_matrix.csv')\n\n# Plot Correlation Heatmap\nplt.figure(figsize=(12, 13))\nsns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0)\nplt.title('COPD Incidence vs. PM2.5 Correlation Heatmap')\nplt.savefig(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_correlation_heatmap_pm25.png')\n# plt.show()\n",
+  "lang" : "python",
+  "owner" : "111111",
+  "confidential" : "FALSE"
+},{
+  "id" : "kw5zqo",
+  "name" : "COPD_Incid_ozone_breathright_data_prep",
+  "description" : null,
+  "code" : "import os\nimport pandas as pd\nimport re\n\n# Paths to your data\nozone_data_path = \"/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/Ozone and PM2.5 Data/combined_ozone_data.csv\"\nCOPD_incidence_data_path = \"/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/IHME_2000-2021_COPD_Incidence_DATA.csv\"\npm25_data_path = \"/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/Ozone and PM2.5 Data/combined_pm25_data.csv\"\n\n# Read all the csv into pandas dataframe in memory\nozone_df = pd.read_csv(ozone_data_path, parse_dates=['Date Local'])\nCOPD_incidence_df = pd.read_csv(COPD_incidence_data_path)\npm25_df = pd.read_csv(pm25_data_path)\n\nprint(COPD_incidence_df.columns)\n\n# Convert 'Date Local' to datetime format\nozone_df['Date Local'] = pd.to_datetime(ozone_df['Date Local'], errors='coerce')\n\n# Rename columns to be consistent\n#COPD_incidence_df.rename(columns={'Location': 'County Name'}, inplace=True)\n\n# Use the melt function to transform the DataFrame from wide to long format. This will convert the year-specific columns into rows.\n#COPD_incidence_long = COPD_incidence_df.melt(\n    #id_vars=['County Name', 'FIPS', '% Change in Mortality Rate, 1980-2014'],\n    #var_name='year',\n    #value_name='Mortality'\n#)\n\n# Extract the year from the 'year' column using string operations and convert it to an integer.\n#ILD_long['year'] = ILD_long['year'].str.extract(r'(\\d{4})').astype(int)\n\n# Function to split the Mortality Rate column\ndef split_COPD_Incidence_column_into_three(COPD_Incidence):\n    match = re.match(r'(\\d+\\.\\d+) \\((\\d+\\.\\d+), (\\d+\\.\\d+)\\)', COPD_Incidence)\n    if match:\n        avg, min_val, max_val = match.groups()\n        return pd.Series([float(avg), float(min_val), float(max_val)], index=['COPD_average', 'COPD_min', 'COPD_max'])\n    else:\n        return pd.Series([None, None, None], index=['COPD_average', 'COPD_min', 'COPD_max'])\n\n# Ensure 'COPD Incidence' is a string and handle NaN values\nCOPD_incidence_df['COPD Incidence'] = COPD_incidence_df['COPD Incidence'].astype(str)\n\n\n# Apply the function to split the 'Mortality' column\nCOPD_incidence_df[['COPD_average', 'COPD_min', 'COPD_max']] = COPD_incidence_df['COPD Incidence'].apply(split_COPD_Incidence_column_into_three)\n\n# Drop the original 'Mortality' column if no longer needed\n#COPD_incidence_df = COPD_incidence_df.drop(columns=['COPD In'])\n\nprint(\"COPD_Incidence DataFrame columns:\", COPD_incidence_df.columns)\n\n# Convert the daily ozone into yearly data\nozone_df['year'] = ozone_df['Date Local'].dt.year\n\n# Group by additional columns and 'year'\ngrouped = ozone_df.groupby(['State Name', 'County Name', 'year'])['Arithmetic Mean']\n\n# Compute statistics\nstats_df = grouped.agg(['max', 'min', 'mean', 'median', 'std']).reset_index()\n\n# Rename columns for clarity\nstats_df.columns = ['State Name', 'County Name', 'year', 'Max', 'Min', 'Mean', 'Median', 'Std']\n\n# Convert columns to string in both DataFrames\nstats_df['county'] = stats_df['County Name'].str.lower()\nstats_df['state'] = stats_df['State Name'].str.lower()\nstats_df['year'] = stats_df['year'].astype(int)\n\n#COPD_incidence_df['county'] = COPD_incidence_df['County Name'].str.strip().str.lower()\nCOPD_incidence_df['year'] = COPD_incidence_df['year'].astype(int)\n\n# Print the results\nprint(\"Ozone aggregated yearly data:\", stats_df)\nprint(\"COPD Incidence data header:\", COPD_incidence_df.head())\n\n# Merge the statistics ozone DataFrame with the COPD_long DataFrame\nCOPD_Incid_merged_ozone_df = pd.merge(COPD_incidence_df, stats_df, on=['State Name', 'year'], how='inner')\n\nprint(\"Merged DataFrame:\", COPD_Incid_merged_ozone_df.head())\n\n# Save to a CSV file\nCOPD_Incid_merged_ozone_df.to_csv(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_ozone_merged_df.csv', index=False)\n\n\n\n\n\n\n",
+  "lang" : "python",
+  "owner" : "111111",
+  "confidential" : "FALSE"
+},{
+  "id" : "ddcab9",
+  "name" : "COPD_Incid_ozone_breathright_correlation",
+  "description" : null,
+  "code" : "import pandas as pd\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\n# Load the CSV file into a DataFrame\nfile_path = '/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_ozone_merged_df.csv'\nCOPD_Incid_merged_ozone_df = pd.read_csv(file_path)\n\nCOPD_Incid_merged_ozone_df = COPD_Incid_merged_ozone_df.drop(columns=['COPD_min', 'COPD_max', 'county', 'County Name'])\n\n\n\n# Display the first few rows of the DataFrame\nprint(COPD_Incid_merged_ozone_df.head())\nprint(COPD_Incid_merged_ozone_df.columns)\n\ndef do_breatheright_correlation_analysis():\n    # Read in the merged CSV file with ozone and lung disease data\n    COPD_Incid_merged_ozone_df = pd.read_csv(f\"/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_ozone_merged_df.csv\")\n    print(COPD_Incid_merged_ozone_df.head())\n    print(COPD_Incid_merged_ozone_df.columns)\n\n# Drop the unnecessary columns\n# COPD_merged_pm25_df = COPD_merged_pm25_df.drop(columns=[\"county_x\", 'State Name', 'county_y', 'State Name_y'])\n\n# Figuring out which columns to drop\n# Values in 'county_x' but not in 'county_y'\n#county_x_not_in_county_y = set(ILD_merged_pm25_df['county_x'].dropna()).difference(set(ILD_merged_pm25_df['county_y'].dropna()))\n#print(\"Values in 'county_x' but not in 'county_y':\")\n#print(county_x_not_in_county_y)\n\n# Values in 'county_y' but not in 'county_x'\n#county_y_not_in_county_x = set(ILD_merged_pm25_df['county_y'].dropna()).difference(set(ILD_merged_pm25_df['county_x'].dropna()))\n#print(\"\\nValues in 'county_y' but not in 'county_x':\")\n#print(county_y_not_in_county_x)\n\n# Check for null values\n#print(\"\\nNull values in 'county_x':\")\n#print(ILD_merged_pm25_df['county_x'].isnull().sum())\n\n#print(\"\\nNull values in 'county_y':\")\n#print(ILD_merged_pm25_df['county_y'].isnull().sum())\n\n# Drop the 'county_x' column\n#ILD_merged_pm25_df = ILD_merged_pm25_df.drop(columns=['county_x'])\n\n# Convert state names in 'State Name' to lowercase\nCOPD_Incid_merged_ozone_df['State Name'] = COPD_Incid_merged_ozone_df['State Name'].str.lower()\n\n# Convert state names in 'state' to lowercase\nCOPD_Incid_merged_ozone_df['state'] = COPD_Incid_merged_ozone_df['state'].str.lower()\n\n# Get unique values in 'State Name' and 'state'\nstate_name_values = set(COPD_Incid_merged_ozone_df['State Name'].dropna().unique())\nstate_values = set(COPD_Incid_merged_ozone_df['state'].dropna().unique())\n\n# Find differences\ndiff_state_name_not_in_state = state_name_values - state_values\ndiff_state_not_in_state_name = state_values - state_name_values\n\n# Print the differences\nprint(\"Values in 'State Name' but not in 'state':\")\nprint(diff_state_name_not_in_state)\n\nprint(\"\\nValues in 'state' but not in 'State Name':\")\nprint(diff_state_not_in_state_name)\n\n# Check for null values\nprint(\"\\nNull values in 'State Name':\")\nprint(COPD_Incid_merged_ozone_df['State Name'].isnull().sum())\n\nprint(\"\\nNull values in 'state':\")\nprint(COPD_Incid_merged_ozone_df['state'].isnull().sum())\n\n# Drop the 'state' column\nCOPD_Incid_merged_ozone_df = COPD_Incid_merged_ozone_df.drop(columns=['state'])\n\n# Renaming columns for clarity\nCOPD_Incid_merged_ozone_df = COPD_Incid_merged_ozone_df.rename(columns={\n    'Max': 'ozone_max',\n    'Min': 'ozone_min',\n    'Mean': 'ozone_mean',\n    'Median': 'ozone_median',\n    'Std': 'ozone_std',\n})\n\n# Calculate correlations\ncorrelation_matrix = COPD_Incid_merged_ozone_df[[\n    'COPD_average', 'lower', 'upper',\n    'ozone_max', 'ozone_min', 'ozone_mean', 'ozone_median', 'ozone_std'\n]].corr()\n\n# Save correlation matrix to CSV\ncorrelation_matrix.to_csv(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_ozone_correlation_matrix.csv')\n\n# Plot Correlation Heatmap\nplt.figure(figsize=(12, 13))\nsns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0)\nplt.title('COPD Incidence vs. Ozone Correlation Heatmap')\nplt.savefig(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_correlation_heatmap_ozone.png')\n# plt.show()\n",
   "lang" : "python",
   "owner" : "111111",
   "confidential" : "FALSE"