diff --git a/COPD_Incid_ozone_breathright_correlation.py b/COPD_Incid_ozone_breathright_correlation.py new file mode 100644 index 0000000..292a532 --- /dev/null +++ b/COPD_Incid_ozone_breathright_correlation.py @@ -0,0 +1,102 @@ +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt + +# Load the CSV file into a DataFrame +file_path = '/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_ozone_merged_df.csv' +COPD_Incid_merged_ozone_df = pd.read_csv(file_path) + +COPD_Incid_merged_ozone_df = COPD_Incid_merged_ozone_df.drop(columns=['COPD_min', 'COPD_max', 'county', 'County Name']) + + + +# Display the first few rows of the DataFrame +print(COPD_Incid_merged_ozone_df.head()) +print(COPD_Incid_merged_ozone_df.columns) + +def do_breatheright_correlation_analysis(): + # Read in the merged CSV file with ozone and lung disease data + COPD_Incid_merged_ozone_df = pd.read_csv(f"/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_ozone_merged_df.csv") + print(COPD_Incid_merged_ozone_df.head()) + print(COPD_Incid_merged_ozone_df.columns) + +# Drop the unnecessary columns +# COPD_merged_pm25_df = COPD_merged_pm25_df.drop(columns=["county_x", 'State Name', 'county_y', 'State Name_y']) + +# Figuring out which columns to drop +# Values in 'county_x' but not in 'county_y' +#county_x_not_in_county_y = set(ILD_merged_pm25_df['county_x'].dropna()).difference(set(ILD_merged_pm25_df['county_y'].dropna())) +#print("Values in 'county_x' but not in 'county_y':") +#print(county_x_not_in_county_y) + +# Values in 'county_y' but not in 'county_x' +#county_y_not_in_county_x = set(ILD_merged_pm25_df['county_y'].dropna()).difference(set(ILD_merged_pm25_df['county_x'].dropna())) +#print("\nValues in 'county_y' but not in 'county_x':") +#print(county_y_not_in_county_x) + +# Check for null values +#print("\nNull values in 'county_x':") +#print(ILD_merged_pm25_df['county_x'].isnull().sum()) + +#print("\nNull values in 'county_y':") +#print(ILD_merged_pm25_df['county_y'].isnull().sum()) + +# Drop the 'county_x' column +#ILD_merged_pm25_df = ILD_merged_pm25_df.drop(columns=['county_x']) + +# Convert state names in 'State Name' to lowercase +COPD_Incid_merged_ozone_df['State Name'] = COPD_Incid_merged_ozone_df['State Name'].str.lower() + +# Convert state names in 'state' to lowercase +COPD_Incid_merged_ozone_df['state'] = COPD_Incid_merged_ozone_df['state'].str.lower() + +# Get unique values in 'State Name' and 'state' +state_name_values = set(COPD_Incid_merged_ozone_df['State Name'].dropna().unique()) +state_values = set(COPD_Incid_merged_ozone_df['state'].dropna().unique()) + +# Find differences +diff_state_name_not_in_state = state_name_values - state_values +diff_state_not_in_state_name = state_values - state_name_values + +# Print the differences +print("Values in 'State Name' but not in 'state':") +print(diff_state_name_not_in_state) + +print("\nValues in 'state' but not in 'State Name':") +print(diff_state_not_in_state_name) + +# Check for null values +print("\nNull values in 'State Name':") +print(COPD_Incid_merged_ozone_df['State Name'].isnull().sum()) + +print("\nNull values in 'state':") +print(COPD_Incid_merged_ozone_df['state'].isnull().sum()) + +# Drop the 'state' column +COPD_Incid_merged_ozone_df = COPD_Incid_merged_ozone_df.drop(columns=['state']) + +# Renaming columns for clarity +COPD_Incid_merged_ozone_df = COPD_Incid_merged_ozone_df.rename(columns={ + 'Max': 'ozone_max', + 'Min': 'ozone_min', + 'Mean': 'ozone_mean', + 'Median': 'ozone_median', + 'Std': 'ozone_std', +}) + +# Calculate correlations +correlation_matrix = COPD_Incid_merged_ozone_df[[ + 'COPD_average', 'lower', 'upper', + 'ozone_max', 'ozone_min', 'ozone_mean', 'ozone_median', 'ozone_std' +]].corr() + +# Save correlation matrix to CSV +correlation_matrix.to_csv(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_ozone_correlation_matrix.csv') + +# Plot Correlation Heatmap +plt.figure(figsize=(12, 13)) +sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0) +plt.title('COPD Incidence vs. Ozone Correlation Heatmap') +plt.savefig(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_correlation_heatmap_ozone.png') +# plt.show() + diff --git a/COPD_Incid_ozone_breathright_data_prep.py b/COPD_Incid_ozone_breathright_data_prep.py new file mode 100644 index 0000000..7aa5124 --- /dev/null +++ b/COPD_Incid_ozone_breathright_data_prep.py @@ -0,0 +1,91 @@ +import os +import pandas as pd +import re + +# Paths to your data +ozone_data_path = "/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/Ozone and PM2.5 Data/combined_ozone_data.csv" +COPD_incidence_data_path = "/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/IHME_2000-2021_COPD_Incidence_DATA.csv" +pm25_data_path = "/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/Ozone and PM2.5 Data/combined_pm25_data.csv" + +# Read all the csv into pandas dataframe in memory +ozone_df = pd.read_csv(ozone_data_path, parse_dates=['Date Local']) +COPD_incidence_df = pd.read_csv(COPD_incidence_data_path) +pm25_df = pd.read_csv(pm25_data_path) + +print(COPD_incidence_df.columns) + +# Convert 'Date Local' to datetime format +ozone_df['Date Local'] = pd.to_datetime(ozone_df['Date Local'], errors='coerce') + +# Rename columns to be consistent +#COPD_incidence_df.rename(columns={'Location': 'County Name'}, inplace=True) + +# Use the melt function to transform the DataFrame from wide to long format. This will convert the year-specific columns into rows. +#COPD_incidence_long = COPD_incidence_df.melt( + #id_vars=['County Name', 'FIPS', '% Change in Mortality Rate, 1980-2014'], + #var_name='year', + #value_name='Mortality' +#) + +# Extract the year from the 'year' column using string operations and convert it to an integer. +#ILD_long['year'] = ILD_long['year'].str.extract(r'(\d{4})').astype(int) + +# Function to split the Mortality Rate column +def split_COPD_Incidence_column_into_three(COPD_Incidence): + match = re.match(r'(\d+\.\d+) \((\d+\.\d+), (\d+\.\d+)\)', COPD_Incidence) + if match: + avg, min_val, max_val = match.groups() + return pd.Series([float(avg), float(min_val), float(max_val)], index=['COPD_average', 'COPD_min', 'COPD_max']) + else: + return pd.Series([None, None, None], index=['COPD_average', 'COPD_min', 'COPD_max']) + +# Ensure 'COPD Incidence' is a string and handle NaN values +COPD_incidence_df['COPD Incidence'] = COPD_incidence_df['COPD Incidence'].astype(str) + + +# Apply the function to split the 'Mortality' column +COPD_incidence_df[['COPD_average', 'COPD_min', 'COPD_max']] = COPD_incidence_df['COPD Incidence'].apply(split_COPD_Incidence_column_into_three) + +# Drop the original 'Mortality' column if no longer needed +#COPD_incidence_df = COPD_incidence_df.drop(columns=['COPD In']) + +print("COPD_Incidence DataFrame columns:", COPD_incidence_df.columns) + +# Convert the daily ozone into yearly data +ozone_df['year'] = ozone_df['Date Local'].dt.year + +# Group by additional columns and 'year' +grouped = ozone_df.groupby(['State Name', 'County Name', 'year'])['Arithmetic Mean'] + +# Compute statistics +stats_df = grouped.agg(['max', 'min', 'mean', 'median', 'std']).reset_index() + +# Rename columns for clarity +stats_df.columns = ['State Name', 'County Name', 'year', 'Max', 'Min', 'Mean', 'Median', 'Std'] + +# Convert columns to string in both DataFrames +stats_df['county'] = stats_df['County Name'].str.lower() +stats_df['state'] = stats_df['State Name'].str.lower() +stats_df['year'] = stats_df['year'].astype(int) + +#COPD_incidence_df['county'] = COPD_incidence_df['County Name'].str.strip().str.lower() +COPD_incidence_df['year'] = COPD_incidence_df['year'].astype(int) + +# Print the results +print("Ozone aggregated yearly data:", stats_df) +print("COPD Incidence data header:", COPD_incidence_df.head()) + +# Merge the statistics ozone DataFrame with the COPD_long DataFrame +COPD_Incid_merged_ozone_df = pd.merge(COPD_incidence_df, stats_df, on=['State Name', 'year'], how='inner') + +print("Merged DataFrame:", COPD_Incid_merged_ozone_df.head()) + +# Save to a CSV file +COPD_Incid_merged_ozone_df.to_csv(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_ozone_merged_df.csv', index=False) + + + + + + + diff --git a/COPD_Incid_pm25_breathright_correlation.py b/COPD_Incid_pm25_breathright_correlation.py new file mode 100644 index 0000000..a318ccf --- /dev/null +++ b/COPD_Incid_pm25_breathright_correlation.py @@ -0,0 +1,102 @@ +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt + +# Load the CSV file into a DataFrame +file_path = '/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_pm25_merged_df.csv' +COPD_Incid_merged_pm25_df = pd.read_csv(file_path) + +COPD_Incid_merged_pm25_df = COPD_Incid_merged_pm25_df.drop(columns=['COPD_min', 'COPD_max', 'county', 'County Name']) + + + +# Display the first few rows of the DataFrame +print(COPD_Incid_merged_pm25_df.head()) +print(COPD_Incid_merged_pm25_df.columns) + +def do_breatheright_correlation_analysis(): + # Read in the merged CSV file with ozone and lung disease data + COPD_Incid_merged_pm25_df = pd.read_csv(f"/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_pm25_merged_df.csv") + print(COPD_Incid_merged_pm25_df.head()) + print(COPD_Incid_merged_pm25_df.columns) + +# Drop the unnecessary columns +# COPD_merged_pm25_df = COPD_merged_pm25_df.drop(columns=["county_x", 'State Name', 'county_y', 'State Name_y']) + +# Figuring out which columns to drop +# Values in 'county_x' but not in 'county_y' +#county_x_not_in_county_y = set(ILD_merged_pm25_df['county_x'].dropna()).difference(set(ILD_merged_pm25_df['county_y'].dropna())) +#print("Values in 'county_x' but not in 'county_y':") +#print(county_x_not_in_county_y) + +# Values in 'county_y' but not in 'county_x' +#county_y_not_in_county_x = set(ILD_merged_pm25_df['county_y'].dropna()).difference(set(ILD_merged_pm25_df['county_x'].dropna())) +#print("\nValues in 'county_y' but not in 'county_x':") +#print(county_y_not_in_county_x) + +# Check for null values +#print("\nNull values in 'county_x':") +#print(ILD_merged_pm25_df['county_x'].isnull().sum()) + +#print("\nNull values in 'county_y':") +#print(ILD_merged_pm25_df['county_y'].isnull().sum()) + +# Drop the 'county_x' column +#ILD_merged_pm25_df = ILD_merged_pm25_df.drop(columns=['county_x']) + +# Convert state names in 'State Name' to lowercase +COPD_Incid_merged_pm25_df['State Name'] = COPD_Incid_merged_pm25_df['State Name'].str.lower() + +# Convert state names in 'state' to lowercase +COPD_Incid_merged_pm25_df['state'] = COPD_Incid_merged_pm25_df['state'].str.lower() + +# Get unique values in 'State Name' and 'state' +state_name_values = set(COPD_Incid_merged_pm25_df['State Name'].dropna().unique()) +state_values = set(COPD_Incid_merged_pm25_df['state'].dropna().unique()) + +# Find differences +diff_state_name_not_in_state = state_name_values - state_values +diff_state_not_in_state_name = state_values - state_name_values + +# Print the differences +print("Values in 'State Name' but not in 'state':") +print(diff_state_name_not_in_state) + +print("\nValues in 'state' but not in 'State Name':") +print(diff_state_not_in_state_name) + +# Check for null values +print("\nNull values in 'State Name':") +print(COPD_Incid_merged_pm25_df['State Name'].isnull().sum()) + +print("\nNull values in 'state':") +print(COPD_Incid_merged_pm25_df['state'].isnull().sum()) + +# Drop the 'state' column +COPD_Incid_merged_pm25_df = COPD_Incid_merged_pm25_df.drop(columns=['state']) + +# Renaming columns for clarity +COPD_Incid_merged_pm25_df = COPD_Incid_merged_pm25_df.rename(columns={ + 'Max': 'pm25_max', + 'Min': 'pm25_min', + 'Mean': 'pm25_mean', + 'Median': 'pm25_median', + 'Std': 'pm25_std', +}) + +# Calculate correlations +correlation_matrix = COPD_Incid_merged_pm25_df[[ + 'COPD_average', 'lower', 'upper', + 'pm25_max', 'pm25_min', 'pm25_mean', 'pm25_median', 'pm25_std' +]].corr() + +# Save correlation matrix to CSV +correlation_matrix.to_csv(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_pm25_correlation_matrix.csv') + +# Plot Correlation Heatmap +plt.figure(figsize=(12, 13)) +sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0) +plt.title('COPD Incidence vs. PM2.5 Correlation Heatmap') +plt.savefig(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_correlation_heatmap_pm25.png') +# plt.show() + diff --git a/COPD_Incid_pm25_breathright_data_prep.py b/COPD_Incid_pm25_breathright_data_prep.py new file mode 100644 index 0000000..23b87a8 --- /dev/null +++ b/COPD_Incid_pm25_breathright_data_prep.py @@ -0,0 +1,91 @@ +import os +import pandas as pd +import re + +# Paths to your data +ozone_data_path = "/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/Ozone and PM2.5 Data/combined_ozone_data.csv" +COPD_incidence_data_path = "/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/IHME_2000-2021_COPD_Incidence_DATA.csv" +pm25_data_path = "/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/Ozone and PM2.5 Data/combined_pm25_data.csv" + +# Read all the csv into pandas dataframe in memory +ozone_df = pd.read_csv(ozone_data_path, parse_dates=['Date Local']) +COPD_incidence_df = pd.read_csv(COPD_incidence_data_path) +pm25_df = pd.read_csv(pm25_data_path) + +print(COPD_incidence_df.columns) + +# Convert 'Date Local' to datetime format +pm25_df['Date Local'] = pd.to_datetime(pm25_df['Date Local'], errors='coerce') + +# Rename columns to be consistent +#COPD_incidence_df.rename(columns={'Location': 'County Name'}, inplace=True) + +# Use the melt function to transform the DataFrame from wide to long format. This will convert the year-specific columns into rows. +#COPD_incidence_long = COPD_incidence_df.melt( + #id_vars=['County Name', 'FIPS', '% Change in Mortality Rate, 1980-2014'], + #var_name='year', + #value_name='Mortality' +#) + +# Extract the year from the 'year' column using string operations and convert it to an integer. +#ILD_long['year'] = ILD_long['year'].str.extract(r'(\d{4})').astype(int) + +# Function to split the Mortality Rate column +def split_COPD_Incidence_column_into_three(COPD_Incidence): + match = re.match(r'(\d+\.\d+) \((\d+\.\d+), (\d+\.\d+)\)', COPD_Incidence) + if match: + avg, min_val, max_val = match.groups() + return pd.Series([float(avg), float(min_val), float(max_val)], index=['COPD_average', 'COPD_min', 'COPD_max']) + else: + return pd.Series([None, None, None], index=['COPD_average', 'COPD_min', 'COPD_max']) + +# Ensure 'COPD Incidence' is a string and handle NaN values +COPD_incidence_df['COPD Incidence'] = COPD_incidence_df['COPD Incidence'].astype(str) + + +# Apply the function to split the 'Mortality' column +COPD_incidence_df[['COPD_average', 'COPD_min', 'COPD_max']] = COPD_incidence_df['COPD Incidence'].apply(split_COPD_Incidence_column_into_three) + +# Drop the original 'Mortality' column if no longer needed +#COPD_incidence_df = COPD_incidence_df.drop(columns=['COPD In']) + +print("COPD_Incidence DataFrame columns:", COPD_incidence_df.columns) + +# Convert the daily ozone into yearly data +pm25_df['year'] = pm25_df['Date Local'].dt.year + +# Group by additional columns and 'year' +grouped = pm25_df.groupby(['State Name', 'County Name', 'year'])['Arithmetic Mean'] + +# Compute statistics +stats_df = grouped.agg(['max', 'min', 'mean', 'median', 'std']).reset_index() + +# Rename columns for clarity +stats_df.columns = ['State Name', 'County Name', 'year', 'Max', 'Min', 'Mean', 'Median', 'Std'] + +# Convert columns to string in both DataFrames +stats_df['county'] = stats_df['County Name'].str.lower() +stats_df['state'] = stats_df['State Name'].str.lower() +stats_df['year'] = stats_df['year'].astype(int) + +#COPD_incidence_df['county'] = COPD_incidence_df['County Name'].str.strip().str.lower() +COPD_incidence_df['year'] = COPD_incidence_df['year'].astype(int) + +# Print the results +print("Ozone aggregated yearly data:", stats_df) +print("COPD Incidence data header:", COPD_incidence_df.head()) + +# Merge the statistics ozone DataFrame with the COPD_long DataFrame +COPD_Incid_merged_pm25_df = pd.merge(COPD_incidence_df, stats_df, on=['State Name', 'year'], how='inner') + +print("Merged DataFrame:", COPD_Incid_merged_pm25_df.head()) + +# Save to a CSV file +COPD_Incid_merged_pm25_df.to_csv(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_pm25_merged_df.csv', index=False) + + + + + + + diff --git a/process.json b/process.json index a40711a..13f0766 100644 --- a/process.json +++ b/process.json @@ -1,16 +1,32 @@ [{ - "id" : "6zmcpn", - "name" : "1. Incidence COPD data prep", + "id" : "fzdydc", + "name" : "COPD_Incid_pm25_breathright_data_prep", "description" : null, - "code" : "import pandas as pd\nimport geopandas as gpd\n\n# Read CDC COPD incidence data\nCOPD_incidence_df = pd.read_csv('/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/IHME_2000-2021_COPD_Incidence_DATA.csv')\n\n# Read EPA air quality data for PM2.5 and ozone\npm25_df = pd.read_csv('/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/Ozone and PM2.5 Data/combined_pm25_data.csv', low_memory=False)\nozone_df = pd.read_csv('/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/Ozone and PM2.5 Data/combined_ozone_data.csv', low_memory=False)\n\n# Read county shapefile for spatial analysis\ncounties_gdf = gpd.read_file('/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/County data/county_shapefile.shp')\n\n# Convert date column to datetime\npm25_df['Date Local'] = pd.to_datetime(pm25_df['Date Local'], format='%Y-%m-%d')\nozone_df['Date Local'] = pd.to_datetime(ozone_df['Date Local'], format='%Y-%m-%d')\n\n# Extract year from date\npm25_df['year'] = pm25_df['Date Local'].dt.year\nozone_df['year'] = ozone_df['Date Local'].dt.year\n\n# Get year from PM2.5\n\n# Convert 'Date Local' to datetime and extract year\npm25_df['Date Local'] = pd.to_datetime(pm25_df['Date Local'], format='%Y-%m-%d')\npm25_df['year'] = pm25_df['Date Local'].dt.year\n\n# Rename 'Sample Measurement' to 'PM2.5'\npm25_df.rename(columns={'Arithmetic Mean': 'PM2.5'}, inplace=True)\n\n# Verify the DataFrame\nprint(pm25_df.head())\nprint(pm25_df.columns)\n\n# Convert 'Date Local' to datetime and extract year\nozone_df['Date Local'] = pd.to_datetime(ozone_df['Date Local'], format='%m/%d/%y')\nozone_df['year'] = ozone_df['Date Local'].dt.year\n\n# Rename 'Sample Measurement' to 'Ozone'\nozone_df.rename(columns={'Arithmetic Mean': 'Ozone'}, inplace=True)\n\n# Verify the DataFrame\nprint(ozone_df.head())\nprint(ozone_df.columns)\n\n# Ensure 'State Name' and 'year' exist\nprint(pm25_df[['State Name', 'year']].head())\nprint(ozone_df[['State Name', 'year']].head())\n\n# Group by state and year to calculate annual averages\npm25_annual = pm25_df.groupby(['State Name', 'year'])['PM2.5'].mean().reset_index()\nozone_annual = ozone_df.groupby(['State Name', 'year'])['Ozone'].mean().reset_index()\n\n# Get latitude and longitude for each state-year pair\nlat_lon_pm25 = pm25_df.groupby(['State Name', 'year']).agg({\n 'Latitude': 'first',\n 'Longitude': 'first'\n}).reset_index()\n\nlat_lon_ozone = ozone_df.groupby(['State Name', 'year']).agg({\n 'Latitude': 'first',\n 'Longitude': 'first'\n}).reset_index()\n\n# Merge latitude and longitude with PM2.5 and Ozone averages\npm25_annual = pd.merge(pm25_annual, lat_lon_pm25, on=['State Name', 'year'])\nozone_annual = pd.merge(ozone_annual, lat_lon_ozone, on=['State Name', 'year'])\n\n# Verify the columns, all should have 'State Name' and 'year'\nprint(\"PM2.5 DataFrame columns:\")\nprint(pm25_df.columns)\nprint(\"Ozone DataFrame columns:\")\nprint(ozone_df.columns)\nprint(\"COPD incidence DataFrame columns:\")\nprint(COPD_incidence_df.columns)\n\n# Merge lung disease data with PM2.5 and ozone data\nCOPD_incidence_merged_df = pd.merge(COPD_incidence_df, pm25_annual, on=['State Name', 'year'])\nCOPD_incidence_merged_df = pd.merge(COPD_incidence_merged_df, ozone_annual, on=['State Name', 'year'])\n\n# Find rows where Latitude_x and Latitude_y differ\ndiscrepancies = COPD_incidence_merged_df[COPD_incidence_merged_df['Latitude_x'] != COPD_incidence_merged_df['Latitude_y']]\n\n# Display the rows with discrepancies\nprint(discrepancies[['State Name', 'year', 'Latitude_x', 'Latitude_y']])\n\n# Drop the incorrect columns and rename the correct ones\nCOPD_incidence_merged_df = COPD_incidence_merged_df.drop(columns=['Latitude_x', 'Longitude_x'])\nCOPD_incidence_merged_df = COPD_incidence_merged_df.rename(columns={'Latitude_y': 'Latitude', 'Longitude_y': 'Longitude'})\n\nfrom shapely.geometry import Point\n\n# Create the geometry column using the latitude and longitude columns\nCOPD_incidence_merged_df['geometry'] = COPD_incidence_merged_df.apply(lambda row: Point(row['Longitude'], row['Latitude']), axis=1)\n\n\n# Convert merged DataFrame to GeoDataFrame for spatial analysis\nCOPD_incidence_gdf = gpd.GeoDataFrame(COPD_incidence_merged_df, geometry='geometry')\n\nCOPD_incidence_gdf.crs = 'EPSG:4326' # Example CRS, adjust as needed\n\n# Save the GeoDataFrame to a shapefile or other format if needed\nCOPD_incidence_gdf.to_file('/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_incidence_gdf.shp')\n\n# Save the updated DataFrame to CSV\nCOPD_incidence_merged_df.to_csv('/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_incidence_merged_df.csv', index=False)\n\n\n\n\n# Check for matches in State Name and year across dataframes\nprint(COPD_incidence_df[['State Name', 'year']].drop_duplicates())\nprint(pm25_annual[['State Name', 'year']].drop_duplicates())\nprint(ozone_annual[['State Name', 'year']].drop_duplicates())\n\n# Check for common values between dataframes\ncommon_states_years = pd.merge(COPD_incidence_df[['State Name', 'year']], pm25_annual[['State Name', 'year']], on=['State Name', 'year'])\nprint(common_states_years.head())\n\n# Verify data types before merging\nprint(COPD_incidence_df.dtypes)\nprint(pm25_annual.dtypes)\nprint(ozone_annual.dtypes)\n\n# Ensure no columns have trailing spaces or unexpected characters\nCOPD_incidence_df.columns = COPD_incidence_df.columns.str.strip()\npm25_annual.columns = pm25_annual.columns.str.strip()\nozone_annual.columns = ozone_annual.columns.str.strip()\n\n\n\n\n\n\n# Making Random Forest Prediction Model\nimport pandas as pd\nimport numpy as np\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.ensemble import RandomForestRegressor\nfrom sklearn.metrics import mean_squared_error, r2_score\n\n# Load the merged DataFrame\nCOPD_incidence_merged_df = pd.read_csv('/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_incidence_merged_df.csv')\n\nprint(COPD_incidence_merged_df.dtypes)\n\n# Check for any non-numeric values or unexpected text\nnon_numeric_values = COPD_incidence_merged_df[~COPD_incidence_merged_df['COPD Incidence'].apply(pd.to_numeric, errors='coerce').notnull()]\nprint(non_numeric_values)\n\nprint(COPD_incidence_merged_df['State Name'].unique())\nprint(COPD_incidence_merged_df['metric_name'].unique())\n\nCOPD_incidence_merged_df['COPD Incidence'] = pd.to_numeric(COPD_incidence_merged_df['COPD Incidence'], errors='coerce')\nCOPD_incidence_merged_df['PM2.5'] = pd.to_numeric(COPD_incidence_merged_df['PM2.5'], errors='coerce')\nCOPD_incidence_merged_df['Ozone'] = pd.to_numeric(COPD_incidence_merged_df['Ozone'], errors='coerce')\n\n# Verify if any columns were unintentionally concatenated\nprint(COPD_incidence_merged_df.head(10))\n\n\n# Identify numeric columns\nnumeric_cols = COPD_incidence_merged_df.select_dtypes(include='number').columns\nnon_numeric_cols = COPD_incidence_merged_df.select_dtypes(exclude='number').columns\n\n\n# Identify non-numeric columns\nnon_numeric_cols = COPD_incidence_merged_df.select_dtypes(exclude=[np.number]).columns.tolist()\n\n# Fill NaN values in numeric columns with the mean\nCOPD_incidence_merged_df[numeric_cols].fillna(COPD_incidence_merged_df[numeric_cols].mean(), inplace=True)\n\n\n# Check for missing values\nprint(COPD_incidence_merged_df.isnull().sum())\n\n# Fill missing values if necessary\n# For simplicity, you can use the mean or median of the columns with missing values\nCOPD_incidence_merged_df.fillna(COPD_incidence_merged_df.mean(), inplace=True)\n\n# Extract features and target variable\n# Assume 'COPD Incidence' is the target variable and the rest are features\nfeatures = COPD_incidence_merged_df[['PM2.5', 'Ozone', 'Latitude', 'Longitude']]\ntarget = COPD_incidence_merged_df['COPD Incidence']\n\n# Split the data into training and testing sets\nX_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)\n\n# Initialize and fit the Random Forest model\nrf_model = RandomForestRegressor(n_estimators=300, max_depth=10, min_samples_split=10, random_state=42)\nrf_model.fit(X_train, y_train)\n\n# Make predictions\ny_pred = rf_model.predict(X_test)\n\n# Evaluate the model\nmse = mean_squared_error(y_test, y_pred)\nr2 = r2_score(y_test, y_pred)\n\nprint(f\"Mean Squared Error: {mse}\")\nprint(f\"R^2 Score: {r2}\")\n\n# Get feature importances\nimportances = rf_model.feature_importances_\nfeatures_importance = pd.DataFrame({\n 'Feature': X_train.columns,\n 'Importance': importances\n}).sort_values(by='Importance', ascending=False)\n\nprint(features_importance)\n\n\n\n\n\n\n", + "code" : "import os\nimport pandas as pd\nimport re\n\n# Paths to your data\nozone_data_path = \"/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/Ozone and PM2.5 Data/combined_ozone_data.csv\"\nCOPD_incidence_data_path = \"/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/IHME_2000-2021_COPD_Incidence_DATA.csv\"\npm25_data_path = \"/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/Ozone and PM2.5 Data/combined_pm25_data.csv\"\n\n# Read all the csv into pandas dataframe in memory\nozone_df = pd.read_csv(ozone_data_path, parse_dates=['Date Local'])\nCOPD_incidence_df = pd.read_csv(COPD_incidence_data_path)\npm25_df = pd.read_csv(pm25_data_path)\n\nprint(COPD_incidence_df.columns)\n\n# Convert 'Date Local' to datetime format\npm25_df['Date Local'] = pd.to_datetime(pm25_df['Date Local'], errors='coerce')\n\n# Rename columns to be consistent\n#COPD_incidence_df.rename(columns={'Location': 'County Name'}, inplace=True)\n\n# Use the melt function to transform the DataFrame from wide to long format. This will convert the year-specific columns into rows.\n#COPD_incidence_long = COPD_incidence_df.melt(\n #id_vars=['County Name', 'FIPS', '% Change in Mortality Rate, 1980-2014'],\n #var_name='year',\n #value_name='Mortality'\n#)\n\n# Extract the year from the 'year' column using string operations and convert it to an integer.\n#ILD_long['year'] = ILD_long['year'].str.extract(r'(\\d{4})').astype(int)\n\n# Function to split the Mortality Rate column\ndef split_COPD_Incidence_column_into_three(COPD_Incidence):\n match = re.match(r'(\\d+\\.\\d+) \\((\\d+\\.\\d+), (\\d+\\.\\d+)\\)', COPD_Incidence)\n if match:\n avg, min_val, max_val = match.groups()\n return pd.Series([float(avg), float(min_val), float(max_val)], index=['COPD_average', 'COPD_min', 'COPD_max'])\n else:\n return pd.Series([None, None, None], index=['COPD_average', 'COPD_min', 'COPD_max'])\n\n# Ensure 'COPD Incidence' is a string and handle NaN values\nCOPD_incidence_df['COPD Incidence'] = COPD_incidence_df['COPD Incidence'].astype(str)\n\n\n# Apply the function to split the 'Mortality' column\nCOPD_incidence_df[['COPD_average', 'COPD_min', 'COPD_max']] = COPD_incidence_df['COPD Incidence'].apply(split_COPD_Incidence_column_into_three)\n\n# Drop the original 'Mortality' column if no longer needed\n#COPD_incidence_df = COPD_incidence_df.drop(columns=['COPD In'])\n\nprint(\"COPD_Incidence DataFrame columns:\", COPD_incidence_df.columns)\n\n# Convert the daily ozone into yearly data\npm25_df['year'] = pm25_df['Date Local'].dt.year\n\n# Group by additional columns and 'year'\ngrouped = pm25_df.groupby(['State Name', 'County Name', 'year'])['Arithmetic Mean']\n\n# Compute statistics\nstats_df = grouped.agg(['max', 'min', 'mean', 'median', 'std']).reset_index()\n\n# Rename columns for clarity\nstats_df.columns = ['State Name', 'County Name', 'year', 'Max', 'Min', 'Mean', 'Median', 'Std']\n\n# Convert columns to string in both DataFrames\nstats_df['county'] = stats_df['County Name'].str.lower()\nstats_df['state'] = stats_df['State Name'].str.lower()\nstats_df['year'] = stats_df['year'].astype(int)\n\n#COPD_incidence_df['county'] = COPD_incidence_df['County Name'].str.strip().str.lower()\nCOPD_incidence_df['year'] = COPD_incidence_df['year'].astype(int)\n\n# Print the results\nprint(\"Ozone aggregated yearly data:\", stats_df)\nprint(\"COPD Incidence data header:\", COPD_incidence_df.head())\n\n# Merge the statistics ozone DataFrame with the COPD_long DataFrame\nCOPD_Incid_merged_pm25_df = pd.merge(COPD_incidence_df, stats_df, on=['State Name', 'year'], how='inner')\n\nprint(\"Merged DataFrame:\", COPD_Incid_merged_pm25_df.head())\n\n# Save to a CSV file\nCOPD_Incid_merged_pm25_df.to_csv(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_pm25_merged_df.csv', index=False)\n\n\n\n\n\n\n", "lang" : "python", "owner" : "111111", "confidential" : "FALSE" },{ - "id" : "hjbur5", - "name" : "2. Incidence COPD Data Analysis", + "id" : "f7jhqc", + "name" : "COPD_Incid_pm25_breathright_correlation", "description" : null, - "code" : "import pandas as pd\nimport geopandas as gpd\nimport matplotlib.pyplot as plt\nfrom esda.moran import Moran, Moran_Local\nfrom libpysal.weights import Queen\nfrom splot.esda import lisa_cluster\n\n# Load the merged dataset\nCOPD_incidence_merged_df = pd.read_csv('/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_incidence_merged_df.csv')\n\n# Load the shapefile into a GeoDataFrame\nCOPD_incidence_gdf = gpd.read_file('/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_incidence_gdf.shp')\n\n# Rename the column 'val' to 'COPD Incidence'\nCOPD_incidence_gdf = COPD_incidence_gdf.rename(columns={'val': 'COPD Incidence'})\n\n\n# Verify the loaded GeoDataFrame\nprint(COPD_incidence_gdf.head())\nprint(COPD_incidence_gdf.columns)\nprint(COPD_incidence_gdf.crs) # Check the Coordinate Reference System\n\n# Descriptive Statistics\nsummary_stats = COPD_incidence_gdf.describe()\nprint(summary_stats)\n\n# Plot distribution of PM2.5 levels and Ozone levels\nplt.figure(figsize=(12, 6))\n\nplt.subplot(1, 2, 1)\nCOPD_incidence_gdf['PM2.5'].hist(bins=20)\nplt.title('Distribution of PM2.5 Levels')\nplt.xlabel('PM2.5')\nplt.ylabel('Frequency')\n\nplt.subplot(1, 2, 2)\nCOPD_incidence_gdf['Ozone'].hist(bins=20)\nplt.title('Distribution of Ozone Levels')\nplt.xlabel('Ozone')\nplt.ylabel('Frequency')\n\nplt.tight_layout()\nplt.show()\n\n\n# Scatter plot of PM2.5 vs. Mortality\nplt.figure(figsize=(12, 6))\n\nplt.subplot(1, 2, 1)\nplt.scatter(COPD_incidence_gdf['PM2.5'], COPD_incidence_gdf['COPD Incidence'], alpha=0.5, edgecolors='w', s=80)\nplt.title('PM2.5 vs. COPD Incidence')\nplt.xlabel('PM2.5')\nplt.ylabel('COPD Incidence')\n\n\n# Scatter plot of Ozone vs. Mortality\nplt.subplot(1, 2, 2)\nplt.scatter(COPD_incidence_gdf['Ozone'], COPD_incidence_gdf['COPD Incidence'], alpha=0.5, edgecolors='w', s=80)\nplt.title('Ozone vs. COPD Incidence')\nplt.xlabel('Ozone')\nplt.ylabel('COPD Incidence')\n\nplt.tight_layout()\nplt.show()\n\n\n# Add an ID column to the GeoDataFrame to use it on GeoDa\nCOPD_incidence_gdf['ID'] = range(1, len(COPD_incidence_gdf) + 1)\n\nprint(COPD_incidence_gdf.columns)\n\n# Define the path to the shapefile\nshapefile_path = '/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_incidence_gdf.shp'\n\n# Save the updated GeoDataFrame to the shapefile\nCOPD_incidence_gdf.to_file(shapefile_path)\n\n# Verify the file is saved\nprint(f\"Updated shapefile saved to {shapefile_path}\")\n\n\n\n\n\n\n\n\n", + "code" : "import pandas as pd\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\n# Load the CSV file into a DataFrame\nfile_path = '/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_pm25_merged_df.csv'\nCOPD_Incid_merged_pm25_df = pd.read_csv(file_path)\n\nCOPD_Incid_merged_pm25_df = COPD_Incid_merged_pm25_df.drop(columns=['COPD_min', 'COPD_max', 'county', 'County Name'])\n\n\n\n# Display the first few rows of the DataFrame\nprint(COPD_Incid_merged_pm25_df.head())\nprint(COPD_Incid_merged_pm25_df.columns)\n\ndef do_breatheright_correlation_analysis():\n # Read in the merged CSV file with ozone and lung disease data\n COPD_Incid_merged_pm25_df = pd.read_csv(f\"/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_pm25_merged_df.csv\")\n print(COPD_Incid_merged_pm25_df.head())\n print(COPD_Incid_merged_pm25_df.columns)\n\n# Drop the unnecessary columns\n# COPD_merged_pm25_df = COPD_merged_pm25_df.drop(columns=[\"county_x\", 'State Name', 'county_y', 'State Name_y'])\n\n# Figuring out which columns to drop\n# Values in 'county_x' but not in 'county_y'\n#county_x_not_in_county_y = set(ILD_merged_pm25_df['county_x'].dropna()).difference(set(ILD_merged_pm25_df['county_y'].dropna()))\n#print(\"Values in 'county_x' but not in 'county_y':\")\n#print(county_x_not_in_county_y)\n\n# Values in 'county_y' but not in 'county_x'\n#county_y_not_in_county_x = set(ILD_merged_pm25_df['county_y'].dropna()).difference(set(ILD_merged_pm25_df['county_x'].dropna()))\n#print(\"\\nValues in 'county_y' but not in 'county_x':\")\n#print(county_y_not_in_county_x)\n\n# Check for null values\n#print(\"\\nNull values in 'county_x':\")\n#print(ILD_merged_pm25_df['county_x'].isnull().sum())\n\n#print(\"\\nNull values in 'county_y':\")\n#print(ILD_merged_pm25_df['county_y'].isnull().sum())\n\n# Drop the 'county_x' column\n#ILD_merged_pm25_df = ILD_merged_pm25_df.drop(columns=['county_x'])\n\n# Convert state names in 'State Name' to lowercase\nCOPD_Incid_merged_pm25_df['State Name'] = COPD_Incid_merged_pm25_df['State Name'].str.lower()\n\n# Convert state names in 'state' to lowercase\nCOPD_Incid_merged_pm25_df['state'] = COPD_Incid_merged_pm25_df['state'].str.lower()\n\n# Get unique values in 'State Name' and 'state'\nstate_name_values = set(COPD_Incid_merged_pm25_df['State Name'].dropna().unique())\nstate_values = set(COPD_Incid_merged_pm25_df['state'].dropna().unique())\n\n# Find differences\ndiff_state_name_not_in_state = state_name_values - state_values\ndiff_state_not_in_state_name = state_values - state_name_values\n\n# Print the differences\nprint(\"Values in 'State Name' but not in 'state':\")\nprint(diff_state_name_not_in_state)\n\nprint(\"\\nValues in 'state' but not in 'State Name':\")\nprint(diff_state_not_in_state_name)\n\n# Check for null values\nprint(\"\\nNull values in 'State Name':\")\nprint(COPD_Incid_merged_pm25_df['State Name'].isnull().sum())\n\nprint(\"\\nNull values in 'state':\")\nprint(COPD_Incid_merged_pm25_df['state'].isnull().sum())\n\n# Drop the 'state' column\nCOPD_Incid_merged_pm25_df = COPD_Incid_merged_pm25_df.drop(columns=['state'])\n\n# Renaming columns for clarity\nCOPD_Incid_merged_pm25_df = COPD_Incid_merged_pm25_df.rename(columns={\n 'Max': 'pm25_max',\n 'Min': 'pm25_min',\n 'Mean': 'pm25_mean',\n 'Median': 'pm25_median',\n 'Std': 'pm25_std',\n})\n\n# Calculate correlations\ncorrelation_matrix = COPD_Incid_merged_pm25_df[[\n 'COPD_average', 'lower', 'upper',\n 'pm25_max', 'pm25_min', 'pm25_mean', 'pm25_median', 'pm25_std'\n]].corr()\n\n# Save correlation matrix to CSV\ncorrelation_matrix.to_csv(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_pm25_correlation_matrix.csv')\n\n# Plot Correlation Heatmap\nplt.figure(figsize=(12, 13))\nsns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0)\nplt.title('COPD Incidence vs. PM2.5 Correlation Heatmap')\nplt.savefig(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_correlation_heatmap_pm25.png')\n# plt.show()\n", + "lang" : "python", + "owner" : "111111", + "confidential" : "FALSE" +},{ + "id" : "kw5zqo", + "name" : "COPD_Incid_ozone_breathright_data_prep", + "description" : null, + "code" : "import os\nimport pandas as pd\nimport re\n\n# Paths to your data\nozone_data_path = \"/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/Ozone and PM2.5 Data/combined_ozone_data.csv\"\nCOPD_incidence_data_path = \"/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/IHME_2000-2021_COPD_Incidence_DATA.csv\"\npm25_data_path = \"/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/Ozone and PM2.5 Data/combined_pm25_data.csv\"\n\n# Read all the csv into pandas dataframe in memory\nozone_df = pd.read_csv(ozone_data_path, parse_dates=['Date Local'])\nCOPD_incidence_df = pd.read_csv(COPD_incidence_data_path)\npm25_df = pd.read_csv(pm25_data_path)\n\nprint(COPD_incidence_df.columns)\n\n# Convert 'Date Local' to datetime format\nozone_df['Date Local'] = pd.to_datetime(ozone_df['Date Local'], errors='coerce')\n\n# Rename columns to be consistent\n#COPD_incidence_df.rename(columns={'Location': 'County Name'}, inplace=True)\n\n# Use the melt function to transform the DataFrame from wide to long format. This will convert the year-specific columns into rows.\n#COPD_incidence_long = COPD_incidence_df.melt(\n #id_vars=['County Name', 'FIPS', '% Change in Mortality Rate, 1980-2014'],\n #var_name='year',\n #value_name='Mortality'\n#)\n\n# Extract the year from the 'year' column using string operations and convert it to an integer.\n#ILD_long['year'] = ILD_long['year'].str.extract(r'(\\d{4})').astype(int)\n\n# Function to split the Mortality Rate column\ndef split_COPD_Incidence_column_into_three(COPD_Incidence):\n match = re.match(r'(\\d+\\.\\d+) \\((\\d+\\.\\d+), (\\d+\\.\\d+)\\)', COPD_Incidence)\n if match:\n avg, min_val, max_val = match.groups()\n return pd.Series([float(avg), float(min_val), float(max_val)], index=['COPD_average', 'COPD_min', 'COPD_max'])\n else:\n return pd.Series([None, None, None], index=['COPD_average', 'COPD_min', 'COPD_max'])\n\n# Ensure 'COPD Incidence' is a string and handle NaN values\nCOPD_incidence_df['COPD Incidence'] = COPD_incidence_df['COPD Incidence'].astype(str)\n\n\n# Apply the function to split the 'Mortality' column\nCOPD_incidence_df[['COPD_average', 'COPD_min', 'COPD_max']] = COPD_incidence_df['COPD Incidence'].apply(split_COPD_Incidence_column_into_three)\n\n# Drop the original 'Mortality' column if no longer needed\n#COPD_incidence_df = COPD_incidence_df.drop(columns=['COPD In'])\n\nprint(\"COPD_Incidence DataFrame columns:\", COPD_incidence_df.columns)\n\n# Convert the daily ozone into yearly data\nozone_df['year'] = ozone_df['Date Local'].dt.year\n\n# Group by additional columns and 'year'\ngrouped = ozone_df.groupby(['State Name', 'County Name', 'year'])['Arithmetic Mean']\n\n# Compute statistics\nstats_df = grouped.agg(['max', 'min', 'mean', 'median', 'std']).reset_index()\n\n# Rename columns for clarity\nstats_df.columns = ['State Name', 'County Name', 'year', 'Max', 'Min', 'Mean', 'Median', 'Std']\n\n# Convert columns to string in both DataFrames\nstats_df['county'] = stats_df['County Name'].str.lower()\nstats_df['state'] = stats_df['State Name'].str.lower()\nstats_df['year'] = stats_df['year'].astype(int)\n\n#COPD_incidence_df['county'] = COPD_incidence_df['County Name'].str.strip().str.lower()\nCOPD_incidence_df['year'] = COPD_incidence_df['year'].astype(int)\n\n# Print the results\nprint(\"Ozone aggregated yearly data:\", stats_df)\nprint(\"COPD Incidence data header:\", COPD_incidence_df.head())\n\n# Merge the statistics ozone DataFrame with the COPD_long DataFrame\nCOPD_Incid_merged_ozone_df = pd.merge(COPD_incidence_df, stats_df, on=['State Name', 'year'], how='inner')\n\nprint(\"Merged DataFrame:\", COPD_Incid_merged_ozone_df.head())\n\n# Save to a CSV file\nCOPD_Incid_merged_ozone_df.to_csv(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_ozone_merged_df.csv', index=False)\n\n\n\n\n\n\n", + "lang" : "python", + "owner" : "111111", + "confidential" : "FALSE" +},{ + "id" : "ddcab9", + "name" : "COPD_Incid_ozone_breathright_correlation", + "description" : null, + "code" : "import pandas as pd\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\n# Load the CSV file into a DataFrame\nfile_path = '/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_ozone_merged_df.csv'\nCOPD_Incid_merged_ozone_df = pd.read_csv(file_path)\n\nCOPD_Incid_merged_ozone_df = COPD_Incid_merged_ozone_df.drop(columns=['COPD_min', 'COPD_max', 'county', 'County Name'])\n\n\n\n# Display the first few rows of the DataFrame\nprint(COPD_Incid_merged_ozone_df.head())\nprint(COPD_Incid_merged_ozone_df.columns)\n\ndef do_breatheright_correlation_analysis():\n # Read in the merged CSV file with ozone and lung disease data\n COPD_Incid_merged_ozone_df = pd.read_csv(f\"/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_ozone_merged_df.csv\")\n print(COPD_Incid_merged_ozone_df.head())\n print(COPD_Incid_merged_ozone_df.columns)\n\n# Drop the unnecessary columns\n# COPD_merged_pm25_df = COPD_merged_pm25_df.drop(columns=[\"county_x\", 'State Name', 'county_y', 'State Name_y'])\n\n# Figuring out which columns to drop\n# Values in 'county_x' but not in 'county_y'\n#county_x_not_in_county_y = set(ILD_merged_pm25_df['county_x'].dropna()).difference(set(ILD_merged_pm25_df['county_y'].dropna()))\n#print(\"Values in 'county_x' but not in 'county_y':\")\n#print(county_x_not_in_county_y)\n\n# Values in 'county_y' but not in 'county_x'\n#county_y_not_in_county_x = set(ILD_merged_pm25_df['county_y'].dropna()).difference(set(ILD_merged_pm25_df['county_x'].dropna()))\n#print(\"\\nValues in 'county_y' but not in 'county_x':\")\n#print(county_y_not_in_county_x)\n\n# Check for null values\n#print(\"\\nNull values in 'county_x':\")\n#print(ILD_merged_pm25_df['county_x'].isnull().sum())\n\n#print(\"\\nNull values in 'county_y':\")\n#print(ILD_merged_pm25_df['county_y'].isnull().sum())\n\n# Drop the 'county_x' column\n#ILD_merged_pm25_df = ILD_merged_pm25_df.drop(columns=['county_x'])\n\n# Convert state names in 'State Name' to lowercase\nCOPD_Incid_merged_ozone_df['State Name'] = COPD_Incid_merged_ozone_df['State Name'].str.lower()\n\n# Convert state names in 'state' to lowercase\nCOPD_Incid_merged_ozone_df['state'] = COPD_Incid_merged_ozone_df['state'].str.lower()\n\n# Get unique values in 'State Name' and 'state'\nstate_name_values = set(COPD_Incid_merged_ozone_df['State Name'].dropna().unique())\nstate_values = set(COPD_Incid_merged_ozone_df['state'].dropna().unique())\n\n# Find differences\ndiff_state_name_not_in_state = state_name_values - state_values\ndiff_state_not_in_state_name = state_values - state_name_values\n\n# Print the differences\nprint(\"Values in 'State Name' but not in 'state':\")\nprint(diff_state_name_not_in_state)\n\nprint(\"\\nValues in 'state' but not in 'State Name':\")\nprint(diff_state_not_in_state_name)\n\n# Check for null values\nprint(\"\\nNull values in 'State Name':\")\nprint(COPD_Incid_merged_ozone_df['State Name'].isnull().sum())\n\nprint(\"\\nNull values in 'state':\")\nprint(COPD_Incid_merged_ozone_df['state'].isnull().sum())\n\n# Drop the 'state' column\nCOPD_Incid_merged_ozone_df = COPD_Incid_merged_ozone_df.drop(columns=['state'])\n\n# Renaming columns for clarity\nCOPD_Incid_merged_ozone_df = COPD_Incid_merged_ozone_df.rename(columns={\n 'Max': 'ozone_max',\n 'Min': 'ozone_min',\n 'Mean': 'ozone_mean',\n 'Median': 'ozone_median',\n 'Std': 'ozone_std',\n})\n\n# Calculate correlations\ncorrelation_matrix = COPD_Incid_merged_ozone_df[[\n 'COPD_average', 'lower', 'upper',\n 'ozone_max', 'ozone_min', 'ozone_mean', 'ozone_median', 'ozone_std'\n]].corr()\n\n# Save correlation matrix to CSV\ncorrelation_matrix.to_csv(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_ozone_correlation_matrix.csv')\n\n# Plot Correlation Heatmap\nplt.figure(figsize=(12, 13))\nsns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0)\nplt.title('COPD Incidence vs. Ozone Correlation Heatmap')\nplt.savefig(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_correlation_heatmap_ozone.png')\n# plt.show()\n", "lang" : "python", "owner" : "111111", "confidential" : "FALSE" diff --git a/totalCOPD_breathright1.py b/totalCOPD_breathright1.py new file mode 100644 index 0000000..8e1d251 --- /dev/null +++ b/totalCOPD_breathright1.py @@ -0,0 +1,126 @@ +import os +import pandas as pd +import re + +# Paths to your data +Non_Smoke_merged_df = "/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/Non Smoke/Non_Smoke_merged_df.csv" +great_pm25_combined_df = pd.read_csv('/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/Ozone and PM2.5 Data/great_combined_daily_pm25.csv', low_memory=False) +great_ozone_combined_df = pd.read_csv('/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/Ozone and PM2.5 Data/great_combined_daily_ozone.csv', low_memory=False) + +# Read all the csv into pandas dataframe in memory +#great_ozone_combined_df = pd.read_csv(ozone_data_path, parse_dates=['Date Local']) +Non_Smoke_merged_df = pd.read_csv(Non_Smoke_merged_df) +#great_pm25_combined_df = pd.read_csv(great_pm25_combined_df) + +print(Non_Smoke_merged_df.columns) +print(Non_Smoke_merged_df['COPD_prevalence_percentage_total'].head()) + + +# Convert 'Date Local' to datetime format +great_ozone_combined_df['Date Local'] = pd.to_datetime(great_ozone_combined_df['Date Local'], errors='coerce') + +print(Non_Smoke_merged_df.columns) + +# Rename columns to be consistent +#COPD_incidence_df.rename(columns={'Location': 'County Name'}, inplace=True) + +# Use the melt function to transform the DataFrame from wide to long format. This will convert the year-specific columns into rows. +#COPD_incidence_long = COPD_incidence_df.melt( + #id_vars=['County Name', 'FIPS', '% Change in Mortality Rate, 1980-2014'], + #var_name='year', + #value_name='Mortality' +#) + +# Extract the year from the 'year' column using string operations and convert it to an integer. +#ILD_long['year'] = ILD_long['year'].str.extract(r'(\d{4})').astype(int) + +# Function to split the Mortality Rate column +import pandas as pd + +def split_COPD_prevalence_percentage_total_column_into_three(COPD_prevalence_percentage_total): + if pd.isna(COPD_prevalence_percentage_total): + return pd.Series([None, None, None], index=['Average', 'Min', 'Max']) + + COPD_prevalence_percentage_total = str(COPD_prevalence_percentage_total) + + # Check for range format + match = re.match(r'(\d+\.\d+) \((\d+\.\d+), (\d+\.\d+)\)', COPD_prevalence_percentage_total) + if match: + avg, min_val, max_val = match.groups() + return pd.Series([float(avg), float(min_val), float(max_val)], index=['Average', 'Min', 'Max']) + + # Handle single value format + try: + avg = float(COPD_prevalence_percentage_total) + return pd.Series([avg, avg, avg], index=['Average', 'Min', 'Max']) + except ValueError: + return pd.Series([None, None, None], index=['Average', 'Min', 'Max']) +# Apply the function to the DataFrame +Non_Smoke_merged_df[['Average', 'Min', 'Max']] = Non_Smoke_merged_df['COPD_prevalence_percentage_total'].apply(split_COPD_prevalence_percentage_total_column_into_three) + + + +# Ensure 'COPD Incidence' is a string and handle NaN values +Non_Smoke_merged_df['COPD_prevalence_percentage_total'] = Non_Smoke_merged_df['COPD_prevalence_percentage_total'].astype(str) + +# Apply the function to the DataFrame +Non_Smoke_merged_df[['Average', 'Min', 'Max']] = Non_Smoke_merged_df['COPD_prevalence_percentage_total'].apply(split_COPD_prevalence_percentage_total_column_into_three) + +# Drop the original 'Mortality' column if no longer needed +#COPD_incidence_df = COPD_incidence_df.drop(columns=['COPD In']) + +print("COPD_prevalence_percentage_total DataFrame columns:", Non_Smoke_merged_df.columns) + +# Convert the daily ozone into yearly data +great_ozone_combined_df['year'] = great_ozone_combined_df['Date Local'].dt.year + +# Group by additional columns and 'year' +grouped = great_ozone_combined_df.groupby(['State Name', 'year'])['Arithmetic Mean'] + +# Compute statistics +stats_df = grouped.agg(['max', 'min', 'mean', 'median', 'std']).reset_index() + +# Rename columns for clarity +#stats_df.columns = ['State Name', 'County Name', 'year', 'Max', 'Min', 'Mean', 'Median', 'Std'] + +# Convert columns to string in both DataFrames +#stats_df['county'] = stats_df['County Name'].str.lower() +stats_df['state'] = stats_df['State Name'].str.lower() +stats_df['year'] = stats_df['year'].astype(int) + +#COPD_incidence_df['county'] = COPD_incidence_df['County Name'].str.strip().str.lower() +Non_Smoke_merged_df['year'] = Non_Smoke_merged_df['year'].astype(int) + +# Check unique values in each DataFrame for the merge columns +print("Unique State Names in Non_Smoke_merged_df:") +print(Non_Smoke_merged_df['State Name'].unique()) + +print("Unique State Names in stats_df:") +print(stats_df['State Name'].unique()) + +print("Unique Years in Non_Smoke_merged_df:") +print(Non_Smoke_merged_df['year'].unique()) + +print("Unique Years in stats_df:") +print(stats_df['year'].unique()) + + +# Print the results +print("Ozone aggregated yearly data:", stats_df) +print("COPD Incidence data header:", Non_Smoke_merged_df.head()) + +# Merge the statistics ozone DataFrame with the COPD_long DataFrame +totalCOPD_ozone_merged_df = pd.merge(Non_Smoke_merged_df, stats_df, on=['State Name', 'year'], how='inner') + +print("Merged DataFrame:", totalCOPD_ozone_merged_df.head()) + +# Save to a CSV file +totalCOPD_ozone_merged_df.to_csv(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/Non Smoke/totalCOPD_ozone_merged_df.csv', index=False) + + +print(totalCOPD_ozone_merged_df) +print(totalCOPD_ozone_merged_df.columns) + + + + diff --git a/totalCOPD_breathright2.py b/totalCOPD_breathright2.py new file mode 100644 index 0000000..d00091a --- /dev/null +++ b/totalCOPD_breathright2.py @@ -0,0 +1,256 @@ +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt + +# Load the CSV file into a DataFrame +#file_path = '/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/Non Smoke/totalCOPD_ozone_merged_df.csv' +#totalCOPD_ozone_merged_df = pd.read_csv(file_path) + + +# Verify the data types after conversion +#print(totalCOPD_ozone_merged_df.dtypes) + + +# Display the first few rows of the DataFrame +#print(totalCOPD_ozone_merged_df.head()) +#print(totalCOPD_ozone_merged_df.columns) + +# Assuming 'Min_x' and 'Min_y' are already in the DataFrame +#totalCOPD_ozone_merged_df['difference_Min_x_Min_y'] = totalCOPD_ozone_merged_df['Min_x'] - totalCOPD_ozone_merged_df['Min_y'] + +# Display the first few rows to check the new column +#print(totalCOPD_ozone_merged_df[['Min_x', 'Min_y', 'difference_Min_x_Min_y']].head()) + +# Calculate basic statistics for the difference +#diff_mean = totalCOPD_ozone_merged_df['difference_Min_x_Min_y'].mean() +#diff_median = totalCOPD_ozone_merged_df['difference_Min_x_Min_y'].median() +#diff_std = totalCOPD_ozone_merged_df['difference_Min_x_Min_y'].std() + +# Fill NaN values in Min_x or Min_y with a specific value, e.g., 0 +#totalCOPD_ozone_merged_df_filled = totalCOPD_ozone_merged_df.fillna({'Min_x': 0, 'Min_y': 0}) + +# Recalculate the difference +#totalCOPD_ozone_merged_df_filled['difference_Min_x_Min_y'] = totalCOPD_ozone_merged_df_filled['Min_x'] - totalCOPD_ozone_merged_df_filled['Min_y'] + +# Display the updated DataFrame +#print(totalCOPD_ozone_merged_df_filled[['Min_x', 'Min_y', 'difference_Min_x_Min_y']].head()) + +# Calculate basic statistics +#diff_mean = totalCOPD_ozone_merged_df_filled['difference_Min_x_Min_y'].mean() +#diff_median = totalCOPD_ozone_merged_df_filled['difference_Min_x_Min_y'].median() +#diff_std = totalCOPD_ozone_merged_df_filled['difference_Min_x_Min_y'].std() + +#print(f"Mean of the difference: {diff_mean}") +#print(f"Median of the difference: {diff_median}") +#print(f"Standard deviation of the difference: {diff_std}") + +# Drop the Min_x column +#totalCOPD_ozone_merged_df = totalCOPD_ozone_merged_df.drop(columns=['Min_x', 'Max_x']) + +# Rename columns +#totalCOPD_ozone_merged_df = totalCOPD_ozone_merged_df.rename(columns={'Min_y': 'Min', 'Max_y': 'Max'}) + + +#def do_breatheright_correlation_analysis(): + # Read in the merged CSV file with ozone and lung disease data + #totalCOPD_ozone_merged_df = pd.read_csv(f"/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/Non Smoke/totalCOPD_ozone_merged_df.csv") + #print(totalCOPD_ozone_merged_df.head()) + #print(totalCOPD_ozone_merged_df.columns) + +# Drop the unnecessary columns +# ILD_merged_pm25_df = ILD_merged_pm25_df.drop(columns=["county_x", 'State Name', 'county_y', 'State Name_y']) + +# Figuring out which columns to drop +# Values in 'county_x' but not in 'county_y' +#county_x_not_in_county_y = set(nosmokeCOPD_merged_pm25_df['county_x'].dropna()).difference(set(ILD_merged_pm25_df['county_y'].dropna())) +#print("Values in 'county_x' but not in 'county_y':") +#print(county_x_not_in_county_y) + +# Values in 'county_y' but not in 'county_x' +#county_y_not_in_county_x = set(ILD_merged_pm25_df['county_y'].dropna()).difference(set(ILD_merged_pm25_df['county_x'].dropna())) +#print("\nValues in 'county_y' but not in 'county_x':") +#print(county_y_not_in_county_x) + +# Check for null values +#print("\nNull values in 'county_x':") +#print(ILD_merged_pm25_df['county_x'].isnull().sum()) + +#print("\nNull values in 'county_y':") +#print(ILD_merged_pm25_df['county_y'].isnull().sum()) + +# Drop the 'county_x' column +#ILD_merged_pm25_df = ILD_merged_pm25_df.drop(columns=['county_x']) + +# Convert state names in 'State Name' to lowercase +#totalCOPD_ozone_merged_df['State Name'] = totalCOPD_ozone_merged_df['State Name'].str.lower() + +# Convert state names in 'state' to lowercase +#totalCOPD_ozone_merged_df['state'] = totalCOPD_ozone_merged_df['state'].str.lower() + +# Get unique values in 'State Name' and 'state' +#state_name_values = set(totalCOPD_ozone_merged_df['State Name'].dropna().unique()) +#state_values = set(totalCOPD_ozone_merged_df['state'].dropna().unique()) + +# Find differences +#diff_state_name_not_in_state = state_name_values - state_values +#diff_state_not_in_state_name = state_values - state_name_values + +# Print the differences +#print("Values in 'State Name' but not in 'state':") +#print(diff_state_name_not_in_state) + +#print("\nValues in 'state' but not in 'State Name':") +#print(diff_state_not_in_state_name) + +# Check for null values +#print("\nNull values in 'State Name':") +#print(totalCOPD_ozone_merged_df['State Name'].isnull().sum()) + +#print("\nNull values in 'state':") +#print(totalCOPD_ozone_merged_df['state'].isnull().sum()) + +# Drop the 'state' column +#totalCOPD_ozone_merged_df = totalCOPD_ozone_merged_df.drop(columns=['state']) + +# Renaming columns for clarity +#totalCOPD_ozone_merged_df = totalCOPD_ozone_merged_df.rename(columns={ + #'Max': 'ozone_max', + #'Min': 'ozone_min', + #'Mean': 'ozone_mean', + #'Median': 'ozone_median', + #'Std': 'ozone_std', +#}) + +# Calculate correlations +#correlation_matrix = totalCOPD_ozone_merged_df[[ + #'Average', 'Min', 'Max', + #'ozone_max', 'ozone_min', 'ozone_mean', 'ozone_median', 'ozone_std' +#]].corr() + +# Convert columns to numeric where applicable +#numeric_columns = ['average', 'min', 'max', 'ozone_max', 'ozone_min', 'ozone_mean', 'ozone_median', 'ozone_std'] +#totalCOPD_ozone_merged_df[numeric_columns] = totalCOPD_ozone_merged_df[numeric_columns].apply(pd.to_numeric, errors='coerce') + +# Verify the data types after conversion +#print(totalCOPD_ozone_merged_df.dtypes) + +# Save correlation matrix to CSV +#correlation_matrix.to_csv(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/Non Smoke/COPDtotalozone_correlation_matrix.csv') + +# Plot Correlation Heatmap +#plt.figure(figsize=(12, 13)) +#sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0) +#plt.title('COPD Total Incidence vs. Ozone Correlation Heatmap') +#plt.savefig(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/Non Smoke/totalCOPD1_correlation_heatmap_ozone.png') +# plt.show() + + +# Check for missing values in relevant columns +#print(totalCOPD_ozone_merged_df[['average', 'min', 'max', 'ozone_max', 'ozone_min', 'ozone_mean', 'ozone_median', 'ozone_std']].isnull().sum()) + + +#print(totalCOPD_ozone_merged_df.dtypes) + + +#print(totalCOPD_ozone_merged_df[['average', 'min', 'max', 'ozone_max', 'ozone_min', 'ozone_mean', 'ozone_median', 'ozone_std']].sample(10)) + + + + +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt + +# Load the CSV file into a DataFrame +file_path = '/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/Non Smoke/totalCOPD_ozone_merged_df.csv' +totalCOPD_ozone_merged_df = pd.read_csv(file_path) + +# Verify the data types after conversion +print(totalCOPD_ozone_merged_df.dtypes) + +# Display the first few rows of the DataFrame +print(totalCOPD_ozone_merged_df.head()) +print(totalCOPD_ozone_merged_df.columns) + +# Assuming 'Min_x' and 'Min_y' are already in the DataFrame +#totalCOPD_ozone_merged_df['difference_Min_x_Min_y'] = totalCOPD_ozone_merged_df['Min_x'] - totalCOPD_ozone_merged_df['Min_y'] + +# Display the first few rows to check the new column +#print(totalCOPD_ozone_merged_df[['Min_x', 'Min_y', 'difference_Min_x_Min_y']].head()) + +# Calculate basic statistics for the difference +#diff_mean = totalCOPD_ozone_merged_df['difference_Min_x_Min_y'].mean() +#diff_median = totalCOPD_ozone_merged_df['difference_Min_x_Min_y'].median() +#diff_std = totalCOPD_ozone_merged_df['difference_Min_x_Min_y'].std() + +# Fill NaN values in Min_x or Min_y with a specific value, e.g., 0 +#totalCOPD_ozone_merged_df_filled = totalCOPD_ozone_merged_df.fillna({'Min_x': 0, 'Min_y': 0}) + +# Recalculate the difference +#totalCOPD_ozone_merged_df_filled['difference_Min_x_Min_y'] = totalCOPD_ozone_merged_df_filled['Min_x'] - totalCOPD_ozone_merged_df_filled['Min_y'] + +# Display the updated DataFrame +#print(totalCOPD_ozone_merged_df_filled[['Min_x', 'Min_y', 'difference_Min_x_Min_y']].head()) + +# Calculate basic statistics +#diff_mean = totalCOPD_ozone_merged_df_filled['difference_Min_x_Min_y'].mean() +#diff_median = totalCOPD_ozone_merged_df_filled['difference_Min_x_Min_y'].median() +#diff_std = totalCOPD_ozone_merged_df_filled['difference_Min_x_Min_y'].std() + +#print(f"Mean of the difference: {diff_mean}") +#print(f"Median of the difference: {diff_median}") +#print(f"Standard deviation of the difference: {diff_std}") + +# Drop the Min_x and Max_x columns +#totalCOPD_ozone_merged_df = totalCOPD_ozone_merged_df.drop(columns=['Min_x', 'Max_x']) + +# Rename columns +#totalCOPD_ozone_merged_df = totalCOPD_ozone_merged_df.rename(columns={'Min_y': 'Min', 'Max_y': 'Max'}) + +# Ensure the renaming has taken place +print(totalCOPD_ozone_merged_df.columns) + +# Drop the 'state' column +totalCOPD_ozone_merged_df = totalCOPD_ozone_merged_df.drop(columns=['state']) + +# Renaming columns for clarity +#totalCOPD_ozone_merged_df = totalCOPD_ozone_merged_df.rename(columns={ + #'Max': 'ozone_max', + #'Min': 'ozone_min', + #'Mean': 'ozone_mean', + #'Median': 'ozone_median', + #'Std': 'ozone_std', +#}) + +# Convert columns to numeric where applicable +numeric_columns = ['Average', 'ozone_max', 'ozone_min', 'ozone_mean', 'ozone_median', 'ozone_std'] +totalCOPD_ozone_merged_df[numeric_columns] = totalCOPD_ozone_merged_df[numeric_columns].apply(pd.to_numeric, errors='coerce') + +# Verify the data types after conversion +print(totalCOPD_ozone_merged_df.dtypes) + +# Calculate correlations +correlation_matrix = totalCOPD_ozone_merged_df[[ + 'Average', 'ozone_max', 'ozone_min', 'ozone_mean', 'ozone_median', 'ozone_std' +]].corr() + +# Save correlation matrix to CSV +correlation_matrix.to_csv(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/Non Smoke/COPDtotalozone_correlation_matrix.csv') + +# Plot Correlation Heatmap +plt.figure(figsize=(12, 13)) +sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0) +plt.title('COPD Total Incidence vs. Ozone Correlation Heatmap') +plt.savefig(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/Non Smoke/totalCOPD1_correlation_heatmap_ozone.png') +# plt.show() + +# Check for missing values in relevant columns +print(totalCOPD_ozone_merged_df[['Average', 'ozone_max', 'ozone_min', 'ozone_mean', 'ozone_median', 'ozone_std']].isnull().sum()) + +print(totalCOPD_ozone_merged_df.dtypes) + +print(totalCOPD_ozone_merged_df[['Average', 'ozone_max', 'ozone_min', 'ozone_mean', 'ozone_median', 'ozone_std']].sample(10)) + + + +