earth-artificial-intelligence · IK-173 · Aug 16, 2024 · Aug 16, 2024
diff --git a/COPD_Incid_ozone_breathright_correlation.py b/COPD_Incid_ozone_breathright_correlation.py
@@ -0,0 +1,102 @@
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+# Load the CSV file into a DataFrame
+file_path = '/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_ozone_merged_df.csv'
+COPD_Incid_merged_ozone_df = pd.read_csv(file_path)
+
+COPD_Incid_merged_ozone_df = COPD_Incid_merged_ozone_df.drop(columns=['COPD_min', 'COPD_max', 'county', 'County Name'])
+
+
+
+# Display the first few rows of the DataFrame
+print(COPD_Incid_merged_ozone_df.head())
+print(COPD_Incid_merged_ozone_df.columns)
+
+def do_breatheright_correlation_analysis():
+    # Read in the merged CSV file with ozone and lung disease data
+    COPD_Incid_merged_ozone_df = pd.read_csv(f"/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_ozone_merged_df.csv")
+    print(COPD_Incid_merged_ozone_df.head())
+    print(COPD_Incid_merged_ozone_df.columns)
+
+# Drop the unnecessary columns
+# COPD_merged_pm25_df = COPD_merged_pm25_df.drop(columns=["county_x", 'State Name', 'county_y', 'State Name_y'])
+
+# Figuring out which columns to drop
+# Values in 'county_x' but not in 'county_y'
+#county_x_not_in_county_y = set(ILD_merged_pm25_df['county_x'].dropna()).difference(set(ILD_merged_pm25_df['county_y'].dropna()))
+#print("Values in 'county_x' but not in 'county_y':")
+#print(county_x_not_in_county_y)
+
+# Values in 'county_y' but not in 'county_x'
+#county_y_not_in_county_x = set(ILD_merged_pm25_df['county_y'].dropna()).difference(set(ILD_merged_pm25_df['county_x'].dropna()))
+#print("\nValues in 'county_y' but not in 'county_x':")
+#print(county_y_not_in_county_x)
+
+# Check for null values
+#print("\nNull values in 'county_x':")
+#print(ILD_merged_pm25_df['county_x'].isnull().sum())
+
+#print("\nNull values in 'county_y':")
+#print(ILD_merged_pm25_df['county_y'].isnull().sum())
+
+# Drop the 'county_x' column
+#ILD_merged_pm25_df = ILD_merged_pm25_df.drop(columns=['county_x'])
+
+# Convert state names in 'State Name' to lowercase
+COPD_Incid_merged_ozone_df['State Name'] = COPD_Incid_merged_ozone_df['State Name'].str.lower()
+
+# Convert state names in 'state' to lowercase
+COPD_Incid_merged_ozone_df['state'] = COPD_Incid_merged_ozone_df['state'].str.lower()
+
+# Get unique values in 'State Name' and 'state'
+state_name_values = set(COPD_Incid_merged_ozone_df['State Name'].dropna().unique())
+state_values = set(COPD_Incid_merged_ozone_df['state'].dropna().unique())
+
+# Find differences
+diff_state_name_not_in_state = state_name_values - state_values
+diff_state_not_in_state_name = state_values - state_name_values
+
+# Print the differences
+print("Values in 'State Name' but not in 'state':")
+print(diff_state_name_not_in_state)
+
+print("\nValues in 'state' but not in 'State Name':")
+print(diff_state_not_in_state_name)
+
+# Check for null values
+print("\nNull values in 'State Name':")
+print(COPD_Incid_merged_ozone_df['State Name'].isnull().sum())
+
+print("\nNull values in 'state':")
+print(COPD_Incid_merged_ozone_df['state'].isnull().sum())
+
+# Drop the 'state' column
+COPD_Incid_merged_ozone_df = COPD_Incid_merged_ozone_df.drop(columns=['state'])
+
+# Renaming columns for clarity
+COPD_Incid_merged_ozone_df = COPD_Incid_merged_ozone_df.rename(columns={
+    'Max': 'ozone_max',
+    'Min': 'ozone_min',
+    'Mean': 'ozone_mean',
+    'Median': 'ozone_median',
+    'Std': 'ozone_std',
+})
+
+# Calculate correlations
+correlation_matrix = COPD_Incid_merged_ozone_df[[
+    'COPD_average', 'lower', 'upper',
+    'ozone_max', 'ozone_min', 'ozone_mean', 'ozone_median', 'ozone_std'
+]].corr()
+
+# Save correlation matrix to CSV
+correlation_matrix.to_csv(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_ozone_correlation_matrix.csv')
+
+# Plot Correlation Heatmap
+plt.figure(figsize=(12, 13))
+sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0)
+plt.title('COPD Incidence vs. Ozone Correlation Heatmap')
+plt.savefig(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_correlation_heatmap_ozone.png')
+# plt.show()
+
diff --git a/COPD_Incid_ozone_breathright_data_prep.py b/COPD_Incid_ozone_breathright_data_prep.py
@@ -0,0 +1,91 @@
+import os
+import pandas as pd
+import re
+
+# Paths to your data
+ozone_data_path = "/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/Ozone and PM2.5 Data/combined_ozone_data.csv"
+COPD_incidence_data_path = "/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/IHME_2000-2021_COPD_Incidence_DATA.csv"
+pm25_data_path = "/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/Ozone and PM2.5 Data/combined_pm25_data.csv"
+
+# Read all the csv into pandas dataframe in memory
+ozone_df = pd.read_csv(ozone_data_path, parse_dates=['Date Local'])
+COPD_incidence_df = pd.read_csv(COPD_incidence_data_path)
+pm25_df = pd.read_csv(pm25_data_path)
+
+print(COPD_incidence_df.columns)
+
+# Convert 'Date Local' to datetime format
+ozone_df['Date Local'] = pd.to_datetime(ozone_df['Date Local'], errors='coerce')
+
+# Rename columns to be consistent
+#COPD_incidence_df.rename(columns={'Location': 'County Name'}, inplace=True)
+
+# Use the melt function to transform the DataFrame from wide to long format. This will convert the year-specific columns into rows.
+#COPD_incidence_long = COPD_incidence_df.melt(
+    #id_vars=['County Name', 'FIPS', '% Change in Mortality Rate, 1980-2014'],
+    #var_name='year',
+    #value_name='Mortality'
+#)
+
+# Extract the year from the 'year' column using string operations and convert it to an integer.
+#ILD_long['year'] = ILD_long['year'].str.extract(r'(\d{4})').astype(int)
+
+# Function to split the Mortality Rate column
+def split_COPD_Incidence_column_into_three(COPD_Incidence):
+    match = re.match(r'(\d+\.\d+) \((\d+\.\d+), (\d+\.\d+)\)', COPD_Incidence)
+    if match:
+        avg, min_val, max_val = match.groups()
+        return pd.Series([float(avg), float(min_val), float(max_val)], index=['COPD_average', 'COPD_min', 'COPD_max'])
+    else:
+        return pd.Series([None, None, None], index=['COPD_average', 'COPD_min', 'COPD_max'])
+
+# Ensure 'COPD Incidence' is a string and handle NaN values
+COPD_incidence_df['COPD Incidence'] = COPD_incidence_df['COPD Incidence'].astype(str)
+
+
+# Apply the function to split the 'Mortality' column
+COPD_incidence_df[['COPD_average', 'COPD_min', 'COPD_max']] = COPD_incidence_df['COPD Incidence'].apply(split_COPD_Incidence_column_into_three)
+
+# Drop the original 'Mortality' column if no longer needed
+#COPD_incidence_df = COPD_incidence_df.drop(columns=['COPD In'])
+
+print("COPD_Incidence DataFrame columns:", COPD_incidence_df.columns)
+
+# Convert the daily ozone into yearly data
+ozone_df['year'] = ozone_df['Date Local'].dt.year
+
+# Group by additional columns and 'year'
+grouped = ozone_df.groupby(['State Name', 'County Name', 'year'])['Arithmetic Mean']
+
+# Compute statistics
+stats_df = grouped.agg(['max', 'min', 'mean', 'median', 'std']).reset_index()
+
+# Rename columns for clarity
+stats_df.columns = ['State Name', 'County Name', 'year', 'Max', 'Min', 'Mean', 'Median', 'Std']
+
+# Convert columns to string in both DataFrames
+stats_df['county'] = stats_df['County Name'].str.lower()
+stats_df['state'] = stats_df['State Name'].str.lower()
+stats_df['year'] = stats_df['year'].astype(int)
+
+#COPD_incidence_df['county'] = COPD_incidence_df['County Name'].str.strip().str.lower()
+COPD_incidence_df['year'] = COPD_incidence_df['year'].astype(int)
+
+# Print the results
+print("Ozone aggregated yearly data:", stats_df)
+print("COPD Incidence data header:", COPD_incidence_df.head())
+
+# Merge the statistics ozone DataFrame with the COPD_long DataFrame
+COPD_Incid_merged_ozone_df = pd.merge(COPD_incidence_df, stats_df, on=['State Name', 'year'], how='inner')
+
+print("Merged DataFrame:", COPD_Incid_merged_ozone_df.head())
+
+# Save to a CSV file
+COPD_Incid_merged_ozone_df.to_csv(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_ozone_merged_df.csv', index=False)
+
+
+
+
+
+
+
diff --git a/COPD_Incid_pm25_breathright_correlation.py b/COPD_Incid_pm25_breathright_correlation.py
@@ -0,0 +1,102 @@
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+# Load the CSV file into a DataFrame
+file_path = '/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_pm25_merged_df.csv'
+COPD_Incid_merged_pm25_df = pd.read_csv(file_path)
+
+COPD_Incid_merged_pm25_df = COPD_Incid_merged_pm25_df.drop(columns=['COPD_min', 'COPD_max', 'county', 'County Name'])
+
+
+
+# Display the first few rows of the DataFrame
+print(COPD_Incid_merged_pm25_df.head())
+print(COPD_Incid_merged_pm25_df.columns)
+
+def do_breatheright_correlation_analysis():
+    # Read in the merged CSV file with ozone and lung disease data
+    COPD_Incid_merged_pm25_df = pd.read_csv(f"/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_pm25_merged_df.csv")
+    print(COPD_Incid_merged_pm25_df.head())
+    print(COPD_Incid_merged_pm25_df.columns)
+
+# Drop the unnecessary columns
+# COPD_merged_pm25_df = COPD_merged_pm25_df.drop(columns=["county_x", 'State Name', 'county_y', 'State Name_y'])
+
+# Figuring out which columns to drop
+# Values in 'county_x' but not in 'county_y'
+#county_x_not_in_county_y = set(ILD_merged_pm25_df['county_x'].dropna()).difference(set(ILD_merged_pm25_df['county_y'].dropna()))
+#print("Values in 'county_x' but not in 'county_y':")
+#print(county_x_not_in_county_y)
+
+# Values in 'county_y' but not in 'county_x'
+#county_y_not_in_county_x = set(ILD_merged_pm25_df['county_y'].dropna()).difference(set(ILD_merged_pm25_df['county_x'].dropna()))
+#print("\nValues in 'county_y' but not in 'county_x':")
+#print(county_y_not_in_county_x)
+
+# Check for null values
+#print("\nNull values in 'county_x':")
+#print(ILD_merged_pm25_df['county_x'].isnull().sum())
+
+#print("\nNull values in 'county_y':")
+#print(ILD_merged_pm25_df['county_y'].isnull().sum())
+
+# Drop the 'county_x' column
+#ILD_merged_pm25_df = ILD_merged_pm25_df.drop(columns=['county_x'])
+
+# Convert state names in 'State Name' to lowercase
+COPD_Incid_merged_pm25_df['State Name'] = COPD_Incid_merged_pm25_df['State Name'].str.lower()
+
+# Convert state names in 'state' to lowercase
+COPD_Incid_merged_pm25_df['state'] = COPD_Incid_merged_pm25_df['state'].str.lower()
+
+# Get unique values in 'State Name' and 'state'
+state_name_values = set(COPD_Incid_merged_pm25_df['State Name'].dropna().unique())
+state_values = set(COPD_Incid_merged_pm25_df['state'].dropna().unique())
+
+# Find differences
+diff_state_name_not_in_state = state_name_values - state_values
+diff_state_not_in_state_name = state_values - state_name_values
+
+# Print the differences
+print("Values in 'State Name' but not in 'state':")
+print(diff_state_name_not_in_state)
+
+print("\nValues in 'state' but not in 'State Name':")
+print(diff_state_not_in_state_name)
+
+# Check for null values
+print("\nNull values in 'State Name':")
+print(COPD_Incid_merged_pm25_df['State Name'].isnull().sum())
+
+print("\nNull values in 'state':")
+print(COPD_Incid_merged_pm25_df['state'].isnull().sum())
+
+# Drop the 'state' column
+COPD_Incid_merged_pm25_df = COPD_Incid_merged_pm25_df.drop(columns=['state'])
+
+# Renaming columns for clarity
+COPD_Incid_merged_pm25_df = COPD_Incid_merged_pm25_df.rename(columns={
+    'Max': 'pm25_max',
+    'Min': 'pm25_min',
+    'Mean': 'pm25_mean',
+    'Median': 'pm25_median',
+    'Std': 'pm25_std',
+})
+
+# Calculate correlations
+correlation_matrix = COPD_Incid_merged_pm25_df[[
+    'COPD_average', 'lower', 'upper',
+    'pm25_max', 'pm25_min', 'pm25_mean', 'pm25_median', 'pm25_std'
+]].corr()
+
+# Save correlation matrix to CSV
+correlation_matrix.to_csv(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_pm25_correlation_matrix.csv')
+
+# Plot Correlation Heatmap
+plt.figure(figsize=(12, 13))
+sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0)
+plt.title('COPD Incidence vs. PM2.5 Correlation Heatmap')
+plt.savefig(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_correlation_heatmap_pm25.png')
+# plt.show()
+
diff --git a/COPD_Incid_pm25_breathright_data_prep.py b/COPD_Incid_pm25_breathright_data_prep.py
@@ -0,0 +1,91 @@
+import os
+import pandas as pd
+import re
+
+# Paths to your data
+ozone_data_path = "/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/Ozone and PM2.5 Data/combined_ozone_data.csv"
+COPD_incidence_data_path = "/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/IHME_2000-2021_COPD_Incidence_DATA.csv"
+pm25_data_path = "/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/Ozone and PM2.5 Data/combined_pm25_data.csv"
+
+# Read all the csv into pandas dataframe in memory
+ozone_df = pd.read_csv(ozone_data_path, parse_dates=['Date Local'])
+COPD_incidence_df = pd.read_csv(COPD_incidence_data_path)
+pm25_df = pd.read_csv(pm25_data_path)
+
+print(COPD_incidence_df.columns)
+
+# Convert 'Date Local' to datetime format
+pm25_df['Date Local'] = pd.to_datetime(pm25_df['Date Local'], errors='coerce')
+
+# Rename columns to be consistent
+#COPD_incidence_df.rename(columns={'Location': 'County Name'}, inplace=True)
+
+# Use the melt function to transform the DataFrame from wide to long format. This will convert the year-specific columns into rows.
+#COPD_incidence_long = COPD_incidence_df.melt(
+    #id_vars=['County Name', 'FIPS', '% Change in Mortality Rate, 1980-2014'],
+    #var_name='year',
+    #value_name='Mortality'
+#)
+
+# Extract the year from the 'year' column using string operations and convert it to an integer.
+#ILD_long['year'] = ILD_long['year'].str.extract(r'(\d{4})').astype(int)
+
+# Function to split the Mortality Rate column
+def split_COPD_Incidence_column_into_three(COPD_Incidence):
+    match = re.match(r'(\d+\.\d+) \((\d+\.\d+), (\d+\.\d+)\)', COPD_Incidence)
+    if match:
+        avg, min_val, max_val = match.groups()
+        return pd.Series([float(avg), float(min_val), float(max_val)], index=['COPD_average', 'COPD_min', 'COPD_max'])
+    else:
+        return pd.Series([None, None, None], index=['COPD_average', 'COPD_min', 'COPD_max'])
+
+# Ensure 'COPD Incidence' is a string and handle NaN values
+COPD_incidence_df['COPD Incidence'] = COPD_incidence_df['COPD Incidence'].astype(str)
+
+
+# Apply the function to split the 'Mortality' column
+COPD_incidence_df[['COPD_average', 'COPD_min', 'COPD_max']] = COPD_incidence_df['COPD Incidence'].apply(split_COPD_Incidence_column_into_three)
+
+# Drop the original 'Mortality' column if no longer needed
+#COPD_incidence_df = COPD_incidence_df.drop(columns=['COPD In'])
+
+print("COPD_Incidence DataFrame columns:", COPD_incidence_df.columns)
+
+# Convert the daily ozone into yearly data
+pm25_df['year'] = pm25_df['Date Local'].dt.year
+
+# Group by additional columns and 'year'
+grouped = pm25_df.groupby(['State Name', 'County Name', 'year'])['Arithmetic Mean']
+
+# Compute statistics
+stats_df = grouped.agg(['max', 'min', 'mean', 'median', 'std']).reset_index()
+
+# Rename columns for clarity
+stats_df.columns = ['State Name', 'County Name', 'year', 'Max', 'Min', 'Mean', 'Median', 'Std']
+
+# Convert columns to string in both DataFrames
+stats_df['county'] = stats_df['County Name'].str.lower()
+stats_df['state'] = stats_df['State Name'].str.lower()
+stats_df['year'] = stats_df['year'].astype(int)
+
+#COPD_incidence_df['county'] = COPD_incidence_df['County Name'].str.strip().str.lower()
+COPD_incidence_df['year'] = COPD_incidence_df['year'].astype(int)
+
+# Print the results
+print("Ozone aggregated yearly data:", stats_df)
+print("COPD Incidence data header:", COPD_incidence_df.head())
+
+# Merge the statistics ozone DataFrame with the COPD_long DataFrame
+COPD_Incid_merged_pm25_df = pd.merge(COPD_incidence_df, stats_df, on=['State Name', 'year'], how='inner')
+
+print("Merged DataFrame:", COPD_Incid_merged_pm25_df.head())
+
+# Save to a CSV file
+COPD_Incid_merged_pm25_df.to_csv(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_pm25_merged_df.csv', index=False)
+
+
+
+
+
+
+