Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

totalCOPD_heatmap_workflow #2

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 102 additions & 0 deletions COPD_Incid_ozone_breathright_correlation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the CSV file into a DataFrame
file_path = '/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_ozone_merged_df.csv'
COPD_Incid_merged_ozone_df = pd.read_csv(file_path)

COPD_Incid_merged_ozone_df = COPD_Incid_merged_ozone_df.drop(columns=['COPD_min', 'COPD_max', 'county', 'County Name'])



# Display the first few rows of the DataFrame
print(COPD_Incid_merged_ozone_df.head())
print(COPD_Incid_merged_ozone_df.columns)

def do_breatheright_correlation_analysis():
# Read in the merged CSV file with ozone and lung disease data
COPD_Incid_merged_ozone_df = pd.read_csv(f"/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_ozone_merged_df.csv")
print(COPD_Incid_merged_ozone_df.head())
print(COPD_Incid_merged_ozone_df.columns)

# Drop the unnecessary columns
# COPD_merged_pm25_df = COPD_merged_pm25_df.drop(columns=["county_x", 'State Name', 'county_y', 'State Name_y'])

# Figuring out which columns to drop
# Values in 'county_x' but not in 'county_y'
#county_x_not_in_county_y = set(ILD_merged_pm25_df['county_x'].dropna()).difference(set(ILD_merged_pm25_df['county_y'].dropna()))
#print("Values in 'county_x' but not in 'county_y':")
#print(county_x_not_in_county_y)

# Values in 'county_y' but not in 'county_x'
#county_y_not_in_county_x = set(ILD_merged_pm25_df['county_y'].dropna()).difference(set(ILD_merged_pm25_df['county_x'].dropna()))
#print("\nValues in 'county_y' but not in 'county_x':")
#print(county_y_not_in_county_x)

# Check for null values
#print("\nNull values in 'county_x':")
#print(ILD_merged_pm25_df['county_x'].isnull().sum())

#print("\nNull values in 'county_y':")
#print(ILD_merged_pm25_df['county_y'].isnull().sum())

# Drop the 'county_x' column
#ILD_merged_pm25_df = ILD_merged_pm25_df.drop(columns=['county_x'])

# Convert state names in 'State Name' to lowercase
COPD_Incid_merged_ozone_df['State Name'] = COPD_Incid_merged_ozone_df['State Name'].str.lower()

# Convert state names in 'state' to lowercase
COPD_Incid_merged_ozone_df['state'] = COPD_Incid_merged_ozone_df['state'].str.lower()

# Get unique values in 'State Name' and 'state'
state_name_values = set(COPD_Incid_merged_ozone_df['State Name'].dropna().unique())
state_values = set(COPD_Incid_merged_ozone_df['state'].dropna().unique())

# Find differences
diff_state_name_not_in_state = state_name_values - state_values
diff_state_not_in_state_name = state_values - state_name_values

# Print the differences
print("Values in 'State Name' but not in 'state':")
print(diff_state_name_not_in_state)

print("\nValues in 'state' but not in 'State Name':")
print(diff_state_not_in_state_name)

# Check for null values
print("\nNull values in 'State Name':")
print(COPD_Incid_merged_ozone_df['State Name'].isnull().sum())

print("\nNull values in 'state':")
print(COPD_Incid_merged_ozone_df['state'].isnull().sum())

# Drop the 'state' column
COPD_Incid_merged_ozone_df = COPD_Incid_merged_ozone_df.drop(columns=['state'])

# Renaming columns for clarity
COPD_Incid_merged_ozone_df = COPD_Incid_merged_ozone_df.rename(columns={
'Max': 'ozone_max',
'Min': 'ozone_min',
'Mean': 'ozone_mean',
'Median': 'ozone_median',
'Std': 'ozone_std',
})

# Calculate correlations
correlation_matrix = COPD_Incid_merged_ozone_df[[
'COPD_average', 'lower', 'upper',
'ozone_max', 'ozone_min', 'ozone_mean', 'ozone_median', 'ozone_std'
]].corr()

# Save correlation matrix to CSV
correlation_matrix.to_csv(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_ozone_correlation_matrix.csv')

# Plot Correlation Heatmap
plt.figure(figsize=(12, 13))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0)
plt.title('COPD Incidence vs. Ozone Correlation Heatmap')
plt.savefig(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_correlation_heatmap_ozone.png')
# plt.show()

91 changes: 91 additions & 0 deletions COPD_Incid_ozone_breathright_data_prep.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import os
import pandas as pd
import re

# Paths to your data
ozone_data_path = "/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/Ozone and PM2.5 Data/combined_ozone_data.csv"
COPD_incidence_data_path = "/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/IHME_2000-2021_COPD_Incidence_DATA.csv"
pm25_data_path = "/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/Ozone and PM2.5 Data/combined_pm25_data.csv"

# Read all the csv into pandas dataframe in memory
ozone_df = pd.read_csv(ozone_data_path, parse_dates=['Date Local'])
COPD_incidence_df = pd.read_csv(COPD_incidence_data_path)
pm25_df = pd.read_csv(pm25_data_path)

print(COPD_incidence_df.columns)

# Convert 'Date Local' to datetime format
ozone_df['Date Local'] = pd.to_datetime(ozone_df['Date Local'], errors='coerce')

# Rename columns to be consistent
#COPD_incidence_df.rename(columns={'Location': 'County Name'}, inplace=True)

# Use the melt function to transform the DataFrame from wide to long format. This will convert the year-specific columns into rows.
#COPD_incidence_long = COPD_incidence_df.melt(
#id_vars=['County Name', 'FIPS', '% Change in Mortality Rate, 1980-2014'],
#var_name='year',
#value_name='Mortality'
#)

# Extract the year from the 'year' column using string operations and convert it to an integer.
#ILD_long['year'] = ILD_long['year'].str.extract(r'(\d{4})').astype(int)

# Function to split the Mortality Rate column
def split_COPD_Incidence_column_into_three(COPD_Incidence):
match = re.match(r'(\d+\.\d+) \((\d+\.\d+), (\d+\.\d+)\)', COPD_Incidence)
if match:
avg, min_val, max_val = match.groups()
return pd.Series([float(avg), float(min_val), float(max_val)], index=['COPD_average', 'COPD_min', 'COPD_max'])
else:
return pd.Series([None, None, None], index=['COPD_average', 'COPD_min', 'COPD_max'])

# Ensure 'COPD Incidence' is a string and handle NaN values
COPD_incidence_df['COPD Incidence'] = COPD_incidence_df['COPD Incidence'].astype(str)


# Apply the function to split the 'Mortality' column
COPD_incidence_df[['COPD_average', 'COPD_min', 'COPD_max']] = COPD_incidence_df['COPD Incidence'].apply(split_COPD_Incidence_column_into_three)

# Drop the original 'Mortality' column if no longer needed
#COPD_incidence_df = COPD_incidence_df.drop(columns=['COPD In'])

print("COPD_Incidence DataFrame columns:", COPD_incidence_df.columns)

# Convert the daily ozone into yearly data
ozone_df['year'] = ozone_df['Date Local'].dt.year

# Group by additional columns and 'year'
grouped = ozone_df.groupby(['State Name', 'County Name', 'year'])['Arithmetic Mean']

# Compute statistics
stats_df = grouped.agg(['max', 'min', 'mean', 'median', 'std']).reset_index()

# Rename columns for clarity
stats_df.columns = ['State Name', 'County Name', 'year', 'Max', 'Min', 'Mean', 'Median', 'Std']

# Convert columns to string in both DataFrames
stats_df['county'] = stats_df['County Name'].str.lower()
stats_df['state'] = stats_df['State Name'].str.lower()
stats_df['year'] = stats_df['year'].astype(int)

#COPD_incidence_df['county'] = COPD_incidence_df['County Name'].str.strip().str.lower()
COPD_incidence_df['year'] = COPD_incidence_df['year'].astype(int)

# Print the results
print("Ozone aggregated yearly data:", stats_df)
print("COPD Incidence data header:", COPD_incidence_df.head())

# Merge the statistics ozone DataFrame with the COPD_long DataFrame
COPD_Incid_merged_ozone_df = pd.merge(COPD_incidence_df, stats_df, on=['State Name', 'year'], how='inner')

print("Merged DataFrame:", COPD_Incid_merged_ozone_df.head())

# Save to a CSV file
COPD_Incid_merged_ozone_df.to_csv(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_ozone_merged_df.csv', index=False)







102 changes: 102 additions & 0 deletions COPD_Incid_pm25_breathright_correlation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the CSV file into a DataFrame
file_path = '/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_pm25_merged_df.csv'
COPD_Incid_merged_pm25_df = pd.read_csv(file_path)

COPD_Incid_merged_pm25_df = COPD_Incid_merged_pm25_df.drop(columns=['COPD_min', 'COPD_max', 'county', 'County Name'])



# Display the first few rows of the DataFrame
print(COPD_Incid_merged_pm25_df.head())
print(COPD_Incid_merged_pm25_df.columns)

def do_breatheright_correlation_analysis():
# Read in the merged CSV file with ozone and lung disease data
COPD_Incid_merged_pm25_df = pd.read_csv(f"/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_pm25_merged_df.csv")
print(COPD_Incid_merged_pm25_df.head())
print(COPD_Incid_merged_pm25_df.columns)

# Drop the unnecessary columns
# COPD_merged_pm25_df = COPD_merged_pm25_df.drop(columns=["county_x", 'State Name', 'county_y', 'State Name_y'])

# Figuring out which columns to drop
# Values in 'county_x' but not in 'county_y'
#county_x_not_in_county_y = set(ILD_merged_pm25_df['county_x'].dropna()).difference(set(ILD_merged_pm25_df['county_y'].dropna()))
#print("Values in 'county_x' but not in 'county_y':")
#print(county_x_not_in_county_y)

# Values in 'county_y' but not in 'county_x'
#county_y_not_in_county_x = set(ILD_merged_pm25_df['county_y'].dropna()).difference(set(ILD_merged_pm25_df['county_x'].dropna()))
#print("\nValues in 'county_y' but not in 'county_x':")
#print(county_y_not_in_county_x)

# Check for null values
#print("\nNull values in 'county_x':")
#print(ILD_merged_pm25_df['county_x'].isnull().sum())

#print("\nNull values in 'county_y':")
#print(ILD_merged_pm25_df['county_y'].isnull().sum())

# Drop the 'county_x' column
#ILD_merged_pm25_df = ILD_merged_pm25_df.drop(columns=['county_x'])

# Convert state names in 'State Name' to lowercase
COPD_Incid_merged_pm25_df['State Name'] = COPD_Incid_merged_pm25_df['State Name'].str.lower()

# Convert state names in 'state' to lowercase
COPD_Incid_merged_pm25_df['state'] = COPD_Incid_merged_pm25_df['state'].str.lower()

# Get unique values in 'State Name' and 'state'
state_name_values = set(COPD_Incid_merged_pm25_df['State Name'].dropna().unique())
state_values = set(COPD_Incid_merged_pm25_df['state'].dropna().unique())

# Find differences
diff_state_name_not_in_state = state_name_values - state_values
diff_state_not_in_state_name = state_values - state_name_values

# Print the differences
print("Values in 'State Name' but not in 'state':")
print(diff_state_name_not_in_state)

print("\nValues in 'state' but not in 'State Name':")
print(diff_state_not_in_state_name)

# Check for null values
print("\nNull values in 'State Name':")
print(COPD_Incid_merged_pm25_df['State Name'].isnull().sum())

print("\nNull values in 'state':")
print(COPD_Incid_merged_pm25_df['state'].isnull().sum())

# Drop the 'state' column
COPD_Incid_merged_pm25_df = COPD_Incid_merged_pm25_df.drop(columns=['state'])

# Renaming columns for clarity
COPD_Incid_merged_pm25_df = COPD_Incid_merged_pm25_df.rename(columns={
'Max': 'pm25_max',
'Min': 'pm25_min',
'Mean': 'pm25_mean',
'Median': 'pm25_median',
'Std': 'pm25_std',
})

# Calculate correlations
correlation_matrix = COPD_Incid_merged_pm25_df[[
'COPD_average', 'lower', 'upper',
'pm25_max', 'pm25_min', 'pm25_mean', 'pm25_median', 'pm25_std'
]].corr()

# Save correlation matrix to CSV
correlation_matrix.to_csv(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_pm25_correlation_matrix.csv')

# Plot Correlation Heatmap
plt.figure(figsize=(12, 13))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0)
plt.title('COPD Incidence vs. PM2.5 Correlation Heatmap')
plt.savefig(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_correlation_heatmap_pm25.png')
# plt.show()

91 changes: 91 additions & 0 deletions COPD_Incid_pm25_breathright_data_prep.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import os
import pandas as pd
import re

# Paths to your data
ozone_data_path = "/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/Ozone and PM2.5 Data/combined_ozone_data.csv"
COPD_incidence_data_path = "/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/IHME_2000-2021_COPD_Incidence_DATA.csv"
pm25_data_path = "/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/Ozone and PM2.5 Data/combined_pm25_data.csv"

# Read all the csv into pandas dataframe in memory
ozone_df = pd.read_csv(ozone_data_path, parse_dates=['Date Local'])
COPD_incidence_df = pd.read_csv(COPD_incidence_data_path)
pm25_df = pd.read_csv(pm25_data_path)

print(COPD_incidence_df.columns)

# Convert 'Date Local' to datetime format
pm25_df['Date Local'] = pd.to_datetime(pm25_df['Date Local'], errors='coerce')

# Rename columns to be consistent
#COPD_incidence_df.rename(columns={'Location': 'County Name'}, inplace=True)

# Use the melt function to transform the DataFrame from wide to long format. This will convert the year-specific columns into rows.
#COPD_incidence_long = COPD_incidence_df.melt(
#id_vars=['County Name', 'FIPS', '% Change in Mortality Rate, 1980-2014'],
#var_name='year',
#value_name='Mortality'
#)

# Extract the year from the 'year' column using string operations and convert it to an integer.
#ILD_long['year'] = ILD_long['year'].str.extract(r'(\d{4})').astype(int)

# Function to split the Mortality Rate column
def split_COPD_Incidence_column_into_three(COPD_Incidence):
match = re.match(r'(\d+\.\d+) \((\d+\.\d+), (\d+\.\d+)\)', COPD_Incidence)
if match:
avg, min_val, max_val = match.groups()
return pd.Series([float(avg), float(min_val), float(max_val)], index=['COPD_average', 'COPD_min', 'COPD_max'])
else:
return pd.Series([None, None, None], index=['COPD_average', 'COPD_min', 'COPD_max'])

# Ensure 'COPD Incidence' is a string and handle NaN values
COPD_incidence_df['COPD Incidence'] = COPD_incidence_df['COPD Incidence'].astype(str)


# Apply the function to split the 'Mortality' column
COPD_incidence_df[['COPD_average', 'COPD_min', 'COPD_max']] = COPD_incidence_df['COPD Incidence'].apply(split_COPD_Incidence_column_into_three)

# Drop the original 'Mortality' column if no longer needed
#COPD_incidence_df = COPD_incidence_df.drop(columns=['COPD In'])

print("COPD_Incidence DataFrame columns:", COPD_incidence_df.columns)

# Convert the daily ozone into yearly data
pm25_df['year'] = pm25_df['Date Local'].dt.year

# Group by additional columns and 'year'
grouped = pm25_df.groupby(['State Name', 'County Name', 'year'])['Arithmetic Mean']

# Compute statistics
stats_df = grouped.agg(['max', 'min', 'mean', 'median', 'std']).reset_index()

# Rename columns for clarity
stats_df.columns = ['State Name', 'County Name', 'year', 'Max', 'Min', 'Mean', 'Median', 'Std']

# Convert columns to string in both DataFrames
stats_df['county'] = stats_df['County Name'].str.lower()
stats_df['state'] = stats_df['State Name'].str.lower()
stats_df['year'] = stats_df['year'].astype(int)

#COPD_incidence_df['county'] = COPD_incidence_df['County Name'].str.strip().str.lower()
COPD_incidence_df['year'] = COPD_incidence_df['year'].astype(int)

# Print the results
print("Ozone aggregated yearly data:", stats_df)
print("COPD Incidence data header:", COPD_incidence_df.head())

# Merge the statistics ozone DataFrame with the COPD_long DataFrame
COPD_Incid_merged_pm25_df = pd.merge(COPD_incidence_df, stats_df, on=['State Name', 'year'], how='inner')

print("Merged DataFrame:", COPD_Incid_merged_pm25_df.head())

# Save to a CSV file
COPD_Incid_merged_pm25_df.to_csv(f'/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_Incid_pm25_merged_df.csv', index=False)







Loading