Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

solved #325

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified .DS_Store
Binary file not shown.
2,389 changes: 2,389 additions & 0 deletions .ipynb_checkpoints/lab-dw-data-cleaning-and-formatting-checkpoint.ipynb

Large diffs are not rendered by default.

Binary file added data_cleaning/.DS_Store
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# clean column names
import pandas as pd

df.columns = [column.lower().replace(" ","-").replace("ST","state") for column in df.columns]
83 changes: 83 additions & 0 deletions data_cleaning/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@


%%writefile data_cleaning/__clean_column_names__.py
# clean column names
import pandas as pd

df.columns = [column.lower().replace(" ","-").replace("ST","state") for column in df.columns]


%%writefile data_cleaning/clean_gender_column.py
#clean gender column
def clean_gender_column(df):
def gender_category(value):
if value == "Male":
return "M"
elif value == "female":
return "F"
elif value == "Femal":
return "F"
else:
return value

df["gender"] = df["gender"].apply(gender_category)
return df


%%writefile data_cleaning/clv_to_intr.py
# covert CLV to integer
def clv_to_intr(df):
def remove_percentage_char(value):
if type(value) == str:
return float(value.strip("%"))
else:
return value
df["customer-lifetime-value"] = df["customer-lifetime-value"].apply(remove_percentage_char)
return df



%%writefile data_cleaning/clean_number_of_complains.py
# convert number of complaints into inter
def clean_number_of_complains(value):
if type(value) == str:
return int(value.split("/")[1])
else:
return value

df["number-of-open-complaints"] = df["number-of-open-complaints"].apply(clean_number_of_complains)



%%writefile data_cleaning/drop_full_null_columns.py
# drop the rows or columns with null values
df = df.dropna(axis=0, how="all")



%%writefile data_cleaning/null_gender_to_mode.py
# fill null vales in gender with mode
df["gender"] = df["gender"].fillna(df["gender"].mode()[0])



%%writefile data_cleaning/null_cv_to_median.py
# fill null values in CLV with median
df["customer-lifetime-value"] = df["customer-lifetime-value"].fillna(df["customer-lifetime-value"].median())



%%writefile data_cleaning/convert_full_df_to_int.py
# convert all values to int
df = df.select_dtypes(include=["number"]).astype("int")



%%writefile data_cleaning/remove_duplicates.py
# remove duplicates & save them in a new file
df_duplicates = df.copy()

df_duplicates.drop_duplicates(keep = "first")

df_duplicates.to_csv("./data/df_duplicates_lab.csv")

4 changes: 4 additions & 0 deletions data_cleaning/clean_column_names.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# clean column names
import pandas as pd

df.columns = [column.lower().replace(" ","-").replace("ST","state") for column in df.columns]
14 changes: 14 additions & 0 deletions data_cleaning/clean_gender_column.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#clean gender column
def clean_gender_column(df):
def gender_category(value):
if value == "Male":
return "M"
elif value == "female":
return "F"
elif value == "Femal":
return "F"
else:
return value

df["gender"] = df["gender"].apply(gender_category)
return df
8 changes: 8 additions & 0 deletions data_cleaning/clean_number_of_complains.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# convert number of complaints into inter
def clean_number_of_complains(value):
if type(value) == str:
return int(value.split("/")[1])
else:
return value

df["number-of-open-complaints"] = df["number-of-open-complaints"].apply(clean_number_of_complains)
9 changes: 9 additions & 0 deletions data_cleaning/clv_to_intr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# covert CLV to integer
def clv_to_intr(df):
def remove_percentage_char(value):
if type(value) == str:
return float(value.strip("%"))
else:
return value
df["customer-lifetime-value"] = df["customer-lifetime-value"].apply(remove_percentage_char)
return df
2 changes: 2 additions & 0 deletions data_cleaning/convert_full_df_to_int.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# convert all values to int
df = df.select_dtypes(include=["number"]).astype("int")
2 changes: 2 additions & 0 deletions data_cleaning/drop_full_null_columns.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# drop the rows or columns with null values
df = df.dropna(axis=0, how="all")
2 changes: 2 additions & 0 deletions data_cleaning/null_cv_to_median.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# fill null values in CLV with median
df["customer-lifetime-value"] = df["customer-lifetime-value"].fillna(df["customer-lifetime-value"].median())
2 changes: 2 additions & 0 deletions data_cleaning/null_gender_to_mode.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# fill null vales in gender with mode
df["gender"] = df["gender"].fillna(df["gender"].mode()[0])
6 changes: 6 additions & 0 deletions data_cleaning/remove_duplicates.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# remove duplicates & save them in a new file
df_duplicates = df.copy()

df_duplicates.drop_duplicates(keep = "first")

df_duplicates.to_csv("./data/df_duplicates_lab.csv")
Loading