diff --git a/README.md b/README.md index 139c379..6b680bf 100644 --- a/README.md +++ b/README.md @@ -79,21 +79,21 @@ To set up and run the Data Clean-up Tool, follow these steps: - [x] Add enforcement of types. Say a column shows only numbers, let's say money. Enforce that you can't edit a text into there, only numeric values are allowed (https://dash.plotly.com/datatable/typing) - - [ ] Add a formatting setting that formats columns to a specified prefereance. For example, cost column will show $ sign and number type enforcement along with commas when needed (https://dash.plotly.com/datatable/typing) + - [x] Add a formatting setting that formats columns to a specified prefereance. For example, cost column will show $ sign and number type enforcement along with commas when needed (https://dash.plotly.com/datatable/typing) - [x] Adding or removing columns and rows - [ ] Update parse_content function to include 'xslx, xml, html" and "pdf" if we can (pdf is a bonus feature) - [x] Combine two or more data of the same format into one file - [z] After "Enforcing" dtypes or formatting, those cells are then highlighted (https://dash.plotly.com/datatable/conditional-formatting). We could also use for other use cases when highlighting is required. We should have a legend that says what each higlight color means - [x] dtype highlighting - [x] Highlighting None, NaN, or Empty String Values - - [ ] formatting highlighting + - [x] formatting highlighting - [z] Make legend for filtering operations/syntax (https://dash.plotly.com/datatable/filtering) - [ ] Testing (https://dash.plotly.com/testing) - Bonus Features: - [ ] Make a tab option for graphs (https://dash.plotly.com/dash-core-components/tab) - - [ ] Highlight Changes: Display changed cells in a different color for easier tracking + - [x] Highlight Changes: Display changed cells in a different color for easier tracking - [x] Add loading animation (https://dash.plotly.com/dash-core-components/loading) - [x] Make columns selection through a checkbox (https://dash.plotly.com/datatable/editable) - [ ] Displaying Errors with dash.no_update (https://dash.plotly.com/advanced-callbacks) diff --git a/callback_graph.png b/callback_graph.png deleted file mode 100644 index 43945da..0000000 Binary files a/callback_graph.png and /dev/null differ diff --git a/dashboard/__init__.py b/dashboard/__init__.py index 3f53e86..6f1bb90 100644 --- a/dashboard/__init__.py +++ b/dashboard/__init__.py @@ -10,19 +10,16 @@ import dashboard.utils.userPreferences as UserPreferences import dashboard.utils.dataAnalysis as DataAnalysis from .layout import layout +import json +import re cache = diskcache.Cache("./cache") long_callback_manager = DiskcacheManager(cache) - -# This is the main app object app = Dash(__name__, suppress_callback_exceptions=True) -# Improves load time by not loading all callbacks at once. 5-10% improvement -# app.config.suppress_callback_exceptions = True - app.layout = layout -# region handleFile +# region handleFile ###################### UPLOAD FILE ###################### @callback( @@ -40,9 +37,39 @@ def upload_file(prevData, files, fileNames): if files is None: raise exceptions.PreventUpdate + return HandleFile.importFiles(prevData, files, fileNames) + +###################### DOWNLOAD FILE ###################### +@callback( + Output("download-file", "data"), + Output("notifications-container", "children", allow_duplicate=True), + Input("btn-download", "n_clicks"), + State('editable-table', 'data'), + State('editable-table', 'columns'), + State('file-type-select', 'value'), + prevent_initial_call=True, +) +def download_file(_, data, columns, fileType): + if (data == None or columns == None): + print("Nothing to export") + raise exceptions.PreventUpdate + + notification = dmc.Notification( + title="File Exported Successfuly!", + id="simple-notify", + color="green", + action="show", + autoClose=3000, + message='', + icon=DashIconify(icon="akar-icons:circle-alert"), + ) + + return HandleFile.exportFile(data, columns, fileType), notification +# endregion - return HandleFile.importFiles(prevData, files, fileNames) + +# region dataAnalysis ###################### Data Analytics ###################### @callback( @@ -94,22 +121,129 @@ def highlight_cells(submit_btn, highlight_empty_cells, highlight_dtype_cells, co if submit_btn: return new_highlighting + +# endregion + + +# region userPreferences + +###################### ENFORCE DATATYPES (OPEN MODAL) ###################### +@callback( + Output("enforce-dtypes-modal", "opened"), + Input("btn-enforce-dtypes", "n_clicks"), + Input("dtype-modal-close-button", "n_clicks"), + Input("dtype-modal-submit-button", "n_clicks"), + State("enforce-dtypes-modal", "opened"), + prevent_initial_call=True, +) +def enforce_dtypes_modal(nc1, nc2, nc3, opened): + return not opened + + +###################### ENFORCE DATATYPES (FILL MODAL WITH COLUMNS) ###################### +@callback( + Output("column-type-selector", "children"), + Input("enforce-dtypes-modal", "opened"), + State("editable-table", "columns"), + prevent_initial_call=True, +) +def populate_datatype_selection(opened, columns): + if not opened or not columns: + return dmc.Text("Upload a file to enforce datatypes!", style={"color": "black", "fontWeight": "bold", "textAlign": "center"}) + + return UserPreferences.populate_datatype_selection(opened, columns) + + +###################### ENFORCE DATATYPES (SUBMIT MODAL) ###################### +@callback( + Output('editable-table', 'columns'), + Input('dtype-modal-submit-button', 'n_clicks'), + State('column-type-selector', 'children'), + State('editable-table', 'columns'), + prevent_initial_call=True +) +def update_column_datatypes(_, modal_children, columns): + if not columns: + raise exceptions.PreventUpdate + + dropdown_values = UserPreferences.extract_dropdown_values(modal_children) + + # We are able to iterate over columns and dropdown_values simultaneously because they are both in the same order + for col, dtype in zip(columns, dropdown_values): + if dtype: + col['type'] = dtype + + return columns + +###################### ENFORCE FORMATTING (OPEN MODAL) ###################### +@callback( + Output("enforce-formatting-modal", "opened"), + Input("btn-enforce-format", "n_clicks"), + Input("formatting-modal-close-button", "n_clicks"), + Input("formatting-modal-submit-button", "n_clicks"), + State("enforce-formatting-modal", "opened"), + prevent_initial_call=True, +) +def enforce_dtypes_modal(nc1, nc2, nc3, opened): + return not opened + +###################### ENFORCE FORMATTING (FILL MODAL WITH COLUMNS) ###################### +@callback( + Output("column-format-selector", "children"), + Input("enforce-formatting-modal", "opened"), + State("editable-table", "columns"), + State('formatting-store', 'data'), + prevent_initial_call=True, +) +def populate_format_selection(opened, columns, formatting_options): + if not opened or not columns: + return dmc.Text("Upload a file to enforce formatting!", style={"color": "black", "fontWeight": "bold", "textAlign": "center"}) + + return UserPreferences.populate_format_selection(opened, columns, formatting_options) + +###################### ENFORCE FORMATTING (SUBMIT MODAL) ###################### +@callback( + Output('formatting-store', 'data'), + Input('formatting-modal-submit-button', 'n_clicks'), + State('column-format-selector', 'children'), + State('editable-table', 'columns'), + prevent_initial_call=True +) +def update_column_formatting(_, modal_children, columns): + if not columns: + raise exceptions.PreventUpdate + + format_values = UserPreferences.extract_input_values(modal_children) + + # Create a dictionary with column names as keys and formatting options as values + column_formats = { + col['name']: fmt_val for col, fmt_val in zip(columns, format_values) if fmt_val + } + + return json.dumps(column_formats) + + +# endregion + +# region dataCleaner ###################### REMOVE DUPLICATE ROWS ###################### @callback( Output('editable-table', 'data'), + Output('initial-table-data', 'data', allow_duplicate=True), Output('notifications-container', 'children', allow_duplicate=True), State('editable-table', 'data'), Input('btn-remove-duplicates', 'n_clicks'), prevent_initial_call=True ) def remove_duplicate_rows(data, n_clicks): - if data is None and n_clicks is None: + if data is None or n_clicks is None: raise exceptions.PreventUpdate df = pd.DataFrame.from_dict(data) - df.drop_duplicates(inplace=True) + # Exclude the 'ID' column when dropping duplicates + df.drop_duplicates(subset=[col for col in df.columns if col != 'ID'], inplace=True) # Count how many rows were removed rows_removed = len(data) - len(df) @@ -124,7 +258,7 @@ def remove_duplicate_rows(data, n_clicks): message="", icon=DashIconify(icon="akar-icons:circle-alert") ) - return no_update, notification + return no_update, no_update, notification else: notification = dmc.Notification( @@ -137,120 +271,79 @@ def remove_duplicate_rows(data, n_clicks): icon=DashIconify(icon="akar-icons:circle-check"), ) - return df.to_dict('records'), notification + return df.to_dict('records'), df.to_dict('records'), notification -###################### DOWNLOAD FILE ###################### + +###################### CHECK EMPTY/CORRUPT CELLS [CLEANING OPERATION] ###################### @callback( - Output("download-file", "data"), - Output("notifications-container", "children", allow_duplicate=True), - Input("btn-download", "n_clicks"), - State('editable-table', 'data'), + Output('editable-table', 'data', allow_duplicate=True), + Output('noncomplient-indices-3', 'data'), + Output('notifications-container', 'children', allow_duplicate=True), + Output('btn-confirm-changes-container', 'children', allow_duplicate=True), + [Input('btn-check-empty-corrupt-cells', 'n_clicks')], State('editable-table', 'columns'), - State('file-type-select', 'value'), - prevent_initial_call=True, + State('editable-table', 'data'), + prevent_initial_call=True ) -def download_file(_, data, columns, fileType): - if (data == None or columns == None): - print("Nothing to export") +def show_noncomplient_empty_data(n_clicks, columns, data): + if columns is None or data is None or n_clicks is None: raise exceptions.PreventUpdate + + return DataCleaner.show_noncomplient_empty_data(columns, data) - notification = dmc.Notification( - title="File Exported Successfuly!", - id="simple-notify", - color="green", - action="show", - autoClose=3000, - message='', - icon=DashIconify(icon="akar-icons:circle-alert"), - ) - - return HandleFile.exportFile(data, columns, fileType), notification - -# endregion - -# region datacleaner - -# @app.long_callback( -# Output("editable-table", "data"), -# Output("log-textbox", "children"), -# Input("clean-data-button", "n_clicks"), -# State("editable-table", "data"), -# State("editable-table", "columns"), -# State("auto-clean-checkbox", "checked"), -# running=[(Output("clean-data-button", "disabled"), True, False), -# (Output("cancel-button", "disabled"), False, True) -# ], -# cancel=[Input("cancel-button", "n_clicks")], -# manager=long_callback_manager, -# prevent_initial_call=True, -# ) -# def cleanData(_, data, columns, isAutoClean): -# # todo manual clean -# # todo get and use user preferences -# # todo clean up logging -# # reconsider what to report based on frontend needs -# userPreferences = {"*": "int"} -# if (isAutoClean): -# data, message, changedCells, emptyCells, needsAttention = DataCleaner.cleanDataAuto( -# data, columns, userPreferences) -# message = f"changed{changedCells}, empty{emptyCells}, needsAttention{needsAttention}" -# print(message) -# return data, message - -# print("Not implemented") -# raise exceptions.NonExistentEventException - -# endregion -###################### ENFORCE DATATYPES (OPEN MODAL) ###################### +###################### CLEAN EMPTY/CORRUPT CELLS [CLEANING OPERATION] highlighting ###################### @callback( - Output("enforce-dtypes-modal", "opened"), - Input("btn-enforce-dtypes", "n_clicks"), - Input("dtype-modal-close-button", "n_clicks"), - Input("dtype-modal-submit-button", "n_clicks"), - State("enforce-dtypes-modal", "opened"), - prevent_initial_call=True, + Output('editable-table', 'style_data_conditional', allow_duplicate=True), + [Input('noncomplient-indices-3', 'data')], + State('editable-table', 'columns'), + State('editable-table', 'data'), + prevent_initial_call=True ) -def enforce_dtypes_modal(nc1, nc2, nc3, opened): - return not opened +def style_noncompliant_empty_cells(cache, columns, data): + if not cache: + raise exceptions.PreventUpdate + return DataCleaner.style_noncompliant_empty_cells(columns, data) -###################### ENFORCE DATATYPES (FILL MODAL WITH COLUMNS) ###################### + + +###################### CHECK CELLS FORMATTING [CLEANING OPERATION] ###################### @callback( - Output("column-type-selector", "children"), - Input("enforce-dtypes-modal", "opened"), - State("editable-table", "columns"), - prevent_initial_call=True, + Output('editable-table', 'data', allow_duplicate=True), + Output('noncomplient-indices-2', 'data'), + Output('notifications-container', 'children', allow_duplicate=True), + Output('btn-confirm-changes-container', 'children', allow_duplicate=True), + [Input('btn-check-cells-formatting', 'n_clicks')], + State('formatting-store', 'data'), # State to hold formatting options + State('editable-table', 'columns'), + State('editable-table', 'data'), + prevent_initial_call=True ) -def populate_datatype_selection(opened, columns): - if not opened or not columns: - return dmc.Text("Upload a file to enforce datatypes!", style={"color": "black", "fontWeight": "bold", "textAlign": "center"}) - - return UserPreferences.populate_datatype_selection(opened, columns) +def show_noncompliant_format_data(n_clicks, formatting_store_data, columns, data): + if columns is None or data is None or n_clicks is None: + raise exceptions.PreventUpdate + + return DataCleaner.show_noncompliant_format_data(formatting_store_data, columns, data) -###################### ENFORCE DATATYPES (SUBMIT MODAL) ###################### +###################### CLEAN CELLS FORMATTING [CLEANING OPERATION] highlighting ###################### @callback( - Output('editable-table', 'columns'), - Input('dtype-modal-submit-button', 'n_clicks'), - State('column-type-selector', 'children'), + Output('editable-table', 'style_data_conditional', allow_duplicate=True), + [Input('noncomplient-indices-2', 'data')], State('editable-table', 'columns'), + State('editable-table', 'data'), + State('formatting-store', 'data'), # State to hold formatting options prevent_initial_call=True ) -def update_column_datatypes(_, modal_children, columns): - if not columns: +def style_noncompliant_format_cells(cache, columns, data, formatting_store_data): + if not cache: raise exceptions.PreventUpdate - dropdown_values = UserPreferences.extract_dropdown_values(modal_children) + return DataCleaner.style_noncompliant_format_cells(columns, data, formatting_store_data) - # We are able to iterate over columns and dropdown_values simultaneously because they are both in the same order - for col, dtype in zip(columns, dropdown_values): - if dtype: - col['type'] = dtype - - return columns ###################### CHECK CELLS DATATYPE [CLEANING OPERATION] ###################### @callback( @@ -263,89 +356,11 @@ def update_column_datatypes(_, modal_children, columns): State('editable-table', 'data'), prevent_initial_call=True ) -def show_noncomplient_data(n_clicks, columns, data): +def show_noncomplient_dtype_data(n_clicks, columns, data): if columns is None or data is None or n_clicks is None: raise exceptions.PreventUpdate - df = pd.DataFrame.from_dict(data) - non_compliant_rows = set() # To track rows with non-compliant data - - for col in columns: - # Ensure the column has the 'type' key - if 'type' not in col: - continue - - if col['type'] == 'text': - def is_convertible_to_numeric(val): - if val is None: - return False - try: - # Try to convert to float - float(val) - return True - except (TypeError, ValueError): - return False - - # mask = df[col['name']].apply(lambda x: not isinstance(x, str) or is_convertible_to_numeric(x)) - mask = df[col['name']].apply(lambda x: x is not None and (not isinstance(x, str) or is_convertible_to_numeric(x))) - - - - elif col['type'] == 'numeric': - def is_numeric(val): - if val is None: - return False - - # If val is already numeric (float or int) - if isinstance(val, (float, int)): - return True - - # If val is a string, attempt to convert to float after removing hyphens - if isinstance(val, str): - try: - float(val.replace('-', '')) - return True - except (TypeError, ValueError): - return False - return False - - # mask = df[col['name']].apply(lambda x: not is_numeric(x)) - mask = df[col['name']].apply(lambda x: x is not None and (not is_numeric(x))) - - - elif col['type'] == 'datetime': - # mask = df[col['name']].apply(lambda x: not isinstance(x, pd.Timestamp)) - mask = df[col['name']].apply(lambda x: x is not None and (not isinstance(x, pd.Timestamp))) - else: - continue - - # Find non-compliant indices and add them to the set - non_compliant_indices = mask[mask].index.tolist() - for idx in non_compliant_indices: - non_compliant_rows.add(idx) # Add row index to the set - - # Filter the dataframe to keep only rows with non-compliant data - df_filtered = df[df.index.isin(non_compliant_rows)] - # print(df_filtered) - - if df_filtered.empty: - print("No non-compliant data found") - - notification = dmc.Notification( - title="No non-complient data found!", - id="simple-notify", - color="yellow", - action="show", - message="", - autoClose=3000, - icon=DashIconify(icon="akar-icons:circle-alert") - ) - return no_update, no_update, notification, no_update - - confirm_button = dmc.Button("Confirm Changes", id="btn-confirm-changes", style={"backgroundColor": "#12B886"}), - - # return df_filtered.to_dict('records'), [] - return df_filtered.to_dict('records'), df_filtered.index.tolist(), [], confirm_button + return DataCleaner.show_noncomplient_dtype_data(columns, data) ###################### CLEAN CELLS DATATYPE [CLEANING OPERATION] highlighting ###################### @@ -356,68 +371,14 @@ def is_numeric(val): State('editable-table', 'data'), prevent_initial_call=True ) -def style_noncompliant_cells(cache, columns, data): +def style_noncompliant_dtype_cells(cache, columns, data): if not cache: raise exceptions.PreventUpdate - df = pd.DataFrame.from_dict(data) - style_data_conditional = [] - - for col in columns: - if 'type' not in col: - continue - - if col['type'] == 'text': - def is_convertible_to_numeric(val): - if val is None: - return False - try: - # Try to convert to float - float(val) - return True - except (TypeError, ValueError): - return False - - # mask = df[col['name']].apply(lambda x: not isinstance(x, str) or is_convertible_to_numeric(x)) - mask = df[col['name']].apply(lambda x: x is not None and (not isinstance(x, str) or is_convertible_to_numeric(x))) - color = '#fde047' # Adjusted color for non-string data in a text column - - elif col['type'] == 'numeric': - def is_numeric(val): - if val is None: - return False - - if isinstance(val, (float, int)): - return True - - if isinstance(val, str): - try: - float(val.replace('-', '')) - return True - except (TypeError, ValueError): - return False - return False - - # mask = df[col['name']].apply(lambda x: not is_numeric(x)) - mask = df[col['name']].apply(lambda x: x is not None and (not is_numeric(x))) - color = '#6ee7b7' # Adjusted color for non-numeric data in a numeric column - - elif col['type'] == 'datetime': - # mask = df[col['name']].apply(lambda x: not isinstance(x, pd.Timestamp)) - mask = df[col['name']].apply(lambda x: x is not None and (not isinstance(x, pd.Timestamp))) - color = '#c4b5fd' # Adjusted color for non-datetime data in a datetime column - else: - continue + return DataCleaner.style_noncompliant_dtype_cells(columns, data) - non_compliant_indices = mask[mask].index.tolist() - for idx in non_compliant_indices: - style_data_conditional.append({ - 'if': {'row_index': idx, 'column_id': col['name']}, - 'backgroundColor': color, - }) - - return style_data_conditional +# endregion # ###################### CLEAN CELLS DATATYPE [CONFIRM BUTTON] (persist changes) ###################### @callback( @@ -490,5 +451,38 @@ def reset_table(n_clicks, initial_data): return initial_data, [], [] +# @app.long_callback( +# Output("editable-table", "data"), +# Output("log-textbox", "children"), +# Input("clean-data-button", "n_clicks"), +# State("editable-table", "data"), +# State("editable-table", "columns"), +# State("auto-clean-checkbox", "checked"), +# running=[(Output("clean-data-button", "disabled"), True, False), +# (Output("cancel-button", "disabled"), False, True) +# ], +# cancel=[Input("cancel-button", "n_clicks")], +# manager=long_callback_manager, +# prevent_initial_call=True, +# ) +# def cleanData(_, data, columns, isAutoClean): +# # todo manual clean +# # todo get and use user preferences +# # todo clean up logging +# # reconsider what to report based on frontend needs +# userPreferences = {"*": "int"} +# if (isAutoClean): +# data, message, changedCells, emptyCells, needsAttention = DataCleaner.cleanDataAuto( +# data, columns, userPreferences) +# message = f"changed{changedCells}, empty{emptyCells}, needsAttention{needsAttention}" +# print(message) +# return data, message + +# print("Not implemented") +# raise exceptions.NonExistentEventException + + if __name__ == '__main__': app.run(debug=True) + + diff --git a/dashboard/assets/data/test.xlsx b/dashboard/assets/data/test.xlsx index e35cc14..4a863a3 100644 Binary files a/dashboard/assets/data/test.xlsx and b/dashboard/assets/data/test.xlsx differ diff --git a/dashboard/layout.py b/dashboard/layout.py index f711179..d0ec708 100644 --- a/dashboard/layout.py +++ b/dashboard/layout.py @@ -9,8 +9,10 @@ dcc.Store(id='initial-table-data'), dcc.Store(id='initial-table-columns'), dcc.Store(id='noncomplient-indices'), + dcc.Store(id='noncomplient-indices-2'), + dcc.Store(id='noncomplient-indices-3'), + dcc.Store(id='formatting-store'), - # Sidebar # Sidebar html.Div([ html.Div([ @@ -18,7 +20,7 @@ src="./assets/images/logo.jpeg", alt="USCS", width=40), dmc.Title(f"United States Cold Storage", order=5,), ], style={"display": "flex", "justifyContent": "center", "alignItems": "center", "gap": "1rem", "marginBottom": "1rem", "borderBottom": "1px solid #ccc", 'padding': "1rem"}), - + html.Div(id='store-output'), html.Div([ dmc.Menu([ dmc.MenuLabel("Data Analysis", style={"padding-left": "5px"}), @@ -44,7 +46,7 @@ label="Opens a modal for cell higlighting options", children=dmc.Button("Highlight Cells", id="btn-higlight-cells", variant="subtle", leftIcon=DashIconify(icon="bx:highlight"),), ), - dmc.Modal( # This is the modal that will open when the enforce datatypes button is clicked + dmc.Modal( # This is the modal that will open when the highlight cells button is clicked title="Choose options for cell highlighting", id="higlight-cells-modal", zIndex=10000, @@ -116,6 +118,28 @@ label="Distinguish cells that don't match their columns enforced formatting, set in user preferences", children=dmc.Button("Enforce Formatting", id="btn-enforce-format", variant="subtle", leftIcon=DashIconify(icon="streamline:interface-edit-write-2-change-document-edit-modify-paper-pencil-write-writing"),) ), + dmc.Modal( # This is the modal that will open when the enforce formatting button is clicked + title="Input a specified format for each column", + id="enforce-formatting-modal", + zIndex=10000, + size="70rem", + children=[ + html.Div(id='column-format-selector'), + dmc.Space(h=20), + dmc.Group( + [ + dmc.Button("Submit", id="formatting-modal-submit-button"), + dmc.Button( + "Close", + color="red", + variant="outline", + id="formatting-modal-close-button", + ), + ], + position="right", + ), + ], + ), dmc.Space(h=20), dmc.MenuLabel("Cleaning Operations", style={"padding-left": "5px"}), dmc.Tooltip( @@ -125,8 +149,8 @@ position="right", transition="fade", transitionDuration=300, - label="Distinguish and iterate over empty and corrupt cells", - children=dmc.Button("Check Empty/Corrupt Cells", id="btn-check-empty-corrupt-cells", variant="subtle", leftIcon=DashIconify(icon="iconoir:info-empty"),) + label="Removes duplicate rows from the imported data", + children=dmc.Button("Remove Duplicates", id="btn-remove-duplicates", variant="subtle", leftIcon=DashIconify(icon="bx:duplicate"),) ), dmc.Tooltip( withArrow=True, @@ -135,10 +159,9 @@ position="right", transition="fade", transitionDuration=300, - label="Removes duplicate rows from the imported data", - children=dmc.Button("Remove Duplicates", id="btn-remove-duplicates", variant="subtle", leftIcon=DashIconify(icon="bx:duplicate"),) + label="Distinguish and iterate over empty and corrupt cells", + children=dmc.Button("Check Empty/Corrupt Cells", id="btn-check-empty-corrupt-cells", variant="subtle", leftIcon=DashIconify(icon="iconoir:info-empty"),) ), - dmc.Tooltip( withArrow=True, width=200, @@ -159,17 +182,7 @@ transitionDuration=300, label="Distinguish cells that don't match their columns enforced formatting, set in user preferences", children=dmc.Button("Check Cells Formatting", id="btn-check-cells-formatting", variant="subtle", leftIcon=DashIconify(icon="mdi:checkbox-outline"),), - ), - dmc.Tooltip( - withArrow=True, - width=200, - multiline=True, - position="right", - transition="fade", - transitionDuration=300, - label="Check all cells for any issues", - children=dmc.Button("Clean All", id="btn-clean-all", variant="subtle", color="red", leftIcon=DashIconify(icon="material-symbols:cleaning-services-outline"),), - ), + ) ]), ], style={"fontSize": "26px"}), ], className="sidebar"), @@ -267,6 +280,21 @@ 'backgroundColor': '#c4b5fd', 'margin': '0.5rem', })), + dmc.Tooltip( + withArrow=True, + width=200, + multiline=True, + position="right", + transition="fade", + transitionDuration=300, + label="Cells with noncomplient formatting", + children=html.Div(style={ + 'display': 'inline-block', + 'width': '20px', + 'height': '20px', + 'backgroundColor': '#93c5fd', + 'margin': '0.5rem', + })), ], style={"display": "flex", "backgroundColor": "grey"}), diff --git a/dashboard/utils/dataAnalysis.py b/dashboard/utils/dataAnalysis.py index b15e658..3e471b4 100644 --- a/dashboard/utils/dataAnalysis.py +++ b/dashboard/utils/dataAnalysis.py @@ -8,19 +8,19 @@ def get_data_analysis(data): empty_corrupt_values = df.isna().sum() num_rows, num_columns = df.shape - num_duplicate_rows = df.duplicated().sum() + # Assuming 'ID' is the name of your unique identifier column + num_duplicate_rows = df.duplicated(subset=[col for col in df.columns if col != 'ID']).sum() if (empty_corrupt_values.sum() != 0): return [ html.Li([ "Data: ", - html.Span(f'{"{:,}".format(df.size)}', style={ + html.Span(f'{"{:,}".format(df.count().sum())}', style={ 'color': '#007BFF', 'fontWeight': 'bold', 'padding': '0 5px', 'borderRadius': '5px' }) - # i want ot get rid of the bulletd points ], style={"listStyleType": "none"}), html.Li([ "Rows: ", diff --git a/dashboard/utils/dataCleaner.py b/dashboard/utils/dataCleaner.py index 4693911..1f72e8f 100644 --- a/dashboard/utils/dataCleaner.py +++ b/dashboard/utils/dataCleaner.py @@ -1,5 +1,283 @@ import pandas as pd from dash.exceptions import PreventUpdate +from dash import no_update +import re +import json +import dash_mantine_components as dmc +from dash_iconify import DashIconify + +def show_noncomplient_empty_data(columns, data): + + df = pd.DataFrame.from_dict(data) + non_compliant_rows = set() # To track rows with non-compliant data + + for col in columns: + # Find non-compliant indices and add them to the set + non_compliant_indices = df[col['name']].apply(lambda x: pd.isna(x)) + non_compliant_rows.update(non_compliant_indices[non_compliant_indices].index.tolist()) + + # Filter the dataframe to keep only rows with non-compliant data + df_filtered = df.loc[list(non_compliant_rows)] + + if df_filtered.empty: + notification = dmc.Notification( + title="No empty/corrupt data found!", + id="simple-notify", + color="yellow", + action="show", + message="", + autoClose=3000, + icon=DashIconify(icon="akar-icons:circle-alert") + ) + return no_update, no_update, notification, no_update + + confirm_button = dmc.Button("Confirm Changes", id="btn-confirm-changes", style={"backgroundColor": "#12B886"}) + + return df_filtered.to_dict('records'), df_filtered.index.tolist(), [], confirm_button + + +def style_noncompliant_empty_cells(columns, data): + + df = pd.DataFrame.from_dict(data) + style_data_conditional = [] + + for col in columns: + # Find non-compliant indices + non_compliant_indices = df[col['name']].apply(pd.isna) + non_compliant_rows = non_compliant_indices[non_compliant_indices].index.tolist() + + for idx in non_compliant_rows: + style_data_conditional.append({ + 'if': {'row_index': idx, 'column_id': col['name']}, + 'backgroundColor': '#f87171', # Red background to highlight non-compliant cells + }) + + return style_data_conditional + + +def show_noncompliant_format_data(formatting_store_data, columns, data): + + if formatting_store_data is None: + notification = dmc.Notification( + title="No formatting options found!", + id="simple-notify", + color="yellow", + action="show", + message="", + autoClose=3000, + icon=DashIconify(icon="akar-icons:circle-alert") + ) + return no_update, no_update, notification, no_update + + # Load the stored formatting options + formatting_options = json.loads(formatting_store_data) + df = pd.DataFrame.from_dict(data) + non_compliant_rows = set() # To track rows with non-compliant data + + for col in columns: + col_name = col['name'] + # Ensure the column has a formatting pattern stored + if col_name in formatting_options: + pattern = formatting_options[col_name] + regex = re.compile(pattern) + + # Find non-compliant indices and add them to the set + non_compliant_indices = df[col_name].apply(lambda x: not regex.match(str(x)) if x else False) + non_compliant_rows.update(non_compliant_indices[non_compliant_indices].index.tolist()) + + # Filter the dataframe to keep only rows with non-compliant data + df_filtered = df.loc[list(non_compliant_rows)] + + if df_filtered.empty: + notification = dmc.Notification( + title="No format non-complient data found!", + id="simple-notify", + color="yellow", + action="show", + message="", + autoClose=3000, + icon=DashIconify(icon="akar-icons:circle-alert") + ) + return no_update, no_update, notification, no_update + + confirm_button = dmc.Button("Confirm Changes", id="btn-confirm-changes", style={"backgroundColor": "#12B886"}) + + return df_filtered.to_dict('records'), df_filtered.index.tolist(), [], confirm_button + + +def style_noncompliant_format_cells(columns, data, formatting_store_data): + + if formatting_store_data is None: + return no_update + + # Load the stored formatting options + formatting_options = json.loads(formatting_store_data) + df = pd.DataFrame.from_dict(data) + style_data_conditional = [] + + for col in columns: + col_name = col['name'] + # Ensure the column has a formatting pattern stored + if col_name in formatting_options: + pattern = formatting_options[col_name] + regex = re.compile(pattern) + + # Find non-compliant indices and add them to the set + non_compliant_indices = df[col_name].apply(lambda x: not regex.match(str(x)) if x else False) + non_compliant_rows = non_compliant_indices[non_compliant_indices].index.tolist() + + for idx in non_compliant_rows: + style_data_conditional.append({ + 'if': {'row_index': idx, 'column_id': col['name']}, + 'backgroundColor': '#93c5fd', + }) + + return style_data_conditional + + +def show_noncomplient_dtype_data(columns, data): + df = pd.DataFrame.from_dict(data) + non_compliant_rows = set() # To track rows with non-compliant data + + for col in columns: + # Ensure the column has the 'type' key + if 'type' not in col: + continue + + if col['type'] == 'text': + def is_convertible_to_numeric(val): + if val is None: + return False + try: + # Try to convert to float + float(val) + return True + except (TypeError, ValueError): + return False + + # mask = df[col['name']].apply(lambda x: not isinstance(x, str) or is_convertible_to_numeric(x)) + mask = df[col['name']].apply(lambda x: x is not None and (not isinstance(x, str) or is_convertible_to_numeric(x))) + + + + elif col['type'] == 'numeric': + def is_numeric(val): + if val is None: + return False + + # If val is already numeric (float or int) + if isinstance(val, (float, int)): + return True + + # If val is a string, attempt to convert to float after removing hyphens + if isinstance(val, str): + try: + float(val.replace('-', '')) + return True + except (TypeError, ValueError): + return False + return False + + # mask = df[col['name']].apply(lambda x: not is_numeric(x)) + mask = df[col['name']].apply(lambda x: x is not None and (not is_numeric(x))) + + + elif col['type'] == 'datetime': + # mask = df[col['name']].apply(lambda x: not isinstance(x, pd.Timestamp)) + mask = df[col['name']].apply(lambda x: x is not None and (not isinstance(x, pd.Timestamp))) + else: + continue + + # Find non-compliant indices and add them to the set + non_compliant_indices = mask[mask].index.tolist() + for idx in non_compliant_indices: + non_compliant_rows.add(idx) # Add row index to the set + + # Filter the dataframe to keep only rows with non-compliant data + df_filtered = df[df.index.isin(non_compliant_rows)] + # print(df_filtered) + + if df_filtered.empty: + notification = dmc.Notification( + title="No datatype non-complient data found!", + id="simple-notify", + color="yellow", + action="show", + message="", + autoClose=3000, + icon=DashIconify(icon="akar-icons:circle-alert") + ) + return no_update, no_update, notification, no_update + + confirm_button = dmc.Button("Confirm Changes", id="btn-confirm-changes", style={"backgroundColor": "#12B886"}), + + # return df_filtered.to_dict('records'), [] + return df_filtered.to_dict('records'), df_filtered.index.tolist(), [], confirm_button + +def style_noncompliant_dtype_cells(columns, data): + df = pd.DataFrame.from_dict(data) + style_data_conditional = [] + + for col in columns: + if 'type' not in col: + continue + + if col['type'] == 'text': + def is_convertible_to_numeric(val): + if val is None: + return False + try: + # Try to convert to float + float(val) + return True + except (TypeError, ValueError): + return False + + # mask = df[col['name']].apply(lambda x: not isinstance(x, str) or is_convertible_to_numeric(x)) + mask = df[col['name']].apply(lambda x: x is not None and (not isinstance(x, str) or is_convertible_to_numeric(x))) + color = '#fde047' # Adjusted color for non-string data in a text column + + elif col['type'] == 'numeric': + def is_numeric(val): + if val is None: + return False + + if isinstance(val, (float, int)): + return True + + if isinstance(val, str): + try: + float(val.replace('-', '')) + return True + except (TypeError, ValueError): + return False + return False + + # mask = df[col['name']].apply(lambda x: not is_numeric(x)) + mask = df[col['name']].apply(lambda x: x is not None and (not is_numeric(x))) + color = '#6ee7b7' # Adjusted color for non-numeric data in a numeric column + + elif col['type'] == 'datetime': + # mask = df[col['name']].apply(lambda x: not isinstance(x, pd.Timestamp)) + mask = df[col['name']].apply(lambda x: x is not None and (not isinstance(x, pd.Timestamp))) + color = '#c4b5fd' # Adjusted color for non-datetime data in a datetime column + else: + continue + + non_compliant_indices = mask[mask].index.tolist() + for idx in non_compliant_indices: + style_data_conditional.append({ + 'if': {'row_index': idx, 'column_id': col['name']}, + 'backgroundColor': color, + }) + + return style_data_conditional + + + + + + def cleanDataAuto(data, columns, preferences): diff --git a/dashboard/utils/parameters.py b/dashboard/utils/parameters.py new file mode 100644 index 0000000..a79d523 --- /dev/null +++ b/dashboard/utils/parameters.py @@ -0,0 +1,176 @@ +from dash import html +import dash_mantine_components as dmc + +definition_items = [ + dmc.Col( + html.Div( + style={"display": "flex", "alignItems": "center", "justifyContent": "start", "gap": "10px"}, + children=[dmc.Kbd("."), html.Span("Matches any character except a newline")] + ), + xs=12, md=6 + ), + dmc.Col( + html.Div( + style={"display": "flex", "alignItems": "center", "justifyContent": "start", "gap": "10px"}, + children=[dmc.Kbd("^"), html.Span("Matches the start of the string")] + ), + xs=12, md=6 + ), + dmc.Col( + html.Div( + style={"display": "flex", "alignItems": "center", "justifyContent": "start", "gap": "10px"}, + children=[dmc.Kbd("$"), html.Span("Matches the end of the string or just before the newline")] + ), + xs=12, md=6 + ), + dmc.Col( + html.Div( + style={"display": "flex", "alignItems": "center", "justifyContent": "start", "gap": "10px"}, + children=[dmc.Kbd("*"), html.Span("Matches 0 or more repetitions of the preceding RE")] + ), + xs=12, md=6 + ), + dmc.Col( + html.Div( + style={"display": "flex", "alignItems": "center", "justifyContent": "start", "gap": "10px"}, + children=[dmc.Kbd("+"), html.Span("Matches 1 or more repetitions of the preceding RE")] + ), + xs=12, md=6 + ), + dmc.Col( + html.Div( + style={"display": "flex", "alignItems": "center", "justifyContent": "start", "gap": "10px"}, + children=[dmc.Kbd("?"), html.Span("Matches 0 or 1 repetitions of the preceding RE")] + ), + xs=12, md=6 + ), + dmc.Col( + html.Div( + style={"display": "flex", "alignItems": "center", "justifyContent": "start", "gap": "10px"}, + children=[dmc.Kbd("{m,n}"), html.Span("Matches from m to n repetitions of the preceding RE")] + ), + xs=12, md=6 + ), + dmc.Col( + html.Div( + style={ + "display": "flex", "alignItems": "center", "justifyContent": "start", "gap": "10px" + }, + children=[dmc.Kbd("[...]"), html.Span("Matches any character inside the square brackets")] + ), + xs=12, md=6 + ), + dmc.Col( + html.Div( + style={ + "display": "flex", "alignItems": "center", "justifyContent": "start", "gap": "10px" + }, + children=[dmc.Kbd("[^...]"), html.Span("Matches any character not inside the square brackets")] + ), + xs=12, md=6 + ), + dmc.Col( + html.Div( + style={"display": "flex", "alignItems": "center", "justifyContent": "start", "gap": "10px"}, + children=[dmc.Kbd("(?=...)"), html.Span("Matches if ... matches next, but doesn’t consume any of the string (lookahead assertion)")] + ), + xs=12, md=6 + ), + dmc.Col( + html.Div( + style={"display": "flex", "alignItems": "center", "justifyContent": "start", "gap": "10px"}, + children=[dmc.Kbd("\\d"), html.Span("Matches any Unicode decimal digit")] + ), + xs=12, md=6 + ), + dmc.Col( + html.Div( + style={"display": "flex", "alignItems": "center", "justifyContent": "start", "gap": "10px"}, + children=[dmc.Kbd("\\w"), html.Span("Matches Unicode word characters")] + ), + xs=12, md=6 + ), +] + + +# Create the header row +header_row = html.Tr([ + html.Th("Pattern"), + html.Th("Explanation"), + html.Th("Case") +]) + +# Create the data rows +data_rows = [ + html.Tr([ + html.Td(dmc.Kbd("ab(cd|ef)")), + html.Td("Matches 'abcd' or 'abef'."), + html.Td([dmc.Highlight("'abcdxyz', ", highlight="abcd"), dmc.Highlight("'abefxyz'", highlight="abef")]) + ]), + html.Tr([ + html.Td(dmc.Kbd("(a|b)c")), + html.Td("Matches 'ac' or 'bc'."), + html.Td([dmc.Highlight("'acxyz', ", highlight="ac"), dmc.Highlight("'bcxyz'", highlight="bc")]) + ]), + # \d{4}-\d{2}-\d{2} + html.Tr([ + html.Td(dmc.Kbd(r"\d{4}-\d{2}-\d{2}")), + html.Td("Matches dates in the format YYYY-MM-DD."), + html.Td(dmc.Highlight("'2021-01-01'", highlight="2021-01-01")) + ]), + html.Tr([ + html.Td(dmc.Kbd(r"^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$")), + html.Td("Matches a simple email address."), + html.Td(dmc.Highlight("'user@example.com'", highlight="user@example.com")) + ]), + html.Tr([ + html.Td(dmc.Kbd(r"https?://(?:www\.)?\w+\.\w+")), + html.Td("Matches HTTP and HTTPS URLs."), + html.Td(dmc.Highlight("'http://www.example.com'", highlight="http://www.example.com")) + ]), + html.Tr([ + html.Td(dmc.Kbd(r"\(\d{3}\)\s\d{3}-\d{4}")), + html.Td("Matches US phone number with area code in brackets."), + html.Td(dmc.Highlight("'(123) 456-7890'", highlight="(123) 456-7890")) + ]), + html.Tr([ + html.Td(dmc.Kbd(r"^\d{5}-\d{4}$")), + html.Td("Matches US ZIP code in 5-digit + 4 format."), + html.Td(dmc.Highlight("'12345-6789'", highlight="12345-6789")) + ]), + html.Tr([ + html.Td(dmc.Kbd(r"#[a-fA-F0-9]{6}")), + html.Td("Matches hexadecimal color codes."), + html.Td(dmc.Highlight("'#1a2b3c'", highlight="#1a2b3c")) + ]), + html.Tr([ + html.Td(dmc.Kbd(r"^[A-Z][a-z]+\s[A-Z][a-z]+$")), + html.Td("Matches people's names."), + html.Td(dmc.Highlight("'John Doe'", highlight="John Doe")) + ]), + html.Tr([ + html.Td(dmc.Kbd(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b")), + html.Td("Matches email addresses (case-insensitive) with subdomains."), + html.Td(dmc.Highlight("'first.last@example.co.uk'", highlight="first.last@example.co.uk")) + ]), + html.Tr([ + html.Td(dmc.Kbd(r"^\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}$")), + html.Td("Matches credit card numbers."), + html.Td(dmc.Highlight("'1234 5678 9012 3456'", highlight="1234 5678 9012 3456")) + ]), + html.Tr([ + html.Td(dmc.Kbd(r"\b\d{1,3}(,\d{3})*\b")), + html.Td("Matches numbers with commas for every thousand."), + html.Td(dmc.Highlight("'1,234,567'", highlight="1,234,567")) + ]), +] + +# Create the table component +regex_table = dmc.Table( + striped=True, + highlightOnHover=True, + children=[ + html.Thead(header_row), + html.Tbody(data_rows) + ] +) \ No newline at end of file diff --git a/dashboard/utils/userPreferences.py b/dashboard/utils/userPreferences.py index e523012..982c519 100644 --- a/dashboard/utils/userPreferences.py +++ b/dashboard/utils/userPreferences.py @@ -1,4 +1,7 @@ from dash import html, dcc +import dash_mantine_components as dmc +import json +from dashboard.utils.parameters import definition_items, regex_table def extract_dropdown_values(children): @@ -17,6 +20,16 @@ def extract_dropdown_values(children): return dropdown_values +def extract_input_values(children): + input_values = [] + + for child in children: + if isinstance(child, dict) and child.get('type') == 'Div': + for inner_child in child['props']['children']: + if inner_child['type'] == 'TextInput': + input_values.append(inner_child['props']['value']) + + return input_values def populate_datatype_selection(opened, columns): data_type_options = ["text", "numeric", "datetime", "any"] @@ -24,7 +37,7 @@ def populate_datatype_selection(opened, columns): for col_details in columns: col_name = col_details['name'] - dropdown_value = col_details.get('type', None) + dropdown_value = col_details.get('type', 'any') dropdown = dcc.Dropdown( id={'type': 'datatype-dropdown', 'index': col_name}, @@ -44,3 +57,93 @@ def populate_datatype_selection(opened, columns): ) return children + +def populate_format_selection(opened, columns, formatting_options): + + formatting_options = json.loads(formatting_options) if formatting_options else None + + children = [] + children.append(create_regex_instructional_area()) + children.append(dmc.Space(h=20)) + + for col_details in columns: + col_name = col_details['name'] + + # Retrieve the format from the stored formatting if it exists, otherwise set to None + placeholder_value = formatting_options.get(col_name, None) if formatting_options else None + + input_text = dmc.TextInput( + id={'type': 'format-input', 'index': col_name}, + value=placeholder_value, + placeholder="Enter format", + style={'width': '20rem'} + ) + + + children.append( + html.Div( + [html.Label(col_name), input_text], + style={"display": "flex", "justifyContent": "space-between", + "alignItems": "center", "padding": "0.5rem", "borderBottom": "1px solid #000"} + ) + ) + + return children + +def create_regex_instructional_area(): + return dmc.Alert( + children=[ + html.Div( + style={ + "display": "flex", + "justifyContent": "space-around", + "alignItems": "center", + "marginBottom": "1rem" + }, + children=[ + dmc.Title(order=4, children="Understanding Regular Expressions"), + html.A( + "Learn More", + href="https://docs.python.org/3/library/re.html", + target="_blank", + style={ + "textDecoration": "none", + "color": "inherit", + "padding": "10px 20px", + "border": "1px solid", + "borderRadius": "4px", + } + ) + ] + ), + dmc.AccordionMultiple(children=[ + dmc.AccordionItem( + [ + dmc.AccordionControl("Common Definitions"), + dmc.AccordionPanel( + dmc.Grid( + children=definition_items, + style={"margin": "0 auto"} + ), + style={"textAlign": "center"} + ) + ], + value="definitions", + ), + dmc.AccordionItem( + [ + dmc.AccordionControl("Common Examples"), + dmc.AccordionPanel( + dmc.List( + children=regex_table + ) + ) + ], + value="examples", + ) + ]), + ], + style={"maxWidth": "70rem", "margin": "0 auto"} + ) + + diff --git a/downloadbutton.py b/downloadbutton.py deleted file mode 100644 index d178d22..0000000 --- a/downloadbutton.py +++ /dev/null @@ -1,22 +0,0 @@ -import dash_mantine_components as dmc -from dash import Dash, html, Output, Input, callback - -app = Dash(__name__) - -data = [["csv", "csv"], ["xsls", "xsls"], ["pdf", "pdf"], ["html", "html"], ["xml", "xml"]] - -app.layout = html.Div( - [ - dmc.Button("Download", style={"backgroundColor": "#0C7FDA"}), - dmc.RadioGroup( - [dmc.Radio(l, value=k) for k, l in data], - id="radiogroup-simple", - value="react", - size="sm", - mt=10, - ), - ] -) - -if __name__ == "__main__": - app.run_server() \ No newline at end of file