From 332971efd3d912736a16cf105c3cb25840e1e583 Mon Sep 17 00:00:00 2001 From: Tony Hirst Date: Fri, 30 Aug 2024 16:56:23 +0100 Subject: [PATCH] Update reporting --- nb_quality_profile/nb_visualiser.py | 80 ++++++++++++---- nb_quality_profile/notebook_profiler.py | 120 +++++++++++++++++------- setup.py | 4 +- 3 files changed, 153 insertions(+), 51 deletions(-) diff --git a/nb_quality_profile/nb_visualiser.py b/nb_quality_profile/nb_visualiser.py index b893b55..b93cd81 100644 --- a/nb_quality_profile/nb_visualiser.py +++ b/nb_quality_profile/nb_visualiser.py @@ -14,6 +14,8 @@ # # Simple Notebook Visualiser # +# __USE `active_ipynb` tag for cells viewed in Jupytext notebook that are not meant for py file__ +# # Simple notebook visualiser for one or more Jupyter notebooks. # # Visualises markdown and code cells, with block size determined by code cell line count and estimated screen line count for markdown cells. @@ -27,6 +29,8 @@ import jupytext from .text_quality import md_readtime from pathlib import Path +from .notebook_profiler import process_notebook_file +from pandas import concat, DataFrame def nb_vis(cell_map, img_file='', linewidth = 5, w=20, gap=None, gap_boost=1, gap_colour='lightgrey', retval='', @@ -140,7 +144,7 @@ def _count_screen_lines(txt, width=LINE_WIDTH): n_screen_lines = len(_ll) return n_screen_lines - def _nb_big_parse_nb(fn='.', text_formats=True, raw='', **kwargs): + def _nb_big_parse_nb(fn=None, text_formats=True, raw='', **kwargs): """Parse a notebook and generate the nb_vis cell map for it.""" cell_map = [] @@ -186,7 +190,9 @@ def _nb_big_parse_nb(fn='.', text_formats=True, raw='', **kwargs): if 'rounded_minutes' in kwargs and kwargs['rounded_minutes']: if 'reading_time' in text_report: text_report['reading_time'] = math.ceil(text_report['reading_time']/60) - return { 'cell_map':cell_map, 'imports':list(set(imports)), 'text_report':text_report } + big_report = process_notebook_file(fn) + return { 'cell_map':cell_map, 'imports':list(set(imports)), + 'text_report':text_report, "big_report":big_report } def _dir_walker(path='.', exclude = 'default', text_formats=True): """Profile all the notebooks in a specific directory, list of directories, or individual files.""" @@ -201,6 +207,8 @@ def _dir_walker(path='.', exclude = 'default', text_formats=True): nb_multidir_cell_map = {} nb_multidir_imports = {} nb_multidir_text_report = {} + nb_multidir_big_report = {} + very_big_report_df = DataFrame() # Ensure path is a list to handle single paths and lists uniformly if isinstance(path, str): @@ -232,15 +240,31 @@ def _dir_walker(path='.', exclude = 'default', text_formats=True): cell_map = reports['cell_map'] imports = reports['imports'] text_report = reports['text_report'] + big_report_df = reports["big_report"] + big_report=big_report_df.to_dict('records') + big_report_df["path"] = str(Path(fn).parent) + big_report_df["name"] = Path(fn).name if cell_map: nb_multidir_cell_map = {**nb_multidir_cell_map, fn: cell_map} if imports: nb_multidir_imports = {**nb_multidir_imports, fn: imports} if text_report: nb_multidir_text_report = {**nb_multidir_text_report, fn: text_report} - return {"cell_map": nb_multidir_cell_map, - "imports": nb_multidir_imports, - "text_report": nb_multidir_text_report} + if big_report: + nb_multidir_big_report = {**nb_multidir_big_report, fn: big_report} + if not big_report_df.empty: + very_big_report_df = concat( + [very_big_report_df, big_report_df], + ignore_index=True, + sort=False, + ) + return { + "cell_map": nb_multidir_cell_map, + "imports": nb_multidir_imports, + "text_report": nb_multidir_text_report, + "big_report": nb_multidir_big_report, + "big_report_df": very_big_report_df, + } # Also: we need to be able to switch on and off which reports are run # Need to think about handling this properly e.g. in context of plugins @@ -249,15 +273,21 @@ def _dir_walker(path='.', exclude = 'default', text_formats=True): cell_map = reports['cell_map'] imports = reports['imports'] text_report = reports['text_report'] + big_report = reports["big_report"] + big_report_df = reports["big_report_df"] else: reports = _nb_big_parse_nb(path, text_formats, raw=raw, **kwargs) cell_map = {path: reports['cell_map']} imports = {path: reports['imports']} text_report = {path: reports['text_report']} + big_report = reports["big_report"] + big_report_df = reports["big_report_df"] return {"cell_map": cell_map, "imports": imports, - "text_report": text_report} + "text_report": text_report, + "big_report": big_report, + "big_report_df": big_report_df} def nb_vis_parse_nb(path='.', img_file='', linewidth = 5, w=20, text_formats=True, retval='', raw='', **kwargs): @@ -288,13 +318,13 @@ def nb_imports_parse_nb(path='.', text_formats=True, for i in imports: packages = [p.split('.')[0] for p in imports[i]] all_packages = all_packages + packages - + if verbose: print(f"Imports in {i}: {', '.join(packages)}") - + # Scatterplot for p in imports[i]: - #x.append("\n".join(str(i).split("/"))) + # x.append("\n".join(str(i).split("/"))) # Limit length of filename displayed x.append(str(i).split("/")[-1].replace(".ipynb", "")[:40]) y.append(p) @@ -311,8 +341,8 @@ def nb_imports_parse_nb(path='.', text_formats=True, # stdlib packages std_lib = {p for p in all_packages if place_module(p) == "STDLIB"} - #Project names are defined by a project’s setup script, - #and they are used to identify projects on PyPI. + # Project names are defined by a project’s setup script, + # and they are used to identify projects on PyPI. third_party = {p for p in all_packages if place_module(p) == "THIRDPARTY"} third_party_packages_required = {pkg_resources.Requirement(p).project_name for p in all_packages if place_module(p) == "THIRDPARTY"} if verbose: @@ -324,32 +354,48 @@ def nb_imports_parse_nb(path='.', text_formats=True, if installed: import importlib - fails = [p for p in all_packages if not importlib.util.find_spec(p)] + fails = [p for p in all_packages if p and not importlib.util.find_spec(p)] # TO DO - what was the following supposed to check? # maybe dependencies? - #fails_required = {pkg_resources.Requirement(p).project_name for p in fails} + # fails_required = {pkg_resources.Requirement(p).project_name for p in fails} if verbose: if fails: print(f"The following packages cannot be imported: {', '.join(fails)}") - #print(f"Install the following packages to fix broken imports: {', '.join(fails_required)}") + # print(f"Install the following packages to fix broken imports: {', '.join(fails_required)}") else: print("All packages can be imported.") return (imports, all_packages, std_lib, third_party, fails) # For package details: - #import pkg_resources + # import pkg_resources # https://setuptools.pypa.io/en/latest/pkg_resources.html - #print([p.project_name for p in pkg_resources.working_set]) + # print([p.project_name for p in pkg_resources.working_set]) # We can also pull out things like package requirements, etc. # pkg_resources.working_set.require('pandas') # pkg_resources.Requirement('pandas').project_name +from .notebook_profiler import ( + reporter, + report_template_dir, + report_template_nb, + multi_level_reporter, +) def nb_text_parse_nb(path='.', text_formats=True, reading_rate=100, rounded_minutes=False, raw=''): """Parse markdown text in notebook(s).""" reports = nb_big_parse_nb(path, text_formats, reading_rate=reading_rate, rounded_minutes=rounded_minutes, raw=raw) - print(reports['text_report']) + print("\nTEXT REPORT\n",reports['text_report']) + print("\n\nIMPORTS REPORT\n",reports["imports"]) + #print("\n\BIG REPORT\n", reports["big_report"], "\n\n") + + # print(reporter(reports["big_report_df"], report_template_full)) + print( + multi_level_reporter( + reports["big_report_df"], report_template_dir, report_template_nb + ) + ) + # print(reports) # + tags=["active-ipynb"] diff --git a/nb_quality_profile/notebook_profiler.py b/nb_quality_profile/notebook_profiler.py index f11f80d..0bbe207 100644 --- a/nb_quality_profile/notebook_profiler.py +++ b/nb_quality_profile/notebook_profiler.py @@ -38,6 +38,8 @@ # # There is a potential for making IPython magics for some of the reporting functions (for example, `radon` or `wily` reports) to provide live feedback / reporting during the creation of content in a notebook. # +# *Code has been extract from this notebook into the `nb_quality_profile` package. If experiments are made updating code in this notebook, it will need mirroring in the actual package...* +# # ### Notebooks # # In the first instance, reports are generated for code cell inputs and markdown cells; code outputs and raw cells are not considered. Code appearing in markdown cells is identified as code-like but not analysed in terms of code complexity etc. @@ -60,7 +62,7 @@ # # + -#Last using numpy 1.x +# Last using numpy 1.x # #%pip install --upgrade numpy<2 spacy==3.7.5 pandas==2.2.2 scikit-learn==1.4.0 # - @@ -568,8 +570,8 @@ def process_notebook_md_doc(doc): # Running the `process_notebook_md_doc()` function on a `doc` object returns a single row dataframe containing summary statistics calculated over the full markdown content of the notebook. -#counts, readability = text_stats_summary(doc) -#extras = process_extras(doc) +# counts, readability = text_stats_summary(doc) +# extras = process_extras(doc) # + tags=["active-ipynb"] # process_notebook_md_doc(full_doc) @@ -736,7 +738,7 @@ def process_notebook_md(nb, fn=''): for i, cell in enumerate(nb.cells): if cell['cell_type']=='markdown': _metrics = process_notebook_md_doc( nlp( cell['source'] )) - _metrics['cell_count'] = i + _metrics['cell_index'] = i _metrics['cell_type'] = 'md' #cell_reports = cell_reports.append(_metrics, sort=False) cell_reports = pd.concat([cell_reports, _metrics], ignore_index=True, sort=False) @@ -768,7 +770,7 @@ def process_notebook_file(fn): with open(fn,'r') as f: try: nb = nbformat.reads(f.read(), as_version=4) - cell_reports = process_notebook_md(nb, fn=fn) + cell_reports = process_notebook(nb, fn=fn) except: print(f'FAILED to process {fn}') cell_reports = pd.DataFrame() @@ -1114,19 +1116,19 @@ def process_notebook_code_text(txt): def process_notebook(nb, fn=''): """Process all the markdown and code cells in a notebook.""" cell_reports = pd.DataFrame() - + for i, cell in enumerate(nb.cells): if cell['cell_type']=='markdown': _metrics = process_notebook_md_doc( nlp( cell['source'] )) - _metrics['cell_count'] = i + _metrics['cell_index'] = i _metrics['cell_type'] = 'md' - #cell_reports = cell_reports.append(_metrics, sort=False) + # cell_reports = cell_reports.append(_metrics, sort=False) cell_reports = pd.concat([cell_reports, _metrics], ignore_index=True, sort=False) elif cell['cell_type']=='code': _metrics = process_notebook_code_text(cell['source'] ) - _metrics['cell_count'] = i + _metrics["cell_index"] = i _metrics['cell_type'] = 'code' - #cell_reports = cell_reports.append(_metrics, sort=False) + # cell_reports = cell_reports.append(_metrics, sort=False) cell_reports = pd.concat([cell_reports, _metrics], ignore_index=True, sort=False) cell_reports['filename'] = fn cell_reports.reset_index(drop=True, inplace=True) @@ -1193,7 +1195,7 @@ def process_notebook(nb, fn=''): # Let's make a start on a complete report template... -report_template_full = ''' +report_template_dir = ''' In directory `{path}` there were {nb_count} notebooks. - total markdown wordcount {n_words} words across {n_md_cells} markdown cells @@ -1204,34 +1206,90 @@ def process_notebook(nb, fn=''): ''' +report_template_nb = """ +Report for {name} + +- total markdown wordcount {n_words} words across {n_md_cells} markdown cells +- total code line count of {n_total_code_lines} lines of code across {n_code_cells} code cells + - {n_code_lines} code lines, {n_single_line_comment_code_lines} comment lines and {n_blank_code_lines} blank lines + +Estimated total reading time of {reading_time_mins} minutes. + +""" # Now let's add those extra requirements to the the feedstock generator: +def notebook_report_feedstock(ddf, grouper=None): + """Create a feedstock dict for report generation. Keyed by directory path and optionally by name.""" + if grouper is None: + grouper = ["path"] + + ddf_dict = ( + ddf.groupby(grouper)[ + [ + "n_words", + "reading_time_mins", + "reading_time_s", + "n_code_lines", + "n_single_line_comment_code_lines", + "n_total_code_lines", + "n_blank_code_lines", + ] + ] + .sum() + .to_dict(orient="index") + ) + + notebook_counts = ddf.groupby(grouper)["filename"].nunique().to_dict() + notebook_counts = {k: {"nb_count": notebook_counts[k]} for k in notebook_counts} + + report_dict = always_merger.merge(ddf_dict, notebook_counts) + + code_cell_counts = ddf[ddf["cell_type"] == "code"].groupby(grouper).size().to_dict() + md_cell_counts = ddf[ddf["cell_type"] == "md"].groupby(grouper).size().to_dict() -def notebook_report_feedstock(ddf): - """Create a feedstock dict for report generation. Keyed by directory path.""" - ddf_dict = ddf.groupby(['path'])[['n_words', 'reading_time_mins', 'reading_time_s', - 'n_code_lines', 'n_single_line_comment_code_lines', - 'n_total_code_lines','n_blank_code_lines']].sum().to_dict(orient='index') - - notebook_counts_by_dir = ddf.groupby(['path'])['filename'].nunique().to_dict() - notebook_counts_by_dir = {k:{'nb_count':notebook_counts_by_dir[k]} for k in notebook_counts_by_dir} - - report_dict = always_merger.merge(ddf_dict, notebook_counts_by_dir ) - - code_cell_counts = ddf[ddf['cell_type']=='code'].groupby(['path']).size().to_dict() - md_cell_counts = ddf[ddf['cell_type']=='md'].groupby(['path']).size().to_dict() - for k in report_dict: - report_dict[k]['path'] = k - report_dict[k]['n_code_cells'] = code_cell_counts[k] if k in code_cell_counts else 'NA' - report_dict[k]['n_md_cells'] = md_cell_counts[k] if k in md_cell_counts else 'NA' - + report_dict[k]["path"] = k[0] if isinstance(k, tuple) else k + if isinstance(k, tuple) and len(k) > 1: + report_dict[k]["name"] = k[1] + report_dict[k]["n_code_cells"] = ( + code_cell_counts[k] if k in code_cell_counts else "NA" + ) + report_dict[k]["n_md_cells"] = ( + md_cell_counts[k] if k in md_cell_counts else "NA" + ) + return report_dict +from collections import defaultdict + +def multi_level_reporter(df, dir_template, item_template, path_filter=""): + """Generate a multi-level report with directory and item level information.""" + dir_feedstock = notebook_report_feedstock(df, grouper=["path"]) + item_feedstock = notebook_report_feedstock(df, grouper=["path", "name"]) + + report_txt = "" + item_reports = defaultdict(str) + + # Generate item-level reports + for item in item_feedstock: + if path_filter in item[0]: # item[0] is the path + item_report = item_template.format(**item_feedstock[item]) + item_reports[item[0]] += "\n" + item_report + + # Generate directory-level reports with nested item reports + for directory in dir_feedstock: + if path_filter in directory: + dir_report = dir_template.format(**dir_feedstock[directory]) + report_txt += "\n\n" + dir_report + if directory in item_reports: + report_txt += "\n" + item_reports[directory] + + return report_txt + # Create a wrapper function for generating the report text: -def reporter(df, template, path_filter=''): +def reporter(df, template, path_filter='', nb_report=True): feedstock = notebook_report_feedstock(df) report_txt='' for d in feedstock: @@ -1484,5 +1542,3 @@ def cell_attribs(cells, colour='cell_type', size='n_screen_lines'): # Also a count of empty cells? # # Is this moving towards some sort of notebook linter? - - diff --git a/setup.py b/setup.py index 12ff676..2670fbe 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ setup( name="nb_quality_profile", packages=["nb_quality_profile"], - version="0.3.0", + version="0.3.1", author="Tony Hirst", author_email="tony.hirst@gmail.com", description="Tools for profiling Jupyter notebook quality and visualing notebook structure.", @@ -24,7 +24,7 @@ "pandas==2.2.1", "matplotlib==3.7.5","numpy<2", "spacy==3.7.5", - "scikit-learn==1.4.0", + "scikit-learn==1.3.2", "scipy==1.11.4", "readtime", "list-imports", "pytest-codeblocks",