From 332971efd3d912736a16cf105c3cb25840e1e583 Mon Sep 17 00:00:00 2001
From: Tony Hirst <tony.hirst@gmail.com>
Date: Fri, 30 Aug 2024 16:56:23 +0100
Subject: [PATCH] Update reporting

---
 nb_quality_profile/nb_visualiser.py     |  80 ++++++++++++----
 nb_quality_profile/notebook_profiler.py | 120 +++++++++++++++++-------
 setup.py                                |   4 +-
 3 files changed, 153 insertions(+), 51 deletions(-)

diff --git a/nb_quality_profile/nb_visualiser.py b/nb_quality_profile/nb_visualiser.py
index b893b55..b93cd81 100644
--- a/nb_quality_profile/nb_visualiser.py
+++ b/nb_quality_profile/nb_visualiser.py
@@ -14,6 +14,8 @@
 
 # # Simple Notebook Visualiser
 #
+# __USE `active_ipynb` tag for cells viewed in Jupytext notebook that are not meant for py file__
+#
 # Simple notebook visualiser for one or more Jupyter notebooks.
 #
 # Visualises markdown and code cells, with block size determined by code cell line count and estimated screen line count for markdown cells.
@@ -27,6 +29,8 @@
 import jupytext
 from .text_quality import md_readtime
 from pathlib import Path
+from .notebook_profiler import process_notebook_file
+from pandas import concat, DataFrame
 
 def nb_vis(cell_map, img_file='', linewidth = 5, w=20, gap=None,
            gap_boost=1, gap_colour='lightgrey', retval='',
@@ -140,7 +144,7 @@ def _count_screen_lines(txt, width=LINE_WIDTH):
         n_screen_lines = len(_ll)
         return n_screen_lines
 
-    def _nb_big_parse_nb(fn='.', text_formats=True, raw='', **kwargs):
+    def _nb_big_parse_nb(fn=None, text_formats=True, raw='', **kwargs):
         """Parse a notebook and generate the nb_vis cell map for it."""
 
         cell_map = []
@@ -186,7 +190,9 @@ def _nb_big_parse_nb(fn='.', text_formats=True, raw='', **kwargs):
         if 'rounded_minutes' in kwargs and kwargs['rounded_minutes']:
             if 'reading_time' in text_report:
                 text_report['reading_time'] =  math.ceil(text_report['reading_time']/60)
-        return { 'cell_map':cell_map, 'imports':list(set(imports)), 'text_report':text_report }
+        big_report = process_notebook_file(fn)
+        return { 'cell_map':cell_map, 'imports':list(set(imports)),
+                 'text_report':text_report, "big_report":big_report }
 
     def _dir_walker(path='.', exclude = 'default', text_formats=True):
         """Profile all the notebooks in a specific directory, list of directories, or individual files."""
@@ -201,6 +207,8 @@ def _dir_walker(path='.', exclude = 'default', text_formats=True):
         nb_multidir_cell_map = {}
         nb_multidir_imports = {}
         nb_multidir_text_report = {}
+        nb_multidir_big_report = {}
+        very_big_report_df = DataFrame()
 
         # Ensure path is a list to handle single paths and lists uniformly
         if isinstance(path, str):
@@ -232,15 +240,31 @@ def _dir_walker(path='.', exclude = 'default', text_formats=True):
                 cell_map = reports['cell_map']
                 imports = reports['imports']
                 text_report = reports['text_report']
+                big_report_df = reports["big_report"]
+                big_report=big_report_df.to_dict('records')
+                big_report_df["path"] = str(Path(fn).parent)
+                big_report_df["name"] = Path(fn).name
                 if cell_map:
                     nb_multidir_cell_map = {**nb_multidir_cell_map, fn: cell_map}
                 if imports:
                     nb_multidir_imports = {**nb_multidir_imports, fn: imports}
                 if text_report:
                     nb_multidir_text_report = {**nb_multidir_text_report, fn: text_report}
-        return {"cell_map": nb_multidir_cell_map,
-                "imports": nb_multidir_imports,
-                "text_report": nb_multidir_text_report}
+                if big_report:
+                    nb_multidir_big_report = {**nb_multidir_big_report,  fn: big_report}
+                if not big_report_df.empty:
+                    very_big_report_df = concat(
+                        [very_big_report_df, big_report_df],
+                        ignore_index=True,
+                        sort=False,
+                    )
+        return {
+            "cell_map": nb_multidir_cell_map,
+            "imports": nb_multidir_imports,
+            "text_report": nb_multidir_text_report,
+            "big_report": nb_multidir_big_report,
+            "big_report_df": very_big_report_df,
+        }
 
     # Also: we need to be able to switch on and off which reports are run
     # Need to think about handling this properly e.g. in context of plugins
@@ -249,15 +273,21 @@ def _dir_walker(path='.', exclude = 'default', text_formats=True):
         cell_map = reports['cell_map']
         imports = reports['imports']
         text_report = reports['text_report']
+        big_report = reports["big_report"]
+        big_report_df = reports["big_report_df"]
     else:
         reports =  _nb_big_parse_nb(path, text_formats, raw=raw, **kwargs)
 
         cell_map = {path: reports['cell_map']}
         imports = {path: reports['imports']}
         text_report = {path: reports['text_report']}
+        big_report = reports["big_report"]
+        big_report_df = reports["big_report_df"]
     return {"cell_map": cell_map,
             "imports": imports,
-            "text_report": text_report}
+            "text_report": text_report,
+            "big_report": big_report,
+            "big_report_df": big_report_df}
 
 
 def nb_vis_parse_nb(path='.', img_file='', linewidth = 5, w=20, text_formats=True, retval='', raw='', **kwargs):
@@ -288,13 +318,13 @@ def nb_imports_parse_nb(path='.', text_formats=True,
     for i in imports:
         packages = [p.split('.')[0] for p in imports[i]]
         all_packages = all_packages + packages
-    
+
         if verbose:
             print(f"Imports in {i}: {', '.join(packages)}")
-    
+
         # Scatterplot
         for p in imports[i]:
-            #x.append("\n".join(str(i).split("/")))
+            # x.append("\n".join(str(i).split("/")))
             # Limit length of filename displayed
             x.append(str(i).split("/")[-1].replace(".ipynb", "")[:40])
             y.append(p)
@@ -311,8 +341,8 @@ def nb_imports_parse_nb(path='.', text_formats=True,
 
     # stdlib packages
     std_lib = {p for p in all_packages if place_module(p) == "STDLIB"}
-    #Project names are defined by a project’s setup script, 
-    #and they are used to identify projects on PyPI. 
+    # Project names are defined by a project’s setup script,
+    # and they are used to identify projects on PyPI.
     third_party = {p for p in all_packages if place_module(p) == "THIRDPARTY"}
     third_party_packages_required = {pkg_resources.Requirement(p).project_name for p in all_packages if place_module(p) == "THIRDPARTY"}
     if verbose:
@@ -324,32 +354,48 @@ def nb_imports_parse_nb(path='.', text_formats=True,
     if installed:
         import importlib
 
-        fails = [p for p in all_packages if not importlib.util.find_spec(p)]
+        fails = [p for p in all_packages if p and not importlib.util.find_spec(p)]
         # TO DO  - what was the following supposed to check?
         # maybe dependencies?
-        #fails_required = {pkg_resources.Requirement(p).project_name for p in fails}
+        # fails_required = {pkg_resources.Requirement(p).project_name for p in fails}
         if verbose:
             if fails:
                 print(f"The following packages cannot be imported: {', '.join(fails)}")
-                #print(f"Install the following packages to fix broken imports: {', '.join(fails_required)}")
+                # print(f"Install the following packages to fix broken imports: {', '.join(fails_required)}")
             else:
                 print("All packages can be imported.")
 
     return (imports, all_packages, std_lib, third_party, fails)
 
     # For package details:
-    #import pkg_resources
+    # import pkg_resources
     # https://setuptools.pypa.io/en/latest/pkg_resources.html
-    #print([p.project_name for p in pkg_resources.working_set])
+    # print([p.project_name for p in pkg_resources.working_set])
     # We can also pull out things like package requirements, etc.
     # pkg_resources.working_set.require('pandas')
     # pkg_resources.Requirement('pandas').project_name
 
+from .notebook_profiler import (
+    reporter,
+    report_template_dir,
+    report_template_nb,
+    multi_level_reporter,
+)
 
 def nb_text_parse_nb(path='.', text_formats=True, reading_rate=100, rounded_minutes=False, raw=''):
     """Parse markdown text in notebook(s)."""
     reports = nb_big_parse_nb(path, text_formats, reading_rate=reading_rate, rounded_minutes=rounded_minutes, raw=raw)
-    print(reports['text_report'])
+    print("\nTEXT REPORT\n",reports['text_report'])
+    print("\n\nIMPORTS REPORT\n",reports["imports"])
+    #print("\n\BIG REPORT\n", reports["big_report"], "\n\n")
+
+    # print(reporter(reports["big_report_df"], report_template_full))
+    print(
+        multi_level_reporter(
+            reports["big_report_df"], report_template_dir, report_template_nb
+        )
+    )
+    # print(reports)
 
 
 # + tags=["active-ipynb"]
diff --git a/nb_quality_profile/notebook_profiler.py b/nb_quality_profile/notebook_profiler.py
index f11f80d..0bbe207 100644
--- a/nb_quality_profile/notebook_profiler.py
+++ b/nb_quality_profile/notebook_profiler.py
@@ -38,6 +38,8 @@
 #
 # There is a potential for making IPython magics for some of the reporting functions (for example, `radon` or `wily` reports) to provide live feedback / reporting during the creation of content in a notebook.
 #
+# *Code has been extract from this notebook into the `nb_quality_profile` package. If experiments are made updating code in this notebook, it will need mirroring in the actual package...*
+#
 # ### Notebooks
 #
 # In the first instance, reports are generated for code cell inputs and markdown cells; code outputs and raw cells are not considered. Code appearing in markdown cells is identified as code-like but not analysed in terms of code complexity etc.
@@ -60,7 +62,7 @@
 #
 
 # +
-#Last using numpy 1.x
+# Last using numpy 1.x
 # #%pip install --upgrade numpy<2 spacy==3.7.5 pandas==2.2.2 scikit-learn==1.4.0
 # -
 
@@ -568,8 +570,8 @@ def process_notebook_md_doc(doc):
 
 # Running the `process_notebook_md_doc()` function on a `doc` object returns a single row dataframe containing summary statistics calculated over the full markdown content of the notebook.
 
-#counts, readability = text_stats_summary(doc)
-#extras = process_extras(doc)
+# counts, readability = text_stats_summary(doc)
+# extras = process_extras(doc)
 
 # + tags=["active-ipynb"]
 # process_notebook_md_doc(full_doc)
@@ -736,7 +738,7 @@ def process_notebook_md(nb, fn=''):
     for i, cell in enumerate(nb.cells):
         if cell['cell_type']=='markdown':
             _metrics = process_notebook_md_doc( nlp( cell['source'] ))
-            _metrics['cell_count'] = i
+            _metrics['cell_index'] = i
             _metrics['cell_type'] = 'md'
             #cell_reports = cell_reports.append(_metrics, sort=False)
             cell_reports = pd.concat([cell_reports, _metrics], ignore_index=True, sort=False)
@@ -768,7 +770,7 @@ def process_notebook_file(fn):
     with open(fn,'r') as f:
         try:
             nb = nbformat.reads(f.read(), as_version=4)
-            cell_reports = process_notebook_md(nb, fn=fn)
+            cell_reports = process_notebook(nb, fn=fn)
         except:
             print(f'FAILED to process {fn}')
             cell_reports = pd.DataFrame()
@@ -1114,19 +1116,19 @@ def process_notebook_code_text(txt):
 def process_notebook(nb, fn=''):
     """Process all the markdown and code cells in a notebook."""
     cell_reports = pd.DataFrame()
-    
+
     for i, cell in enumerate(nb.cells):
         if cell['cell_type']=='markdown':
             _metrics = process_notebook_md_doc( nlp( cell['source'] ))
-            _metrics['cell_count'] = i
+            _metrics['cell_index'] = i
             _metrics['cell_type'] = 'md'
-            #cell_reports = cell_reports.append(_metrics, sort=False)
+            # cell_reports = cell_reports.append(_metrics, sort=False)
             cell_reports = pd.concat([cell_reports, _metrics], ignore_index=True, sort=False)
         elif cell['cell_type']=='code':
             _metrics = process_notebook_code_text(cell['source'] )
-            _metrics['cell_count'] = i
+            _metrics["cell_index"] = i
             _metrics['cell_type'] = 'code'
-            #cell_reports = cell_reports.append(_metrics, sort=False)
+            # cell_reports = cell_reports.append(_metrics, sort=False)
             cell_reports = pd.concat([cell_reports, _metrics], ignore_index=True, sort=False)
     cell_reports['filename'] = fn
     cell_reports.reset_index(drop=True, inplace=True)
@@ -1193,7 +1195,7 @@ def process_notebook(nb, fn=''):
 
 # Let's make a start on a complete report template...
 
-report_template_full = '''
+report_template_dir = '''
 In directory `{path}` there were {nb_count} notebooks.
 
 - total markdown wordcount {n_words} words across {n_md_cells} markdown cells
@@ -1204,34 +1206,90 @@ def process_notebook(nb, fn=''):
 
 '''
 
+report_template_nb = """
+Report for {name}
+
+- total markdown wordcount {n_words} words across {n_md_cells} markdown cells
+- total code line count of {n_total_code_lines} lines of code across {n_code_cells} code cells
+  - {n_code_lines} code lines, {n_single_line_comment_code_lines} comment lines and {n_blank_code_lines} blank lines
+
+Estimated total reading time of {reading_time_mins} minutes.
+
+"""
 
 # Now let's add those extra requirements to the the feedstock generator:
+def notebook_report_feedstock(ddf, grouper=None):
+    """Create a feedstock dict for report generation. Keyed by directory path and optionally by name."""
+    if grouper is None:
+        grouper = ["path"]
+
+    ddf_dict = (
+        ddf.groupby(grouper)[
+            [
+                "n_words",
+                "reading_time_mins",
+                "reading_time_s",
+                "n_code_lines",
+                "n_single_line_comment_code_lines",
+                "n_total_code_lines",
+                "n_blank_code_lines",
+            ]
+        ]
+        .sum()
+        .to_dict(orient="index")
+    )
+
+    notebook_counts = ddf.groupby(grouper)["filename"].nunique().to_dict()
+    notebook_counts = {k: {"nb_count": notebook_counts[k]} for k in notebook_counts}
+
+    report_dict = always_merger.merge(ddf_dict, notebook_counts)
+
+    code_cell_counts = ddf[ddf["cell_type"] == "code"].groupby(grouper).size().to_dict()
+    md_cell_counts = ddf[ddf["cell_type"] == "md"].groupby(grouper).size().to_dict()
 
-def notebook_report_feedstock(ddf):
-    """Create a feedstock dict for report generation. Keyed by directory path."""
-    ddf_dict = ddf.groupby(['path'])[['n_words', 'reading_time_mins', 'reading_time_s',
-                                     'n_code_lines', 'n_single_line_comment_code_lines',
-                                     'n_total_code_lines','n_blank_code_lines']].sum().to_dict(orient='index')
-    
-    notebook_counts_by_dir = ddf.groupby(['path'])['filename'].nunique().to_dict()
-    notebook_counts_by_dir = {k:{'nb_count':notebook_counts_by_dir[k]} for k in notebook_counts_by_dir}
-        
-    report_dict = always_merger.merge(ddf_dict, notebook_counts_by_dir )
-    
-    code_cell_counts = ddf[ddf['cell_type']=='code'].groupby(['path']).size().to_dict()
-    md_cell_counts = ddf[ddf['cell_type']=='md'].groupby(['path']).size().to_dict()
-    
     for k in report_dict:
-        report_dict[k]['path'] = k
-        report_dict[k]['n_code_cells'] = code_cell_counts[k] if k in code_cell_counts else 'NA'
-        report_dict[k]['n_md_cells'] = md_cell_counts[k] if  k in md_cell_counts else 'NA'
-    
+        report_dict[k]["path"] = k[0] if isinstance(k, tuple) else k
+        if isinstance(k, tuple) and len(k) > 1:
+            report_dict[k]["name"] = k[1]
+        report_dict[k]["n_code_cells"] = (
+            code_cell_counts[k] if k in code_cell_counts else "NA"
+        )
+        report_dict[k]["n_md_cells"] = (
+            md_cell_counts[k] if k in md_cell_counts else "NA"
+        )
+
     return report_dict
 
+from collections import defaultdict
+
+def multi_level_reporter(df, dir_template, item_template, path_filter=""):
+    """Generate a multi-level report with directory and item level information."""
+    dir_feedstock = notebook_report_feedstock(df, grouper=["path"])
+    item_feedstock = notebook_report_feedstock(df, grouper=["path", "name"])
+
+    report_txt = ""
+    item_reports = defaultdict(str)
+
+    # Generate item-level reports
+    for item in item_feedstock:
+        if path_filter in item[0]:  # item[0] is the path
+            item_report = item_template.format(**item_feedstock[item])
+            item_reports[item[0]] += "\n" + item_report
+
+    # Generate directory-level reports with nested item reports
+    for directory in dir_feedstock:
+        if path_filter in directory:
+            dir_report = dir_template.format(**dir_feedstock[directory])
+            report_txt += "\n\n" + dir_report
+            if directory in item_reports:
+                report_txt += "\n" + item_reports[directory]
+
+    return report_txt
+
 
 # Create a wrapper function for generating the report text:
 
-def reporter(df, template, path_filter=''):
+def reporter(df, template, path_filter='', nb_report=True):
     feedstock = notebook_report_feedstock(df)
     report_txt=''
     for d in feedstock:
@@ -1484,5 +1542,3 @@ def cell_attribs(cells, colour='cell_type', size='n_screen_lines'):
 # Also a count of empty cells?
 #
 # Is this moving towards some sort of notebook linter?
-
-
diff --git a/setup.py b/setup.py
index 12ff676..2670fbe 100644
--- a/setup.py
+++ b/setup.py
@@ -6,7 +6,7 @@
 setup(
     name="nb_quality_profile",
     packages=["nb_quality_profile"],
-    version="0.3.0",
+    version="0.3.1",
     author="Tony Hirst",
     author_email="tony.hirst@gmail.com",
     description="Tools for profiling Jupyter notebook quality and visualing notebook structure.",
@@ -24,7 +24,7 @@
         "pandas==2.2.1",
         "matplotlib==3.7.5","numpy<2",
         "spacy==3.7.5",
-        "scikit-learn==1.4.0",
+        "scikit-learn==1.3.2", "scipy==1.11.4",
         "readtime",
         "list-imports",
         "pytest-codeblocks",