Skip to content

Commit

Permalink
Update reporting
Browse files Browse the repository at this point in the history
  • Loading branch information
psychemedia committed Aug 30, 2024
1 parent b4662ba commit 332971e
Show file tree
Hide file tree
Showing 3 changed files with 153 additions and 51 deletions.
80 changes: 63 additions & 17 deletions nb_quality_profile/nb_visualiser.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@

# # Simple Notebook Visualiser
#
# __USE `active_ipynb` tag for cells viewed in Jupytext notebook that are not meant for py file__
#
# Simple notebook visualiser for one or more Jupyter notebooks.
#
# Visualises markdown and code cells, with block size determined by code cell line count and estimated screen line count for markdown cells.
Expand All @@ -27,6 +29,8 @@
import jupytext
from .text_quality import md_readtime
from pathlib import Path
from .notebook_profiler import process_notebook_file
from pandas import concat, DataFrame

def nb_vis(cell_map, img_file='', linewidth = 5, w=20, gap=None,
gap_boost=1, gap_colour='lightgrey', retval='',
Expand Down Expand Up @@ -140,7 +144,7 @@ def _count_screen_lines(txt, width=LINE_WIDTH):
n_screen_lines = len(_ll)
return n_screen_lines

def _nb_big_parse_nb(fn='.', text_formats=True, raw='', **kwargs):
def _nb_big_parse_nb(fn=None, text_formats=True, raw='', **kwargs):
"""Parse a notebook and generate the nb_vis cell map for it."""

cell_map = []
Expand Down Expand Up @@ -186,7 +190,9 @@ def _nb_big_parse_nb(fn='.', text_formats=True, raw='', **kwargs):
if 'rounded_minutes' in kwargs and kwargs['rounded_minutes']:
if 'reading_time' in text_report:
text_report['reading_time'] = math.ceil(text_report['reading_time']/60)
return { 'cell_map':cell_map, 'imports':list(set(imports)), 'text_report':text_report }
big_report = process_notebook_file(fn)
return { 'cell_map':cell_map, 'imports':list(set(imports)),
'text_report':text_report, "big_report":big_report }

def _dir_walker(path='.', exclude = 'default', text_formats=True):
"""Profile all the notebooks in a specific directory, list of directories, or individual files."""
Expand All @@ -201,6 +207,8 @@ def _dir_walker(path='.', exclude = 'default', text_formats=True):
nb_multidir_cell_map = {}
nb_multidir_imports = {}
nb_multidir_text_report = {}
nb_multidir_big_report = {}
very_big_report_df = DataFrame()

# Ensure path is a list to handle single paths and lists uniformly
if isinstance(path, str):
Expand Down Expand Up @@ -232,15 +240,31 @@ def _dir_walker(path='.', exclude = 'default', text_formats=True):
cell_map = reports['cell_map']
imports = reports['imports']
text_report = reports['text_report']
big_report_df = reports["big_report"]
big_report=big_report_df.to_dict('records')
big_report_df["path"] = str(Path(fn).parent)
big_report_df["name"] = Path(fn).name
if cell_map:
nb_multidir_cell_map = {**nb_multidir_cell_map, fn: cell_map}
if imports:
nb_multidir_imports = {**nb_multidir_imports, fn: imports}
if text_report:
nb_multidir_text_report = {**nb_multidir_text_report, fn: text_report}
return {"cell_map": nb_multidir_cell_map,
"imports": nb_multidir_imports,
"text_report": nb_multidir_text_report}
if big_report:
nb_multidir_big_report = {**nb_multidir_big_report, fn: big_report}
if not big_report_df.empty:
very_big_report_df = concat(
[very_big_report_df, big_report_df],
ignore_index=True,
sort=False,
)
return {
"cell_map": nb_multidir_cell_map,
"imports": nb_multidir_imports,
"text_report": nb_multidir_text_report,
"big_report": nb_multidir_big_report,
"big_report_df": very_big_report_df,
}

# Also: we need to be able to switch on and off which reports are run
# Need to think about handling this properly e.g. in context of plugins
Expand All @@ -249,15 +273,21 @@ def _dir_walker(path='.', exclude = 'default', text_formats=True):
cell_map = reports['cell_map']
imports = reports['imports']
text_report = reports['text_report']
big_report = reports["big_report"]
big_report_df = reports["big_report_df"]
else:
reports = _nb_big_parse_nb(path, text_formats, raw=raw, **kwargs)

cell_map = {path: reports['cell_map']}
imports = {path: reports['imports']}
text_report = {path: reports['text_report']}
big_report = reports["big_report"]
big_report_df = reports["big_report_df"]
return {"cell_map": cell_map,
"imports": imports,
"text_report": text_report}
"text_report": text_report,
"big_report": big_report,
"big_report_df": big_report_df}


def nb_vis_parse_nb(path='.', img_file='', linewidth = 5, w=20, text_formats=True, retval='', raw='', **kwargs):
Expand Down Expand Up @@ -288,13 +318,13 @@ def nb_imports_parse_nb(path='.', text_formats=True,
for i in imports:
packages = [p.split('.')[0] for p in imports[i]]
all_packages = all_packages + packages

if verbose:
print(f"Imports in {i}: {', '.join(packages)}")

# Scatterplot
for p in imports[i]:
#x.append("\n".join(str(i).split("/")))
# x.append("\n".join(str(i).split("/")))
# Limit length of filename displayed
x.append(str(i).split("/")[-1].replace(".ipynb", "")[:40])
y.append(p)
Expand All @@ -311,8 +341,8 @@ def nb_imports_parse_nb(path='.', text_formats=True,

# stdlib packages
std_lib = {p for p in all_packages if place_module(p) == "STDLIB"}
#Project names are defined by a project’s setup script,
#and they are used to identify projects on PyPI.
# Project names are defined by a project’s setup script,
# and they are used to identify projects on PyPI.
third_party = {p for p in all_packages if place_module(p) == "THIRDPARTY"}
third_party_packages_required = {pkg_resources.Requirement(p).project_name for p in all_packages if place_module(p) == "THIRDPARTY"}
if verbose:
Expand All @@ -324,32 +354,48 @@ def nb_imports_parse_nb(path='.', text_formats=True,
if installed:
import importlib

fails = [p for p in all_packages if not importlib.util.find_spec(p)]
fails = [p for p in all_packages if p and not importlib.util.find_spec(p)]
# TO DO - what was the following supposed to check?
# maybe dependencies?
#fails_required = {pkg_resources.Requirement(p).project_name for p in fails}
# fails_required = {pkg_resources.Requirement(p).project_name for p in fails}
if verbose:
if fails:
print(f"The following packages cannot be imported: {', '.join(fails)}")
#print(f"Install the following packages to fix broken imports: {', '.join(fails_required)}")
# print(f"Install the following packages to fix broken imports: {', '.join(fails_required)}")
else:
print("All packages can be imported.")

return (imports, all_packages, std_lib, third_party, fails)

# For package details:
#import pkg_resources
# import pkg_resources
# https://setuptools.pypa.io/en/latest/pkg_resources.html
#print([p.project_name for p in pkg_resources.working_set])
# print([p.project_name for p in pkg_resources.working_set])
# We can also pull out things like package requirements, etc.
# pkg_resources.working_set.require('pandas')
# pkg_resources.Requirement('pandas').project_name

from .notebook_profiler import (
reporter,
report_template_dir,
report_template_nb,
multi_level_reporter,
)

def nb_text_parse_nb(path='.', text_formats=True, reading_rate=100, rounded_minutes=False, raw=''):
"""Parse markdown text in notebook(s)."""
reports = nb_big_parse_nb(path, text_formats, reading_rate=reading_rate, rounded_minutes=rounded_minutes, raw=raw)
print(reports['text_report'])
print("\nTEXT REPORT\n",reports['text_report'])
print("\n\nIMPORTS REPORT\n",reports["imports"])
#print("\n\BIG REPORT\n", reports["big_report"], "\n\n")

# print(reporter(reports["big_report_df"], report_template_full))
print(
multi_level_reporter(
reports["big_report_df"], report_template_dir, report_template_nb
)
)
# print(reports)


# + tags=["active-ipynb"]
Expand Down
120 changes: 88 additions & 32 deletions nb_quality_profile/notebook_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@
#
# There is a potential for making IPython magics for some of the reporting functions (for example, `radon` or `wily` reports) to provide live feedback / reporting during the creation of content in a notebook.
#
# *Code has been extract from this notebook into the `nb_quality_profile` package. If experiments are made updating code in this notebook, it will need mirroring in the actual package...*
#
# ### Notebooks
#
# In the first instance, reports are generated for code cell inputs and markdown cells; code outputs and raw cells are not considered. Code appearing in markdown cells is identified as code-like but not analysed in terms of code complexity etc.
Expand All @@ -60,7 +62,7 @@
#

# +
#Last using numpy 1.x
# Last using numpy 1.x
# #%pip install --upgrade numpy<2 spacy==3.7.5 pandas==2.2.2 scikit-learn==1.4.0
# -

Expand Down Expand Up @@ -568,8 +570,8 @@ def process_notebook_md_doc(doc):

# Running the `process_notebook_md_doc()` function on a `doc` object returns a single row dataframe containing summary statistics calculated over the full markdown content of the notebook.

#counts, readability = text_stats_summary(doc)
#extras = process_extras(doc)
# counts, readability = text_stats_summary(doc)
# extras = process_extras(doc)

# + tags=["active-ipynb"]
# process_notebook_md_doc(full_doc)
Expand Down Expand Up @@ -736,7 +738,7 @@ def process_notebook_md(nb, fn=''):
for i, cell in enumerate(nb.cells):
if cell['cell_type']=='markdown':
_metrics = process_notebook_md_doc( nlp( cell['source'] ))
_metrics['cell_count'] = i
_metrics['cell_index'] = i
_metrics['cell_type'] = 'md'
#cell_reports = cell_reports.append(_metrics, sort=False)
cell_reports = pd.concat([cell_reports, _metrics], ignore_index=True, sort=False)
Expand Down Expand Up @@ -768,7 +770,7 @@ def process_notebook_file(fn):
with open(fn,'r') as f:
try:
nb = nbformat.reads(f.read(), as_version=4)
cell_reports = process_notebook_md(nb, fn=fn)
cell_reports = process_notebook(nb, fn=fn)
except:
print(f'FAILED to process {fn}')
cell_reports = pd.DataFrame()
Expand Down Expand Up @@ -1114,19 +1116,19 @@ def process_notebook_code_text(txt):
def process_notebook(nb, fn=''):
"""Process all the markdown and code cells in a notebook."""
cell_reports = pd.DataFrame()

for i, cell in enumerate(nb.cells):
if cell['cell_type']=='markdown':
_metrics = process_notebook_md_doc( nlp( cell['source'] ))
_metrics['cell_count'] = i
_metrics['cell_index'] = i
_metrics['cell_type'] = 'md'
#cell_reports = cell_reports.append(_metrics, sort=False)
# cell_reports = cell_reports.append(_metrics, sort=False)
cell_reports = pd.concat([cell_reports, _metrics], ignore_index=True, sort=False)
elif cell['cell_type']=='code':
_metrics = process_notebook_code_text(cell['source'] )
_metrics['cell_count'] = i
_metrics["cell_index"] = i
_metrics['cell_type'] = 'code'
#cell_reports = cell_reports.append(_metrics, sort=False)
# cell_reports = cell_reports.append(_metrics, sort=False)
cell_reports = pd.concat([cell_reports, _metrics], ignore_index=True, sort=False)
cell_reports['filename'] = fn
cell_reports.reset_index(drop=True, inplace=True)
Expand Down Expand Up @@ -1193,7 +1195,7 @@ def process_notebook(nb, fn=''):

# Let's make a start on a complete report template...

report_template_full = '''
report_template_dir = '''
In directory `{path}` there were {nb_count} notebooks.
- total markdown wordcount {n_words} words across {n_md_cells} markdown cells
Expand All @@ -1204,34 +1206,90 @@ def process_notebook(nb, fn=''):
'''

report_template_nb = """
Report for {name}
- total markdown wordcount {n_words} words across {n_md_cells} markdown cells
- total code line count of {n_total_code_lines} lines of code across {n_code_cells} code cells
- {n_code_lines} code lines, {n_single_line_comment_code_lines} comment lines and {n_blank_code_lines} blank lines
Estimated total reading time of {reading_time_mins} minutes.
"""

# Now let's add those extra requirements to the the feedstock generator:
def notebook_report_feedstock(ddf, grouper=None):
"""Create a feedstock dict for report generation. Keyed by directory path and optionally by name."""
if grouper is None:
grouper = ["path"]

ddf_dict = (
ddf.groupby(grouper)[
[
"n_words",
"reading_time_mins",
"reading_time_s",
"n_code_lines",
"n_single_line_comment_code_lines",
"n_total_code_lines",
"n_blank_code_lines",
]
]
.sum()
.to_dict(orient="index")
)

notebook_counts = ddf.groupby(grouper)["filename"].nunique().to_dict()
notebook_counts = {k: {"nb_count": notebook_counts[k]} for k in notebook_counts}

report_dict = always_merger.merge(ddf_dict, notebook_counts)

code_cell_counts = ddf[ddf["cell_type"] == "code"].groupby(grouper).size().to_dict()
md_cell_counts = ddf[ddf["cell_type"] == "md"].groupby(grouper).size().to_dict()

def notebook_report_feedstock(ddf):
"""Create a feedstock dict for report generation. Keyed by directory path."""
ddf_dict = ddf.groupby(['path'])[['n_words', 'reading_time_mins', 'reading_time_s',
'n_code_lines', 'n_single_line_comment_code_lines',
'n_total_code_lines','n_blank_code_lines']].sum().to_dict(orient='index')

notebook_counts_by_dir = ddf.groupby(['path'])['filename'].nunique().to_dict()
notebook_counts_by_dir = {k:{'nb_count':notebook_counts_by_dir[k]} for k in notebook_counts_by_dir}

report_dict = always_merger.merge(ddf_dict, notebook_counts_by_dir )

code_cell_counts = ddf[ddf['cell_type']=='code'].groupby(['path']).size().to_dict()
md_cell_counts = ddf[ddf['cell_type']=='md'].groupby(['path']).size().to_dict()

for k in report_dict:
report_dict[k]['path'] = k
report_dict[k]['n_code_cells'] = code_cell_counts[k] if k in code_cell_counts else 'NA'
report_dict[k]['n_md_cells'] = md_cell_counts[k] if k in md_cell_counts else 'NA'

report_dict[k]["path"] = k[0] if isinstance(k, tuple) else k
if isinstance(k, tuple) and len(k) > 1:
report_dict[k]["name"] = k[1]
report_dict[k]["n_code_cells"] = (
code_cell_counts[k] if k in code_cell_counts else "NA"
)
report_dict[k]["n_md_cells"] = (
md_cell_counts[k] if k in md_cell_counts else "NA"
)

return report_dict

from collections import defaultdict

def multi_level_reporter(df, dir_template, item_template, path_filter=""):
"""Generate a multi-level report with directory and item level information."""
dir_feedstock = notebook_report_feedstock(df, grouper=["path"])
item_feedstock = notebook_report_feedstock(df, grouper=["path", "name"])

report_txt = ""
item_reports = defaultdict(str)

# Generate item-level reports
for item in item_feedstock:
if path_filter in item[0]: # item[0] is the path
item_report = item_template.format(**item_feedstock[item])
item_reports[item[0]] += "\n" + item_report

# Generate directory-level reports with nested item reports
for directory in dir_feedstock:
if path_filter in directory:
dir_report = dir_template.format(**dir_feedstock[directory])
report_txt += "\n\n" + dir_report
if directory in item_reports:
report_txt += "\n" + item_reports[directory]

return report_txt


# Create a wrapper function for generating the report text:

def reporter(df, template, path_filter=''):
def reporter(df, template, path_filter='', nb_report=True):
feedstock = notebook_report_feedstock(df)
report_txt=''
for d in feedstock:
Expand Down Expand Up @@ -1484,5 +1542,3 @@ def cell_attribs(cells, colour='cell_type', size='n_screen_lines'):
# Also a count of empty cells?
#
# Is this moving towards some sort of notebook linter?


Loading

0 comments on commit 332971e

Please sign in to comment.