stack_usage.py

#!/usr/bin/python
# -*- coding: utf-8 -*-

#  Copyright (C) Simon Wright <simon@pushface.org>

#  This package is free software; you can redistribute it and/or
#  modify it under terms of the BSD 3-Clause License.

# Reads .ci files generated by GCC in response to the switch
# -fcallgraph-info=su,da.

# Reports on the total stack usage by each subprogram and the
# subprograms it calls, recursively.

# Uses PLY (http://www.dabeaz.com/ply/).

import csv
import getopt
import os
import pickle
import ply.lex as lex
import ply.yacc as yacc
import re
import sys
import time

# ----------------------------------------------------------------------
# Object model
# ----------------------------------------------------------------------

class Graph:
    """Contains the interesting content of one .ci file."""
    def __init__(self):
        # title is the unit filename
        self.title = ''
        # internal_nodes contains information on the subprograms in
        # this unit
        self.internal_nodes = []
        # external_nodes contains information on subprograms called
        # from this unit but external to it
        self.external_nodes = []
        # edges contains information on the calls made from this unit
        self.edges = []
    def __str__(self):
        result = "graph title: %s" % self.title
        for int in self.internal_nodes:
            result += "\n" + str(int)
        for ext in self.external_nodes:
            result += "\n" + str(ext)
        for edge in self.edges:
            result += "\n" + str(edge)
        return result

class Node(object):
    """Effectively abstract: either an InternalNode or an ExternalNode."""
    def __init__(self):
        # title is the subprograms's symbol, possibly annotated with
        # the file name
        self.title = ''
        # symbol is the subprogram's symbol
        self.symbol = ''
        # XXX I don't remember what this is
        self.source = ''

class InternalNode(Node):
    """A subprogram declared in this unit."""
    def __init__(self):
        super(InternalNode, self).__init__()
        # the amount of stack used in this subprogram
        self.static_stack = 0
        # the number of dynamic objects (?)
        self.dynamic_objects = 0
    def __str__(self):
        result = "intl: title: %s symbol: %s src: %s stack: %d dobjs: %d" \
            % (self.title, self.symbol, self.source,
               self.static_stack, self.dynamic_objects)
        return result

class ExternalNode(Node):
    """A subprogram called from this unit but not defined here."""
    def __init__(self):
        super(ExternalNode, self).__init__()
    def __str__(self):
        result = "extl: title: %s symbol: %s source: %s" \
            % (self.title, self.symbol, self.source)
        return result

class Edge:
    """Represents a call from a subprogram in this compilation unit. The
    target may be local (will be an InternalNode) or not (an ExternalNode).
    @source is the calling subprogram.
    @target is the called subprogram.
    """
    def __init__(self):
        # source is the calling subprogram
        self.source = ''
        # target is the called subprogram
        self.target = ''
        # XXX not sure what this is
        self.label = None
    def __str__(self):
        result = "edge: from: %s to: %s" % (self.source, self.target)
        if self.label:
            result += ", label: %s" % self.label
        else:
            result += ", NO LABEL"
        return result

class Graphs:
    def __init__(self):
        self.graphs = []
        # sources is a map {sourcename:(internal_node, stack_used)}
        self.sources = {}
        # edges is a map {sourcename:[targetname, ...]}
        self.edges = {}
        # missing contains the symbols that were called but not found
        # (probably would be in the external_nodes); used so missing
        # symbols only reported once
        self.missing = []
    def add_ci_file(self, file):
        try:
            input = open(file, 'r')
        except:
            error("couldn't open %s for input.\n" % file)
        lexer.input(input.read())
        input.close()
        g = parser.parse(lexer=lexer, debug=verbosity)
        # # in case we need to know whether this is an Ada unit
        # ada = g.title
        # ada = ada.rsplit('.')[-1]
        # ada = ada == 'ads' or ada == 'adb'
        self.graphs += [g, ]
        for i in g.internal_nodes:
            src = i.symbol
            if src in self.sources:
                warning("duplicate subprogram '%s'" % src)
            else:
                self.sources[src] = [i, None]
        for e in g.edges:
            src = e.source
            tgt = e.target
            if src in self.edges:
                if tgt in self.edges[src]:
                    pass
                else:
                    self.edges[src] += (tgt, )
            else:
                self.edges[src] = (tgt, )
    def _resolve(self, name):
        """Calculate the total stack depth required for @name.
        If sources[name][1] is None, this means we have to
        calculate it as sources[name][0].static_stack + the max of
        all the nodes called (recursively).
        If not None, we've already done this calculation (it contains
        the cached result).
        """
        if not name in self.sources:
            if not name in self.missing:
                self.missing += [name, ]
                warning("callee '%s' not found" % name)
            return 0
        if not self.sources[name][1]:
            if name in self.edges:
                called = self.edges[name]
                stacks = [self._resolve(c) for c in called if c != name]
                max_called_stack = max(stacks)
            else:
                max_called_stack = 0
            self.sources[name][1] = self.sources[name][0].static_stack \
                + max_called_stack
        return self.sources[name][1]
    def usage(self):
        """Returns the results, sorted by caller, as a list of 2-element
        lists: [(<name with __ replaced by .>, <depth>)]
        """
        names = self.sources.keys()
        return sorted([(n.replace('__', '.'), self._resolve(n)) for n in names],
                      key=lambda el: el[0])

# ----------------------------------------------------------------------
# Parser
# ----------------------------------------------------------------------

def p_start(p):
    '''
    start \
        : GRAPH COLON OPEN_BRACE title graph_contents CLOSE_BRACE
        | GRAPH COLON OPEN_BRACE title CLOSE_BRACE
    '''
    p[0] = Graph()
    p[0].title = p[4]
    if len(p) > 5:
        for c in p[5]:
            if isinstance(c, InternalNode):
                p[0].internal_nodes += (c, )
            elif isinstance(c, ExternalNode):
                p[0].external_nodes += (c, )
            elif isinstance(c, Edge):
                p[0].edges += (c, )

def p_title(p):
    '''
    title : TITLE COLON STRING
    '''
    p[0] = p[3]

def p_graph_contents(p):
    '''
    graph_contents \
        : graph_item graph_contents
        | graph_item
    '''
    p[0] = (p[1],)
    if len(p) == 3:
        p[0] += p[2]

def p_graph_item(p):
    '''
    graph_item \
        : class
        | node
        | edge
    '''
    p[0] = p[1]

def p_class(p):
    '''
    class \
        : CLASS OPEN_BRACE \
          CLASSNAME COLON STRING \
          LABEL COLON STRING \
          PARENT COLON STRING \
          VIRTUALS COLON STRING \
          CLOSE_BRACE
    '''
    #p[0] = "class, ignored"

def p_node(p):
    '''
    node : NODE COLON OPEN_BRACE title node_content CLOSE_BRACE
    '''
    p[0] = p[5]
    p[0].title = p[4]
    p[0].symbol = p[4].rsplit(':')[-1]

def p_node_content(p):
    '''
    node_content \
        : internal_node
        | external_node
    '''
    p[0] = p[1]

# Matcher for the 'label' of an internal node (a subprogram in this CI
# file)
internal_matcher = re.compile(r'^(\S+)\\n(.*)\\n(\d+).*\\n(\d+).*$')

def p_internal_node(p):
    '''
    internal_node : LABEL COLON STRING
    '''
    p[0] = InternalNode()
    p[0].content = p[3]
    match = internal_matcher.match(p[3])
    if match:
        p[0].source = match.group(2)
        p[0].static_stack = int(match.group(3))
        p[0].dynamic_objects = int(match.group(4))
    else:
        warning("failed to match internal '%s'" % p[3])

# Matcher for the 'label' of an external node (a subprogram not in
# this CI file)
external_matcher = re.compile(r'^(\S+)\\n(.*)$')

def p_external_node(p):
    '''
    external_node : LABEL COLON STRING SHAPE COLON ELLIPSE
    '''
    p[0] = ExternalNode()
    p[0].content = p[3]
    match = external_matcher.match(p[3])
    if match:
        p[0].source = match.group(2)
    else:
        warning("failed to match external '%s'" % p[3])

def p_edge(p):
    '''
    edge \
        : EDGE COLON OPEN_BRACE \
          SOURCENAME COLON STRING \
          TARGETNAME COLON STRING \
          LABEL COLON STRING \
          CLOSE_BRACE
        | EDGE COLON OPEN_BRACE \
          SOURCENAME COLON STRING \
          TARGETNAME COLON STRING \
          CLOSE_BRACE
    '''
    p[0] = Edge()
    # for Ada sources, 'sourcename' is sometimes filename:source
    p[0].source = p[6].rsplit(':')[-1]
    # for targets in this C source file, 'targetname' is
    # filename:target
    p[0].target = p[9].rsplit(':')[-1]
    if len(p) > 11:
        p[0].label = p[12]

def p_error(p):
    '''Panic mode recovery.'''
    if not p:
        warning("That seems to be it.")
        return None
    text = lexer.lexdata
    last_cr = text.rfind('\n', 0, p.lexpos)
    if last_cr < 0:
        last_cr = 0
    column = (p.lexpos - last_cr) - 1
    error("Syntax error at %s on line %d:%d" % (p.type, p.lineno, column))

# ----------------------------------------------------------------------
# Lexer
# ----------------------------------------------------------------------

# See https://www.dabeaz.com/ply/ply.html#ply_nn21 for management of
# colons within strings using states.

tokens = (
    'CLASS',
    'CLASSNAME',
    'CLOSE_BRACE',
    'COLON',
    'EDGE',
    'ELLIPSE',
    'GRAPH',
    'LABEL',
    'NODE',
    'OPEN_BRACE',
    'PARENT',
    'SHAPE',
    'SOURCENAME',
    'STRING',
    'TARGETNAME',
    'TITLE',
    'VIRTUALS',
)

states = (
    ('inString', 'exclusive'),
)

t_CLASS = r'class'
t_CLASSNAME = r'classname'
t_CLOSE_BRACE = r'}'
t_EDGE = r'edge'
t_ELLIPSE = r'ellipse'
t_GRAPH = r'graph'
t_LABEL = r'label'
t_NODE = r'node'
t_OPEN_BRACE = r'{'
t_PARENT = r'parent'
t_SHAPE = r'shape'
t_SOURCENAME = r'sourcename'
t_TARGETNAME = r'targetname'
t_TITLE = r'title'
t_VIRTUALS = r'virtuals'

# colons outside strings
t_INITIAL_COLON = r':'

def t_inString(t):
    r'"'
    t.lexer.string_start = t.lexer.lexpos
    t.lexer.begin('inString')

def t_inString_content(t):
    r'[^"]'

def t_inString_endquote(t):
    r'"'
    # leave off the end quote
    t.value = t.lexer.lexdata[t.lexer.string_start:t.lexer.lexpos - 1]
    t.type = 'STRING'
    t.lexer.lineno += t.value.count('\n')
    t.lexer.begin('INITIAL')
    return t

def t_INITIAL_newline(t):
    # Define a rule so we can track skipped line numbers as well as
    # significant ones.
    r'\n+'
    t.lexer.lineno += len(t.value)

# A string containing ignored characters (space, tab and CR)
t_ANY_ignore = ' \t\r'

def t_ANY_error(t):
    '''Error handling.'''
    warning("\nIllegal character '%s', line %d"
            % (t.value, t.lexer.lineno))
    t.lexer.skip(1)

# ----------------------------------------------------------------------
# Main
# ----------------------------------------------------------------------

def warning(msg):
    sys.stderr.write("%s\n" % msg)

def error(msg):
    sys.stderr.write("%s\n" % msg)
    sys.exit(1)

def main():

    def usage():
        sys.stderr.write('usage: stack_usage.py [flags] <input .ci files>\n')
        sys.stderr.write('flags:\n')
        sys.stderr.write('-h, --help:        '
                         + 'output this message\n')
        sys.stderr.write('-s, --save=file:   '
                         + 'save data in file\n')
        sys.stderr.write('-l, --load=file:   '
                         + 'restore previously saved data\n')
        sys.stderr.write('-o, --output=file: '
                         + 'file for CSV output (D=stack_usage.csv)\n')
        sys.stderr.write('-d, --diagnostics: '
                         + 'output diagnostic info on parsing\n')

    try:
        opts, args = getopt.getopt(
            sys.argv[1:],
            'hs:l:o:d',
            ('help', 'save=', 'load=', 'output=', 'diagnostics', ))
    except getopt.GetoptError:
        usage()
        sys.exit(1)

    input = sys.stdin
    output_file = 'stack_usage.csv'
    do_save = False; save_file = ''
    do_load = False; load_file = ''
    global verbosity; verbosity = False

    for o, v in opts:
        if o in ('-h', '--help'):
            usage()
            sys.exit()
        if o in ('-v', '--verbose'):
            verbosity = True
        if o in ('-l', '--load'):
            do_load = True
            load_file = v
        if o in ('-s', '--save'):
            do_save = True
            save_file = v
        if o in ('-o', '--output'):
            output_file = v

    if len(args) == 0:
        usage()
        sys.exit(1)

    # create the lexer
    global lexer; lexer = lex.lex()
    # create the parser (global, for p_error())
    global parser; parser = yacc.yacc()

    # check for load of previous run
    if do_load:
        graphs = pickle.load(open(load_file, "rb"))
    else:
        graphs = Graphs()

    # parse the input files, collect the data
    for f in args:
        graphs.add_ci_file(f)

    # save if requested
    if do_save:
        pickle.dump(graphs, open(save_file, "wb"))

    csv_file = open(output_file, mode='w')
    csv_writer = csv.DictWriter(csv_file, ('Caller', 'Depth'))
    csv_writer.writeheader()

    for row in graphs.usage():
        csv_writer.writerow({'Caller':row[0], 'Depth':row[1]})

    csv_file.close()

if __name__ == '__main__':
    main()