binpash · vagos · Nov 21, 2024 · Nov 6, 2024 · Nov 6, 2024 · Nov 11, 2024
diff --git a/infrastructure/.gitignore b/infrastructure/.gitignore
@@ -0,0 +1 @@
+__pycache__/
diff --git a/infrastructure/Makefile b/infrastructure/Makefile
@@ -0,0 +1,22 @@
+STATIC_OUTPUTS = target/lines_of_code.csv target/nodes_in_scripts.csv target/scripts_to_benchmark.csv
+
+static: $(STATIC_OUTPUTS)
+
+target/scripts_to_benchmark.csv: scripts_to_benchmark.py
+	python3 $< > $@
+
+target/lines_of_code.csv: count_lines_of_code.py 
+	python3 $< > $@
+
+target/nodes_in_scripts.csv: count_nodes_in_scripts.py syntax_analysis.py
+	python3 $< > $@
+
+static-test: tests/test_syntax_analysis.py
+	python3 -m unittest $<
+
+clean-static: 
+	rm -f $(STATIC_OUTPUTS)
+
+dynamic:
+
+.PHONY: static dynamic clean-static static-test
diff --git a/infrastructure/all_scripts.py b/infrastructure/all_scripts.py
@@ -0,0 +1,21 @@
+#!/usr/bin/env python3
+
+from pathlib import Path
+from typing import Optional
+import json
+
+from project_root import get_project_root
+
+def get_all_scripts(
+    scripts_file: Path = get_project_root() / 'infrastructure/data/script-globs.json'
+) -> list[Path]:
+    scripts = scripts_file.read_text()
+    benchmark_data: dict[str, dict[str, any]] = json.loads(scripts)
+    return {
+        benchmark_name: [
+            script
+            for script_glob in benchmark_data['scripts']
+            for script in get_project_root().glob(script_glob)
+        ]
+        for benchmark_name, benchmark_data in benchmark_data.items()
+    }
diff --git a/infrastructure/count_lines_of_code.py b/infrastructure/count_lines_of_code.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python3
+
+from pathlib import Path
+from typing import Optional
+import json
+from subprocess import Popen, PIPE
+
+from all_scripts import get_all_scripts
+from syntax_analysis import parse_shell_script, count_nodes
+from project_root import get_project_root
+
+root = get_project_root()
+for benchmark_name, scripts in get_all_scripts().items():
+    processes = []
+    for script in scripts:
+        process = Popen(['cloc', '--json', script], stdout=PIPE)
+        script = script.relative_to(root)
+        processes.append((script, process))
+    for script, process in processes:
+        stdout, _stderr = process.communicate()
+        stdout = stdout.decode()
+        cloc = json.loads(stdout)
+        cloc = cloc['SUM']['code']
+        print(script, cloc, sep=',')
diff --git a/infrastructure/count_nodes_in_scripts.py b/infrastructure/count_nodes_in_scripts.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python3
+
+from pathlib import Path
+from typing import Optional
+import json
+from subprocess import check_output
+from collections import Counter
+
+from all_scripts import get_all_scripts
+from syntax_analysis import parse_shell_script, count_nodes
+from project_root import get_project_root
+
+root = get_project_root()
+for benchmark_name, scripts in get_all_scripts().items():
+    for script in scripts:
+        asts = parse_shell_script(script)
+        count = Counter()
+        count_nodes(asts, count)
+        count = ';'.join(f'{n}:{c}' for n, c in count.items())
+        print(script.relative_to(root), count, sep=',')
diff --git a/infrastructure/data/script-globs.json b/infrastructure/data/script-globs.json
@@ -0,0 +1,38 @@
+{
+    "covid-mts": {
+        "scripts": ["covid-mts/scripts/*.sh"]
+    },
+    "file-enc": {
+        "scripts": ["file-enc/scripts/*.sh"]
+    },
+    "log-analysis": {
+        "scripts": ["log-analysis/scripts/*.sh"]
+    },
+    "max-temp": {
+        "scripts": ["max-temp/scripts/*.sh"]
+    },
+    "media-conv": {
+        "scripts": ["media-conv/scripts/*.sh"]
+    },
+    "nlp": {
+        "scripts": ["nlp/scripts/*.sh"]
+    },
+    "oneliners": {
+        "scripts": ["oneliners/scripts/*.sh"]
+    },
+    "sklearn": {
+        "scripts": ["sklearn/run.sh"]
+    },
+    "riker": {
+        "scripts": ["riker/scripts/*/build.sh"]
+    },
+    "uniq-ips": {
+        "scripts": ["uniq-ips/run.sh"]
+    },
+    "unix50": {
+        "scripts": ["unix50/scripts/*.sh"]
+    },
+    "web-index": {
+        "scripts": ["unix50/scripts/*.sh"]
+    }
+}
diff --git a/infrastructure/project_root.py b/infrastructure/project_root.py
@@ -0,0 +1,10 @@
+#!/usr/bin/env python3
+
+from subprocess import run, CalledProcessError
+from pathlib import Path
+
+def get_project_root():
+    result = run(['git', 'rev-parse', '--show-toplevel'], capture_output=True, text=True)
+    if result.returncode != 0:
+        raise Exception(f'could not find project root: `{result.stderr}`')
+    return Path(result.stdout.removesuffix('\n')) # git only emits one trailing newline in the path
diff --git a/infrastructure/requirements.txt b/infrastructure/requirements.txt
@@ -0,0 +1,2 @@
+shasta
+libdash
diff --git a/infrastructure/scripts_to_benchmark.py b/infrastructure/scripts_to_benchmark.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+
+from pathlib import Path
+from typing import Optional
+import json
+from subprocess import check_output
+
+from all_scripts import get_all_scripts
+from syntax_analysis import parse_shell_script, count_nodes
+from project_root import get_project_root
+
+root = get_project_root()
+for benchmark_name, scripts in get_all_scripts().items():
+    for script in scripts:
+        print(script.relative_to(root), benchmark_name, sep=',')
diff --git a/infrastructure/syntax_analysis.py b/infrastructure/syntax_analysis.py
@@ -0,0 +1,204 @@
+#!/usr/bin/env python3
+
+# Good points of reference:
+# https://github.com/binpash/shasta/blob/main/shasta/ast_node.py
+# https://github.com/binpash/Shseer/blob/8bb9e72f7fe1b4703fc963bfa5d5bd2837e80ab3/src/shseer/symb.py
+
+import sys
+from enum import StrEnum, auto
+from dataclasses import dataclass
+import operator
+from collections import Counter
+import sys
+import libdash
+import functools as ft
+from shasta.json_to_ast import to_ast_node
+from shasta.ast_node import (
+    AndNode,
+    ArgChar,
+    EArgChar,
+    CArgChar,
+    TArgChar,
+    AArgChar, VArgChar, QArgChar, BArgChar,
+    AstNode,
+    CArgChar,
+    CaseNode,
+    CommandNode,
+    DefunNode,
+    DupRedirNode,
+    FileRedirNode,
+    ForNode,
+    HeredocRedirNode,
+    IfNode,
+    NotNode,
+    OrNode,
+    PipeNode,
+    RedirNode,
+    SemiNode,
+    SubshellNode,
+    WhileNode,
+    BackgroundNode,
+    AssignNode,
+)
+from collections import namedtuple
+
+sys.setrecursionlimit (9001)
+
+# Node With Source
+NodeWSource = namedtuple('NodeWSource', ['node', 'source_syntax', 'linum_before', 'linum'])
+
+first_time = True
+def parse_shell_script(path):
+    global first_time
+    path = str(path) # handle both str and pathlib.Path
+    raw_asts = libdash.parse(path, first_time)
+    first_time = False
+    return [NodeWSource(to_ast_node(raw_ast),
+                        source,
+                        linum_before,
+                        linum)
+               for (raw_ast, source, linum_before, linum) in raw_asts]
+
+class NodeVariant(StrEnum):
+    """
+    Components are listed exhaustively here for documentation purposes.
+    So that we have the string names.
+
+    It is likely that we want to add more variants for command names, so it makes sense
+    to define a node kind for our purposes.
+    """
+    PIPELINE = auto() # command1 | command2
+    BACKGROUND = auto() # command1 &
+    SUBSHELL_COMMAND = auto() # (command)
+#     SEMICOLON_COMMAND = auto() # command1; command2 # this is not useful
+    AND_COMMAND = auto() # command1 && command2
+    OR_COMMAND = auto() # command1 || command2
+    NEGATE_COMMAND = auto() # !command1 
+    WHILE_COMMAND = auto() # while cond; do command2; end 
+    FOR_COMMAND = auto() # for cond; do command2; end 
+    IF_COMMAND = auto() # for cond; then command2; fi
+    CASE_COMMAND = auto() # case cond; then command2; fi
+    FUNCTION_COMMAND = auto() # fname() { fdefinition }
+
+    ASSIGNMENT = auto() # a=b
+
+    REDIRECTION = auto() # command >> file.txt
+    FILE_REDIRECTION = auto() # command >> file.txt
+#     TRUNCATE_FILE_REDIRECTION = auto()
+#     TRUNCATE_FORCE_FILE_REDIRECTION = auto() # >|
+#     INPUT_FILE_REDIRECTION = auto() # <
+#     READ_WRITE_FILE_REDIRECTION = auto() # <>
+#     APPEND_FILE_REDIRECTION = auto() # >|
+
+    DUP_REDIRECTION = auto() # >&
+    HEREDOC_REDIRECTION = auto() # <<EOF
+
+    HOME_TILDE_CONTROL = auto() # ~s
+    VARIABLE_USE = auto() # $something
+    DOLLAR_PAREN_SHELL_CONTROL = auto() # $()
+    DOLLAR_PAREN_PAREN_ARITH_CONTROL = auto() # $(())
+    QUOTED_CONTROL = auto() # $(())
+
+#     STRING_CHAR = auto() # a=stringlit # excluded because I don't know what to do with this
+    ESCAPED_CHAR = auto() # escaped
+    RAW_COMMAND = auto() # like echo
+
+@dataclass(frozen=True)
+class Command:
+    name: str
+    def __str__(self):
+        return f'command({self.name})'
+
+def count_nodes(asts, count: Counter[NodeVariant]):
+    match asts:
+        case NodeWSource(node=subnode):
+            count_nodes(subnode, count)
+        case CArgChar():
+            pass
+        case EArgChar():
+            count[NodeVariant.ESCAPED_CHAR] += 1
+        case TArgChar():
+            count[NodeVariant.HOME_TILDE_CONTROL] += 1
+        case AArgChar(arg=more):
+            count_nodes(more, count)
+            count[NodeVariant.DOLLAR_PAREN_PAREN_ARITH_CONTROL] += 1
+        case VArgChar(arg=more):
+            count[NodeVariant.VARIABLE_USE] += 1
+            count_nodes(more, count)
+        case QArgChar(arg=more):
+            count[NodeVariant.QUOTED_CONTROL] += 1
+        case BArgChar(node=node):
+            count[NodeVariant.DOLLAR_PAREN_SHELL_CONTROL] += 1
+        case PipeNode(items=subnodes):
+            count_nodes(subnodes, count)
+            count[NodeVariant.PIPELINE] += 1
+        case SubshellNode(body=subnode):
+            count_nodes(subnode, count)
+            count[NodeVariant.SUBSHELL_COMMAND] += 1
+        case NotNode(body=subnode):
+            count_nodes(subnode, count)
+            count[NodeVariant.NEGATE_COMMAND] += 1
+        case RedirNode(node=subnode):
+            count_nodes(subnode, count)
+            count[NodeVariant.REDIRECTION] += 1
+        case BackgroundNode(node=subnode):
+            count_nodes(subnode, count)
+            count[NodeVariant.BACKGROUND] += 1
+        case DefunNode(body=subnode):
+            count_nodes(subnode, count)
+            count[NodeVariant.FUNCTION_COMMAND] += 1
+        case AndNode(left_operand=l, right_operand=r):
+            count_nodes([l, r], count)
+            count[NodeVariant.AND_COMMAND] += 1
+        case OrNode(left_operand=l, right_operand=r):
+            count_nodes([l, r], count)
+            count[NodeVariant.OR_COMMAND] += 1
+        case SemiNode(left_operand=l, right_operand=r):
+            count_nodes([l, r], count)
+        case WhileNode(test=l, body=r):
+            count_nodes([l, r], count)
+            count[NodeVariant.WHILE_COMMAND] += 1
+        case IfNode(cond=t, then_b=thn, else_b=els):
+            count_nodes([t, thn, els], count)
+            count[NodeVariant.IF_COMMAND] += 1
+        case ForNode(argument=lolo_argchar, body=subnode):
+            count_nodes(lolo_argchar, count)
+            count_nodes(subnode, count)
+            count[NodeVariant.FOR_COMMAND] += 1
+        case CaseNode(argument=argchars, cases=cases):
+            count[NodeVariant.CASE_COMMAND] += 1
+            for node in argchars:
+                count_nodes(node, count)
+            for c in cases:
+                count_nodes(c['cbody'], count)
+        case AssignNode(val=argchars):
+            count[NodeVariant.ASSIGNMENT] += 1
+            count_nodes(argchars, count)
+        case FileRedirNode(arg=argchars):
+            count[NodeVariant.FILE_REDIRECTION] += 1
+            count_nodes(argchars, count)
+        case DupRedirNode(arg=argchars):
+            count[NodeVariant.DUP_REDIRECTION] += 1
+            count_nodes(argchars, count)
+        case HeredocRedirNode(arg=argchars):
+            count[NodeVariant.HEREDOC_REDIRECTION] += 1
+            count_nodes(argchars, count)
+        case CommandNode(arguments=lolo_argchar, assignments=assignments, redir_list=redir_list):
+            if len(lolo_argchar) > 0 and all(isinstance(c, CArgChar) for c in lolo_argchar[0]):
+                command_name = ''.join(str(c) for c in lolo_argchar[0])
+                count[Command(command_name)] += 1
+            count_nodes(assignments, count)
+            count_nodes(redir_list, count)
+            count_nodes(lolo_argchar, count)
+        case [*subnodes]:
+            for node in subnodes:
+                count_nodes(node, count)
+        case other:
+            raise Exception(f"oops: {other} of type {type(other)}")
+
+if __name__ == '__main__':
+    p = "script.sh"
+    asts = parse_shell_script(p)
+    count = Counter()
+    count_nodes(asts, count)
+    print(count)
diff --git a/infrastructure/target/.gitignore b/infrastructure/target/.gitignore
@@ -0,0 +1 @@
+