Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

syntax analysis #32

Merged
merged 6 commits into from
Nov 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions infrastructure/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__pycache__/
22 changes: 22 additions & 0 deletions infrastructure/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
STATIC_OUTPUTS = target/lines_of_code.csv target/nodes_in_scripts.csv target/scripts_to_benchmark.csv

static: $(STATIC_OUTPUTS)

target/scripts_to_benchmark.csv: scripts_to_benchmark.py
python3 $< > $@

target/lines_of_code.csv: count_lines_of_code.py
python3 $< > $@

target/nodes_in_scripts.csv: count_nodes_in_scripts.py syntax_analysis.py
python3 $< > $@

static-test: tests/test_syntax_analysis.py
python3 -m unittest $<

clean-static:
rm -f $(STATIC_OUTPUTS)

dynamic:

.PHONY: static dynamic clean-static static-test
21 changes: 21 additions & 0 deletions infrastructure/all_scripts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/usr/bin/env python3

from pathlib import Path
from typing import Optional
import json

from project_root import get_project_root

def get_all_scripts(
scripts_file: Path = get_project_root() / 'infrastructure/data/script-globs.json'
) -> list[Path]:
scripts = scripts_file.read_text()
benchmark_data: dict[str, dict[str, any]] = json.loads(scripts)
return {
benchmark_name: [
script
for script_glob in benchmark_data['scripts']
for script in get_project_root().glob(script_glob)
]
for benchmark_name, benchmark_data in benchmark_data.items()
}
24 changes: 24 additions & 0 deletions infrastructure/count_lines_of_code.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/usr/bin/env python3

from pathlib import Path
from typing import Optional
import json
from subprocess import Popen, PIPE

from all_scripts import get_all_scripts
from syntax_analysis import parse_shell_script, count_nodes
from project_root import get_project_root

root = get_project_root()
for benchmark_name, scripts in get_all_scripts().items():
processes = []
for script in scripts:
process = Popen(['cloc', '--json', script], stdout=PIPE)
script = script.relative_to(root)
processes.append((script, process))
for script, process in processes:
stdout, _stderr = process.communicate()
stdout = stdout.decode()
cloc = json.loads(stdout)
cloc = cloc['SUM']['code']
print(script, cloc, sep=',')
20 changes: 20 additions & 0 deletions infrastructure/count_nodes_in_scripts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/usr/bin/env python3

from pathlib import Path
from typing import Optional
import json
from subprocess import check_output
from collections import Counter

from all_scripts import get_all_scripts
from syntax_analysis import parse_shell_script, count_nodes
from project_root import get_project_root

root = get_project_root()
for benchmark_name, scripts in get_all_scripts().items():
for script in scripts:
asts = parse_shell_script(script)
count = Counter()
count_nodes(asts, count)
count = ';'.join(f'{n}:{c}' for n, c in count.items())
print(script.relative_to(root), count, sep=',')
38 changes: 38 additions & 0 deletions infrastructure/data/script-globs.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
{
"covid-mts": {
"scripts": ["covid-mts/scripts/*.sh"]
},
"file-enc": {
"scripts": ["file-enc/scripts/*.sh"]
},
"log-analysis": {
"scripts": ["log-analysis/scripts/*.sh"]
},
"max-temp": {
"scripts": ["max-temp/scripts/*.sh"]
},
"media-conv": {
"scripts": ["media-conv/scripts/*.sh"]
},
"nlp": {
"scripts": ["nlp/scripts/*.sh"]
},
"oneliners": {
"scripts": ["oneliners/scripts/*.sh"]
},
"sklearn": {
"scripts": ["sklearn/run.sh"]
},
"riker": {
"scripts": ["riker/scripts/*/build.sh"]
},
"uniq-ips": {
"scripts": ["uniq-ips/run.sh"]
},
"unix50": {
"scripts": ["unix50/scripts/*.sh"]
},
"web-index": {
"scripts": ["unix50/scripts/*.sh"]
}
}
10 changes: 10 additions & 0 deletions infrastructure/project_root.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/usr/bin/env python3

from subprocess import run, CalledProcessError
from pathlib import Path

def get_project_root():
result = run(['git', 'rev-parse', '--show-toplevel'], capture_output=True, text=True)
if result.returncode != 0:
raise Exception(f'could not find project root: `{result.stderr}`')
return Path(result.stdout.removesuffix('\n')) # git only emits one trailing newline in the path
2 changes: 2 additions & 0 deletions infrastructure/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
shasta
libdash
15 changes: 15 additions & 0 deletions infrastructure/scripts_to_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/usr/bin/env python3

from pathlib import Path
from typing import Optional
import json
from subprocess import check_output

from all_scripts import get_all_scripts
from syntax_analysis import parse_shell_script, count_nodes
from project_root import get_project_root

root = get_project_root()
for benchmark_name, scripts in get_all_scripts().items():
for script in scripts:
print(script.relative_to(root), benchmark_name, sep=',')
204 changes: 204 additions & 0 deletions infrastructure/syntax_analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
#!/usr/bin/env python3

# Good points of reference:
# https://github.com/binpash/shasta/blob/main/shasta/ast_node.py
# https://github.com/binpash/Shseer/blob/8bb9e72f7fe1b4703fc963bfa5d5bd2837e80ab3/src/shseer/symb.py

import sys
from enum import StrEnum, auto
from dataclasses import dataclass
import operator
from collections import Counter
import sys
import libdash
import functools as ft
from shasta.json_to_ast import to_ast_node
from shasta.ast_node import (
AndNode,
ArgChar,
EArgChar,
CArgChar,
TArgChar,
AArgChar, VArgChar, QArgChar, BArgChar,
AstNode,
CArgChar,
CaseNode,
CommandNode,
DefunNode,
DupRedirNode,
FileRedirNode,
ForNode,
HeredocRedirNode,
IfNode,
NotNode,
OrNode,
PipeNode,
RedirNode,
SemiNode,
SubshellNode,
WhileNode,
BackgroundNode,
AssignNode,
)
from collections import namedtuple

sys.setrecursionlimit (9001)

# Node With Source
NodeWSource = namedtuple('NodeWSource', ['node', 'source_syntax', 'linum_before', 'linum'])

first_time = True
def parse_shell_script(path):
global first_time
path = str(path) # handle both str and pathlib.Path
raw_asts = libdash.parse(path, first_time)
first_time = False
return [NodeWSource(to_ast_node(raw_ast),
source,
linum_before,
linum)
for (raw_ast, source, linum_before, linum) in raw_asts]

class NodeVariant(StrEnum):
"""
Components are listed exhaustively here for documentation purposes.
So that we have the string names.

It is likely that we want to add more variants for command names, so it makes sense
to define a node kind for our purposes.
"""
PIPELINE = auto() # command1 | command2
BACKGROUND = auto() # command1 &
SUBSHELL_COMMAND = auto() # (command)
# SEMICOLON_COMMAND = auto() # command1; command2 # this is not useful
AND_COMMAND = auto() # command1 && command2
OR_COMMAND = auto() # command1 || command2
NEGATE_COMMAND = auto() # !command1
WHILE_COMMAND = auto() # while cond; do command2; end
FOR_COMMAND = auto() # for cond; do command2; end
IF_COMMAND = auto() # for cond; then command2; fi
CASE_COMMAND = auto() # case cond; then command2; fi
FUNCTION_COMMAND = auto() # fname() { fdefinition }

ASSIGNMENT = auto() # a=b

REDIRECTION = auto() # command >> file.txt
FILE_REDIRECTION = auto() # command >> file.txt
# TRUNCATE_FILE_REDIRECTION = auto()
# TRUNCATE_FORCE_FILE_REDIRECTION = auto() # >|
# INPUT_FILE_REDIRECTION = auto() # <
# READ_WRITE_FILE_REDIRECTION = auto() # <>
# APPEND_FILE_REDIRECTION = auto() # >|

DUP_REDIRECTION = auto() # >&
HEREDOC_REDIRECTION = auto() # <<EOF

HOME_TILDE_CONTROL = auto() # ~s
VARIABLE_USE = auto() # $something
DOLLAR_PAREN_SHELL_CONTROL = auto() # $()
DOLLAR_PAREN_PAREN_ARITH_CONTROL = auto() # $(())
QUOTED_CONTROL = auto() # $(())

# STRING_CHAR = auto() # a=stringlit # excluded because I don't know what to do with this
ESCAPED_CHAR = auto() # escaped
RAW_COMMAND = auto() # like echo

@dataclass(frozen=True)
class Command:
name: str
def __str__(self):
return f'command({self.name})'

def count_nodes(asts, count: Counter[NodeVariant]):
match asts:
case NodeWSource(node=subnode):
count_nodes(subnode, count)
case CArgChar():
pass
case EArgChar():
count[NodeVariant.ESCAPED_CHAR] += 1
case TArgChar():
count[NodeVariant.HOME_TILDE_CONTROL] += 1
case AArgChar(arg=more):
count_nodes(more, count)
count[NodeVariant.DOLLAR_PAREN_PAREN_ARITH_CONTROL] += 1
case VArgChar(arg=more):
count[NodeVariant.VARIABLE_USE] += 1
count_nodes(more, count)
case QArgChar(arg=more):
count[NodeVariant.QUOTED_CONTROL] += 1
case BArgChar(node=node):
count[NodeVariant.DOLLAR_PAREN_SHELL_CONTROL] += 1
case PipeNode(items=subnodes):
count_nodes(subnodes, count)
count[NodeVariant.PIPELINE] += 1
case SubshellNode(body=subnode):
count_nodes(subnode, count)
count[NodeVariant.SUBSHELL_COMMAND] += 1
case NotNode(body=subnode):
count_nodes(subnode, count)
count[NodeVariant.NEGATE_COMMAND] += 1
case RedirNode(node=subnode):
count_nodes(subnode, count)
count[NodeVariant.REDIRECTION] += 1
case BackgroundNode(node=subnode):
count_nodes(subnode, count)
count[NodeVariant.BACKGROUND] += 1
case DefunNode(body=subnode):
count_nodes(subnode, count)
count[NodeVariant.FUNCTION_COMMAND] += 1
case AndNode(left_operand=l, right_operand=r):
count_nodes([l, r], count)
count[NodeVariant.AND_COMMAND] += 1
case OrNode(left_operand=l, right_operand=r):
count_nodes([l, r], count)
count[NodeVariant.OR_COMMAND] += 1
case SemiNode(left_operand=l, right_operand=r):
count_nodes([l, r], count)
case WhileNode(test=l, body=r):
count_nodes([l, r], count)
count[NodeVariant.WHILE_COMMAND] += 1
case IfNode(cond=t, then_b=thn, else_b=els):
count_nodes([t, thn, els], count)
count[NodeVariant.IF_COMMAND] += 1
case ForNode(argument=lolo_argchar, body=subnode):
count_nodes(lolo_argchar, count)
count_nodes(subnode, count)
count[NodeVariant.FOR_COMMAND] += 1
case CaseNode(argument=argchars, cases=cases):
count[NodeVariant.CASE_COMMAND] += 1
for node in argchars:
count_nodes(node, count)
for c in cases:
count_nodes(c['cbody'], count)
case AssignNode(val=argchars):
count[NodeVariant.ASSIGNMENT] += 1
count_nodes(argchars, count)
case FileRedirNode(arg=argchars):
count[NodeVariant.FILE_REDIRECTION] += 1
count_nodes(argchars, count)
case DupRedirNode(arg=argchars):
count[NodeVariant.DUP_REDIRECTION] += 1
count_nodes(argchars, count)
case HeredocRedirNode(arg=argchars):
count[NodeVariant.HEREDOC_REDIRECTION] += 1
count_nodes(argchars, count)
case CommandNode(arguments=lolo_argchar, assignments=assignments, redir_list=redir_list):
if len(lolo_argchar) > 0 and all(isinstance(c, CArgChar) for c in lolo_argchar[0]):
command_name = ''.join(str(c) for c in lolo_argchar[0])
count[Command(command_name)] += 1
count_nodes(assignments, count)
count_nodes(redir_list, count)
count_nodes(lolo_argchar, count)
case [*subnodes]:
for node in subnodes:
count_nodes(node, count)
case other:
raise Exception(f"oops: {other} of type {type(other)}")

if __name__ == '__main__':
p = "script.sh"
asts = parse_shell_script(p)
count = Counter()
count_nodes(asts, count)
print(count)
1 change: 1 addition & 0 deletions infrastructure/target/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

Loading
Loading