-
Notifications
You must be signed in to change notification settings - Fork 5
/
provenance.py
89 lines (66 loc) · 3.03 KB
/
provenance.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import re
import subprocess
def generate_provenance_json(script="unknown", params={}):
"""Generate the provenance in a format which can later be output as valid json.
Inputs:
string: The name of the script used to trigger the data generation/deidentification/synthesis process
dict: The parameters used to tune the data generation etc. process; should include random seeds and other
options as appropriate for the method
Returns:
dict: Details of the script called by the user and any relevant parameters
"""
commit = get_git_commit_hash()
local_mods = get_local_changes()
provenance = {"script": script,
"commit": commit,
"local_modifications": local_mods,
"parameters": params}
return provenance
def get_git_commit_hash():
"""Get the hash of the latest commit in the directory from which this command was called.
Returns:
string: The hash of the latest commit
"""
# Use git rev-parse to try to get the current hash, then use regex to check its format
# The 7-length has that we ask for should be fine, and if it goes over then well check for that in the regex
# (git rev-parse will apparently return as many characters as needed for a unique short hash)
try:
revision = subprocess.check_output(["git", "rev-parse", "--short=7", "HEAD"]).strip().decode()
except subprocess.CalledProcessError:
return "unknown"
match = re.fullmatch(r"[a-z0-9]{7,10}", revision)
# Default to "unknown" if the string returned by the git command isn't in the expected format
if match is None:
return "unknown"
else:
return revision
def get_local_changes():
"""Determine whether local changes have been made to the current git repository
Returns:
bool or None: bool indicating presence of modifications or None if not run in a git repository
bool or None: bool indicating presence of untracked files or None if not run in a git repository
"""
try:
status = subprocess.check_output(["git", "status", "--porcelain"]).strip().decode()
except subprocess.CalledProcessError:
return None, None
local_mods = False
# If there have been changes, each line will start with a one or two character key indicating type of change
if status == "":
local_mods = False
else:
for line in status.splitlines():
if re.match("[ MADRCU]{1,5}", line):
local_mods = True
elif re.match("\?\?", line): # Indicates untracked files - we don't need to record this
continue
else:
print("*****")
print("Unexpected start of line - does this indicate a local modification?")
print(line)
print("*****")
local_mods = True # play it safe and indicate that there are local changes
return local_mods
if __name__ == "__main__":
p = generate_provenance_json()
print(p)