forked from mirador/nhanes
-
Notifications
You must be signed in to change notification settings - Fork 0
/
mergedatasets.py
98 lines (78 loc) · 3.02 KB
/
mergedatasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
'''
Runs all the steps necessary to create a merged Mirador dataset
@copyright: Fathom Information Design 2014
'''
import sys, os, subprocess
def load_components():
ifile = open('components', 'r')
components = {}
metadata = []
for line in ifile.readlines():
line = line.strip()
if line == "" or line[0] == "#": continue
parts = line.split()
if len(parts) == 2:
comp_name = parts[0]
comp_file = parts[1]
components[comp_name] = comp_file
metadata.append(comp_file)
ifile.close()
metadata.append("weights.xml")
return [components, metadata]
def run_command(cmd):
sproc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf8')
sproc.wait()
outfile.write("******************************************************************************************\n")
outfile.write(cmd + "\n")
outfile.writelines(sproc.stdout.readlines())
outfile.write("------------------------------------------------------------------------------------------\n")
if sproc.returncode:
print("AN ERROR OCURRED!")
print("Command: " + cmd)
print("Error message saved to file " + error_filename)
errorfile = open(error_filename, "w")
errorfile.write("Command: " + cmd + "\n")
errorfile.writelines(sproc.stderr.readlines())
errorfile.close()
exit(1)
cycle = sys.argv[1]
[components, metadata] = load_components()
all_files = " ".join(metadata)
output_folder = "data/mirador/" + cycle
output_filename = output_folder + "/process.out"
error_filename = output_folder + "/error.out"
print("MAKING MERGED MIRADOR DATASET FOR", cycle, "CYCLE:")
if not os.path.exists(output_folder):
os.makedirs(output_folder)
outfile = open(output_filename, "w")
errorfile = open(error_filename, "w")
errorfile.close()
print("MERGING METADATA...")
for comp in components:
xml = components[comp]
cmd = "python mergemeta.py " + xml + " " + cycle + " " + comp + " data/mirador " + output_folder + " varequiv"
run_command(cmd)
print("CALCULATING WEIGHTS...")
cmd = "python makeweights.py " + output_folder + " weights.list weights.csv weights.xml"
run_command(cmd)
print("VALIDATING METADATA...")
for xml in metadata:
cmd = "python checkmeta.py " + output_folder + "/" + xml
run_command(cmd)
print("AGGREGATING DATA...")
cmd = "python aggregate.py " + output_folder + " " + all_files + " data.tsv"
run_command(cmd)
print("CREATING DICTIONARY...")
outfile.write("CREATING DICTIONARY...\n")
cmd = "python makedict.py " + output_folder + " " + all_files + " data.tsv dictionary.tsv"
run_command(cmd)
print("CREATING GROUPS...")
outfile.write("CREATING GROUPS...\n")
cmd = "python makegroups.py " + output_folder + " " + all_files + " groups.xml"
run_command(cmd)
print("VALIDATING DATA...")
cmd = "python checkdata.py " + output_folder + " " + all_files + " data.tsv"
run_command(cmd)
outfile.close()
print(cycle, "DATASET COMPLETED.")
print("Detailed messages saved to file " + output_filename)