-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathnzgoal_audit.py
172 lines (139 loc) · 6.46 KB
/
nzgoal_audit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import xml.etree.ElementTree as etree
from datetime import datetime
from urllib2 import Request, urlopen, URLError
import csv
import sys
import re
from optparse import OptionParser
# SPALSH
print '''
-------------------------------------------------
_ ____ ____ _ _ _ ____ ___ _____
| | | _ \/ ___| / \ | | | | _ \_ _|_ _|
| | | | | \___ \ / _ \| | | | | | | | | |
| |___| |_| |___) | / ___ \ |_| | |_| | | | |
|_____|____/|____/ /_/ \_\___/|____/___| |_|
This utility performs the NZGOAL LDS audit by
comparing the LDS RSS feed with the NZGOAL
google form questionnaire as exported in tsv.
For more information. Please see the geodetic wiki
e.g.
python nzgoal_audit.py --help
python nzgoal_audit.py -F '30/06/15'
-T '1/07/16'
-f './NZ Goal Data.tsv'
-------------------------------------------------
'''
# CHECK PYTHON VERSION
if sys.version_info < (2,3) or sys.version_info > (2,8):
print ''''Only Python 2.4 --> 2.7 is supported.
You are running {0}'''.format(sys.version)
sys.exit(-1)
# RESULT CATEGORIES
result = {'pub' : {}, #publish
'pwr' : {}, #publish with restrictions
'dnp' : {}, #do not publish
'nid' : {}, #no id - id not in tsv
}
mappings_for_humans = {'nid' : 'No Corresponding lds id In Forms Spread Sheet (.tsv):',
'pwr' : 'Publish With Restrictions:',
'pub' : 'Publish:',
'dnp' : 'Do Not Publish:'
}
# GET USER INPUTS
parser = OptionParser()
parser.add_option("-F", "--from-date", action="store", dest="date_from", help="Audit from date (dd/mm/yy)")
parser.add_option("-T", "--to-date", action="store", dest="date_to", help="Audit to date (dd/mm/yy)")
parser.add_option("-f", "--tsv-file", action="store", dest="tsv_file", help="tsv file path")
(options, args) = parser.parse_args()
if options.date_from:
date_from = options.date_from
else:
date_from = raw_input("Audit Date From (dd/mm/yy): ")
date_from = datetime.strptime(date_from, "%d/%m/%y").date()
if options.date_to:
date_to = options.date_to
else:
date_to = raw_input("Audit Date To (dd/mm/yy): ")
date_to = datetime.strptime(date_to, "%d/%m/%y").date()
if options.tsv_file:
tsv_file = options.tsv_file
else:
tsv_file = raw_input("tsv file path: ")
# PROCESS FORMS SPREADSHEET
# Read in tsv
with open(tsv_file, 'rb') as tsv:
reader = csv.reader(tsv, delimiter='\t')
header = reader.next()
num_cols = len(header)
# Check an id column was added to tsv as per instructions
if header[0].upper() != 'ID':
print 'EXITING - The first column has not be titled "id" as per the instructions'
sys.exit()
# COMPILE FORM DATA (DICTIONARY)
form_data = {}
for row in reader:
ids = re.sub(' +','',row[0])
# last question answered
pos_last_q = [i for i,x in enumerate(row) if x !=''][-1]
str_last_q = header[pos_last_q]
# iterate over ids in id coloum
for id in ids.split(','):
# Categorise based on last answer
if re.match( r'.*RELEASE\.$', str_last_q):
form_data[id]='pub' #publish
elif re.match( r'(.*release to restricted audience.*|.*obtain the relevant rights.*)', str_last_q):
form_data[id]='pwr' #publish with restrictions
elif re.match( r'(^Do not publish.*)', str_last_q):
form_data[id]='dnp' #do not publish
else:
print '''SCRIPT FAILED WHEN MATCHING TSV HEADER TEXT WITH CODE FOR
FOR CATEGORISATION. \n Has the forms outcomes wording changed?'''
sys.exit()
# COMAPRE LDS RSS DATA TO FORM DATA
print '\nAssessing RSS Data:'
for data_type in ('tables', 'layers'):
status = 200
page = 1
feed = '{http://www.w3.org/2005/Atom}'
while status == 200:
if page % 2 == 0:
print '|',
req = Request('http://data.linz.govt.nz/feeds/{0}?page={1}'.format(data_type, page))
try:
response = urlopen(req)
status = response.getcode()
data = response.read()
tree = etree.fromstring(data)
entries = tree.findall(feed+'entry')
for e in entries:
data_set_date = e.find(feed+'published').text.rsplit('T')[0]
data_set_date = datetime.strptime(data_set_date, "%Y-%m-%d").date()
# Only consider data between user input dates
if data_set_date >= date_from and data_set_date <= date_to:
rss_id = e.find(feed+'id').text #<id>tag:data.linz.govt.nz,2016-09:layers:3452</id>
re_match = re.match( r'(.*layers:)([0-9]*)', rss_id, re.I)
rss_id = re_match.group(2)
#find id in form_data
form_record = form_data.get(rss_id, None)
if form_record:
result[form_record].update({rss_id: {'name' : e.find(feed+'title').text, 'date_pub':e.find(feed+'published').text}})
else :
result['nid'].update({rss_id: {'name' : e.find(feed+'title').text, 'date_pub':e.find(feed+'published').text}})
except URLError, e:
status = e.getcode()
response.close()
page += 1
# PRINT THE RESULTS
print '''
\n>>>RESULTS:\nThe script has found all LDS ids of those public datasets
published between the provided dates.The results are categorised based
on the outcomes of the forms questionnaire, except those that did not find
a matching id in the tsv/ spreadsheet. These are out-putted here under
the section "NO CORRESPONDING LDS ID IN FORMS SPREAD SHEET (.TSV)"
'''
for k, v in result.items():
print '{0}{1}{2}{1}'.format('-'*100,'\n', mappings_for_humans.get(k).upper())
print '{0}{1}{2}'.format('lds_id:', '\tDate Pub:', '\t'*2+'Data Set Name:')
for id, data in v.items():
print '{0}:\t{2}\t{1}'.format(id, data['name'].encode('utf8'), data['date_pub'] )