forked from aaronpenne/get_noaa_ghcn_data
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_dly.py
227 lines (197 loc) · 7.22 KB
/
get_dly.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
# -*- coding: utf-8 -*-
"""
Grabs .dly file from the NOAA GHCN FTP server, parses, and reshapes to have one
day per row and element values in the columns. Writes output as CSV.
Author: Aaron Penne
.dly Format In (roughly): .csv Format Out (roughly):
------------------------- --------------------------
Month1 PRCP Day1 Day2 ... Day31 Day1 PRCP SNOW
Month1 SNOW Day1 Day2 ... Day31 Day2 PRCP SNOW
Month2 PRCP Day1 Day2 ... Day31 Day3 PRCP SNOW
Month2 SNOW Day1 Day2 ... Day31 Day4 PRCP SNOW
Starting with 5 core elements (per README)
PRCP = Precipitation (tenths of mm)
SNOW = Snowfall (mm)
SNWD = Snow depth (mm)
TMAX = Maximum temperature (tenths of degrees C)
TMIN = Minimum temperature (tenths of degrees C)
ICD:
------------------------------
Variable Columns Type
------------------------------
ID 1-11 Character
YEAR 12-15 Integer
MONTH 16-17 Integer
ELEMENT 18-21 Character
VALUE1 22-26 Integer
MFLAG1 27-27 Character
QFLAG1 28-28 Character
SFLAG1 29-29 Character
VALUE2 30-34 Integer
MFLAG2 35-35 Character
QFLAG2 36-36 Character
SFLAG2 37-37 Character
. . .
. . .
. . .
VALUE31 262-266 Integer
MFLAG31 267-267 Character
QFLAG31 268-268 Character
SFLAG31 269-269 Character
------------------------------
"""
import pandas as pd
from ftplib import FTP
from io import StringIO
import os
output_dir = os.path.relpath('output')
if not os.path.isdir(output_dir):
os.mkdir(output_dir)
ftp_path_dly_all = '/pub/data/ghcn/daily/all/'
def connect_to_ftp():
"""
Get FTP server and file details
"""
ftp_path_root = 'ftp.ncdc.noaa.gov'
# Access NOAA FTP server
ftp = FTP(ftp_path_root)
message = ftp.login() # No credentials needed
print(message)
return ftp
def get_flags(s):
"""
Get flags, replacing empty flags with '_' for clarity (' S ' becomes '_S_')
"""
m_flag = s.read(1)
m_flag = m_flag if m_flag.strip() else '_'
q_flag = s.read(1)
q_flag = q_flag if q_flag.strip() else '_'
s_flag = s.read(1)
s_flag = s_flag if s_flag.strip() else '_'
return [m_flag + q_flag + s_flag]
def create_dataframe(element, dict_element):
"""
Make dataframes out of the dicts, make the indices date strings (YYYY-MM-DD)
"""
element = element.upper()
df_element = pd.DataFrame(dict_element)
# Add dates (YYYY-MM-DD) as index on df. Pad days with zeros to two places
df_element.index = df_element['YEAR'] + '-' + df_element['MONTH'] + '-' + df_element['DAY'].str.zfill(2)
df_element.index.name = 'DATE'
# Arrange columns so ID, YEAR, MONTH, DAY are at front. Leaving them in for plotting later - https://stackoverflow.com/a/31396042
for col in ['DAY', 'MONTH', 'YEAR', 'ID']:
df_element = move_col_to_front(col, df_element)
# Convert numerical values to float
df_element.loc[:,element] = df_element.loc[:,element].astype(float)
return df_element
def move_col_to_front(element, df):
element = element.upper()
cols = df.columns.tolist()
cols.insert(0, cols.pop(cols.index(element)))
df = df.reindex(columns=cols)
return df
def dly_to_csv(ftp, station_id):
ftp_filename = station_id + '.dly'
# Write .dly file to stream using StringIO using FTP command 'RETR'
s = StringIO()
ftp.retrlines('RETR ' + ftp_path_dly_all + ftp_filename, s.write)
s.seek(0)
# Write .dly file to dir to preserve original # FIXME make optional?
with open(os.path.join(output_dir, ftp_filename), 'wb+') as f:
ftp.retrbinary('RETR ' + ftp_path_dly_all + ftp_filename, f.write)
# Move to first char in file
s.seek(0)
# File params
num_chars_line = 269
num_chars_metadata = 21
element_list = ['PRCP', 'SNOW', 'SNWD', 'TMAX', 'TMIN']
'''
Read through entire StringIO stream (the .dly file) and collect the data
'''
all_dicts = {}
element_flag = {}
prev_year = '0000'
i = 0
while True:
i += 1
'''
Read metadata for each line (one month of data for a particular element per line)
'''
id_station = s.read(11)
year = s.read(4)
month = s.read(2)
day = 0
element = s.read(4)
# If this is blank then we've reached EOF and should exit loop
if not element:
break
'''
Print status
'''
if year != prev_year:
print('Year {} | Line {}'.format(year, i))
prev_year = year
'''
Loop through each day in rest of row, break if current position is end of row
'''
while s.tell() % num_chars_line != 0:
day += 1
# Fill in contents of each dict depending on element type in current row
if day == 1:
try:
first_hit = element_flag[element]
except:
element_flag[element] = 1
all_dicts[element] = {}
all_dicts[element]['ID'] = []
all_dicts[element]['YEAR'] = []
all_dicts[element]['MONTH'] = []
all_dicts[element]['DAY'] = []
all_dicts[element][element.upper()] = []
all_dicts[element][element.upper() + '_FLAGS'] = []
value = s.read(5)
flags = get_flags(s)
if value == '-9999':
continue
all_dicts[element]['ID'] += [station_id]
all_dicts[element]['YEAR'] += [year]
all_dicts[element]['MONTH'] += [month]
all_dicts[element]['DAY'] += [str(day)]
all_dicts[element][element.upper()] += [value]
all_dicts[element][element.upper() + '_FLAGS'] += flags
'''
Create dataframes from dict
'''
all_dfs = {}
for element in list(all_dicts.keys()):
all_dfs[element] = create_dataframe(element, all_dicts[element])
'''
Combine all element dataframes into one dataframe, indexed on date.
'''
# pd.concat automagically aligns values to matching indices, therefore the data is date aligned!
list_dfs = []
for df in list(all_dfs.keys()):
list_dfs += [all_dfs[df]]
df_all = pd.concat(list_dfs, axis=1, sort=False)
df_all.index.name = 'MM/DD/YYYY'
'''
Remove duplicated/broken columns and rows
'''
# https://stackoverflow.com/a/40435354
df_all = df_all.loc[:,~df_all.columns.duplicated()]
df_all = df_all.loc[df_all['ID'].notnull(), :]
'''
Output to CSV, convert everything to strings first
'''
# NOTE: To open the CSV in Excel, go through the CSV import wizard, otherwise it will come out broken
df_out = df_all.astype(str)
df_out.to_csv(os.path.join(output_dir, station_id + '.csv'))
print('\nOutput CSV saved to: {}'.format(os.path.join(output_dir, station_id + '.csv')))
'''
Main
'''
if __name__ == '__main__':
station_id = 'USR0000CCHC'
ftp = connect_to_ftp()
dly_to_csv(ftp, station_id)
ftp.quit()