Skip to content

Commit

Permalink
Fixed downloading bug in USGS
Browse files Browse the repository at this point in the history
  • Loading branch information
Eli authored and Eli committed Nov 16, 2024
1 parent 5041aec commit c8f422a
Show file tree
Hide file tree
Showing 4 changed files with 23 additions and 14 deletions.
16 changes: 11 additions & 5 deletions dms_datastore/download_nwis.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,21 +211,25 @@ def parse_usgs_json(parseinput,outfile,report_empty=False):
# subloc_yaml = yaml.dump(subloc_dict, default_flow_style=False)

write_ts_csv(result_df,outfile,site_metadata,chunk_years=False)
return unique_qual
return result_df




def download_station(
row, dest_dir, start, end, param, overwrite, endfile, successes, failures, skips
):

agency_id = row.agency_id
station = row.station_id
param = row.src_var_id
paramname = row.param
subloc = row.subloc

if (station, paramname) in successes:
return



yearname = (
f"{start.year}_{endfile}" # if start.year != end.year else f"{start.year}"
Expand Down Expand Up @@ -272,18 +276,20 @@ def download_station(
station_html = response.read().decode().replace("\r", "")
except:
station_html = "" # Catches incomplete read error
if len(station_html) > 80 and not "No sites found matching" in station_html:
if len(station_html) > 120 and not "No sites found matching" in station_html or "\"timeSeries\":[]" in station_html:
found = True
logger.info(f"Parsing USGS JSON: {path}")
logger.info(f"Parsing USGS JSON: {path} param {param}")
try:
parse_usgs_json(station_html,path,report_empty=f"{station} {paramname} ({param})")
df = parse_usgs_json(station_html,path,report_empty=f"{station} {paramname} ({param})")
except Exception as exc:
logger.info(f"Parsing of {station} {paramname} ({param}) JSON to csv failed")
with open(path, "w") as f:
f.write(station_html)
_quarantine_file(path)
raise
successes.add((station, paramname))
if df is not None and not df.empty:
found = True
successes.add((station, paramname))
if not found:
logger.debug(f"Station {station} query failed or produced no data")
if (station, paramname) not in failures:
Expand Down
7 changes: 5 additions & 2 deletions dms_datastore/inventory.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,14 +157,17 @@ def repo_data_inventory(fpath,full=True,by="file_pattern"):
metadf = pd.DataFrame(allmeta)
metadf['original_filename'] = metadf.filename
metadf['filename'] = metadf.apply(lambda x: to_wildcard(x.filename,remove_source=True),axis=1)
metadf['source'] = metadf['agency']

metadf.loc[:,'agency'] = station_db.loc[metadf.station_id,'agency'].to_numpy()
double_year_format = "syear" in metadf.columns

#meta2 = metadf.groupby(["station_id","subloc","param"]).first()
if double_year_format:
# todo: is a groupby necessary for double year format? are there duplicates?
grouped_meta = metadf.groupby(["station_id","subloc","param"],dropna=False).agg(
{
"agency": lambda ser: reduce(prioritize_source,ser),
"agency": ['first'],
"agency_id":['first'],
"syear":['min'],
"eyear":['max'],
Expand All @@ -175,7 +178,7 @@ def repo_data_inventory(fpath,full=True,by="file_pattern"):
else:
grouped_meta = metadf.groupby(["station_id","subloc","param"],dropna=False).agg(
{
"agency": lambda ser: reduce(prioritize_source,ser),
"agency": ['first'],
"agency_id":['first'],
"year":['min','max'],
"filename": ['first'],
Expand Down
13 changes: 7 additions & 6 deletions dms_datastore/populate_repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,7 +438,7 @@ def populate(dest, all_agencies=None, varlist=None, partial_update=False):
for var in varlist:
if not partial_update:
logger.info(
f"Calling populate_repo (1) with agency {agency} variable: {var}"
f"Calling populate_repo (1) with agency {agency} variable: {var} start: 1980-01-01"
)
populate_repo(
agency,
Expand All @@ -449,7 +449,7 @@ def populate(dest, all_agencies=None, varlist=None, partial_update=False):
ignore_existing=ignore_existing,
)
logger.info(
f"Calling populate_repo (2) with agency {agency} variable: {var}"
f"Calling populate_repo (2) with agency {agency} variable: {var} start: 2000-01-01"
)
populate_repo(
agency,
Expand All @@ -460,9 +460,10 @@ def populate(dest, all_agencies=None, varlist=None, partial_update=False):
ignore_existing=ignore_existing,
)
logger.info(
f"Calling populate_repo (3) with agency {agency} variable: {var}"
f"Calling populate_repo (3) with agency {agency} variable: {var} start: 2020-01-01"
)
end_download = pd.Timestamp(2039,12,31,23,59) if ((agency == "noaa") and (var == "predictions")) else None

populate_repo(
agency, var, dest, pd.Timestamp(2020, 1, 1), end_download, overwrite=True
)
Expand All @@ -471,11 +472,11 @@ def populate(dest, all_agencies=None, varlist=None, partial_update=False):
os.path.join(dest, f"{agency}*_{var}_*.{ext}")
)
logger.info(f"Done with agency {agency} variable: {var}")
print(f"Done with agency {agency} for all variables")
logger.info(f"Done with agency {agency} for all variables")
doneagency.append(agency)
print("Completed population for these agencies: ")
logger.info("Completed population for these agencies: ")
for agent in doneagency:
print(agent)
logger.info(agent)


def purge(dest):
Expand Down
1 change: 0 additions & 1 deletion dms_datastore/write_ts.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,6 @@ def write_ts_csv(ts,fpath,metadata=None,chunk_years=False,format_version="dwr-dm
pass
#print(f"Year already in file name for file {newfname}")
with open(newfname,'w',newline="\n") as outfile:
print(meta_header)
outfile.write(meta_header)
tssub.to_csv(outfile,header=True,sep=",",date_format="%Y-%m-%dT%H:%M:%S",**kwargs)
else: # not chunk_years
Expand Down

0 comments on commit c8f422a

Please sign in to comment.