-
Notifications
You must be signed in to change notification settings - Fork 1
/
Haskell.Rmd
1185 lines (920 loc) · 48.2 KB
/
Haskell.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
title: 'Sensing the Earth: CFT tutorial'
output:
html_document:
toc: yes
df_print: paged
cleanrmd::html_document_clean:
theme: style-serif
pdf_document:
template: /Users/ty/Downloads/Eisvogel-2.0.0/eisvogel.latex
toc: yes
toc_depth: 2
number_sections: true
keep_tex: yes
---
\let\oldsection\section
\renewcommand\section{\clearpage\oldsection}
```{r, cache=TRUE, warning=FALSE, message=FALSE}
options(knitr.kable.NA = '')
```
# Climate futures for [Haskell Indian Nations University](https://www.haskell.edu)
## Prepare your environment
1. Install Climate Futures Toolbox
```{r, eval=FALSE}
install.packages("cft")
```
2. Load other packages
```{r, warning=FALSE, message=FALSE}
library(tidyverse)
library(tidync)
library(cft)
library(sf)
library(ggplot2)
library(ggthemes)
library(ggpattern)
library(magick)
library(future)
library(forecast)
library(tidytable)
library(janitor)
options(timeout = 600)
library(ggfortify)
library(reticulate)
library(osmdata)
library(osmextract)
library(changepoint)
library(weathermetrics)
library(TSstudio)
library(ggridges)
library(plotly)
library(htmlwidgets)
library(IRdisplay)
library(knitr)
```
```{r, eval=FALSE, cache=TRUE, warning=FALSE, message=FALSE}
conda_update(conda = "auto")
py_install("numpy")
py_install("matplotlib")
py_install("pandas")
py_install("statsmodels")
py_install("osmnx")
py_install("geopandas")
```
### Set your color palette
1. Download the Haskell university logo
```{r, cache=TRUE, warning=FALSE, message=FALSE, fig.cap="This is the Haskell logo that I downloaded to this working directory from the schools website. This is the only element that we haven't retrieved directly through and API."}
seamless_image_filenames <- c(
'Haskell_logo.png'
)
```
![](Haskell_logo.png){width=50%}
2. Sample the colors on that logo to make a custom color palette for our basemap
```{r, cache=TRUE, warning=FALSE, message=FALSE}
our_blue <- "#3A4E8B"
our_yellow <- "#FFD60F"
our_beige <- "#EDEDF0"
our_purple <- "#3E1471"
```
## Finding basic layers in OpenStreetMap
Explain the basics of APIs and the premise of fast downloads.
### Use plain lnaguage to request a bounding box
1. Find the general area on Open Street Map
We use a function from the osmdata package to find a bounding box for our area of interest. This is a nice function for this purpose because it can use plan language declarations (e.g. "Lawrence, Kansas" or "Boulder, Colorado") for the location.
You do not need to use this function to define a bounding box. You can define your bounding box from any source. The benefit of this method is that is it is rather easy and reliable.
```{r, cache=TRUE, warning=FALSE, message=FALSE}
bb <- getbb("Lawrence, Kansas")
bb
```
If you format your request a little differently, then it will return the more complex polygon for the bounding box.
```{r, cache=TRUE, warning=FALSE, message=FALSE}
bb_sf <- getbb("Lawrence, Kansas", format_out = "sf_polygon")
ggplot(data=bb_sf$multipolygon) + geom_sf(fill=our_beige, color=our_purple) + theme_tufte()
```
2. Find any buildings associated with any University in our "Lawrence, Kansas" bounding box.
#### Using the package osmdata
This request first calls the opq() function, which mounts to the OpenStreetMap database and then queries the "building" key (i.e. all the building footprints) for any building types matching the value "university". This is a representation of the "key" and "value" system that OSM uses to query data. The final step is to convert the OSM output into a spatial format that works well in R, called sf.
```{r, eval=FALSE, cache=TRUE, warning=FALSE, message=FALSE}
library(osmdata)
University_buildings <-
opq(bb) %>%
add_osm_feature(key = "building", value = "university") %>%
osmdata_sf()
```
The output from this request shows a list of multipolygons, polygons, linestrings, and points. Each of these data types have a different storage structure, so we can't look at them all at the same time. Instead, lets start with polygons, which likely represent a single building footprint. Printing 'my_boundary$osm_polygons' shows that there are two Universities in Lawrence and we need to filter those results down to only include Haskell building.
```{r, eval = FALSE, cache=TRUE, warning=FALSE, message=FALSE}
University_buildings <- University_buildings$osm_polygons
```
#### Using the package osmextract
The package OSMextract is calling the same OSM API, but it does it in a slightly different way that can make if faster but also requires a little better understanding of what you're looking for. For example, by default OSM extract only downloads the first 25 columns as their own column and clumps the rest into a list that is difficult to read. You can add columns to the api call as I did here (e.g. extra_tags = c("operator)) and it will return that column as a column instead of in the list. You can also parse the list yourself or use osmdata() to download a sample and then use osmextract to execute larger downloads. This package usually clumps polygons and multipolygons by default so, you just ask for multipolygons and get both back.
```{r, fig.cap="Buildings labeled 'Haskell' in OSM", cache=TRUE, warning=FALSE, message=FALSE}
library(osmextract)
University_buildings <- oe_get(
place = "Lawrence, Kansas",
layer = "multipolygons",
query = "SELECT * FROM multipolygons WHERE building IN ('university')",
quiet = TRUE,
extra_tags = c("operator")
)
colnames(University_buildings)
```
3. Use the 'operator' column to identify the owner of those buildings and filter down to building operated by Haskell Indian Nations University.
```{r, cache=TRUE, warning=FALSE, message=FALSE}
Haskell_university_buildings <- University_buildings %>%
filter(operator == "Haskell Indian Nations University")
Haskell_university_buildings1 <- Haskell_university_buildings[1,] #take the first building (e.g. first row) of the returns
head(Haskell_university_buildings1)
```
The janitor package is useful for performing automated cleaning tasks on your data. Here we remove all of the columns that contain no data to make our dataframe much smaller and easier to read.
```{r, echo=FALSE, cache=TRUE, warning=FALSE, message=FALSE}
boundaries1 <- janitor::remove_empty(as.data.frame(Haskell_university_buildings1), which = "cols")
boundaries1
```
4. Plot our discovered footprint to visually confirm
It looks like we found [Winona Hall](https://www.kansasmemory.org/item/449914) in the OpenStreetMap database. This is how we plot the perimeters associated with it. ![Winona Hall](winona hall haskell.png)
```{r, fig.width=12, fig.height=3, fig.cap="This seems to match.", cache=TRUE, warning=FALSE, message=FALSE}
basemap <- ggplot(data = st_as_sf(boundaries1)) +
geom_sf(fill = our_purple, color=our_yellow) +
geom_sf_text(aes(label = name), size=10, color=our_yellow) +
theme_tufte()
basemap
```
#### OSM in python using osmnx
The python interface for OSM uses a slightly different syntax to write the requests, but it's calling the same OSM api that the R packages call. You still submit a plain-language area-of-interest, a value, and a key. Those three inputs will return a list of points, lines, and polygons for you to use and manipulate.
```{python, eval=TRUE, cache=TRUE, warning=FALSE, message=FALSE}
place_name = "Lawrence, Kansas"
# import osmnx
import osmnx as ox
import geopandas as gpd
import matplotlib.pyplot as plt
# Get place boundary related to the place name as a geodataframe
area = ox.geocode_to_gdf(place_name)
type(area)
# List key-value pairs for tags
tags = {'building': 'university'}
buildings = ox.geometries_from_place(place_name, tags)
buildings.plot()
plt.show()
```
## Build a basemap from OpenStreetMap
### Download all the layers you want to include in your basemap
1. Download the Haskell University footprint
```{r, warning=FALSE, message=FALSE, fig.cap="Buildings labeled 'Haskell' in OSM", cache=TRUE}
amenity_poly <- oe_get(
place = "Lawrence, Kansas",
layer = "multipolygons",
query = "SELECT * FROM multipolygons WHERE amenity IN ('university')",
quiet = TRUE,
extra_tags = c("operator")
)
haskell_poly <- amenity_poly %>%
filter(name =='Haskell Indian Nations University') %>%
st_as_sf()
```
```{r ,warning=FALSE, message=FALSE, echo=FALSE}
ggplot() +
geom_sf( data=haskell_poly, color=our_yellow, size=0.5) +
geom_sf_pattern(
data = haskell_poly,
size=0,
pattern = 'image',
pattern_type = 'tile',
pattern_scale =0.1,
pattern_filename = seamless_image_filenames
) +
theme_void() +
theme(panel.background = element_rect(fill = our_purple),
plot.background = element_rect(fill = NA))
```
2. Download street vector layers
The street vector is divided into two different downloads in order to create two different objects for coloring in the final figure. This first download will be in the foreground. It includes the larger and faster roadways.
```{r, warning=FALSE, message=FALSE, cache=TRUE}
# the big streets
big_streets_lines <- oe_get(
place = "Lawrence, Kansas",
layer = "lines",
query = "SELECT * FROM lines WHERE highway IN ('motorway', 'trunk', 'primary', 'secondary', 'tertiary')",
quiet = TRUE
)
streets_crop <- big_streets_lines %>%
st_crop(y = c(ymin = bb[2,1], ymax = bb[2,2], xmin = bb[1,1], xmax = bb[1,2]))
```
```{r ,warning=FALSE, message=FALSE, echo=FALSE}
ggplot() +
geom_sf( data=streets_crop, color=our_yellow, size=0.5) +
theme_void() +
theme(panel.background = element_rect(fill = our_purple),
plot.background = element_rect(fill = NA))
```
The second street download is for the small side streets and footpaths. These lines will be more faint and in the background.
```{r, warning=FALSE, message=FALSE, cache=TRUE}
small_streets <- oe_get(
place = "Lawrence, Kansas",
layer = "lines",
query = "SELECT * FROM lines WHERE highway IN ('residential', 'living', 'unclassified', 'service', 'footway')",
quiet = TRUE
)
small_streets_crop <- small_streets %>%
st_crop(y = c(ymin = bb[2,1], ymax = bb[2,2], xmin = bb[1,1], xmax = bb[1,2]))
```
```{r ,warning=FALSE, message=FALSE, echo=FALSE}
ggplot() +
geom_sf( data=small_streets_crop, color=our_beige, size=0.5, alpha=0.4) +
theme_void() +
theme(panel.background = element_rect(fill = our_purple),
plot.background = element_rect(fill = NA))
```
3. Download water features.
The water features are first divided into moving and stationary water. We will download the river layer from the waterway key.
```{r, warning=FALSE, message=FALSE, cache=TRUE}
water <- oe_get(
place = "Lawrence, Kansas",
layer = "lines",
query = "SELECT * FROM lines WHERE waterway IN ('river')",
quiet = TRUE
)
```
We divide the water into large and small waterways in the same way we did with the road. We are interested in making the main river much larger and the remaining waterways collectively smaller. The Kansas river is the large feature in this map so, we pull it out first.
```{r, warning=FALSE, message=FALSE, cache=TRUE}
Kansas_river_multi <- water %>%
filter(name == "Kansas River") %>%
st_as_sf() %>%
st_crop(y = c(ymin = bb[2,1], ymax = bb[2,2], xmin = bb[1,1], xmax = bb[1,2]))
```
```{r ,warning=FALSE, message=FALSE, echo=FALSE}
ggplot() +
geom_sf( data=Kansas_river_multi, color=our_blue, size=3, alpha=1) +
theme_void() +
theme(panel.background = element_rect(fill = our_purple),
plot.background = element_rect(fill = NA))
```
After removing the Kansas river, we are left with a number of remaining waterways that are stored as both linestrings and multilinestrings. We need to download each of those data types individually.
```{r, warning=FALSE, message=FALSE, cache=TRUE}
small_water_lines <- water %>%
filter(name != "Kansas River")%>%
st_as_sf() %>%
st_crop(y = c(ymin = bb[2,1], ymax = bb[2,2], xmin = bb[1,1], xmax = bb[1,2]))
```
```{r ,warning=FALSE, message=FALSE, echo=FALSE}
ggplot() +
geom_sf( data=small_water_lines, color=our_blue, size=3, alpha=1) +
theme_void() +
theme(panel.background = element_rect(fill = our_purple),
plot.background = element_rect(fill = NA))
```
The stationary water bodies are a subcategory under the key=natural and the value=water. We ask for the extra column named water to be include in our returned sf table. We can use that column to filter our the lakes and reservours as local water bodies.
```{r, warning=FALSE, message=FALSE, cache=TRUE}
# Request all water features using natural:water but also request the water tag be given it's own column.
water_body <- oe_get(
place = "Lawrence, Kansas",
layer = "multipolygons",
query = "SELECT * FROM multipolygons WHERE natural IN ('water')",
quiet = TRUE,
extra_tags = c("water") #give water it's own column instead of clumping in supplimentary list
)
water_body_crop <- water_body %>%
filter(water == 'lake' | water == "reservoir") %>%
st_as_sf()
```
```{r ,warning=FALSE, message=FALSE, echo=FALSE}
ggplot() +
geom_sf( data=water_body_crop, color=our_blue, fill=our_blue, size=1, alpha=1) +
theme_void() +
theme(panel.background = element_rect(fill = our_purple),
plot.background = element_rect(fill = NA))
```
### Stack downloaded OSM layers into a final basemap.
This is a special edit to manually shift the bounding box so that it better centered Haskell University in the basemap. Most people will not need this adjustment but may enjoy the ability to microadjust their basemap.
```{r, warning=FALSE, message=FALSE, cache=TRUE}
bbb <- bb
bbb[1,1] <- bbb[1,1] - 0.001
bbb[1,2] <- bbb[1,2] + 0.001
bbb[2,1] <- bbb[2,1] - 0.03
bbb[2,2] <- bbb[2,2] + 0.001
xlimit <- bbb[1,]
ylimit <- bbb[2,]
xmid <- xlimit[1] + diff(xlimit) / 2
ratio <- diff(xlimit) / diff(ylimit)
```
This is a long plot that calls each of the plot layers in order from the back to the front. There is a section at the end that crop, format, and append the basemap.
```{r, warning=FALSE, message=FALSE, fig.width=12, fig.height=12, fig.cap="This seems to match.", cache=TRUE}
haskel_basemap <- ggplot() +
# plot moving water layers first
geom_sf(data = Kansas_river_multi, alpha = .8,
size = 3, colour = our_blue) +
geom_sf(data = small_water_lines, alpha = .8,
size = 0.5, colour = our_blue) +
# Layer bodies of water over the moving water layers
geom_sf(data = water_body_crop, alpha = 1, fill = our_blue, size=0) +
# Plot small streets in the background with a light fade
geom_sf(data = small_streets_crop, alpha = .6,
size = .1, colour = our_beige) +
# Layer large streets over the top of the small streets with a bold color.
geom_sf(data = streets_crop, alpha = .8,
size = .4, colour = our_yellow ) +
# Layer Haskell university property polygon in the foreground
geom_sf( data=haskell_poly, color=our_yellow, size=1) +
# Fill Haskell property polygon with Haskell logo
geom_sf_pattern(
data = haskell_poly,
size=0,
pattern = 'image',
pattern_type = 'tile',
pattern_scale = 0.06,
pattern_filename = seamless_image_filenames
) +
# set limits on final figure
coord_sf(ylim = ylimit, xlim = xlimit, expand = TRUE) +
# adding labels
annotate(geom = "text", y = bbb[2,1]+ 0.013, x = xmid,
label = "Haskell Indian Nations University", size = 12, colour = our_beige
) +
annotate(geom = "errorbarh", xmin = xlimit[1], xmax = xlimit[2], y = bbb[2,1]+ 0.005,
height = 0, size = 0.5, colour = our_beige) +
annotate(geom = "text", y = bbb[2,1]+ 0.0001, x = xmid,
label = "38°93'88\"N 95°23'29\"W", size = 6,
colour = our_beige) +
# clean out unused elements and set background color
theme_void() +
theme(panel.background = element_rect(fill = our_purple),
plot.background = element_rect(fill = NA))
```
![](All_roads_lead_to_Haskell.png)
### Save the basemap in high resolution print
```{r, warning=FALSE, message=FALSE, cache=TRUE}
ggsave(haskel_basemap, filename = "All_roads_lead_to_Haskell.png", height = 11, width=8.5,
units="in", dpi=600)
```
## Mount the climate dataset
This dataset is way too big to download to a particular machine. Instead, you mount to the analysis ready data cube (i.e. netCDF) and only download the subsetted data that you want to pull.
```{r, warning=FALSE, message=FALSE, echo=FALSE, cache=TRUE}
inputs <- cft::available_data()
times <- inputs$available_times
all_pts <- inputs$src %>%
hyper_filter(lat = lat <= c(bbb[4]+0.05) & lat >= c(bbb[2]-0.05)) %>%
hyper_filter(lon = lon <= c(bbb[3]+0.05) & lon >= c(bbb[1]-0.05)) %>%
hyper_filter(time = times$`Available times` == 44558) %>%
hyper_tibble(select_var = "pr_MRI-CGCM3_r1i1p1_rcp85"
) %>%
st_as_sf(coords = c("lon", "lat"), crs = 4326, agr = "constant")
```
```{r, warning=FALSE, message=FALSE, echo=FALSE, cache=TRUE, fig.cap="Climate data are gridded. This figure shows the distribution of data stached at particular spatial locations. You can see that one of the data source points is within the Haskell polygon. Capture that point, we can search for which point is within the polygon or, which point is closest to the centroid of the polygon. Distance to centroid is more generalizable. "}
climate_grid <- ggplot() +
geom_sf( data=haskell_poly, color=our_yellow, size=0.5) +
geom_sf_pattern(
data = haskell_poly,
size=0,
pattern = 'image',
pattern_type = 'tile',
pattern_scale = 1.1,
pattern_filename = seamless_image_filenames
) +
geom_sf(data = all_pts, color=our_yellow) +
theme_void() +
theme(panel.background = element_rect(fill = our_purple),
plot.background = element_rect(fill = NA)) +
coord_sf(ylim = ylimit, xlim = xlimit, expand = TRUE)
ggsave(climate_grid, filename = "climate_grid.png", height = 11, width=8.5,
units="in", dpi=600)
```
![](climate_grid.png)
1. We calculate the center point for measuring to the nearest climate data point.
```{r, warning=FALSE, message=FALSE, cache=TRUE}
haskel_centroid <- st_coordinates(st_centroid(haskell_poly))
lat_pt <- haskel_centroid[1,2]
lon_pt <- haskel_centroid[1,1]
```
2. Connect to the web server and activate the proper data dimensions.
```{r, cache=TRUE, warning=FALSE, message=FALSE}
web_link = "https://cida.usgs.gov/thredds/dodsC/macav2metdata_daily_future"
# Change to "https://cida.usgs.gov/thredds/catalog.html?dataset=cida.usgs.gov/macav2metdata_daily_historical" for historical data.
src <- tidync::tidync(web_link)
lons <- src %>% activate("D2") %>% hyper_tibble()
lats <- src %>% activate("D1") %>% hyper_tibble()
```
3. Search through the database of climate prediction points to find which one is closest to our centroid. We then spatially project that chosen pt into an sf object.
```{r, cache=TRUE, warning=FALSE, message=FALSE}
known_lon <- lons[which(abs(lons-lon_pt)==min(abs(lons-lon_pt))),]
known_lat <- lats[which(abs(lats-lat_pt)==min(abs(lats-lat_pt))),]
chosen_pt <- st_as_sf(cbind(known_lon,known_lat), coords = c("lon", "lat"), crs = "WGS84", agr = "constant")
```
```{r, cache=TRUE, echo=FALSE,warning=FALSE, message=FALSE}
l <- st_union(st_centroid(haskell_poly),st_as_sf(chosen_pt) ) %>%
st_cast("LINESTRING")
p <- st_union(st_centroid(haskell_poly),st_as_sf(chosen_pt) ) %>%
st_cast("POINT") %>% mutate(label = c("Nearest point in \n climate database","Center of Haskell polygon"))
box <- st_bbox(haskell_poly)
```
```{r, cache=TRUE, warning=FALSE, message=FALSE, cache=TRUE, echo=FALSE, fig.cap="One of the downscaled climate datasets in within Haskell's property. From the centroid of the Haskell polygon, we measure the distance to each gridded climate data source and choose the nearest one. "}
grid_match <- ggplot() +
geom_sf( data=haskell_poly, color=our_yellow, size=0.5, fill=our_blue) +
geom_sf(data= l, color=our_yellow, size=3) +
geom_sf(data = chosen_pt, color=our_yellow, size=5) +
geom_sf(data = st_centroid(haskell_poly), color=our_yellow, size=5) +
theme_void() +
theme(panel.background = element_rect(fill = our_purple),
plot.background = element_rect(fill = NA)) +
geom_sf_text(data = st_as_sf(p), aes(label = label), colour = "white", nudge_y=c(0.0013,-0.001), nudge_x = -0.001, size=4)
ggsave(grid_match, filename = "grid_match.png", height = 5, width=4,
units="in", dpi=600)
```
![](grid_match.png)
## Find and transfer climate data from an API
### Mount to the downscaled dataset and transfer metadata.
We cannot download the entire dataset. Instead, we mount to that dataset by connecting to an api that route us to a particular part of the dataset based on the bounding box specified.
Mount to the USGS downscaled dataset. The resultant object called 'input' includes three elements The first is the full list of available data at each timestep. The second is a list of possible time steps. The third is a list of verbatim copy of the raw return from the server. This raw return shows the dimensions of the data and how many elements are available in each of those dimensions.
```{r, cache=TRUE, warning=FALSE, message=FALSE}
# Mount to the USGS downscaled dataset.
inputs <- cft::available_data()
```
Element 1. Dataframe of availble data and descriptions of each of those variables.
```{r, cache=TRUE, warning=FALSE, message=FALSE}
head(inputs[[1]])
```
Element 2. Short list of available data you can request
```{r, cache=TRUE, warning=FALSE, message=FALSE}
head(unique(inputs[[1]]$Variable))
```
Element 3. Dataframe of available times
```{r, cache=TRUE, warning=FALSE, message=FALSE}
head(inputs[[2]])
tail(inputs[[2]])
```
### Decide which Variables, Scenarios, and Models you want to request
Your data order needs to include three things: Variable, Scenario, and Model. Your options can be found in the input elements we explored in the previous section.
```{r, cache=TRUE, warning=FALSE, message=FALSE}
input_variables <- inputs$variable_names %>%
filter(Variable %in% c("Maximum Relative Humidity",
"Minimum Relative Humidity",
"Maximum Temperature",
"Minimum Temperature",
"Precipitation",
"Eastward Wind",
"Northward Wind")) %>%
filter(Scenario %in% c( "RCP 8.5")) %>%
filter(Model %in% c(
"Beijing Climate Center - Climate System Model 1.1",
"Beijing Normal University - Earth System Model",
"Canadian Earth System Model 2",
"Centre National de Recherches Météorologiques - Climate Model 5",
"Commonwealth Scientific and Industrial Research Organisation - Mk3.6.0",
"Community Climate System Model 4",
"Geophysical Fluid Dynamics Laboratory - Earth System Model 2 Generalized Ocean Layer Dynamics",
"Geophysical Fluid Dynamics Laboratory - Earth System Model 2 Modular Ocean",
"Hadley Global Environment Model 2 - Climate Chemistry 365 (day) ",
"Hadley Global Environment Model 2 - Earth System 365 (day)",
"Institut Pierre Simon Laplace (IPSL) - Climate Model 5A - Low Resolution",
"Institut Pierre Simon Laplace (IPSL) - Climate Model 5A - Medium Resolution",
"Institut Pierre Simon Laplace (IPSL) - Climate Model 5B - Low Resolution",
"Institute of Numerical Mathematics Climate Model 4",
"Meteorological Research Institute - Coupled Global Climate Model 3",
"Model for Interdisciplinary Research On Climate - Earth System Model",
"Model for Interdisciplinary Research On Climate - Earth System Model - Chemistry",
"Model for Interdisciplinary Research On Climate 5",
"Norwegian Earth System Model 1 - Medium Resolution" )) %>%
pull("Available variable")
head(as.data.frame(input_variables))
```
### Prepare for parallelization
We have requested enough information to exceed our download limit and we need to implement a 'parallel' approach to get all the data we want. This strategy has each of the cores in your computer act as their own computer and individually make small requests from the server and then assemble all the little chunks into the final dataset you requested. Here we tell our computer that we are about to send a parallel request and tell the computer how we want to destribute the tasks we send.
```{r, eval=FALSE, cache=TRUE, warning=FALSE, message=FALSE}
# ask how many cores are available to be farmed out. I subtract one so I still have a core to use for controlling the whole process.
n_cores <- availableCores() - 1
# set plan to take all cores except one.
plan(multisession, workers = n_cores)
```
### Make parallel call to USGS server.
This is the parallel function from the CFT package. It will shuttle all the data you requested, through all the cores you specified, for the latitude and longitude you requested. This took about 45 minutes on my home laptop with fiber internet. Virtual machines usually only have one core. If you're running this in the cloud, you may need to do some special configuration to get this to actually parallelize.
```{r, eval=FALSE, cache=TRUE, warning=FALSE, message=FALSE}
out <- single_point_firehose(input_variables, known_lat, known_lon )
head(out)
```
### Save output
To save time, I have run the api request above and saved the results for future use. There is no reason to run the api request over and over again. It's easy to save the data once their downloaded and save yourself the download again in the future.
```{r, eval=FALSE, cache=TRUE, warning=FALSE, message=FALSE}
haskell <- out
save(haskell, file = "haskell.RData")
```
### Load saved output
If I have run my api request and saved it to my working directory, then I can load it from here anytime I need. This will save you time as you experiment with different downstream analyes.
```{r, cache=TRUE, warning=FALSE, message=FALSE}
load(file = "haskell.RData")
```
## Organize climate data
Our requested climate data are returned from the api server as two data frames. The first data frame is the columns of data that are indexed by a time reference number and the second, which is the list translations from time reference number to actual time. We will join those tables here to make those data easier to work with. Once joined, we convert the time labels from characters to POSIX. POSIX is a special way of handling time and date data in R. We reorder the columns so that the POSIX data is in the first column. This will make it easy to later create a time series object (ts) that can go into our statistical and forecasting functions. Finally, we print the column names of the final transformed data frame to verify that we have time data in the first column and all the requested data as columns after that.
### Join data with dates
```{r, cache=TRUE, warning=FALSE, message=FALSE}
# make the time output into it's own dataframe and change column name
available_times <- inputs[[2]]
colnames(available_times)[1] <- "time"
# left join the time data into the spatial data
haskell_posix <- haskell %>%
left_join(available_times, by="time")
# convert time format into POSIX, which is a format that deals with all the confusion of time and data formats (e.g. time zones and translation between numbers and word descriptions for time)
haskell_posix$dates <- as.POSIXct(haskell_posix$dates)
class(haskell_posix$dates)
#reorder so that dates are the first column
haskell_posix <- haskell_posix[,c(93,2, 1,3:92)]
colnames(haskell_posix)
```
We imported temperature data, humidity data, and precipitation data. Each of these are handled and modeled in a slightly different way. Here, we'll work through those three data types in sequence and try to draw inference from synthesizing those three pieces of information.
### Temperature
Temperature is the most talked-about component of climate. It's a great indicator of the weather overall, it's a direct output of climate models, and it's a defining characteristic of ecological niches.
#### Minimum temperature
1. Organize data
The climate data we are importing are 'downscaled' from much larger and more complex models. This process of downscaling summarized that more complicated model. This means that we don't need to calculate our own summary statistics (e.g. mean, minimum, or maximum) because we download those summary statistics instead of the raw data.
We start by filtering our full downloaded data set and create a new data set with only minimum temperatures in it. We ordered one scenario from 18 different climate modeling agencies so, our filtered data set should be 18 columns of data plus geography and time tags. We then use the gather() function to reorganize that table so that we have three columns: dates, variable,
and value. This reorganization makes the data easy to convert into a time series (ts) object for later analysis and visualization.
```{r, cache=TRUE, warning=FALSE, message=FALSE}
df_min_temp <- haskell_posix %>%
st_drop_geometry() %>%
select(dates, dates, which(startsWith(colnames(haskell), "tasmin"))) %>%
gather(key = "variable", value = "value", -dates)
df_min_temp <- df_min_temp[which(df_min_temp$value > 200), ]
df_min_temp$variable <- as.character(as.numeric(as.factor(df_min_temp$variable)))
colnames(df_min_temp)
```
2. Plot data
If we plot our filtered minimum temperature data, we see a chaotic mess because all 18 models are represented on the same graph. We probably want to consider all of the models together.
```{r, cache=TRUE, warning=FALSE, message=FALSE}
all_climate_projections <- ggplot(data= df_min_temp, aes(x = dates, y = value, color = variable)) +
geom_line()+
geom_smooth() +
scale_colour_viridis_d(option="A")
all_climate_projections
```
3. Plot as ensemble and fit a general additive model (GAM) to those data
If we plot those same data again without the model distinction, we see the ensemble of all 18 climate models. These data were standardized during downscaling, so they are directly comparable now without any more fuss. Notice that these data are in Kelvin. You should alway complet all of your analyses in Kelvin and only translate to Celsius or Fahrenheit for final figures for a general audience.
We apply a general additive model to the data to see what the basic trend looks like. It looks like we should expect a steady increasing in temperature from 281K to 289K between now and 2099. This line doesn't offer much refinement to our existing expectation. Visually, it sits below an area of high data density and it doesn't help us explain any of the season variation we see in the data. This fit makes us want something better.
```{r, cache=TRUE, warning=FALSE, message=FALSE}
ensemble_climate_projections <- ggplot(data= df_min_temp, aes(x = dates, y = kelvin.to.fahrenheit(value))) +
geom_line(color=our_purple)+
geom_smooth(color=our_yellow) + #This applies a GAM (general additive model) to the data to find a trend line.
theme_tufte() +
geom_hline(yintercept=32)+
geom_hline(yintercept=87)+
geom_hline(yintercept=105)
ensemble_climate_projections
```
Number of days below freezing. Each generation will have one less month of freeze.
Highest daily low. Each generation experiences an additional 10 days per year with a daily low above 87 degrees F. Three generations see a lot of change. for those trying to accomplish 8 generation planning, modern science can only project less than half way there.
```{r, cache=TRUE, warning=FALSE, message=FALSE}
df_min_temp_F <- df_min_temp
df_min_temp_F$value <- kelvin.to.fahrenheit(df_min_temp_F$value)
df_min_temp_F$dates <- format(df_min_temp_F$dates, format = "%Y")
below_freezing <- df_min_temp_F %>%
filter(value <=32) %>%
count(dates) %>%
mutate(cold_counts = n/18)
high_lows <- df_min_temp_F %>%
filter(value >=87) %>%
count(dates) %>%
mutate(high_lows_counts = n/18)
scorching_nights <- df_min_temp_F %>%
filter(value >=105) %>%
count(dates) %>%
mutate(scorching_nights_counts = n/18)
ggplot(data = below_freezing, aes(x = dates, y=cold_counts)) +
geom_point(color=our_purple)+
geom_point(data = high_lows, aes(x = dates, y=high_lows_counts),color=our_yellow)+
geom_point(data = scorching_nights, aes(x = dates, y=scorching_nights_counts),color=our_blue)+
theme_tufte() +
ylim(0,100) +
ylab("number of days") +
geom_hline(yintercept=0)+
geom_hline(yintercept=35)
```
```{r, cache=TRUE, warning=FALSE, message=FALSE}
min_temp <- ts(df_min_temp[,3], frequency = 365, start = c(2006, 1), end = c(2099,365))
```
4. Decompose the time series using fourier analysis.
Fourier analyses are a convenient way to decompose repeating waves and are a mainstay of time series analyses. The analysis presented here finds the seasonal harmonic in the data and subtracts that harmonic from the data to show the difference between trend and noise. We start by converting our data into a ts object and passes that ts object to the decompose() function. When we plot that deconstruction, we see that the resultant trend line is much more nuanced than our previous fit.
Here is the GAM fit
```{r, cache=TRUE, warning=FALSE, message=FALSE}
min_temp %>%
decompose() %>%
autoplot() + theme_tufte()
```
We can also use a different decompositions model, here the STL model, which is a loess model for time series data. The STL
Here is the loess fit
```{r, cache=TRUE, warning=FALSE, message=FALSE}
min_temp %>%
stl(s.window = "periodic") %>%
autoplot() + theme_tufte()
```
#### Estimate Spectral Density of a Time Series
This is the fourier spectral breakdown for the decomposition. You can see the strong first harmonic that is easy to pull out and makes the decomposition of this data go relatively fast for this size of dataset.
```{r, cache=TRUE, warning=FALSE, message=FALSE}
autoplot(spec.ar(min_temp, plot = FALSE))+ theme_tufte()
```
#### Verify that our use of an additive model was appropriate.
Time series can come in a couple different flavors. The two that are common in decomposition analyses are additive and multiplicative harmonics. We made the assumption that temperature increase was additive and we can validate that assumption here with an autocorrelation function (ACF), which is the coefficient of correlation between two values in a time series. The results show the gradual decline of ACF along the time series. This is the result of our one-step-ahead climate predictions that use the previous year to predict the next year. this means that there is strong correlation between adjoining years, but that the correlation degrades in one direction from no into the future. You see that years in the near future are tightly coupled with each other but that covariance degrades over time so we have less confidence in the end of the time series than we do about the beginning. A multiplicative relationship would inflate rapidly over time and show an exponential relationship here. You might expect multiplicative relationships with time series of demographic growth, stock trends, or social media 'likes' because those all present mechanisms that can multiply rather than add.
```{r, cache=TRUE, warning=FALSE, message=FALSE}
autoplot(acf(min_temp, plot = FALSE))+ theme_tufte()
```
#### Forecast models
We need to make an important distinctions between the different types of forecasting happening in this analysis. Our climate data are forecast into the future using global mechanistic models simulating the collision of air molecules and the accumulation of gasses that change climate and weather patterns over decades. Those models produce the data we download and use as raw data to describe our local future. The forecasting we're about to do, we're looking for statistical trends contained within that data. There is no natural or environmental mechanism in this forecasting.
The arima forecast is a statistical forecasting method that fits a model to the data using the same decomposition method we described above (GAN) and then calculates an acceptable forecast window based on the predictability of the trend relative to the data.
```{r, cache=TRUE, warning=FALSE, message=FALSE}
seasonally_adjusted_min_temp <- min_temp %>% stl(s.window='periodic') %>% seasadj()
min_plot <- autoplot(seasonally_adjusted_min_temp)
#+
# theme_tufte() +
#geom_smooth(col=our_yellow)
min_plot
```
```{r, cache=TRUE, warning=FALSE, message=FALSE}
arima_min_temp <- auto.arima(seasonally_adjusted_min_temp)
arima_min_temp
```
```{r, cache=TRUE, warning=FALSE, message=FALSE}
bullseye <- autoplot(arima_min_temp)
bullseye
```
```{r, cache=TRUE, warning=FALSE, message=FALSE}
checkresiduals(arima_min_temp)
```
```{r, cache=TRUE, warning=FALSE, message=FALSE}
seasonally_adjusted_min_temp %>% diff() %>% ggtsdisplay(main="") + theme_tufte()
```
```{r, cache=TRUE, warning=FALSE, message=FALSE}
autoplot(cpt.meanvar(seasonally_adjusted_min_temp), cpt.colour = 'blue', cpt.linetype = 'solid')
```
```{r, cache=TRUE, warning=FALSE, message=FALSE}
arima_min_temp %>% forecast(h=2400) %>% autoplot()
```
#### Maximum temperature
The cyverse folks join us from Arizona will think thi
```{r, cache=TRUE, warning=FALSE, message=FALSE}
df_max_temp <- haskell_posix %>%
st_drop_geometry() %>%
select(dates, dates, which(startsWith(colnames(haskell), "tasmax"))) %>%
gather(key = "variable", value = "value", -dates)
df_max_temp <- df_max_temp[which(df_max_temp$value > 200), ]
df_max_temp$variable <- as.character(as.numeric(as.factor(df_max_temp$variable)))
#df_min_temp_TS <- as.xts(df_min_temp)
colnames(df_max_temp)
```
```{r, cache=TRUE, warning=FALSE, message=FALSE}
ensemble_climate_projections <- ggplot(data= df_max_temp, aes(x = dates, y = kelvin.to.fahrenheit(value))) +
geom_line(color=our_purple)+
geom_smooth(color=our_yellow) + #This applies a GAM (general additive model) to the data to find a trend line.
theme_tufte() +
geom_hline(yintercept=114)+
geom_hline(yintercept=32) +
ylab("temperature (F)")
ensemble_climate_projections
```
```{r, cache=TRUE, warning=FALSE, message=FALSE}
df_max_temp_F <- df_max_temp
df_max_temp_F$value <- kelvin.to.fahrenheit(df_max_temp_F$value)
df_max_temp_F$dates <- format(df_max_temp_F$dates, format = "%Y")
beyond_max_temp <- df_max_temp_F %>%
filter(value >=114) %>%
count(dates) %>%
mutate(beyond_max = n/18)
max_below_freezing <- df_max_temp_F %>%
filter(value <=32) %>%
count(dates) %>%
mutate(below_freezing = n/18)
ggplot(data = beyond_max_temp, aes(x = dates, y=beyond_max)) +
geom_point(color=our_purple)+
geom_point(data = max_below_freezing, aes(x = dates, y=below_freezing), color=our_yellow)+
theme_tufte() +
ylim(0,15) +
ylab("number of days")
```
```{r, cache=TRUE, warning=FALSE, message=FALSE}
max_temp <- ts(df_max_temp[,3], frequency = 365, start = c(2006, 1), end = c(2099,365))
```
```{r, cache=TRUE, warning=FALSE, message=FALSE}
seasonally_adjusted_max_temp <- max_temp %>% stl(s.window='periodic') %>% seasadj()
```
```{r, cache=TRUE, warning=FALSE, message=FALSE}
max_temp %>%
stl(s.window = "periodic") %>%
autoplot() + theme_tufte()
```
```{r, cache=TRUE, warning=FALSE, message=FALSE}
arima_max_temp <- auto.arima(seasonally_adjusted_max_temp)
arima_max_temp
```
```{r, cache=TRUE, warning=FALSE, message=FALSE}
autoplot(cpt.meanvar(seasonally_adjusted_max_temp), cpt.colour = 'blue', cpt.linetype = 'solid')
```
```{r, cache=TRUE, warning=FALSE, message=FALSE}
arima_max_temp %>% forecast(h=2400) %>% autoplot()
```
```{r, cache=TRUE, warning=FALSE, message=FALSE}
min_temp_df <- kelvin.to.fahrenheit(as.data.frame(seasonally_adjusted_min_temp))
max_temp_df <- kelvin.to.fahrenheit(as.data.frame(seasonally_adjusted_max_temp))
ggplot(data=min_temp_df, aes(y=x, x=seq(1, length(x)))) +
geom_line(data=min_temp_df, color=our_purple) +
geom_line(data=max_temp_df, color=our_purple) +
geom_smooth(data=min_temp_df,color=our_yellow, alpha=1) +
geom_smooth(data=max_temp_df,color=our_yellow, alpha=1) +
theme_tufte() +
geom_hline(yintercept=114)+
geom_hline(yintercept=32) +
ylab("temperature (F)")
```
### Relative Humidity
```{r, cache=TRUE, warning=FALSE, message=FALSE}
cft_time_series <- function(data, variable){
require(dplyr)
require(ggplot2)
require(ggfortify)
require(changepoint)
require(weathermetrics)
inputs <- cft::available_data()
available_times <- inputs[[2]]
colnames(available_times)[1] <- "time"
# left join the time data into the spatial data
data_posix <- data %>%
left_join(available_times, by="time")
# convert time format into POSIX, which is a format that deals with all the confusion of time and data formats (e.g. time zones and translation between numbers and word descriptions for time)
data_posix$dates <- as.POSIXct(data_posix$dates)
#reorder so that dates are the first column
data_posix <- data_posix[,c(93,2, 1,3:92)]
print("Combined data with verbose dates.")
df <- data_posix %>%
st_drop_geometry() %>%
select(dates,which(startsWith(colnames(data), variable)) ) %>%
gather(key = "variable", value = "value", -dates)
print("regroup data")
plot1 <- ggplot(data= df, aes(x = dates, y = value, color = variable)) +
scale_colour_viridis_d(option="A", alpha = 0.8) +
geom_smooth() +
theme_tufte() +
theme(legend.position = "none")
df_min <- data_posix %>%
st_drop_geometry() %>%
select(dates,which(startsWith(colnames(data_posix), paste0(variable,"min")))) %>%
gather(key = "variable", value = "value", -dates)
df_max <- data_posix %>%
st_drop_geometry() %>%
select(dates,which(startsWith(colnames(data_posix), paste0(variable,"max")))) %>%
gather(key = "variable", value = "value", -dates)
plot2 <- ggplot(data= data_posix, aes(x = dates, y = value) )+
geom_smooth(data= df_min, color=our_purple) +
geom_smooth(data= df_max, color=our_purple) +
theme_tufte()
print("Filtered to seperate out min and max values")
min_ts <- ts(df_min[,3], frequency = 365, start = c(2006, 1), end = c(2099,365))
max_ts <- ts(df_max[,3], frequency = 365, start = c(2006, 1), end = c(2099,365))
print("Converted to time series object")
seasonally_adjusted_min_ts <- min_ts %>% stl(s.window='periodic') %>% seasadj()
seasonally_adjusted_max_ts <- max_ts %>% stl(s.window='periodic') %>% seasadj()
plot3 <- min_ts %>%
stl(s.window = "periodic") %>%
autoplot() + theme_tufte()
plot4 <- max_ts %>%
stl(s.window = "periodic") %>%
autoplot() + theme_tufte()
print("Fit moving window model to max and min")
arima_min_ts <- auto.arima(seasonally_adjusted_min_ts)
arima_min_ts
arima_max_ts <- auto.arima(seasonally_adjusted_max_ts)
arima_max_ts
plot5 <- autoplot(cpt.meanvar(seasonally_adjusted_min_ts), cpt.colour = 'purple', cpt.linetype = 'solid')
plot6 <- autoplot(cpt.meanvar(seasonally_adjusted_max_ts), cpt.colour = 'purple', cpt.linetype = 'solid')
print("Fit arima model to min and max.")
plot7 <- arima_min_ts %>% forecast(h=2400) %>% autoplot()
plot8 <- arima_max_ts %>% forecast(h=2400) %>% autoplot()
print("Statistical Forecast for 20 years.")
min_ts_df <- as.data.frame(seasonally_adjusted_min_ts)
max_ts_df <- as.data.frame(seasonally_adjusted_max_ts)
plot9 <- ggplot(data=min_ts_df, aes(y=x, x=seq(1, length(x)))) +
geom_line(data=min_ts_df, color=our_purple) +
geom_line(data=max_ts_df, color=our_purple) +
geom_smooth(data=min_ts_df,color=our_yellow, alpha=1) +
geom_smooth(data=max_ts_df,color=our_yellow, alpha=1) +
theme_tufte() +
geom_hline(yintercept=100)+
geom_hline(yintercept=0) +
ylab("humidity")