-
Notifications
You must be signed in to change notification settings - Fork 0
/
data.py
3449 lines (2673 loc) · 151 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""Operations on data"""
import math as _math
import string as _string
from abc import ABCMeta as _ABCMeta
from warnings import warn as _warn
import os.path as _path
import inspect as _inspect
import more_itertools as _more_itertools
import fuckit as _fuckit
import pandas as _pd
import numpy as _np
import arcpy as _arcpy
from arcpy.conversion import TableToDBASE, TableToExcel, TableToGeodatabase, TableToSAS, TableToTable, ExcelToTable # noqa
from arcpy.management import MakeAggregationQueryLayer, MakeQueryLayer, MakeQueryTable, CalculateField, CalculateStatistics, DeleteIdentical # noqa Expose here as useful inbult tools
from arcpy.analysis import CountOverlappingFeatures, SummarizeNearby, SummarizeWithin # noqa
from arcpy.da import Describe
with _fuckit:
from arcproapi.common import release
if int(release()[0]) > 2 and _arcpy.GetInstallInfo()['ProductName'] == 'ArcGISPro':
from arcpy.conversion import ExportTable, ExportFeatures # noqa
import xlwings as _xlwings # never will be implemented on linux
import great_expectations as _gx
import funclite.iolib as _iolib
import funclite.baselib as _baselib
import funclite.stringslib as _stringslib
from funclite.mixins import MixinNameSpace as _MixinNameSpace
import funclite.pandaslib as _pandaslib # noqa
from funclite.pandaslib import df_to_dict_as_records_flatten1 as df_to_dict # noqa Used to convert a standard dataframe into one accepted by field_update_from_dict and field_update_from_dict_addnew
import funclite.pandaslib as pandaslib # noqa no _ as want to expose it to pass agg funcs to ResultsAsPandas instances
import arcproapi.structure as _struct
import arcproapi.mixins as _mixins
import arcproapi.errors as _errors
import arcproapi.crud as _crud
import arcproapi.orm as _orm
import arcproapi.sql as _sql
import arcproapi.decs as _decs
import arcproapi.common as _common
import arcproapi.geom as _geom
# More data-like functions, imported for convieniance
from arcproapi.common import get_row_count2 as get_row_count2
from arcproapi.common import get_row_count as get_row_count # noqa
from arcproapi.export import excel_sheets_to_gdb as excel_import_sheets, csv_to_gdb as csv_to_table # noqa Rearranged, import here so as not to break scripts/compatibility
# TODO: All functions that write/delete/update spatial layers need to support transactions using the Editor object - otherwise an error is raised when layers are involved in a topology (and other similiar conditions)
# See https://pro.arcgis.com/en/pro-app/latest/arcpy/data-access/editor.htm and https://pro.arcgis.com/en/pro-app/latest/tool-reference/tool-errors-and-warnings/160001-170000/tool-errors-and-warnings-160226-160250-160250.htm
_sort_list_set = lambda lst: sorted(list(set(lst)))
class Funcs(_MixinNameSpace, metaclass=_ABCMeta):
"""
Defines some commonly used functions to use in the various field_apply/recalculate methods in this module.
Methods:
FidToOneZero: After a spatial operation, e.g. union, we get FID_<table_name> cols. This reduces those scenarios to a 1 or a 0. "0 = 0; <Null> = 0; -1 = 0; > 0 = 1
"""
@staticmethod
def ShapeToHash(shape) -> str:
"""
Takes the value in an arcpy Shape@ field (Shape@ should be used to specify the shape col),
and returns a unique hash value as a string.
Args:
shape: the shape returned by asking for Shape@, for example in a searchcursor loop
Returns:
The hashed value
Notes:
This method is defined in geom.py
"""
return _geom.shape_hash(shape)
@staticmethod
def FloatToLong(v: float) -> int:
"""
Scientific round on float, to integer (long datatype in ESRI).
Args:
v (float): the value
Returns:
int: The rounded float as an int
"""
return int(round(v, 0))
@staticmethod
def LongToOneZero(v: int) -> int:
"""
After a spatial operation, e.g. union, we get FID_<table_name> cols. This reduces those scenarios to a 1 or a 0. "0 = 0; <Null> = 0; -1 = 0; > 0 = 1
Returns: int: 1 or 0
"""
if not v: return 0
if v == -1: return 0
if v < -1: return 0
return 1
@staticmethod
def LongsToOneZeroOr(*args) -> int:
"""
Pict 1 if any int > 0 and not null/none else 0
Returns:
int: 1 or 0
Examples:
>>> Funcs.LongsToOneZeroOr(12, 0, None)
1
>>> Funcs.LongsToOneZeroOr(-1, 0, None)
0
"""
return 1 if any([Funcs.LongToOneZero(v) for v in args]) else 0
@staticmethod
def LongsToOneZeroAll(*args) -> int:
"""
Pict 1 if all int args > 0 and not null/none else 0
Returns:
int: 1 or 0
Examples:
>>> Funcs.LongsToOneZeroAll(12, 1, 2)
1
>>> Funcs.LongsToOneZeroOr(12, 1, 0)
0
"""
return 1 if all([Funcs.LongToOneZero(v) for v in args]) else 0
@staticmethod
def TextToOneZero(v: str) -> int:
"""
Reduce text to 1 or 0. If not isinstance(v, str), also return 0
Returns: int: 1 or 0
"""
if not isinstance(v, str): return 0
if not v: return 0
return 1
@staticmethod
def PickFirst(*args, default: (str, int, float) = 'Unspecified'):
"""
Pick the firt value that evaluates to True.
Args:
*args: values
default: What to return if no field values evaluate to True
Returns:
The value of the first field to evaluate to true
Examples:
>>> Funcs.PickFirst('', '', 'hello', False)
'hello'
>>> Funcs.PickFirst(False, False, '', None, default=0) # noqa
0
"""
for s in args:
if s: return s
return default
@staticmethod
def PickLong(*args):
"""
Picks the value which is not -1, <null> or 0. Picks the first one if multiple are not -1/null/0
If all args evaluate 0, return 0
Returns: int
Examples:
Get first
>>> Funcs.PickLong(-1, 0, 0, 34, 54)
34
All evaluate to 0 with LongToOneZero
>>> Funcs.PickLong(-1, None, 0)
0
"""
for n in args:
if Funcs.LongToOneZero(n): return n
return 0
@staticmethod
def Area_m2_to_km2(v: (float, None)):
"""
Conversion from m2 to km2. If v evaluates to false, returns 0
Args:
v: value
Returns:
float: area as km2.
Examples:
Behaviour on none/null
>>> Funcs.Area_m2_to_km2(None) # noqa
0
"""
return v / 1000000 if v else 0
@staticmethod
def ThinnessRationFromShapePlanar(poly: _arcpy.Polygon) -> float:
"""
Thinness ratio.
See https://tereshenkov.wordpress.com/2014/04/08/fighting-sliver-polygons-in-arcgis-thinness-ratio/
Args:
poly: instance of arcpy.Polygon
Returns:
float: the ratio
Notes:
Only supports projected coordinate systems
"""
numerator = 4 * _math.pi * (poly.getArea('PLANAR'))
denom = poly.length / (_math.pi * _math.pi)
return numerator / denom
@staticmethod
def ThinnessRatioValues(area: float, perimeter: float, ret_on_none=0) -> float:
"""
Thinness ratio.
See https://tereshenkov.wordpress.com/2014/04/08/fighting-sliver-polygons-in-arcgis-thinness-ratio/
Args:
area: area
perimeter: permiter
ret_on_none:
Value to return if area or perimeter are None.
I've seen tools (e.g. Eliminate) return null shapes with corresponding null lengths and areas, this gets over the issue
Returns:
float: the ratio
"""
if area is None or perimeter is None: return ret_on_none
numerator = 4 * _math.pi * area
denom = perimeter * perimeter
return numerator / denom
class Excel(_MixinNameSpace): # noqa
"""
Work with excel workbooks.
"""
@staticmethod
def excel_worksheets_get(workbook: str) -> list[str]:
"""
List of worksheets in workbook
Args:
workbook (str): The workbook (xlsx)
Returns:
list[str]: list of workbooks
Examples:
>>> Excel.excel_worksheets_get('C:/temp/my.xlsx')
['Sheet1', 'Sheet2']
"""
workbook = _path.normpath(workbook)
fld, fname = _iolib.get_file_parts2(workbook)[0:2]
with _xlwings.App(visible=False) as App:
_ = App.books.open(workbook)
out = [sht.name for sht in App.books[fname].sheets]
return out
@staticmethod
def excel_listobjects_get(workbook: str) -> list[str]:
"""
Get list of listobjects in an excel workbook
Args:
workbook (str): The workbook
Returns:
list[str]: List of listobjects
Examples:
>>> Excel.excel_listobjects_get('C:/temp/my.xlsx')
['Table1', 'Table2']
"""
workbook = _path.normpath(workbook)
fld, fname = _iolib.get_file_parts2(workbook)[0:2]
with _xlwings.App(visible=False) as App:
_ = App.books.open(workbook)
out = [lo.name for lo in _baselib.list_flatten([sheet.tables for sheet in App.books[fname].sheets])]
return out
class ResultAsPandas(_mixins.MixinPandasHelper):
"""
Get the results of any arcpy operation which returns a table or layer, as a pandas dataframe.
Also exposes pandas aggregate functions to summarise the data and some other "helper" methods.
Also has a mechanism to load secondary results sets. These are exposed as a dictionary collection of _LayerDataFrame objects. See the examples.
Args:
tool: An arcpy tool which supports the in_features argument and a single out_feature class (support is being extended).
columns: columns to retain in the resulting dataframe
additional_layer_args (tuple[str]): List of the kwargs that point to additional output tables. This allows additional dataframes to be exposed. As an example, see the CountOverlappingFeatures which accepts an argument to define output_overlap_table.
where: where query to apply to the underlying spatial results table/feature class, passed to data.table_as_pandas2
as_int (tuple[str], list[str]): List of cols forced to an int in the resulting dataframe
as_float (tuple[str], list[str]): List of cols forced to a float in the resulting dataframe
memory_workspace (str):
Results are created in an memory workspace.
Arcgispro supports two memory workspace directives - in_memory and memory.
in_memory supports more functionality than memory, but it is likely that ESRI will extend "memory" functionality but depreciate "in_memory".
Members:
df (_pd.DataFrame): Dataframe of the output layer
df_lower (_pd.DataFrame): Dataframe of the output layer, all col names forced to lower case
_fname_output (str): Name of the in-memory layer/table output created by execution of "tool"
execution_result (_arcpy.Result): The result object returned from execution of "tool"
Results: A dictionary of all Results, the main result is keyed as "main", with any additional results keyed with the values in the additional_layer_args member.
Raises:
errors.DataUnknownKeywordsForTool: If the arcpy tool does not support in_features or in_dataset keyword arguments
Notes:
Currently assumes that the in_features list is always called in_features in the tool
The arguments "columns", "where", "as_int", "as_float" and "exclude_cols" are all passed to data.table_as_pandas2
Examples:
Get shape area sum from main results from an arcpy tool (assumpes Shape is returned)
>>> RaP = ResultAsPandas(_arcpy.analysis.CountOverlappingFeatures, ['./my.gdb/england', './my.gdb/uk']) # noqa
>>> RaP.shape_area()
334523.1332
Get the additional results table from CountOverlappingFeatures
>>> RaP = ResultAsPandas(_arcpy.analysis.CountOverlappingFeatures, ['./my.gdb/england', './my.gdb/uk'], additional_layer_args=('out_overlap_table',)) # noqa
>>> RaP.Results['out_overlap_table'].df
id cola colb colc
1 'a' 'cc' 5
2 'b' 'dd' 10
2 'b' 'dd' 10
...
Now aggregate on the main result
>>> import numpy as np
>>> RaP.aggregate(['cola'], ['colc'], np.sum) # noqa
cola sum_colc
'a' 5
'b' 20
Export the results to a feature class in a geodatabase, single line call.
>>> from arcpy.conversion import ExportFeatures
>>> ExportFeatures(ResultAsPandas(arcpy.analysis.Clip, 'C:/my.shp', 'C:/clip.shp').result_memory_layer, 'C:/the.gdb/clip_result') # noqa
View the result in excel from CountOverlappingFeatures
>>> arcdata.ResultAsPandas(arcpy.analysis.CountOverlappingFeatures, ['./my.gdb/england', './my.gdb/uk']).view() # noqa
"""
class _LayerDataFrame(_mixins.MixinPandasHelper):
def __init__(self, arg_name: str, fname_output: str, df: _pd.DataFrame = None): # noqa
self.arg_name = arg_name
self.fname_output = fname_output
self._df = None
self.df_lower = None
self.df = df
@property
def df(self):
return self._df
@df.setter
def df(self, df: _pd.DataFrame):
if self._df is None:
self._df = df
if self.df_lower is None:
self.df_lower = df.copy()
self.df_lower.columns = self.df_lower.columns.str.lower()
def __init__(self, # noqa
tool,
in_features: (list[str], str),
columns: list[str] = None,
additional_layer_args: (tuple[str], str) = (),
where: str = None,
exclude_cols: tuple[str] = ('Shape',),
as_int: (tuple[str], list[str]) = (),
as_float: (tuple[str], list[str]) = (),
memory_workspace: str = 'in_memory',
**kwargs):
self._kwargs = kwargs
self._tool = tool
self._in_features = in_features
self.result_memory_layer = memory_lyr_get(memory_workspace)
self._memory_workspace = memory_workspace
self.Results = {}
if additional_layer_args:
if isinstance(additional_layer_args, str): additional_layer_args = (additional_layer_args,)
for s in additional_layer_args:
lyr_tmp = r'%s\%s' % ('in_memory', _stringslib.rndstr(from_=_string.ascii_lowercase))
self.Results[s] = ResultAsPandas._LayerDataFrame(s, lyr_tmp)
kwargs[s] = lyr_tmp
errstr = 'The tool "%s" had unknown keywords. ' % str(tool)
errstr += 'Currently code only accepts tools which support arguments "in_features", "in_dataset" and the "in_polygons in_sum_features" pairing. '
errstr += 'Along with analysis.Near.\n\nThis will need fixing in code.'
keys = dict(_inspect.signature(tool).parameters).keys()
if tool == _arcpy.analysis.Near:
self.execution_result = tool(in_features=in_features, **kwargs)
_arcpy.conversion.ExportFeatures(self.execution_result, self.result_memory_layer)
elif 'in_dataset' in keys:
self.execution_result = tool(in_dataset=in_features, out_dataset=self.result_memory_layer, **kwargs)
elif 'in_features' in keys:
if 'out_table' in keys: # CheckGeometry
self.execution_result = tool(in_features=in_features, out_table=self.result_memory_layer, **kwargs)
elif 'out_feature_class' in keys:
self.execution_result = tool(in_features=in_features, out_feature_class=self.result_memory_layer, **kwargs)
else:
# Just try and get the outtable for tools that i'm not aware of yet
no_out = True
for out in keys:
if out.startswith('out_'):
no_out = False
self.execution_result = tool(in_features=in_features, out_feature_class=self.result_memory_layer, **kwargs)
break
if no_out:
raise _errors.DataUnknownKeywordsForTool(errstr)
elif 'in_polygons' in keys and 'in_sum_features' in keys: # arcpy.analysis.SummarizeWithin & SummarizeNearBy
self.execution_result = tool(*in_features, out_feature_class=self.result_memory_layer, **kwargs)
else:
raise _errors.DataUnknownKeywordsForTool(errstr)
self.df = table_as_pandas2(self.result_memory_layer, cols=columns, where=where, exclude_cols=exclude_cols, as_int=as_int, as_float=as_float)
self.df_lower = self.df.copy()
self.df_lower.columns = self.df_lower.columns.str.lower()
# check, LDF should behave as byref
for LDF in self.Results.values():
LDF.df = table_as_pandas2(LDF.fname_output, exclude_cols=('Shape',))
self.Results['main'] = ResultAsPandas._LayerDataFrame('main', self.result_memory_layer, self.df)
def tuple_list_to_table(x, out_tbl, cols, null_number=None, null_text=None):
"""Save a list of tuples as table out_tbl and return catalog path to it.
Args:
x (tuple[tuple], list[list]): list of tuples (no nesting!), can be list of lists or tuple of tuples
out_tbl (str): path to the output table
cols list[tuple]: list of tuples defining columns of x. Can be defined as:
[('colname1', 'type1'), ('colname2', 'type2'), ...]
['colname1:type1:lgt1', 'colname2:type2', ('colname3', 'type3')]
[('colname1', 'type1'), 'colname2:type2:lgt2, ...]
where types are case insensitive members of ('SHORT', 'SMALLINTEGER', 'LONG', 'INTEGER', 'TEXT', 'STRING', 'DOUBLE', 'FLOAT')
Each column definition can have third element for length of the field,
e.g.: ('ATextColumn', 'TEXT', 250).
To leave out length, simply leave it out or set to '#'
nullNumber (int, None): A value to replace null (None) values in numeric columns, default is None and does no replacement
nullText (str, None): A value to replace null (None) values in text columns, default is None and does no replacement
Examples:
>>> x_ = [(...),(...),(...),(...),(...), ...]
>>> tuple_list_to_table(x_, 'c:\\temp\\foo.dbf', [('IDO', 'SHORT'), ('NAME', 'TEXT', 200)]
>>> tuple_list_to_table(x_, 'c:\\temp\\foo.dbf', ['IDO:SHORT', 'NAME:TEXT:200']
"""
# decode column names, types, and lengths
cols = [tuple(c.split(":")) if type(c) not in (tuple, list) else c for c in cols]
# remember what indexes to replace if values are null
replace_numbers, replaces_text = [], []
for i in range(len(cols)):
if cols[i][1].upper() in ('TEXT', 'STRING'):
replaces_text.append(i)
else:
replace_numbers.append(i)
do_replace_number = False if null_number is None else True
do_replace_text = False if null_text is None else True
do_replace = do_replace_number or do_replace_text
dname = _path.dirname(out_tbl)
if dname in ('', u''): dname = _arcpy.env.workspace
r = _arcpy.CreateTable_management(dname, _path.basename(out_tbl))
out_tbl = r.getOutput(0)
# add the specified fields
for f in cols:
fname = f[0]
ftype = f[1].upper()
flength = '#'
if len(f) > 2:
flength = int(f[2]) if str(f[2]).isdigit() else '#'
_arcpy.AddField_management(out_tbl, fname, ftype, '#', '#', flength)
# rewrite all tuples
fields = [c[0] for c in cols]
with _arcpy.da.InsertCursor(out_tbl, fields) as ic:
for rw in x:
if do_replace:
rw = list(rw)
# TODO This looks bugged, need to debug
if i in replace_numbers: # noqa
if rw[i] is None:
rw[i] = null_number
if i in replaces_text:
if rw[i] is None:
rw[i] = null_text
rw = tuple(rw)
ic.insertRow(rw)
return out_tbl
def field_values(fname: str, col: (str, list), where: (str, None) = None, order_by: (str, None) = None, distinct: bool = False) -> any:
"""
Return a list of all values in column col in table tbl.
If col is a single column, returns a list of values, otherwise returns
a list of tuples of values where each tuple is one row.
Columns included in the o parameter must be included in the col parameter!
Args:
fname (str): input table or table view
col (str, list): column name(s) as string, a csv or semi-colon seperated string, or a list
where (str): where clause
order_by (str): order by clause like '"OBJECTID" ASC, "Shape_Area" DESC', default is None, which means order by object id if exists
distinct (bool): unique values only
Returns:
list(tuple): list of tuples if mutliple columns requested
list: list lof values if single col defined
Raises:
RuntimeError: Columns defined in order_by must be in col, otherwise a RuntimeError is raised
errors.FieldNotFound: If the field or fields do not exist in fname
Examples:
>>> field_values('c:\\foo\\bar.shp', col='Shape_Length')
[1.23,2.34, ..]
Need to check return values our regads SHAPE@XY
>>> field_values('c:\\foo\\bar.shp', col='SHAPE@XY')
Multiple columns with where and order by clauses
>>> field_values('c:\\foo\\bar.shp', col='OID@;Shape_Length', where='"OBJECTID" < 3', order_by='Shape_Length ASC')
[(1, 2.34), (2, 1.55), ..]
RuntimeError raised, order_by col not in cols
>>> field_values('c:\\foo\\bar.shp', col='SHAPE@XY', order_by='Shape_Length DESC')
Traceback (most recent call last): ....
"""
# unpack column names
if isinstance(col, (list, tuple)):
cols = col
else:
col = str(col)
separ = ';' if ';' in col else ','
cols = [c.strip() for c in col.split(separ)]
# indicate whether one or more than one columns were specified
multicols = False
if len(cols) > 1:
multicols = True
# construct order by clause
if order_by is not None:
order_by = 'ORDER BY ' + str(order_by)
# This check was added as we get a wierd error if field doesnt exists in enterprise geodatabase - make warning more explicit
bad_fields = _baselib.list_not(list(map(str.lower, cols)), list(map(str.lower, _struct.field_list(fname))))
if bad_fields:
raise _errors.FieldNotFound('Field(s) "%s" not found in "%s"' % (bad_fields, fname))
# retrieve values with search cursor
ret = []
with _arcpy.da.SearchCursor(fname, cols, where_clause=where, sql_clause=(None, order_by)) as sc:
for row in sc:
if multicols:
ret.append(row)
else:
ret.append(row[0])
if distinct:
ret = list(set(ret))
return ret
def field_values_multi_table(fnames: list, fields: (str, list), distinct: bool = True):
"""
Get field values from multiple layers/tables.
If distinct is used, a sorted distinct flattened list is returned.
Otherwise, the values are returned as is from the field_values function.
Args:
fnames: list of layers/fnames (is normpathed)
fields:
If fields is a string, then the same field is used for all fnames.
Otherwise the list of fields is passed as is based on index. Note that fields may be nested tuples or list. See the underlying function field_values
distinct: Flattens and returns distinct values across all "cells" read from fnames and fields
Raises:
ValueError: IF the length of "fnames" and "fields" doesnt match, and "fields" was a tuple or list
Returns:
list: Unique list of values
"""
paths = list(map(_path.normpath, fnames))
if isinstance(str, fields):
fields = [fields] * len(fnames)
if len(fnames) != len(fields):
raise ValueError('The length of "fnames" and "fields" lists must match.')
out = []
for i, fname in paths:
out.extend(field_values(fname, fields[i], distinct=distinct))
if distinct:
return _sort_list_set(_baselib.list_flatten(out))
return out
def field_has_duplicates(fname: str, col: str, f: any = lambda v: v) -> bool:
"""
Check a col for duplicates
Args:
fname (str): table or feature class
col (str): col to check
f: function passed to map, typical use would be f=str.lower to make duplicate checks case insensitive
Returns:
bool: True if duplicates found in field col.
Examples:
>>> field_get_dup_values('c:/my.gdb/everyones_fave_colour', 'colour', f=str.lower)
True
"""
s = field_get_dup_values(fname, col, value_list_only=True, f=f)
if s:
return True
return False
def field_get_dup_values(fname: str, col: str, value_list_only: bool = False, f: any = lambda v: v):
"""
Check a col for duplicates
Args:
fname (str): Feature class or table
col (str): col to check
value_list_only (bool): Get as simple list rather than dict
f (any): function passed to map, typical use would be f=str.lower to make duplicate checks case insensitive
Returns:
list: Simple list of values which have duplicates
dict: A dictionary of duplicates. Keys are the duplicate values and values are is the counts of duplicate values
Examples:
sum_of_stuff col n, has duplicates for 2 and 10. Asked for simple list
>>> field_get_dup_values('c:/my.gdb/sum_of_stuff', 'n', value_list_only=True)
[2, 10]
fave_colour colour has 2 occurences of blue and 5 occurences of black, ignore case
>>> field_get_dup_values('c:/my.gdb/fave_colour', 'colour', False, f=str.lower)
{'blue':2,'black':5}
"""
fname = _path.normpath(fname)
vals = list(map(f, field_values(fname, col)))
res = _baselib.list_get_dups(vals, value_list_only=value_list_only)
return res
def fields_get_dup_values(fname: str, cols: list, value_list_only: bool = False, sep: str = _stringslib.Characters.Language.emdash, f: any = lambda v: v):
"""
Check a set of columns for duplicate values across those columns.
i.e. Treat the columns as being a compound key
Args:
fname (str): Feature class or table
cols (list, tuple): cols to check
value_list_only (bool): Get as simple list rather than dict
sep: Seperator for the returned field values (defaults to emdash)
f (any): function passed to map, typical use would be f=str.lower to make duplicate checks case insensitive
Returns:
list: Simple list of values which have duplicates
dict: A dictionary of duplicates. Keys are the duplicate values and values are is the counts of duplicate values
Examples:
Error in data, duplicate for Wales and England
>>> fields_get_dup_values('c:/my.gdb/country', ['name', 'continent'], value_list_only=True)
['Wales - Europe', 'England - Europe']
Ask for dup number, Wales has two dups, England has 5
>>> field_get_dup_values('c:/my.gdb/fave_colour', 'colour', value_list_only=False, f=str.lower)
{'Wales - Europe': 2, 'England - Europe': 5}
"""
vals = []
splitter = '+_)(*&^]'
fname = _path.normpath(fname)
for col in cols:
vals += list(map(lambda s: str(f(s)), field_values(fname, col))) # lambda forces to a string, required for the join on the next line
concat = [splitter.join(v) for v in zip(*vals)]
def _rep(s):
return s.replace(splitter, sep)
dups: dict = _baselib.list_get_dups(concat) # noqa
if value_list_only:
if not dups: return []
return list(map(_rep, dups.keys()))
if not dups: return {}
return {_rep(k): v for k, v in dups.items()}
def key_info(parent: str, parent_field: str, child: str, child_field: str, as_oids: bool = False) -> dict:
"""
Get a dictionary listing the "primary key" values not in child foreign key values.
This is a quick validation tool. A set of more advanced validation tools is planned
in a dedicated module, integrating arcproapi connection objects with the
great expectations package.
TODO: Support compound keys
Args:
parent (str): Parent entity
parent_field (str): The key field in the parent
child (str): Child entity
child_field (str): key field in the child
as_oids (bool): Get unique oids instead of distinct values
Returns:
dict: dict{'parent_only': [..], 'both': [..], 'child_only': [..]
Notes:
The OIDs can be used for futher processing ...
Examples:
>>> key_info('C:/my.gdb/coutries', 'cname', 'C:/my.gdb/towns', 'cname')
{'parent_only': ['NoTownCountry',...], 'both': ['England',...,], 'child_only': ['NoCountryTown',...]}
Now with oids
>>> key_info('C:/my.gdb/coutries', 'cname', 'C:/my.gdb/towns', 'cname', as_oids=True)
{'parent_only': [232, 343], 'both': [1,2,...], 'child_only': [56,77,...]}
Notes:
This is also exposed in arcapipro.info
"""
parent_values = field_values(parent, parent_field, distinct=True)
child_values = field_values(child, child_field, distinct=True)
if not as_oids:
d = _baselib.list_sym_diff(parent_values, child_values, rename_keys=('parent_only', 'both', 'child_only'))
return d
where = _sql.query_where_in(parent_field, parent_values)
parent_oids = field_values(parent, 'OID@', where=where)
where = _sql.query_where_in(child_field, child_values)
child_oids = field_values(child, 'OID@', where=where)
d = _baselib.list_sym_diff(parent_oids, child_oids, rename_keys=('parent_only', 'both', 'child_only'))
return d
def table_as_pandas(fname, cols=(), where='', null_value=_np.NaN, **kwargs):
"""(str, iter:str, iter:str, str, dict) -> pandas.dataframe
Get a feature layer or table as a pandas dataframe
See https://pro.arcgis.com/en/pro-app/latest/arcpy/data-access/featureclasstonumpyarray.htm
for list of named arguments and special column names
Note:
The Arcpy FeatureClassToNumpyArray is bugged, and will fail on dates with null values.
If this happens, use table_as_pandas2
Parameters
fname (str): path to data
cols (list, tuple): tuple of the columns to retrieve
out_headers (list, tuple): list of header names to use for the returned dataframe
where (str): a where statement to filter results (see https://pro.arcgis.com/en/pro-app/latest/help/mapping/navigation/sql-reference-for-elements-used-in-query-expressions.htm)
kwargs: additionl arguments to pass to arcpy.da.FeatureClassToNumPyArray
Example
>>> df = table_as_pandas('c:/lyr.shp', ('id','contact_name'), "contact_name='John'", skip_nulls=False)
"""
fname = _path.normpath(fname)
c = '*' if not cols else cols
return _pd.DataFrame(_arcpy.da.FeatureClassToNumPyArray(fname, field_names=c, where_clause=where, null_value=null_value, **kwargs))
def pandas_to_table(df: _pd.DataFrame, fname: str, overwrite=False, max_str_len=0, fix_ascii_errors=False):
"""
Import a dataframe into a geodatabase.
*** This often fails due to codepage errors and similiar. Use pandas_to_table2 which gets the job done much more reliably ***
Args:
max_str_len (int): pandas dataframes usually stores strings as objects.
If max_str_len is 0, then the length of the max string will be used to convert the object type to a byte field,
else max_str_len will be used, allowing for longer strings to be stored in the created table
df (pandas.DataFrame): the pandas dataframe
fname (str): table name to write
overwrite (bool): Overwrite fname, otherwise an error will be raised if fname exists
max_str_len (int): fix the string length
fix_ascii_errors:
remove none ascii characters if they exist in object cols
(work around for bugged arcpy.da.NumpyArrayToTable
Raises:
RunTimeError: If fname already exists and overwrite is False
Returns: None
Notes:
ArcPy.da.NumPyArrayToTable silently fixes invalid characters in record names,
replacing them with an underscore. e.g. My% will become 'my_'
So advise manually checking column names created in the final export.
Examples:
>>> d = _pd.DataFrame({'A':['a','aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa']}) # noqa
>>> pandas_to_table(df, 'C:/my.gdb/test') # noqa
"""
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_records.html
# https://numpy.org/doc/stable/reference/arrays.scalars.html#built-in-scalar-types
# https://pro.arcgis.com/en/pro-app/2.8/arcpy/data-access/numpyarraytotable.htm
# https://numpy.org/doc/stable/user/basics.rec.html
fname = _path.normpath(fname)
df_copy = df.copy(deep=True)
if overwrite:
_struct.fc_delete2(fname)
if max_str_len:
s = '|S%s' % max_str_len
else:
s = 'bytes'
for i, t in enumerate(df_copy.dtypes):
if str(t) == 'object': # object, assume string .. ok for now, extend on a case by case basis as expect other stuff will error like dates
if fix_ascii_errors:
df_copy[df_copy.columns[i]] = df_copy[df_copy.columns[i]].apply(lambda x: x.encode('ascii', 'ignore').decode('ascii'))
df_copy[df_copy.columns[i]] = df_copy[df_copy.columns[i]].astype(s)
recarray = df_copy.to_records(index=False)
_arcpy.da.NumPyArrayToTable(recarray, fname)
@_decs.environ_persist
def pandas_to_table2(df: _pd.DataFrame, workspace: str, tablename: str, overwrite=False, del_cols=(), **kwargs):
"""
Uses a different kludge to get around bugs with pandas no-ascii strings
and arcpys bug on importing pandas cols of type object - the default
col type class for pandas string representations.
Read Notes for caveat with empty string/null handling
Args:
df (pandas.DataFrame): the pandas dataframe
workspace (str): gdb
tablename (str): tablename to create in workspace. Should be the table name only, and not a fully specified path
overwrite (bool): Overwrite fname, otherwise an error will be raised if fname exists
del_cols (tuple): Tuple of columns to discard, arcpy.conversion.table2table can create eroneous cols.
**kwargs: passed to pandas.to_csv (used as an intermediary to get over bugs in ArcPy
Raises:
RunTimeError: If fname already exists and overwrite is False
IOError: If the feature class to write exists and arcpy reports it as locked
Returns: None
Notes:
Because this method first exports to csv, then it doesn't differentiate between empty strings and missing data.
This becomes an issue where fields in the underlying data are not nullable.
Blanket solutions would result in numeric fields being interpreted as string data types.
An untested workaround would be to write single quotes to string fields only i.e. ''.
Examples:
>>> df = _pd.DataFrame({'A':['a','aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa']}) # noqa
>>> pandas_to_table2(df, 'C:/my.gdb', 'my_table_name') # noqa
"""
workspace = _path.normpath(workspace)
_arcpy.env.workspace = workspace
_arcpy.env.overwriteOutput = overwrite
tmp_file = _iolib.get_temp_fname(suffix='.csv')
df.replace('"', "'", inplace=True)
df.to_csv(tmp_file, sep=',', index=False, **kwargs) # noqa
fname = _iolib.fixp(workspace, tablename)
if _common.is_locked(fname):
raise IOError('%s is locked. Make sure all projects are closed')
# if overwrite:
# _struct.fc_delete2(fname)
if '/' in tablename or '\\' in tablename:
tablename = _path.basename(tablename)
_arcpy.conversion.TableToTable(tmp_file, workspace, tablename) # noqa
if del_cols:
with _fuckit:
_arcpy.DeleteField_management(fname, del_cols)
_iolib.file_delete(tmp_file)
def table_as_pandas2(fname: str, cols: (str, list) = None, where: str = None, exclude_cols: (str, list) = ('Shape',), as_int: (list, tuple) = (),
as_float: (list, tuple) = (), cols_lower: bool = False, **kwargs):
"""Export data in feature class/table fname to a pandas DataFrame
Args:
fname (str): Path to feature class or table.
cols (str, list, tuple): column list to retrieve.
where (str): where clause, passed as-is to SearchCursor
exclude_cols (str, list, tuple, None): list of cols to exclude. Exludes Shape by default for performance reasons. Pass none or an empty tuple to NOT exclude any cols
as_int (list, tuple, None): List of cols by name to force to int
as_float (list, tuple, None): list of cols by name to force to float64
cols_lower (bool): force dataframe columns to lowercase
kwargs: keyword args passed to pandas.DataFrame.from_records. nrows is a useful option
Returns:
pandas.DataFrame: The feature class or table converted to a pandas DataFrame
Warnings:
Raises a warning if as_int and as_float fail to cooerce to their respective type.
Common cause of this is nan/na/none values in the coerced column.
Notes:
The primay key of fname is used as the dataframes index.
For performance reasons and erroneous errors when viewing the data (e.g. in xlwings), exclude the shape column if not needed.
The as_int and as_float allows forcing of col to int or float, by default this data is interpreted as objects by pandas (read as strings by SearchCursor).
"where" is called on arcpy SearchCursor, hence ignores any type conversions forced by as_int and as_float.
To avoid warnings and correctly cooerce to int and float, a prior call to field_apply_func, using lambda x: 0 if not x or x is None else x
will clear up those null values (provided zeros are acceptable).
Examples: