-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathTypes.ecl
440 lines (418 loc) · 15.5 KB
/
Types.ecl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
IMPORT ML_Core.Types AS mlcTypes;
NumericField := mlcTypes.NumericField;
/**
* Module provides all common record types for the Causality Bundle
*/
EXPORT Types := MODULE
/**
* AnyField record layout
*
* AnyField extends NumericField to handle textual as well as numeric data.
* The textVal field is added, which overrides the val field if the value is
* textual.
* @see ML_Core.Types.NumericField
* @field textVal -- The string value of the field.
*
*/
EXPORT AnyField := RECORD(NumericField)
STRING textVal := '';
END;
/**
* Natural Language Query.
*
* Supports probability queries in a simple format.
* @field id A unique id for this query.
* @field query A string representing the probability query e.g. 'P(A=1)'.
*/
EXPORT nlQuery := RECORD
UNSIGNED id;
STRING query;
END;
/**
* Result of a causal query. Includes the orginal query in addition
* to the AnyField result.
*
* @field query The original query that produced this result.
*/
EXPORT nlQueryRslt := RECORD(AnyField)
STRING query;
END;
/**
* Record layout for Probability Query parameters
*
* Various forms are supported:
* 1) Variable Name alone (i.e. Unbound variable)
* 2) Variable Name and One Argument (i.e. Val = arg1)
* 3) Variable Name and Two Arguments (i.e. arg1 <= Val <= arg2)
* 4) Variable Name and an Enumerated Set of Values (i.e. Val in Set(args))
* For discrete variables only.
* 5) Variable Name and one or more string arguments (i.e. Val in Set(strArgs)).
* For text-based variables.
*
* @field VarName -- The variable name
* @field Args -- The arguments for the query spec. Any number arguments
* may be provided depending on the context (for Form 1-4 above)
* @field strArgs -- A set of string values (for Form 5 above only)
* @field isEnum -- Boolean field. If true, then Args will be treated as Form
* 4 above. To avoid confusion between Form 3 and Form 4 with
* two values enumerated.
*/
EXPORT ProbSpec := RECORD
STRING VarName;
SET OF REAL Args := [];
SET OF STRING strArgs := [];
BOOLEAN isEnum := False;
END;
/**
* Record Layout for Probability Queries.
*
* Also used for Causality Queries (i.e. Interventional, Counterfactual)
* General form for Probability Queries:
* - P(target | conditions) -- e.g., P(Y=1 | X1=1.5, X2=-.3, .5 <= X3 <= 1.0)
* - E(target | conditions)
* - distr(target | conditions)
*
* @field id Unique id for each query, used to correlate results.
* @field target The target of the query (e.g. 'Y', 'Y'=1, .5 <= 'Y' <= 1.0)
* @field conditions The set of conditions to apply to the target query
* (e.g. ['X1', 'X2'=1, .5 <= 'X3' <= 1.0]). Defaults to empty set
* meaning no conditions.
* @field interventions The set of interventions for causal (interventional)
* queries. These represent "do()" operations, and must be set
* to exact values (e.g. 'X1'=1.0).
*/
EXPORT ProbQuery := RECORD
UNSIGNED id;
DATASET(ProbSpec) target;
DATASET(ProbSpec) conditions := DATASET([], ProbSpec);
DATASET(ProbSpec) interventions := DATASET([], ProbSpec);
DATASET(ProbSpec) counterfacs := DATASET([], ProbSpec);
END;
/**
* Histogram Entry
*
* Represents one bin of a discretized probability histogram
*
* @field Min The minimum value for this bin
* @field Max The maximum value for this bin. Values
* within this bin fall into the interval [Min, Max).
* For discrete variables, Min and Max will both equal the
* discrete value.
* @field P The probability that the random variable will take on a value
* within this bin.
*/
EXPORT HistEntry := RECORD
REAL Min;
REAL Max;
REAL P;
END;
/**
* Child dataset of Distribution to hold the mapping between string values and
* their numeric eqivalent
*/
EXPORT StrValEntry := RECORD
UNSIGNED numVal;
STRING strVal;
END;
/**
* Input to a causal metric query.
* @field id A unique id for this query
* @field cause The name of the causal variable for the query
* @field effect The name of the effect variable for the query
*/
EXPORT MetricQuery := RECORD
UNSIGNED id;
STRING cause;
STRING effect;
END;
/**
* Record to represent the Distribution of a single random variable
*
* Values are discretized. For discrete variables, there will be as
* many bins as the cardinality of the variable. For continuous
* variables, the number of bins is determined automatically based
* on the number of observations. Datasets with more observations
* are discretized more finely than smaller datasets.
*
* @field id Identifier for the given requested distribution.
* Matches the id of the corresponding request.
* @field query A representation of the query in (near) standard
* Pearl notation. Format:
* Distr<counterfactual>(target | conditions, do(interventions))
* The fields: counterfactual, conditions and
* do(interventions) may or may not appear in any given query.
* Angle brackets <> are used in place of subscripting as in
* Pearl notation.
* @field nSamples The number of samples upon which the distribution
* is based.
* @field isDiscrete Boolean is TRUE if this is a discrete variable,
* otherwise FALSE.
* @field minVal The minimum observed value of the variable.
* @field maxVal The maximum observed value of the variable.
* @field Mean The sample mean of the variable.
* @field StDev The sample standard deviation of the variable.
* @field Skew The sample skew of the variable.
* @field Kurtosis The sample excess kurtosis of the variable.
* @field Median The median sample value of the variable.
* @field Mode The most common value of the variable. For
* continuous variables, this is the midpoint of
* the bin containing the most samples.
* @field Histogram The set of discretized bins representing
* the distribution's PDF.
* @field Deciles The Deciles of the variable's distribution
* From 10 to 90.
*/
EXPORT Distribution := RECORD
UNSIGNED id;
STRING query;
UNSIGNED nSamples;
Boolean isDiscrete;
Boolean isCategorical;
REAL minVal;
REAL maxVal;
REAL Mean;
REAL StDev;
REAL Skew;
REAL Kurtosis;
REAL Median;
REAL Mode;
SET OF BOOLEAN isBounded;
SET OF REAL bounds;
UNSIGNED Modality;
DATASET(HistEntry) Histogram;
DATASET(HistEntry) Deciles;
DATASET(StrValEntry) StringVals
END;
/**
* Enumeration for Random Variable Data Type (see RV below).
*/
EXPORT DatTypeEnum := ENUM(None=0, Numeric=1, Categorical=2);
/**
* Random Variable Record type for causal model representation
*
* @field Name The name of the Random Variable.
* @field Parents A set of RV Names representing the causal parents
* of this variable.
* @field isObserved Boolean is TRUE if this variable has measurable
* data associated with it. Otherwise FALSE.
* @field DataType Enumeration of the data type associated with this
* variable. Currently only Numeric and Categorical are
* supported.
*/
EXPORT RV := RECORD
STRING Name;
SET OF STRING Parents;
BOOLEAN isObserved := TRUE;
DatTypeEnum DataType := DatTypeEnum.None;
END;
/**
* Causal Model Definition Record
*
* @field Name The name of the model.
* @field Nodes The list of Random Variables that comprise the model.
* This must be in the order of variable in the dataset.
*/
EXPORT cModel := RECORD
STRING Name;
DATASET(RV) Nodes;
END;
/**
* Record to represent a Structural Equation Model (SEM)
*
* See Synth/synthTest.ecl for details on use of fields.
*
* @field Init An ordered list of statements to be executed once
* to do any required variable initialization.
* @field VarNames An ordered set of variables representing the
* output of the SEM. The produced data will follow the
* order of variable in this set.
* @field EQ An ordered set of equations that will be executed to
* generate each observation of the generated dataset.
* Equations may refer to variables initialized during Init
* processing, or variables set by previous equations.
*/
EXPORT SEM := RECORD
SET OF STRING Init;
SET OF STRING VarNames;
SET OF STRING EQ;
END;
/**
* Model Validation Report
*
* Shows result of a model validation test.
* Four types of tests are conducted:
* - Type 0: Verify all exogenous variables are independent
* of one another.
* - Type 1: Verify expected independencies.
* - Type 2: Verified expected dependencies.
* - Type 3: Verify causal direction.
*
* @field Confidence The confidence in the model between 0 and 1.
* 0 implies no confidence. 1 implies perfect confidence.
* @field NumTotalTests The total number of tests conducted.
* @field NumTestsByType An array of four values indicating the
* number of tests of each type 0-3 conducted.
* @field NumErrsPerType An array of four values indicating the
* number of errors detected for each test type 0-3.
* @field NumWarnsPerType An array of four values indicating the
* number of warnings detected for each test type 0-3.
* @field Errors Array of strings describing each error that
* occurred.
* @field Warnings Array of strings describing each warning that
* occurred.
*/
EXPORT ValidationReport := RECORD
REAL Confidence;
UNSIGNED NumTotalTests;
SET OF UNSIGNED NumTestsPerType;
SET OF UNSIGNED NumErrsPerType;
SET OF UNSIGNED NumWarnsPerType;
SET OF STRING Errors;
SET OF STRING Warnings;
END;
/**
* Record type for the results of a metrics query.
*
* @field id The id of the result corresponding to the original id
* in the query.
* @field query A representation of the original query e.g.,
* Source -> Destination.
* @field AveCausalEffect The average causal effect (ACE) of the
* source variable on the destination variable.
* @field ContDirEffect The controlled direct effect (CDE) of the
* source variable on the destination variable.
* @field IndirEffect The indirect effect (via other variables)
* of the source variable on the destination variable.
*/
EXPORT cMetrics := RECORD
UNSIGNED id;
STRING query;
REAL AveCausalEffect;
REAL ContrDirEffect;
REAL IndirEffect;
END;
/**
* Represents a named set along with its members
*
* @field Name The identifier of the set
* @field Members A list of unique set member identifiers
*/
EXPORT SetMembers := RECORD
STRING Name;
SET OF STRING Members;
END;
/**
* Results of the ScanModel function.
*
* Provides the information about what was discovered
* from analyzing the dataset.
*
* @field Exos A list of exogenous variables.
* @field Clusters A list of all of the discovered data cluster names.
* @field ClustMembers A list of each cluster and its members.
* @field ClustGraph A list of clusters and the set of parent clusters for
* each, representing a Directed Acyclic Graph (DAG) of cluster-to-cluster
* relationships.
* @field VarGraph A list of variables and the set of parents for
* each, representing a Directed Acyclic Graph (DAG) of variable relationships.
*/
EXPORT ScanReport := RECORD
SET OF STRING Exos;
SET OF STRING Clusters;
DATASET(SetMembers) ClustMembers;
DATASET(SetMembers) ClustGraph;
DATASET(SetMembers) VarGraph;
END;
/**
* Results of the DiscoveryModel function
* Provides the discovered causal model as a list of edges [cause, effect], and
* associated metrics.
*
* @field causeVar The name of the causal variable in the relationship
* @field effectVar The name of the effect variable in the relationship
* @field strength The strength of the dependence between the variables
* @field correlation The statistical correlation between the variables
* @field MDE The Maximum Direct Effect of the cause on the effect variable.
*/
EXPORT DiscoveryResult := RECORD
STRING causeVar;
STRING effectVar;
REAL strength;
REAL correlation;
REAL MDE;
END;
/**
* Child data type for DatasetSummary below. Describes a single variable
* in the dataset.
*
* @field name The name of the variable.
* @field isDiscrete True if the variable is discrete, otherwise False.
* @field isCategorical True if the variable is categorical, otherwise False.
* @field isTextual True if the variable is a text-based categorical, otherwise False.
* @field cardinality The number of unique values which the discrete variable takes in the dataset.
* @field numValues The numeric values the discrete variable takes in the dataset.
* @field textValues The textual values the textual categorical variables takes in the dataset
*/
EXPORT VarSummary := RECORD
STRING name;
BOOLEAN isDiscrete;
BOOLEAN isCategorical;
BOOLEAN isTextual;
UNSIGNED cardinality;
SET OF REAL numValues;
SET OF STRING textValues;
END;
/**
* Dataset Summary returned from Probability.Summary.
*
* Provides an overview of the dataset.
* @field numRecords The number of records in the dataset.
* @field varNames A set of the variable names in the dataset
* @field varDetails A set of Var Summary records describing each variable in the dataset.
*/
EXPORT DatasetSummary := RECORD
UNSIGNED numRecords;
SET OF STRING varNames;
DATASET(VarSummary) varDetails;
END;
/**
* @internal
* Internal data type used by visualization
*/
EXPORT ChartGrid := RECORD
UNSIGNED id;
DATASET(AnyField) gridItem;
END;
/**
* @internal
* Internal data type used by visualization
*/
EXPORT ChartData := RECORD
UNSIGNED id;
STRING x_;
STRING y_;
STRING z_;
REAL range1low := 0.0;
REAL range1high := 0.0;
REAL range2low := 0.0;
REAL range2high := 0.0;
END;
/**
* @internal
* Internal data type used by visualization
*/
EXPORT ChartInfo := RECORD
STRING dataname;
STRING qtype;
UNSIGNED dims;
STRING title;
STRING xlabel;
STRING ylabel;
STRING zlabel;
REAL mean;
REAL range1low;
REAL range1high;
REAL range2low;
REAL range2high;
END;
END;