-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathCausality.ecl
275 lines (265 loc) · 13.4 KB
/
Causality.ecl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
IMPORT $ AS HPCC_Causality;
IMPORT HPCC_Causality.Types;
IMPORT ML_Core.Types AS cTypes;
IMPORT HPCC_Causality.internal.cModel;
powerDefault := 1;
cModelTyp := Types.cModel;
validationReport := Types.validationReport;
MetricQuery := Types.MetricQuery;
cMetrics := Types.cMetrics;
ProbQuery := Types.ProbQuery;
Distr := Types.Distribution;
ScanReport := Types.ScanReport;
DiscResult := Types.DiscoveryResult;
nlQuery := Types.nlQuery;
nlQueryRslt := Types.nlQueryRslt;
AnyField := Types.AnyField;
NumericField := cTypes.NumericField;
/**
* Causal Model Module.
*
* Causal level methods require a combination of a causal model, and a dataset.
*
* Methods include:
* - ValidateModel -- Analyze the data against the provided causal model and
* evaluate the degree of correspondence between the two.
* - Causal (or Probabilistic) Query. This is a superset of probability queries,
* adding support for causal interventions that simulate the effect on a target variable of a causal
* intervention on one or more variable. See Query for details.
* - Metrics -- Evaluate various causal metrics on desgnated pairs
* [source, destination] of variables.
* - DiscoverModel -- Utilize a range of causal discovery methods to discover causal relationships
* between variables.
*
* @param mod A causal model in DATASET(cModel) format. The dataset should
* contain only a single record, defining the model.
* @param PS The id of a Probability Space or Subspace containing the
* dataset. This is obtained by <probabilityInstance>.PS, or as returned from
* a probability.SubSpace() call.
*
* @see Types.cModel
* @see ML_Core.Types.NumericField
*
*/
EXPORT Causality(DATASET(cModelTyp) mod, UNSIGNED PS) := MODULE
SHARED CM := cModel.Init(mod, PS);
/**
* Validate the causal model relative to the data.
*
* @param order The largest number of variables to consider at a time (default 3).
* Higher values lead to exponentially increasing run times, and diminishing
* evaluation accuracy. Very large datasets are required in order to evaluate
* higher order evaluations (default=3, recommended).
* @param pwr Power. The thoroughness to be used in conditionalizing on variables.
* Allows a tradeoff between run-time and certainty of discrimination. Power
* = 1 is sufficient to distinguish linear relationships, where higher numbers
* are needed to distinguish subtle non-linear relationships. Range [1,100].
* For practical purposes, power > 5 should not be needed. Default = 1.
* @param sensitivity
* @return A detailed validation report in Types.ValidationReport format
* @see Types.ValidationReport
*/
EXPORT ValidationReport ValidateModel(UNSIGNED order=3, REAL pwr=powerDefault, REAL sensitivity=10) := FUNCTION
ValidationReport rollupReport(ValidationReport l, ValidationReport r) := TRANSFORM
SELF.confidence := 0.0;
SELF.NumTotalTests := l.NumTotalTests + r.NumTotalTests;
tpt := IF(COUNT(r.NumTestsPerType) > 0, r.NumTestsPerType, [0,0,0,0]);
ept := IF(COUNT(r.NumErrsPerType) > 0, r.NumErrsPerType, [0,0,0,0]);
wpt := IF(COUNT(r.NumWarnsPerType) > 0, r.NumWarnsPerType, [0,0,0,0]);
SELF.NumTestsPerType := [l.NumTestsPerType[1] + tpt[1],
l.NumTestsPerType[2] + tpt[2],
l.NumTestsPerType[3] + tpt[3],
l.NumTestsPerType[4] + tpt[4]];
SELF.NumErrsPerType := [l.NumErrsPerType[1] + ept[1],
l.NumErrsPerType[2] + ept[2],
l.NumErrsPerType[3] + ept[3],
l.NumErrsPerType[4] + ept[4]];
SELF.NumWarnsPerType := [l.NumWarnsPerType[1] + wpt[1],
l.NumWarnsPerType[2] + wpt[2],
l.NumWarnsPerType[3] + wpt[3],
l.NumWarnsPerType[4] + wpt[4]];
SELF.Errors := l.Errors + r.Errors;
SELF.Warnings := l.Warnings + r.Warnings;
END;
results0 := cModel.TestModel(order, pwr, sensitivity, CM);
results1 := ROLLUP(results0, TRUE, rollupReport(LEFT, RIGHT));
resultsRec := results1[1];
score := cModel.ScoreModel(resultsRec.NumTestsPerType,
resultsRec.NumErrsPerType,
resultsRec.NumWarnsPerType, CM);
final := PROJECT(results1, TRANSFORM(RECORDOF(LEFT),
SELF.confidence := score,
SELF := LEFT));
finalRec := final[1];
RETURN finalRec;
END;
/**
* Calculate the distributions resulting from a set of Causal Probability Queries.
*
* Causal Proabability queries are a superset of probability queries that may
* contain an intervention (i.e. do()) clause.
*
* If no do() clause is present, then the results will be the same as a normal
* probability query.
*
* The target portion must specify a bare (unbound) variable, as we are looking
* for a distribution as a result.
*
* Do() clauses are specified within the "given" portion of the query, and
* can only use equality designation. For example:
* 'P(A | do(B=1, C=2), D between [-1,3])'
* This is the probability distribution of A given that D is between -1 and 3,
* and that we intervened to force the value of B to 1 and the value of C to 2.
*
* Interventions simulate the effect of setting a variable or variables
* to fixed values, while breaking the links from those variables' parents.
* The distribution of a target variable given the interventions is returned
* for each query. This is roughly equivalent to performing a randomized
* study.
*
*
* @param queries A list of queries. Exactly 1 target per query must be specified,
* and the target must be unbound (i.e. with no comparitor). One or more
* interventions can be provided for each variable. Interventions must be
* of an exact value (e.g. do(var = value)).
*
* @return A set of Types.Distr records, describing each of the queried distributions.
*
*/
EXPORT DATASET(Distr) QueryDistr(SET OF STRING queries, REAL pwr=powerDefault) := FUNCTION
dummy := DATASET([{1}], {UNSIGNED d});
queryRecs := NORMALIZE(dummy, COUNT(queries), TRANSFORM(nlQuery, SELF.id:=COUNTER,
SELF.query:=queries[COUNTER]));
queries_D := DISTRIBUTE(queryRecs, id);
distrs := cModel.QueryDistr(queries_D, CM);
distrs_S := SORT(distrs, id);
RETURN distrs_S;
END;
/**
* Calculate the probabilities or expectations resulting from a set of
* Causal Probability Queries.
*
* Causal Proabability queries are a superset of probability queries that may
* contain an intervention (i.e. do()) specification.
*
* If no do() clause is present, then the results will be the same as a normal
* probability query.
*
* Probabilities or expectations may be requested by the query. For example:
* 'P(A between [1,3] | B < 0)' # The probability that A is between 1 and 3 given
* # that B is less than zero.
* 'E(A | B < 0)' # The expected value of A given that B is less than
* # zero.
*
* Note that for probability queries that the target must be "bound" (i.e. includes a
* comparison), while for expectation queries, the target must be "unbound" (i.e. a bare
* variable name).
*
* For details of the query syntax, see the README file.
*
* Do() clauses are specified within the "given" portion of the query, and
* can only use equality designation. For example:
* 'E(A | do(B=1, C=2), D between [-1,3])'
* This is the expected value of A given that D is between -1 and 3,
* and that we intervened to force the value of B to 1 and the value of C to 2.
*
* Interventions simulate the effect of setting a variable or variables
* to fixed values, while breaking the links from those variables' parents.
* The distribution of a target variable given the interventions is returned
* for each query. This is roughly equivalent to performing a randomized
* study.
*
* @param queries A list of queries. One or more
* interventions can be provided for each variable. Interventions must be
* of an exact value (e.g. do(var = value)).
*
* @return A set of Types.AnyField records, containing the numeric (or textual).
* result of each query.
*
*/
EXPORT DATASET(nlQueryRslt) Query(SET OF STRING queries, REAL pwr=powerDefault) := FUNCTION
dummy := DATASET([{1}], {UNSIGNED d});
queryRecs := NORMALIZE(dummy, COUNT(queries), TRANSFORM(nlQuery, SELF.id:=COUNTER,
SELF.query:=queries[COUNTER]));
queries_D := DISTRIBUTE(queryRecs, id);
results := cModel.Query(queries_D, CM);
results_S := SORT(results, id);
RETURN results_S;
END;
/**
* Calculate a set of causal metrics from a designated source variable to a designated
* destination variable.
*
* The following metrics are produce for each source / destination pair:
* - Average Causal Effect (ACE) -- The average effect on the destination variable of
* a unit intervention on the source variable.
* - Controlled Direct Effect (CDE) -- The direct effect on the destination variable of
* a unit intervention on the source variable.
* - Indirect Effect (IE) -- The indirect effect (i.e. via intermediate variables) on
* the destination variable of a unit intervention on the source variable.
*
* @param queries A list of queries, each with two targets [source, destination], and
* no conditions or interventions. Targets should be unbound (i.e. no args).
*
* @return Dataset of cMetrics records, one per query, with id corresponding to the
* id of the original query.
*
*/
EXPORT DATASET(cMetrics) Metrics(DATASET(MetricQuery) queries, REAL pwr=powerDefault) := FUNCTION
queries_D := DISTRIBUTE(queries, id);
metrics := cModel.Metrics(queries_D, pwr, CM);
metrics_S := SORT(metrics, id);
RETURN metrics_S;
END;
/**
* Analyze the data to estimate the causal relationships between variables.
*
* @param vars A set of variable names among which to discover relationships. If omitted,
* will use all variables in dataset.
*
* @param pwr The power to use for statisitical queries. Range [1, 100]. The higher power,
* the more accuracy, but longer runtime. Power=1 suffices for liner relationships.
* Power > 10 is not recommended due to very long runtimes. Default = 1.
* @param sensitivity The sensitivity of dependence detection to use. Range 1.0 -10.0. Default is 10
* (Maximum Sensitivity). It can be useful to reduce sensitivity in real-world datasets,
* to restrict the number of relationships found.
* @param depth Determines how many simultaneous conditional variables will be evaluated. Default = 2.
* values above 3 may be problematic due to long run times, and possibly exceding the sensitivity
* of the instruments.
* @return A DATASET(DiscResult) with a single record representing the results
* of the discovery.
* @see Types.DiscResult
*/
EXPORT DATASET(DiscResult) DiscoverModel(SET OF STRING vars=[], REAL pwr=powerDefault, REAL sensitivity=10, UNSIGNED depth=2) := FUNCTION
result := cModel.DiscoverModel(vars, pwr, sensitivity, depth, CM);
RETURN result;
END;
/**
* This function is Deprecated. Use DiscoverModel instead.
* Analyze the data to estimate the causal relationships between variables.
*
* Produces information that is useful for understanding the variables' relationships,
* and attempts to build a full causal model.
* Discovery is done hierarchically, first determining "clusters" that share a common
* set of exogenous variables. Then each cluster is analyzed for topology, and finally,
* the inter-cluster relationships are estimated.
*
* Note that this function does not use the model information supplied to the module
* except for a list of variable names. It, rather, produces an estimated of the model
* that generated the data.
*
* @param pwr The power to use for statisitical queries. Range [1, 100]. The higher power,
* the more accuracy, but longer runtime. Power=1 suffices for liner relationships.
* Power > 10 is not recommended due to very long runtimes.
*
* @return A DATASET(DiscoveryReport) with a single record representing the results
* of the discovery.
* @see Types.DiscoveryReport
*
*
*/
EXPORT DATASET(ScanReport) ScanModel( REAL pwr=powerDefault) := FUNCTION
rpt := cModel.ScanModel(pwr, CM);
RETURN rpt;
END;
END;