Add location based restrictions to call context query (#1166)

* feat: first shot at file filtering for call-context queries * feat: includeUndefinedFiles defaults to true * feat-doc: added comments * feat: fixed issues with file filter in call context queries * doc-fix: added missing newline * doc: added doc for file filters in call context queries
flowr-analysis · Nov 21, 2024 · 6017542 · 6017542 · github-actions · Nov 21, 2024
1 parent 242d064
commit 6017542
Show file tree

Hide file tree

Showing 3 changed files with 55 additions and 4 deletions.
diff --git a/src/documentation/print-query-wiki.ts b/src/documentation/print-query-wiki.ts
@@ -54,6 +54,13 @@ Besides this, we provide the following ways to automatically categorize and link
    For now, we _only_ offer support for linking to the last call, as the current flow dependency over-approximation is not stable.
 4. **Aliases**      (\`includeAliases\`): Consider a case like \`f <- function_of_interest\`, do you want calls to \`f\` to be included in the results? There is probably no need to combine this with a global call target!
 
+It's also possible to filter the results based on the following properties:
+
+1. **File** (\`fileFilter\`): This allows you to filter the results based on the file in which the call is located. This can be useful if you are only interested in calls in, e.g., specific folders.
+  The \`fileFilter\` property is an object made up of two properties:
+  - **Filter** (\`filter\`): A regular expression that a node's file attribute must match to be considered.
+  - **Include Undefined Files** (\`includeUndefinedFiles\`): If \`fileFilter\` is set, but a node's file attribute is not present, should we include it in the results? Defaults to \`true\`.
+
 Re-using the example code from above, the following query attaches all calls to \`mean\` to the kind \`visualize\` and the subkind \`text\`,
 all calls that start with \`read_\` to the kind \`input\` but only if they are not locally overwritten, and the subkind \`csv-file\`, and links all calls to \`points\` to the last call to \`plot\`:
 
@@ -98,6 +105,7 @@ my_test_function()
 		`;
 	}
 });
+
 registerQueryDocumentation('dataflow', {
 	name:             'Dataflow Query',
 	type:             'active',

diff --git a/src/queries/catalog/call-context-query/call-context-query-executor.ts b/src/queries/catalog/call-context-query/call-context-query-executor.ts
@@ -4,6 +4,7 @@ import type {
 	CallContextQueryKindResult,
 	CallContextQueryResult,
 	CallContextQuerySubKindResult,
+	FileFilter,
 	SubCallContextQueryFormat
 } from './call-context-query-format';
 import type { NodeId } from '../../../r-bridge/lang-4.x/ast/model/processing/node-id';
@@ -64,6 +65,10 @@ function promoteQueryCallNames(queries: readonly CallContextQuery[]): { promoted
 				...q,
 				callName: q.callNameExact ? exactCallNameRegex(q.callName)
 					: new RegExp(q.callName),
+				fileFilter: q.fileFilter && {
+					...q.fileFilter,
+					filter: new RegExp(q.fileFilter.filter)
+				},
 				linkTo: {
 					...q.linkTo,
 					/* we have to add another promotion layer whenever we add something without this call name */
@@ -74,7 +79,11 @@ function promoteQueryCallNames(queries: readonly CallContextQuery[]): { promoted
 			return {
 				...q,
 				callName: q.callNameExact ? exactCallNameRegex(q.callName)
-					: new RegExp(q.callName)
+					: new RegExp(q.callName),
+				fileFilter: q.fileFilter && {
+					...q.fileFilter,
+					filter: new RegExp(q.fileFilter.filter)
+				}
 			};
 		}
 	});
@@ -156,6 +165,16 @@ function removeIdenticalDuplicates(collector: TwoLayerCollector<string, string,
 	}
 }
 
+function doesFilepathMatch(file: string | undefined, filter: FileFilter<RegExp> | undefined): boolean {
+	if(filter === undefined) {
+		return true;
+	}
+	if(file === undefined) {
+		return filter.includeUndefinedFiles ?? true;
+	}
+	return filter.filter.test(file);
+}
+
 /**
  * Multi-stage call context query resolve.
  *
@@ -203,6 +222,11 @@ export function executeCallContextQueries({ graph, ast }: BasicQueryData, querie
 		}
 
 		for(const query of promotedQueries.filter(q => q.callName.test(info.name))) {
+			const file = ast.idMap.get(nodeId)?.info.file;
+			if(!doesFilepathMatch(file, query.fileFilter)) {
+				continue;
+			}
+
 			let targets: NodeId[] | 'no' | undefined = undefined;
 			if(query.callTargets) {
 				targets = satisfiesCallTargets(nodeId, graph, query.callTargets);

diff --git a/src/queries/catalog/call-context-query/call-context-query-format.ts b/src/queries/catalog/call-context-query/call-context-query-format.ts
@@ -11,10 +11,21 @@ import type { PipelineOutput } from '../../../core/steps/pipeline/pipeline';
 import type { DEFAULT_DATAFLOW_PIPELINE } from '../../../core/steps/pipeline/default-pipelines';
 import { CallTargets } from './identify-link-to-last-call-relation';
 
-export interface DefaultCallContextQueryFormat<CallName extends RegExp | string> extends BaseQueryFormat {
+export interface FileFilter<FilterType> {
+	/**
+	 * Regex that a node's file attribute must match to be considered
+	 */
+	readonly filter:                 FilterType;
+	/**
+	 * If `fileFilter` is set, but a nodes `file` attribute is `undefined`, should we include it in the results? Defaults to `true`.
+	 */
+	readonly includeUndefinedFiles?: boolean;
+}
+
+export interface DefaultCallContextQueryFormat<RegexType extends RegExp | string> extends BaseQueryFormat {
 	readonly type:            'call-context';
 	/** Regex regarding the function name, please note that strings will be interpreted as regular expressions too! */
-	readonly callName:        CallName;
+	readonly callName:        RegexType;
 	/**
 	 * Should we automatically add the `^` and `$` anchors to the regex to make it an exact match?
 	 */
@@ -32,6 +43,10 @@ export interface DefaultCallContextQueryFormat<CallName extends RegExp | string>
 	 * Consider a case like `f <- function_of_interest`, do you want uses of `f` to be included in the results?
 	 */
 	readonly includeAliases?: boolean;
+	/**
+	 * Filter that, when set, a node's file attribute must match to be considered
+	 */
+	readonly fileFilter?:     FileFilter<RegexType>;
 }
 
 /**
@@ -98,7 +113,11 @@ export const CallContextQueryDefinition = {
 		subkind:        Joi.string().optional().description('The subkind of the call, this can be used to uniquely identify the respective call type when grouping the output (e.g., the normalized name, linking `ggplot` to `plot`). Defaults to `.`'),
 		callTargets:    Joi.string().valid(...Object.values(CallTargets)).optional().description('Call targets the function may have. This defaults to `any`. Request this specifically to gain all call targets we can resolve.'),
 		includeAliases: Joi.boolean().optional().description('Consider a case like `f <- function_of_interest`, do you want uses of `f` to be included in the results?'),
-		linkTo:         Joi.object({
+		fileFilter:     Joi.object({
+			fileFilter:            Joi.string().required().description('Regex that a node\'s file attribute must match to be considered'),
+			includeUndefinedFiles: Joi.boolean().optional().description('If `fileFilter` is set, but a nodes `file` attribute is `undefined`, should we include it in the results? Defaults to `true`.')
+		}).optional().description('Filter that, when set, a node\'s file attribute must match to be considered'),
+		linkTo: Joi.object({
 			type:     Joi.string().valid('link-to-last-call').required().description('The type of the linkTo sub-query.'),
 			callName: Joi.string().required().description('Regex regarding the function name of the last call. Similar to `callName`, strings are interpreted as a regular expression.')
 		}).optional().description('Links the current call to the last call of the given kind. This way, you can link a call like `points` to the latest graphics plot etc.')
Benchmark suite	Current: `6017542`	Previous: `2380113`	Ratio
`Retrieve AST from R code`	`246.4932475909091` ms (`106.3225202474029`)	`238.99386745454547` ms (`99.5397110373604`)	`1.03`
`Normalize R AST`	`17.23065140909091` ms (`29.97360345109044`)	`17.344255954545453` ms (`30.98539214696861`)	`0.99`
`Produce dataflow information`	`60.94881086363637` ms (`128.2161781809194`)	`60.94850918181818` ms (`127.79646053550617`)	`1.00`
`Total per-file`	`856.6658115454545` ms (`1556.9435201245303`)	`839.1842053181819` ms (`1529.9296327924758`)	`1.02`
`Static slicing`	`2.0884953465409675` ms (`1.1287373607195745`)	`2.0548017594197465` ms (`1.212534826826947`)	`1.02`
`Reconstruct code`	`0.23941507563013897` ms (`0.17765086167870764`)	`0.23833344722067284` ms (`0.18520720238626878`)	`1.00`
`Total per-slice`	`2.34218304620429` ms (`1.1936598604915731`)	`2.306734662937843` ms (`1.2769510302954954`)	`1.02`
`failed to reconstruct/re-parse`	`0` #	`0` #	`1`
`times hit threshold`	`0` #	`0` #	`1`
`reduction (characters)`	`0.7878911077490998` #	`0.7869360165281424` #	`1.00`
`reduction (normalized tokens)`	`0.7651712060744233` #	`0.7639690077689504` #	`1.00`
`memory (df-graph)`	`95.46617542613636` KiB (`244.77619956879823`)	`95.46617542613636` KiB (`244.77619956879823`)	`1`
Benchmark suite	Current: `6017542`	Previous: `2380113`	Ratio
`Retrieve AST from R code`	`247.81986214` ms (`46.60339251854539`)	`246.34857636` ms (`46.05736358259125`)	`1.01`
`Normalize R AST`	`18.97386` ms (`14.495631304298653`)	`19.12205878` ms (`14.425168152928613`)	`0.99`
`Produce dataflow information`	`74.76744276000001` ms (`71.76208350663104`)	`74.45471131999999` ms (`70.57749655074137`)	`1.00`
`Total per-file`	`7777.37814308` ms (`28855.670498234602`)	`7726.72160172` ms (`28528.07523908248`)	`1.01`
`Static slicing`	`16.06714210846099` ms (`44.092632744371436`)	`15.999780665624812` ms (`43.68226004466487`)	`1.00`
`Reconstruct code`	`0.29376293538086495` ms (`0.15953293380140443`)	`0.2820169656616924` ms (`0.15417410861831182`)	`1.04`
`Total per-slice`	`16.36969559213324` ms (`44.12159040977069`)	`16.29007209329986` ms (`43.71440479393139`)	`1.00`
`failed to reconstruct/re-parse`	`0` #	`0` #	`1`
`times hit threshold`	`0` #	`0` #	`1`
`reduction (characters)`	`0.8712997340230448` #	`0.8712997340230448` #	`1`
`reduction (normalized tokens)`	`0.8102441553774778` #	`0.8102441553774778` #	`1`
`memory (df-graph)`	`99.4425` KiB (`113.62933451202426`)	`99.4425` KiB (`113.62933451202426`)	`1`