GreenmaskIO · wwoytenko · Aug 27, 2024 · Aug 17, 2024 · Aug 27, 2024 · Aug 27, 2024
diff --git a/docs/database_subset.md b/docs/database_subset.md
@@ -5,6 +5,12 @@ when you need to dump only a part of the database, such as a specific table or a
 ensures data consistency by including all related data from other tables that are required to maintain the integrity of
 the subset.
 
+!!! info
+
+    Greenmask genrates queries for subset conditions based on the introspected schema using joins and recursive queries.
+    It cannot be responsible for query optimization. The subset quries might be slow due to the complexity of
+    the queries and/or lack of indexes. Circular dependencies resolution requires recursive queries execution.
+
 ## Detail
 
 The subset is a list of SQL conditions that are applied to table. The conditions are combined with `AND` operator. **You
@@ -29,6 +35,16 @@ Greenmask will automatically generate the appropriate queries for the table subs
 system ensures data consistency by validating all records found through the recursive queries. If a record does not meet
 the subset condition, it will be excluded along with its parent records, preventing constraint violations.
 
+!!! warning
+
+    Currently, can resolve multi-cylces in one strogly connected component, but only for one group of vertexes. For 
+    instance if you have SSC that contains 2 groups of vertexes, Greenmask will not be able to resolve it. For instance
+    we have 2 cycles with tables `A, B, C` (first group) and `D, E, F` (second group). Greenmask will not be able to
+    resolve it. But if you have only one group of vertexes one and more cycles in the same group of tables (for instance
+    `A, B, C`), Greenmask will be able to resolve it. This might be fixed in the future. See second example below.
+
+You can read the Wikipedia article about Circular reference [here](https://en.wikipedia.org/wiki/Circular_reference).
+
 ## Example: Dump a subset of the database
 
 !!! info
@@ -61,3 +77,103 @@ transformation:
       - >
         person.password.passwordsalt = '329eacbe-c883-4f48-b8b6-17aa4627efff'
 ```
+
+## Example: Dump a subset with circular reference
+
+```postgresql title="Create tables with multi cyles"
+-- Step 1: Create tables without foreign keys
+DROP TABLE IF EXISTS employees CASCADE;
+CREATE TABLE employees
+(
+    employee_id   SERIAL PRIMARY KEY,
+    name          VARCHAR(100) NOT NULL,
+    department_id INT -- Will reference departments(department_id)
+);
+
+DROP TABLE IF EXISTS departments CASCADE;
+CREATE TABLE departments
+(
+    department_id SERIAL PRIMARY KEY,
+    name          VARCHAR(100) NOT NULL,
+    project_id    INT -- Will reference projects(project_id)
+);
+
+DROP TABLE IF EXISTS projects CASCADE;
+CREATE TABLE projects
+(
+    project_id       SERIAL PRIMARY KEY,
+    name             VARCHAR(100) NOT NULL,
+    lead_employee_id INT, -- Will reference employees(employee_id)
+    head_employee_id INT  -- Will reference employees(employee_id)
+);
+
+-- Step 2: Alter tables to add foreign key constraints
+ALTER TABLE employees
+    ADD CONSTRAINT fk_department
+        FOREIGN KEY (department_id) REFERENCES departments (department_id);
+
+ALTER TABLE departments
+    ADD CONSTRAINT fk_project
+        FOREIGN KEY (project_id) REFERENCES projects (project_id);
+
+ALTER TABLE projects
+    ADD CONSTRAINT fk_lead_employee
+        FOREIGN KEY (lead_employee_id) REFERENCES employees (employee_id);
+
+ALTER TABLE projects
+    ADD CONSTRAINT fk_lead_employee2
+        FOREIGN KEY (head_employee_id) REFERENCES employees (employee_id);
+
+-- Insert projects
+INSERT INTO projects (name, lead_employee_id)
+SELECT 'Project ' || i, NULL
+FROM generate_series(1, 10) AS s(i);
+
+-- Insert departments
+INSERT INTO departments (name, project_id)
+SELECT 'Department ' || i, i
+FROM generate_series(1, 10) AS s(i);
+
+-- Insert employees and assign 10 of them as project leads
+INSERT INTO employees (name, department_id)
+SELECT 'Employee ' || i, (i / 10) + 1
+FROM generate_series(1, 99) AS s(i);
+
+-- Assign 10 employees as project leads
+UPDATE projects
+SET lead_employee_id = (SELECT employee_id
+                        FROM employees
+                        WHERE employees.department_id = projects.project_id
+                        LIMIT 1),
+    head_employee_id = 3
+WHERE project_id <= 10;
+```
+
+This schema has two cycles:
+
+* `employees (department_id) -> departments (project_id) -> projects (lead_employee_id) -> employees (employee_id)`
+* `employees (department_id) -> departments (project_id) -> projects (head_employee_id) -> employees (employee_id)`
+
+Greenmask can simply resolve it by generating a recursive query with integrity checks for subset and join conditions.
+
+The example below will fetch the data for both 3 employees and related departments and projects.
+
+```yaml title="Subset configuration example "
+transformation:
+  - schema: "public"
+    name: "employees"
+    subset_conds:
+      - "public.employees.employee_id in (1, 2, 3)"
+```
+
+But this will return empty result, because the subset condition is not met for all related tables because project with 
+`project_id=1` has reference to employee with `employee_id=3` that is invalid for subset condition.
+
+```yaml title="Subset configuration example"
+transformation:
+  - schema: "public"
+    name: "employees"
+    subset_conds:
+      - "public.employees.employee_id in (1, 2)"
+```
+
diff --git a/internal/db/postgres/subset/component.go b/internal/db/postgres/subset/component.go
@@ -7,6 +7,7 @@ import (
 	"strings"
 
 	"github.com/greenmaskio/greenmask/internal/db/postgres/entries"
+	"github.com/greenmaskio/greenmask/pkg/toolkit"
 )
 
 type Component struct {
@@ -20,6 +21,11 @@ type Component struct {
 	cycles       [][]*Edge
 	cyclesIdents map[string]struct{}
 	keys         []string
+	// groupedCycles - cycles grouped by the vertexes
+	groupedCycles map[string][]int
+	// groupedCyclesGraph - contains the mapping of the vertexes in the component to the edges in the original graph
+	// for grouped cycles. This required to join the separated cycles together
+	groupedCyclesGraph map[string][]*CycleEdge
 }
 
 func NewComponent(id int, componentGraph map[int][]*Edge, tables map[int]*entries.Table) *Component {
@@ -35,6 +41,8 @@ func NewComponent(id int, componentGraph map[int][]*Edge, tables map[int]*entrie
 	} else {
 		c.keys = c.getOneTable().PrimaryKey
 	}
+	c.groupCycles()
+	c.buildCyclesGraph()
 
 	return c
 }
@@ -58,6 +66,19 @@ func (c *Component) getOneTable() *entries.Table {
 	panic("cannot call get one table method for cycled scc")
 }
 
+func (c *Component) getOneCycleGroup() [][]*Edge {
+	if len(c.groupedCycles) == 1 {
+		for _, g := range c.groupedCycles {
+			var res [][]*Edge
+			for _, idx := range g {
+				res = append(res, c.cycles[idx])
+			}
+			return res
+		}
+	}
+	panic("get one group cycle group is not allowed for multy cycles")
+}
+
 func (c *Component) hasCycle() bool {
 	return len(c.cycles) > 0
 }
@@ -108,7 +129,7 @@ func (c *Component) findAllCyclesDfs(v int, visited map[int]bool, recStack map[i
 					break
 				}
 			}
-			cycleId := getCycleIdent(cycle)
+			cycleId := getCycleId(cycle)
 			if _, ok := c.cyclesIdents[cycleId]; !ok {
 				res := slices.Clone(cycle)
 				slices.Reverse(res)
@@ -122,7 +143,18 @@ func (c *Component) findAllCyclesDfs(v int, visited map[int]bool, recStack map[i
 	recStack[v] = false
 }
 
-func getCycleIdent(cycle []*Edge) string {
+// getCycleGroupId - returns the group id for the cycle based on the vertexes ID
+func getCycleGroupId(cycle []*Edge) string {
+	ids := make([]string, 0, len(cycle))
+	for _, edge := range cycle {
+		ids = append(ids, fmt.Sprintf("%d", edge.to.idx))
+	}
+	slices.Sort(ids)
+	return strings.Join(ids, "_")
+}
+
+// getCycleId - returns the unique identifier for the cycle based on the edges ID
+func getCycleId(cycle []*Edge) string {
 	ids := make([]string, 0, len(cycle))
 	for _, edge := range cycle {
 		ids = append(ids, fmt.Sprintf("%d", edge.id))
@@ -131,21 +163,91 @@ func getCycleIdent(cycle []*Edge) string {
 	return strings.Join(ids, "_")
 }
 
-func (c *Component) getComponentKeys() []string {
-	if len(c.cycles) > 1 {
-		panic("IMPLEMENT ME: multiple cycles in the component")
+func (c *Component) groupCycles() {
+	c.groupedCycles = make(map[string][]int)
+	for cycleIdx, cycle := range c.cycles {
+		cycleId := getCycleGroupId(cycle)
+		c.groupedCycles[cycleId] = append(c.groupedCycles[cycleId], cycleIdx)
+	}
+}
+
+func (c *Component) buildCyclesGraph() {
+	// TODO: Need to loop through c.groupedCycles instead of c.cycles
+	var idSeq int
+	c.groupedCyclesGraph = make(map[string][]*CycleEdge)
+	for groupIdI, cyclesI := range c.groupedCycles {
+		for groupIdJ, cyclesJ := range c.groupedCycles {
+			if groupIdI == groupIdJ {
+				continue
+			}
+			commonVertexes := c.findCommonVertexes(cyclesI[0], cyclesJ[0])
+			if len(commonVertexes) == 0 {
+				continue
+			}
+			if c.areCyclesLinked(cyclesI[0], cyclesJ[0]) {
+				continue
+			}
+			e := NewCycleEdge(idSeq, groupIdI, groupIdJ, commonVertexes)
+			c.groupedCyclesGraph[groupIdI] = append(c.groupedCyclesGraph[groupIdJ], e)
+			idSeq++
+		}
+	}
+}
+
+func (c *Component) findCommonVertexes(i, j int) (res []*entries.Table) {
+	common := make(map[toolkit.Oid]*entries.Table)
+	for _, edgeI := range c.cycles[i] {
+		for _, edgeJ := range c.cycles[j] {
+			if edgeI.to.idx == edgeJ.to.idx {
+				common[edgeI.to.table.Oid] = edgeI.to.table
+			}
+		}
+	}
+	for _, table := range common {
+		res = append(res, table)
 	}
+	slices.SortFunc(res, func(i, j *entries.Table) int {
+		switch {
+		case i.Oid < j.Oid:
+			return -1
+		case i.Oid > j.Oid:
+			return 1
+		}
+		return 0
+	})
+	return
+}
+
+func (c *Component) areCyclesLinked(i, j int) bool {
+	iId := getCycleGroupId(c.cycles[i])
+	jId := getCycleGroupId(c.cycles[j])
+	for _, to := range c.groupedCyclesGraph[iId] {
+		if to.to == jId {
+			return true
+		}
+	}
+	for _, to := range c.groupedCyclesGraph[jId] {
+		if to.to == iId {
+			return true
+		}
+	}
+	return false
+}
+
+func (c *Component) getComponentKeys() []string {
 	if !c.hasCycle() {
 		return c.getOneTable().PrimaryKey
 	}
 
-	var vertexes []int
-	for _, edge := range c.cycles[0] {
-		vertexes = append(vertexes, edge.to.idx)
+	vertexes := make(map[int]struct{})
+	for _, cycle := range c.cycles {
+		for _, edge := range cycle {
+			vertexes[edge.to.idx] = struct{}{}
+		}
 	}
 
 	var keys []string
-	for _, v := range vertexes {
+	for v := range vertexes {
 		table := c.tables[v]
 		for _, key := range table.PrimaryKey {
 			keys = append(keys, fmt.Sprintf(`%s__%s__%s`, table.Schema, table.Name, key))

diff --git a/internal/db/postgres/subset/cte.go b/internal/db/postgres/subset/cte.go
@@ -29,8 +29,8 @@ func (c *cteQuery) addItem(name, query string) {
 func (c *cteQuery) generateQuery(targetTable *entries.Table) string {
 	var queries []string
 	var excludedCteQueries []string
-	if len(c.c.cycles) > 1 {
-		panic("IMPLEMENT ME")
+	if len(c.c.groupedCycles) > 1 {
+		panic("FIXME: found more than one grouped cycle")
 	}
 	for _, edge := range c.c.cycles[0] {
 		if edge.from.table.Oid == targetTable.Oid {

diff --git a/internal/db/postgres/subset/cycle_edge.go b/internal/db/postgres/subset/cycle_edge.go
@@ -0,0 +1,22 @@
+package subset
+
+import "github.com/greenmaskio/greenmask/internal/db/postgres/entries"
+
+type CycleEdge struct {
+	id     int
+	from   string
+	to     string
+	tables []*entries.Table
+}
+
+func NewCycleEdge(id int, from, to string, tables []*entries.Table) *CycleEdge {
+	if len(tables) == 0 {
+		panic("empty tables provided for cycle edge")
+	}
+	return &CycleEdge{
+		id:     id,
+		from:   from,
+		to:     to,
+		tables: tables,
+	}
+}