From d1bef135f54f22a588f585ef8201a3fdac21dd92 Mon Sep 17 00:00:00 2001
From: Arnaud Bergeron <bergerona62@gmail.com>
Date: Mon, 6 May 2024 10:22:42 -0400
Subject: [PATCH] removed previous test

---
 NumGI/ParallelEquationGenerator.py           | 96 ++++++++++++++++++++
 test/EquationTests/test_numpy_sympy_torch.py | 88 ------------------
 2 files changed, 96 insertions(+), 88 deletions(-)
 create mode 100644 NumGI/ParallelEquationGenerator.py
 delete mode 100644 test/EquationTests/test_numpy_sympy_torch.py

diff --git a/NumGI/ParallelEquationGenerator.py b/NumGI/ParallelEquationGenerator.py
new file mode 100644
index 0000000..20815c6
--- /dev/null
+++ b/NumGI/ParallelEquationGenerator.py
@@ -0,0 +1,96 @@
+from __future__ import annotations
+
+import multiprocessing as mp
+import os
+
+import sympy as sp
+import torch
+
+from NumGI.ConstantDictionaries import DIFFERENTIAL_FUNCTIONS
+from NumGI.ConstantDictionaries import OPERATIONS
+from NumGI.DatasetTokenizer import DatasetTokenizer
+from NumGI.EquationTokenizer import EquationTokenizer
+from NumGI.SolutionGenerator import SolutionGenerator
+
+
+def worker(args):
+    sols = args[0].generate_solution_dataset(*args[1:-1])
+    generate_tokenized_lists(sols, args[-1])
+    return [args[-1]]
+
+
+def generate_tokenized_lists(sols, num):
+    """Generates tokenized lists of equations and solutions. Saves them to disk."""
+    x = []
+    y = []
+    for i in sols:
+        if not isinstance(i[1], sp.logic.boolalg.BooleanTrue) and not isinstance(
+            i[1], sp.logic.boolalg.BooleanFalse
+        ):
+            x.append(i[0].doit())
+            y.append(i[1])
+
+    tok = EquationTokenizer()
+    y_list = [tok.sympy_to_list(i) for i in y]
+    x_list = [tok.sympy_to_list(i) for i in x]
+    x_nozoo = []
+    y_nozoo = []
+    for idx, i in enumerate(y_list):
+        if len(i) < 200 and len(i) > 10:
+            if "zoo" not in [str(j) for j in i]:
+                try:
+                    if len(x_list[idx]) < 100:
+                        x_nozoo.append(x[idx])
+                        y_nozoo.append(y[idx])
+                except Exception as e:
+                    print(e)
+                    continue
+    try:
+        dataset = DatasetTokenizer(x_nozoo, y_nozoo, useDefaultTokenizer=True)
+        dataset.device = "cpu"
+        torch.save(dataset.x_tokenized.to("cpu"), f"data/x_var_6/x_{num}.pt")
+        torch.save(dataset.y_tokenized.to("cpu"), f"data/x_var_6/y_{num}.pt")
+    except KeyError as e:
+        print(f"nan in dataset: {e}")
+    except ValueError:
+        print(len(x_nozoo))
+
+
+def generate_eq_parallel(gen_args: list, path: str, num_thousands: int):
+    """Generates equations in parallel.
+
+    Note some equations will be discarded because they are too long.
+    This won't create the exact number of expected equations.
+
+    Args:
+        path (str): path to save the equations to
+        num_thousands (int): number of thousands of equations to generate
+    """
+    pool = mp.Pool(mp.cpu_count() - 1)
+    shift = 0
+    solgen = SolutionGenerator()
+
+    for i in os.listdir(path):
+        new_i = (i.split("_")[1]).split(".")[0]
+        shift = max(int(new_i), shift)
+
+    shift += 1
+    # Define the parameters for each call to generate_solution_dataset
+    parameters = [([solgen] + gen_args + [shift + _]) for _ in range(num_thousands)]
+
+    pool.map(worker, parameters)
+
+
+if __name__ == "__main__":
+    diff_func = DIFFERENTIAL_FUNCTIONS
+    ops = OPERATIONS
+    vars = ["x"]
+    gen_args = [
+        (3, 4),
+        (3, 5),
+        1_000,
+        vars,
+        diff_func,
+        ops,
+    ]
+    generate_eq_parallel(gen_args, "data/x_var_6", 10000)
diff --git a/test/EquationTests/test_numpy_sympy_torch.py b/test/EquationTests/test_numpy_sympy_torch.py
deleted file mode 100644
index 4d88549..0000000
--- a/test/EquationTests/test_numpy_sympy_torch.py
+++ /dev/null
@@ -1,88 +0,0 @@
-from __future__ import annotations
-
-import math
-
-import numpy as np
-import sympy as sp
-import torch
-
-from NumGI.ConstantDictionaries import DIFFERENTIAL_FUNCTIONS
-from NumGI.ConstantDictionaries import OPERATIONS
-from NumGI.EquationTokenizer import EquationTokenizer
-from NumGI.SolutionGenerator import SolutionGenerator
-
-
-def test_sp_np_torch():
-    sg = SolutionGenerator()
-    sg.PROB_NEW_SYMBOL = 0
-    n_eqs = 30
-    sols = [
-        sg.generate_solution(4, ["x"], DIFFERENTIAL_FUNCTIONS, OPERATIONS)[0].simplify()
-        for i in range(n_eqs)
-    ]
-
-    for func in DIFFERENTIAL_FUNCTIONS:
-        sols.append(func(sp.Symbol("x")))
-
-    tokenizer = EquationTokenizer()
-
-    test_arr = [-10, -5, -2, -1, 0, 1, 2, 5, 10, 20]
-    np_test = np.array(test_arr)
-    torch_test = torch.tensor(test_arr, device=tokenizer.device)
-    x = sp.Symbol("x")
-
-    cnt = 0
-
-    for i in sols:
-        try:
-            np_func, var = tokenizer.sympy_to_numpy(i)
-            np_res = np_func(np_test).tolist()
-        except TypeError:
-            cnt += 1
-            print("typeerr")
-            continue
-
-        if cnt > n_eqs / 2:
-            raise Exception(
-                "Too many equations with TypeError are equations correctly generated \
-                    or error in sp to np func"
-            )
-
-        sp_res = []
-        for idx, j in enumerate(test_arr):
-            try:
-                sp_res.append(float(i.replace(x, j).evalf()))
-            except Exception as e:
-                print(e)
-                sp_res.append(np_res[idx])
-
-        torch_func, var = tokenizer.sympy_to_torch(i)
-        torch_res = torch_func(**{_arg: torch_test for _arg in var}).tolist()
-
-        tol = 1e-4
-        for idx in range(len(sp_res)):
-            print(
-                f"eq:{i}, sp_res: {sp_res[idx]}, np_res: {np_res[idx]}, torch_res: {torch_res[idx]}"
-            )
-            try:
-                if math.isnan(sp_res[idx]) or math.isnan(np_res[idx]) or math.isnan(torch_res[idx]):
-                    continue
-                elif sp_res[idx] == 0:
-                    assert (sp_res[idx] - np_res[idx]) < tol
-                    assert (sp_res[idx] - torch_res[idx]) < tol
-                elif math.isinf(np_res[idx]):
-                    assert np_res[idx] == sp_res[idx]
-                    assert np_res[idx] == torch_res[idx]
-                else:
-                    assert (sp_res[idx] - np_res[idx]) / sp_res[idx] < tol
-                    assert (sp_res[idx] - torch_res[idx]) / sp_res[idx] < tol
-            except Exception as e:
-                print(
-                    f"eq:{i}, sp_res: {sp_res[idx]}, np_res: {np_res[idx]}, \
-                        torch_res: {torch_res[idx]}, {e}"
-                )
-                raise
-
-
-if __name__ == "__main__":
-    test_sp_np_torch()