From d1bef135f54f22a588f585ef8201a3fdac21dd92 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 6 May 2024 10:22:42 -0400 Subject: [PATCH] removed previous test --- NumGI/ParallelEquationGenerator.py | 96 ++++++++++++++++++++ test/EquationTests/test_numpy_sympy_torch.py | 88 ------------------ 2 files changed, 96 insertions(+), 88 deletions(-) create mode 100644 NumGI/ParallelEquationGenerator.py delete mode 100644 test/EquationTests/test_numpy_sympy_torch.py diff --git a/NumGI/ParallelEquationGenerator.py b/NumGI/ParallelEquationGenerator.py new file mode 100644 index 0000000..20815c6 --- /dev/null +++ b/NumGI/ParallelEquationGenerator.py @@ -0,0 +1,96 @@ +from __future__ import annotations + +import multiprocessing as mp +import os + +import sympy as sp +import torch + +from NumGI.ConstantDictionaries import DIFFERENTIAL_FUNCTIONS +from NumGI.ConstantDictionaries import OPERATIONS +from NumGI.DatasetTokenizer import DatasetTokenizer +from NumGI.EquationTokenizer import EquationTokenizer +from NumGI.SolutionGenerator import SolutionGenerator + + +def worker(args): + sols = args[0].generate_solution_dataset(*args[1:-1]) + generate_tokenized_lists(sols, args[-1]) + return [args[-1]] + + +def generate_tokenized_lists(sols, num): + """Generates tokenized lists of equations and solutions. Saves them to disk.""" + x = [] + y = [] + for i in sols: + if not isinstance(i[1], sp.logic.boolalg.BooleanTrue) and not isinstance( + i[1], sp.logic.boolalg.BooleanFalse + ): + x.append(i[0].doit()) + y.append(i[1]) + + tok = EquationTokenizer() + y_list = [tok.sympy_to_list(i) for i in y] + x_list = [tok.sympy_to_list(i) for i in x] + x_nozoo = [] + y_nozoo = [] + for idx, i in enumerate(y_list): + if len(i) < 200 and len(i) > 10: + if "zoo" not in [str(j) for j in i]: + try: + if len(x_list[idx]) < 100: + x_nozoo.append(x[idx]) + y_nozoo.append(y[idx]) + except Exception as e: + print(e) + continue + try: + dataset = DatasetTokenizer(x_nozoo, y_nozoo, useDefaultTokenizer=True) + dataset.device = "cpu" + torch.save(dataset.x_tokenized.to("cpu"), f"data/x_var_6/x_{num}.pt") + torch.save(dataset.y_tokenized.to("cpu"), f"data/x_var_6/y_{num}.pt") + except KeyError as e: + print(f"nan in dataset: {e}") + except ValueError: + print(len(x_nozoo)) + + +def generate_eq_parallel(gen_args: list, path: str, num_thousands: int): + """Generates equations in parallel. + + Note some equations will be discarded because they are too long. + This won't create the exact number of expected equations. + + Args: + path (str): path to save the equations to + num_thousands (int): number of thousands of equations to generate + """ + pool = mp.Pool(mp.cpu_count() - 1) + shift = 0 + solgen = SolutionGenerator() + + for i in os.listdir(path): + new_i = (i.split("_")[1]).split(".")[0] + shift = max(int(new_i), shift) + + shift += 1 + # Define the parameters for each call to generate_solution_dataset + parameters = [([solgen] + gen_args + [shift + _]) for _ in range(num_thousands)] + + pool.map(worker, parameters) + + +if __name__ == "__main__": + diff_func = DIFFERENTIAL_FUNCTIONS + ops = OPERATIONS + vars = ["x"] + gen_args = [ + (3, 4), + (3, 5), + 1_000, + vars, + diff_func, + ops, + ] + generate_eq_parallel(gen_args, "data/x_var_6", 10000) diff --git a/test/EquationTests/test_numpy_sympy_torch.py b/test/EquationTests/test_numpy_sympy_torch.py deleted file mode 100644 index 4d88549..0000000 --- a/test/EquationTests/test_numpy_sympy_torch.py +++ /dev/null @@ -1,88 +0,0 @@ -from __future__ import annotations - -import math - -import numpy as np -import sympy as sp -import torch - -from NumGI.ConstantDictionaries import DIFFERENTIAL_FUNCTIONS -from NumGI.ConstantDictionaries import OPERATIONS -from NumGI.EquationTokenizer import EquationTokenizer -from NumGI.SolutionGenerator import SolutionGenerator - - -def test_sp_np_torch(): - sg = SolutionGenerator() - sg.PROB_NEW_SYMBOL = 0 - n_eqs = 30 - sols = [ - sg.generate_solution(4, ["x"], DIFFERENTIAL_FUNCTIONS, OPERATIONS)[0].simplify() - for i in range(n_eqs) - ] - - for func in DIFFERENTIAL_FUNCTIONS: - sols.append(func(sp.Symbol("x"))) - - tokenizer = EquationTokenizer() - - test_arr = [-10, -5, -2, -1, 0, 1, 2, 5, 10, 20] - np_test = np.array(test_arr) - torch_test = torch.tensor(test_arr, device=tokenizer.device) - x = sp.Symbol("x") - - cnt = 0 - - for i in sols: - try: - np_func, var = tokenizer.sympy_to_numpy(i) - np_res = np_func(np_test).tolist() - except TypeError: - cnt += 1 - print("typeerr") - continue - - if cnt > n_eqs / 2: - raise Exception( - "Too many equations with TypeError are equations correctly generated \ - or error in sp to np func" - ) - - sp_res = [] - for idx, j in enumerate(test_arr): - try: - sp_res.append(float(i.replace(x, j).evalf())) - except Exception as e: - print(e) - sp_res.append(np_res[idx]) - - torch_func, var = tokenizer.sympy_to_torch(i) - torch_res = torch_func(**{_arg: torch_test for _arg in var}).tolist() - - tol = 1e-4 - for idx in range(len(sp_res)): - print( - f"eq:{i}, sp_res: {sp_res[idx]}, np_res: {np_res[idx]}, torch_res: {torch_res[idx]}" - ) - try: - if math.isnan(sp_res[idx]) or math.isnan(np_res[idx]) or math.isnan(torch_res[idx]): - continue - elif sp_res[idx] == 0: - assert (sp_res[idx] - np_res[idx]) < tol - assert (sp_res[idx] - torch_res[idx]) < tol - elif math.isinf(np_res[idx]): - assert np_res[idx] == sp_res[idx] - assert np_res[idx] == torch_res[idx] - else: - assert (sp_res[idx] - np_res[idx]) / sp_res[idx] < tol - assert (sp_res[idx] - torch_res[idx]) / sp_res[idx] < tol - except Exception as e: - print( - f"eq:{i}, sp_res: {sp_res[idx]}, np_res: {np_res[idx]}, \ - torch_res: {torch_res[idx]}, {e}" - ) - raise - - -if __name__ == "__main__": - test_sp_np_torch()