From f54001421d24e9c9229fd16f3cb23f2dff0bed5c Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 6 May 2024 10:06:36 -0400 Subject: [PATCH] bug fixes --- NumGI/.DS_Store | Bin 6148 -> 6148 bytes NumGI/LoadTokenizer.py | 4 +- NumGI/Loss/LossDataset.py | 48 +++++++++------ NumGI/ParallelEquationGenerator.py | 90 ----------------------------- NumGI/SolutionGenerator.py | 16 ++--- 5 files changed, 40 insertions(+), 118 deletions(-) delete mode 100644 NumGI/ParallelEquationGenerator.py diff --git a/NumGI/.DS_Store b/NumGI/.DS_Store index d4cc0d0959dfa43df650e860b22989d25fbbab28..a1c10f5cf4c80c4bb6d6224eeacc810eed67f1f0 100644 GIT binary patch literal 6148 zcmeHK!HN?>5UtK;bz*P{8gUN_175=#<0^`mm}C(_L7YVoDsd(W?y%!b%uF_{A&|5F zg7^zw{Stq|FYpv!b$6CYGJ%{#v@2ANh>zX(AGhVfPkMiHIyHWBvk)4B>v( z70KzA9iWggd`hT|-yZc>tWASqz%cN)F~Hw$QGT;Ng_Qn&Pag+S#|t747Q8(?gzn9x zpf>mF(+s*2hZEb!TT`!ggHV4P9`ZAHO_%KIGfLjGYEqyHCcJVfYMMeZOpJwD5 zJp=CpXn-H=SKjC&w1!Z&Nf0%2Wylquj$0-NwgJ}He2s4uhu6B;Nv(CHyH+$aR&lL` z=`ai!2L4wDcz>{=jCGAug>vgaA+G?yJesAT&VLrz$JSWaI8}%qh)}LT&}VZ6Sl2jJh!%+XBOqxo Kg<;^2GVmMUPz}of delta 91 zcmZoMXfc=|#>B)qu~2NHo}wrV0|Nsi1A_nqLoq`tLkU9&Lq0?H#=_-{j4Ye~u!u5F se$6hkS%HI{Wn#nO&Fmcf96+s`1v$PmPv#eK1Np+A+m-U0IAOuDF6Tf diff --git a/NumGI/LoadTokenizer.py b/NumGI/LoadTokenizer.py index 2980c26..25f2bb3 100644 --- a/NumGI/LoadTokenizer.py +++ b/NumGI/LoadTokenizer.py @@ -8,7 +8,7 @@ class LoadTokenizer(DatasetTokenizer): """The tokenizer used when loading data from files.""" - def __init__(self, x_files, y_files): + def __init__(self, x_files, y_files, useDefaultTokenizer=True): default_tokenized_x = [] default_tokenized_y = [] @@ -37,4 +37,4 @@ def __init__(self, x_files, y_files): new_x = [tempTokenizer.tokens_to_list(i) for i in default_combined_x_torch.tolist()] new_y = [tempTokenizer.tokens_to_list(i) for i in default_combined_y_torch.tolist()] - super().__init__(new_x, new_y, useDefaultTokenizer=False, isSympy=False) + super().__init__(new_x, new_y, useDefaultTokenizer=useDefaultTokenizer, isSympy=False) diff --git a/NumGI/Loss/LossDataset.py b/NumGI/Loss/LossDataset.py index e77721e..2e4ca2f 100644 --- a/NumGI/Loss/LossDataset.py +++ b/NumGI/Loss/LossDataset.py @@ -36,7 +36,8 @@ def create_var_dict(self): sol = self.eq_dataset.tokens_to_sympy(eq) self.solutions.append(sol) if frozenset(sol.free_symbols) not in var_dict: - var_dict[frozenset(sol.free_symbols)] = [[sol, i]] + if len(sol.free_symbols) < 5: + var_dict[frozenset(sol.free_symbols)] = [[sol, i]] else: var_dict[frozenset(sol.free_symbols)].append([sol, i]) return var_dict @@ -44,14 +45,19 @@ def create_var_dict(self): def calculate_n_pairwise_loss(self, N, ell_norm): loss = torch.zeros((3, N)) possible_symbols = self.var_dict.keys() + possible_symbols = [i for i in possible_symbols if len(i) >= 1] - possible_symbols = [i for i in possible_symbols if len(self.var_dict[i]) > 1] + max_len = 0 + for i in possible_symbols: + if len(i) > max_len: + max_len = len(i) - first_batch = int(0.95 * N) - second_batch = N - first_batch + self.generate_grids(max_len) + + first_batch = N # int(0.95 * N) for i in range(first_batch): chosen_symbols = random.choice(list(possible_symbols)) - if len(self.var_dict[chosen_symbols]) < 1: + if len(self.var_dict[chosen_symbols]) <= 1: continue possible_equations = {i[1] for i in self.var_dict[chosen_symbols]} @@ -64,22 +70,22 @@ def calculate_n_pairwise_loss(self, N, ell_norm): if integral is None: continue - loss[0, i] = sol_sympy_1[1] - loss[1, i] = sol_sympy_2[1] integral_val = integral.item() if np.abs(integral_val) < self.max_integral_value: + loss[0, i] = sol_sympy_1[1] + loss[1, i] = sol_sympy_2[1] loss[2, i] = integral.item() else: - loss[2, i] = np.sign(integral_val) * self.max_integral_value + continue - for i in range(second_batch): - chosen_symbols = random.sample(possible_symbols, 2) - sol_sympy_1 = random.choice(self.var_dict[chosen_symbols[0]]) - sol_sympy_2 = random.choice(self.var_dict[chosen_symbols[1]]) + # for i in range(second_batch): + # chosen_symbols = random.sample(possible_symbols, 2) + # sol_sympy_1 = random.choice(self.var_dict[chosen_symbols[0]]) + # sol_sympy_2 = random.choice(self.var_dict[chosen_symbols[1]]) - loss[0, i] = sol_sympy_1[1] - loss[1, i] = sol_sympy_2[1] - loss[2, i] = torch.inf + # loss[0, i] = sol_sympy_1[1] + # loss[1, i] = sol_sympy_2[1] + # loss[2, i] = torch.inf self.loss = loss @@ -88,7 +94,7 @@ def compute_integral(self, sympy_eq): if len(symbols) < 1: return torch.tensor(torch.nan) - grids = self.create_discrete_grids(symbols) + grids = self.grids[len(symbols) - 1] _arg = {sym: _grid for sym, _grid in zip(symbols, grids)} try: complex_result = func(**_arg) @@ -102,12 +108,18 @@ def compute_integral(self, sympy_eq): del grids return result + def generate_grids(self, N_grids): + self.grids = [] + for i in range(N_grids): + symbols = [i for i in range(i + 1)] + self.grids.append(self.create_discrete_grids(symbols)) + def create_discrete_grids(self, symbols): grid_low, grid_high, num_grid = self.grid_size # scale grid down with dimesion - num_grid = int(num_grid * np.exp(0.75 * (1 - len(symbols)))) + num_grid = int(num_grid * np.exp(0.95 * (1 - len(symbols)))) grid_real = torch.linspace(grid_low, grid_high, num_grid, device=self.eq_dataset.device) - grid_im = 1j * torch.linspace(grid_low, grid_high, num_grid, device=self.eq_dataset.device) + grid_im = torch.linspace(grid_low, grid_high, num_grid, device=self.eq_dataset.device) * 1j grid = grid_real[:, None] + grid_im[None, :] grids = [grid.flatten() for i in symbols] mesh = torch.meshgrid(grids) diff --git a/NumGI/ParallelEquationGenerator.py b/NumGI/ParallelEquationGenerator.py deleted file mode 100644 index fea27a1..0000000 --- a/NumGI/ParallelEquationGenerator.py +++ /dev/null @@ -1,90 +0,0 @@ -from __future__ import annotations - -import multiprocessing as mp -import os - -import sympy as sp -import torch - -from NumGI.ConstantDictionaries import DIFFERENTIAL_FUNCTIONS -from NumGI.ConstantDictionaries import OPERATIONS -from NumGI.DatasetTokenizer import DatasetTokenizer -from NumGI.EquationTokenizer import EquationTokenizer -from NumGI.SolutionGenerator import SolutionGenerator - - -def worker(args): - sols = args[0].generate_solution_dataset(*args[1:-1]) - generate_tokenized_lists(sols, args[-1]) - return [args[-1]] - - -def generate_tokenized_lists(sols, num): - """Generates tokenized lists of equations and solutions. Saves them to disk.""" - x = [] - y = [] - for i in sols: - if not isinstance(i[1], sp.logic.boolalg.BooleanTrue) and not isinstance( - i[1], sp.logic.boolalg.BooleanFalse - ): - x.append(i[0]) - y.append(i[1]) - - tok = EquationTokenizer() - y_list = [tok.sympy_to_list(i) for i in y] - x_nozoo = [] - y_nozoo = [] - for idx, i in enumerate(y_list): - if len(i) < 200 and len(i) > 10: - if "zoo" not in [str(j) for j in i]: - x_nozoo.append(x[idx]) - y_nozoo.append(y[idx]) - try: - dataset = DatasetTokenizer(x_nozoo, y_nozoo, useDefaultTokenizer=True) - dataset.device = "cpu" - except KeyError: - print("nan in dataset") - except ValueError: - print(len(x_nozoo)) - torch.save(dataset.x_tokenized.to("cpu"), f"data/easy_ops2/x_{num}.pt") - torch.save(dataset.y_tokenized.to("cpu"), f"data/easy_ops2/y_{num}.pt") - - -def generate_eq_parallel(gen_args: list, path: str, num_thousands: int): - """Generates equations in parallel. - - Note some equations will be discarded because they are too long. - This won't create the exact number of expected equations. - - Args: - path (str): path to save the equations to - num_thousands (int): number of thousands of equations to generate - """ - pool = mp.Pool(mp.cpu_count() - 1) - shift = 0 - solgen = SolutionGenerator() - - for i in os.listdir(path): - new_i = (i.split("_")[1]).split(".")[0] - shift = max(int(new_i), shift) - - shift += 1 - # Define the parameters for each call to generate_solution_dataset - parameters = [([solgen] + gen_args + [shift + _]) for _ in range(num_thousands)] - - pool.map(worker, parameters) - - -if __name__ == "__main__": - diff_func = DIFFERENTIAL_FUNCTIONS - ops = OPERATIONS - vars = ["x", "y", "z", "beta", "gamma", "delta", "a", "b", "c", "d", "epsilon"] - gen_args = [ - (3, 4), - (3, 5), - 1_000, - vars, - diff_func, - ops, - ] - generate_eq_parallel(gen_args, "data/easy_ops2", 50) diff --git a/NumGI/SolutionGenerator.py b/NumGI/SolutionGenerator.py index 5a1921d..3f43bca 100644 --- a/NumGI/SolutionGenerator.py +++ b/NumGI/SolutionGenerator.py @@ -38,7 +38,7 @@ def generate_solution_dataset( sol, used_vars = self.generate_solution(num_ops_sol, vars, funcs, ops) equation = self.generate_equation(used_vars, ops_eq, ops, sol) - func_sol = sp.Function("f")(*[sp.Symbol(var) for var in used_vars]) + func_sol = sp.Function("f")(*[sp.Symbol(var) for var in ["x"]]) sol_eq = sp.Eq(func_sol, sol) dataset.append((sol_eq, equation)) return dataset @@ -99,14 +99,14 @@ def choose_variable(self, new_vars: list | None, used_vars: list | None): Probabilities for when to choose which need to be initialized somewhere. """ - if used_vars is None or len(used_vars) <= 0 or random.random() < self.PROB_NEW_SYMBOL: - var = self.pop_random(new_vars) - used_vars.append(var) - return sp.symbols(var) - return sp.symbols(random.choice(used_vars)) + # if used_vars is None or len(used_vars) <= 0 or random.random() < self.PROB_NEW_SYMBOL: + # var = self.pop_random(new_vars) + # used_vars.append(var) + return sp.symbols("x") + # return sp.symbols(random.choice(used_vars)) def choose_used_variable(self, used_vars: list): - return sp.symbols(random.choice(used_vars)) + return sp.symbols("x") # sp.symbols(random.choice(used_vars)) def choose_operation( self, @@ -133,7 +133,7 @@ def tree_to_equation( """Converts a tree to a sympy equation.""" root = tree.root - vars = [sp.Symbol(var) for var in used_vars] + vars = [sp.Symbol(var) for var in ["x"]] func = sp.Function("f")(*vars) try: expression = self.tree_to_eq_helper(root, sol, used_vars)