Skip to content

Commit

Permalink
Add support for PQS test oracle (#33)
Browse files Browse the repository at this point in the history
* StringView support

* Support PQS test oracle

* rm Cargo.lock
  • Loading branch information
2010YOUY01 authored Jan 6, 2025
1 parent 45a9a96 commit 321c9d1
Show file tree
Hide file tree
Showing 20 changed files with 813 additions and 81 deletions.
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -376,7 +376,7 @@
<dependency>
<groupId>org.apache.arrow</groupId>
<artifactId>flight-sql-jdbc-driver</artifactId>
<version>16.1.0</version>
<version>17.0.0</version>
</dependency>
</dependencies>
<reporting>
Expand Down
3 changes: 2 additions & 1 deletion src/sqlancer/ComparatorHelper.java
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,8 @@ public static void assumeResultSetsAreEqual(List<String> resultSet, List<String>
public static void assumeResultSetsAreEqual(List<String> resultSet, List<String> secondResultSet,
String originalQueryString, List<String> combinedString, SQLGlobalState<?, ?> state,
UnaryOperator<String> canonicalizationRule) {
// Overloaded version of assumeResultSetsAreEqual that takes a canonicalization function which is applied to
// Overloaded version of assumeResultSetsAreEqual that takes a canonicalization
// function which is applied to
// both result sets before their comparison.
List<String> canonicalizedResultSet = resultSet.stream().map(canonicalizationRule).collect(Collectors.toList());
List<String> canonicalizedSecondResultSet = secondResultSet.stream().map(canonicalizationRule)
Expand Down
7 changes: 7 additions & 0 deletions src/sqlancer/IgnoreMeException.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,11 @@ public class IgnoreMeException extends RuntimeException {

private static final long serialVersionUID = 1L;

public IgnoreMeException() {
super();
}

public IgnoreMeException(String message) {
super(message);
}
}
2 changes: 2 additions & 0 deletions src/sqlancer/Main.java
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import sqlancer.common.query.Query;
import sqlancer.common.query.SQLancerResultSet;
import sqlancer.databend.DatabendProvider;
import sqlancer.datafusion.DataFusionProvider;
import sqlancer.doris.DorisProvider;
import sqlancer.duckdb.DuckDBProvider;
import sqlancer.h2.H2Provider;
Expand Down Expand Up @@ -734,6 +735,7 @@ private static void checkForIssue799(List<DatabaseProvider<?, ?, ?>> providers)
providers.add(new CnosDBProvider());
providers.add(new CockroachDBProvider());
providers.add(new DatabendProvider());
providers.add(new DataFusionProvider());
providers.add(new DorisProvider());
providers.add(new DuckDBProvider());
providers.add(new H2Provider());
Expand Down
2 changes: 1 addition & 1 deletion src/sqlancer/common/query/SQLancerResultSet.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

public class SQLancerResultSet implements Closeable {

ResultSet rs;
public ResultSet rs;
private Runnable runnableEpilogue;

public SQLancerResultSet(ResultSet rs) {
Expand Down
29 changes: 19 additions & 10 deletions src/sqlancer/datafusion/DataFusionErrors.java
Original file line number Diff line number Diff line change
Expand Up @@ -44,34 +44,43 @@ public static void registerExpectedExecutionErrors(ExpectedErrors errors) {
errors.add("There is only support Literal types for field at idx:");
errors.add("nth_value not supported for n:");
errors.add("Invalid argument error: Nested comparison: List(");
errors.add("This feature is not implemented: Percentile value for 'APPROX_PERCENTILE_CONT' must be a literal");
errors.add(
"This feature is not implemented: Tdigest max_size value for 'APPROX_PERCENTILE_CONT' must be a literal");

/*
* Known bugs
*/
errors.add("to type Int"); // https://github.com/apache/datafusion/issues/11249
errors.add("bitwise"); // https://github.com/apache/datafusion/issues/11260
errors.add("Sort expressions cannot be empty for streaming merge."); // https://github.com/apache/datafusion/issues/11561
errors.add("compute_utf8_flag_op_scalar failed to cast literal value NULL for operation"); // https://github.com/apache/datafusion/issues/11623
errors.add("Schema error: No field named "); // https://github.com/apache/datafusion/issues/12006
errors.add("Internal error: PhysicalExpr Column references column"); // https://github.com/apache/datafusion/issues/12012
errors.add("APPROX_"); // https://github.com/apache/datafusion/issues/12058
errors.add("External error: task"); // https://github.com/apache/datafusion/issues/12057
errors.add("NTH_VALUE"); // https://github.com/apache/datafusion/issues/12073
errors.add("SUBSTR"); // https://github.com/apache/datafusion/issues/12129
errors.add("NATURAL JOIN"); // https://github.com/apache/datafusion/issues/14015

/*
* False positives
*/
errors.add("Cannot cast string"); // ifnull() is passed two non-compattable type and caused execution error
errors.add("Physical plan does not support logical expression AggregateFunction"); // False positive: when aggr
// is generated in where
// clause
// False positive: when aggr is generated in where clause
errors.add("Physical plan does not support logical expression AggregateFunction");
errors.add("Unsupported ArrowType Utf8View"); // Maybe bug in arrow flight
// jdbc driver

/*
* Not critical, investigate in the future
*/
errors.add("does not match with the projection expression");
errors.add("invalid operator for nested");
errors.add("Arrow error: Cast error: Can't cast value");
errors.add("Nth value indices are 1 based");
/*
* Example query that triggers this error: create table t1(v1 int, v2 bool); select v1, sum(1) over (partition
* by v1 order by v2 range between 0 preceding and 0 following) from t1;
*
* Current error message: Arrow error: Invalid argument error: Invalid arithmetic operation: Boolean - Boolean
*
* TODO: The error message could be more meaningful to indicate that RANGE frame is not supported for boolean
* ORDER BY columns
*/
errors.add("Invalid arithmetic operation");
}
}
17 changes: 11 additions & 6 deletions src/sqlancer/datafusion/DataFusionOptions.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import sqlancer.datafusion.test.DataFusionNoCrashAggregate;
import sqlancer.datafusion.test.DataFusionNoCrashWindow;
import sqlancer.datafusion.test.DataFusionNoRECOracle;
import sqlancer.datafusion.test.DataFusionPQS;
import sqlancer.datafusion.test.DataFusionQueryPartitioningAggrTester;
import sqlancer.datafusion.test.DataFusionQueryPartitioningHavingTester;
import sqlancer.datafusion.test.DataFusionQueryPartitioningWhereTester;
Expand All @@ -26,13 +27,11 @@ public class DataFusionOptions implements DBMSSpecificOptions<DataFusionOracleFa

@Override
public List<DataFusionOracleFactory> getTestOracleFactory() {
return Arrays.asList(
// DataFusionOracleFactory.NO_CRASH_WINDOW,
// DataFusionOracleFactory.NO_CRASH_AGGREGATE,
DataFusionOracleFactory.NOREC, DataFusionOracleFactory.QUERY_PARTITIONING_WHERE
return Arrays.asList(DataFusionOracleFactory.PQS, DataFusionOracleFactory.NO_CRASH_WINDOW,
DataFusionOracleFactory.NO_CRASH_AGGREGATE, DataFusionOracleFactory.NOREC,
DataFusionOracleFactory.QUERY_PARTITIONING_WHERE);
// DataFusionOracleFactory.QUERY_PARTITIONING_AGGREGATE
// ,DataFusionOracleFactory.QUERY_PARTITIONING_HAVING
);
// DataFusionOracleFactory.QUERY_PARTITIONING_HAVING);
}

public enum DataFusionOracleFactory implements OracleFactory<DataFusionGlobalState> {
Expand All @@ -42,6 +41,12 @@ public TestOracle<DataFusionGlobalState> create(DataFusionGlobalState globalStat
return new DataFusionNoRECOracle(globalState);
}
},
PQS {
@Override
public TestOracle<DataFusionGlobalState> create(DataFusionGlobalState globalState) throws SQLException {
return new DataFusionPQS(globalState);
}
},
QUERY_PARTITIONING_WHERE {
@Override
public TestOracle<DataFusionGlobalState> create(DataFusionGlobalState globalState) throws SQLException {
Expand Down
56 changes: 47 additions & 9 deletions src/sqlancer/datafusion/DataFusionProvider.java
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
package sqlancer.datafusion;

import static sqlancer.datafusion.DataFusionUtil.DataFusionLogger.DataFusionLogType.DML;
import static sqlancer.datafusion.DataFusionUtil.dfAssert;
import static sqlancer.datafusion.DataFusionUtil.displayTables;

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.util.List;
import java.util.Optional;
import java.util.Properties;
import java.util.stream.Collectors;

Expand All @@ -34,29 +34,52 @@ public DataFusionProvider() {
super(DataFusionGlobalState.class, DataFusionOptions.class);
}

// Basic tables generated are DataFusion memory tables (named t1, t2, ...)
// Equivalent table can be backed by different physical implementation
// which will be named like t1_stringview, t2_parquet, etc.
//
// e.g. t1 and t1_stringview are logically equivalent table, but backed by
// different physical representation
//
// This helps to do more metamorphic testing on tables, for example
// `select * from t1` and `select * from t1_stringview` should give same
// result
//
// Supported physical implementation for tables:
// 1. Memory table (t1)
// 2. Memory table use StringView for TEXT columns (t1_stringview)
// Note: It's possible only convert random TEXT columns to StringView
@Override
public void generateDatabase(DataFusionGlobalState globalState) throws Exception {
int tableCount = Randomly.fromOptions(1, 2, 3, 4, 5, 6, 7);
// Create base tables
// ============================

int tableCount = Randomly.fromOptions(1, 2, 3, 4);
for (int i = 0; i < tableCount; i++) {
SQLQueryAdapter queryCreateRandomTable = new DataFusionTableGenerator().getQuery(globalState);
SQLQueryAdapter queryCreateRandomTable = new DataFusionTableGenerator().getCreateStmt(globalState);
queryCreateRandomTable.execute(globalState);
globalState.updateSchema();
globalState.dfLogger.appendToLog(DML, queryCreateRandomTable.toString() + "\n");
globalState.dfLogger.appendToLog(DataFusionLogger.DataFusionLogType.DML,
queryCreateRandomTable.toString() + "\n");
}

// Now only `INSERT` DML is supported
// If more DMLs are added later, should use`StatementExecutor` instead
// (see DuckDB's implementation for reference)

// Generating rows in base tables (t1, t2, ... not include t1_stringview, etc.)
// ============================

globalState.updateSchema();
List<DataFusionTable> allTables = globalState.getSchema().getDatabaseTables();
List<String> allTablesName = allTables.stream().map(t -> t.getName()).collect(Collectors.toList());
if (allTablesName.isEmpty()) {
List<DataFusionTable> allBaseTables = globalState.getSchema().getDatabaseTables();
List<String> allBaseTablesName = allBaseTables.stream().map(DataFusionTable::getName)
.collect(Collectors.toList());
if (allBaseTablesName.isEmpty()) {
dfAssert(false, "Generate Database failed.");
}

// Randomly insert some data into existing tables
for (DataFusionTable table : allTables) {
for (DataFusionTable table : allBaseTables) {
int nInsertQuery = globalState.getRandomly().getInteger(0, globalState.getOptions().getMaxNumberInserts());

for (int i = 0; i < nInsertQuery; i++) {
Expand All @@ -69,9 +92,24 @@ public void generateDatabase(DataFusionGlobalState globalState) throws Exception
}

insertQuery.execute(globalState);
globalState.dfLogger.appendToLog(DML, insertQuery.toString() + "\n");
globalState.dfLogger.appendToLog(DataFusionLogger.DataFusionLogType.DML, insertQuery.toString() + "\n");
}
}

// Construct mutated tables like t1_stringview, etc.
// ============================
for (DataFusionTable table : allBaseTables) {
Optional<SQLQueryAdapter> queryCreateStringViewTable = new DataFusionTableGenerator()
.createStringViewTable(globalState, table);
if (queryCreateStringViewTable.isPresent()) {
queryCreateStringViewTable.get().execute(globalState);
globalState.dfLogger.appendToLog(DataFusionLogger.DataFusionLogType.DML,
queryCreateStringViewTable.get().toString() + "\n");
}
}
globalState.updateSchema();
List<DataFusionTable> allTables = globalState.getSchema().getDatabaseTables();
List<String> allTablesName = allTables.stream().map(DataFusionTable::getName).collect(Collectors.toList());

// TODO(datafusion) add `DataFUsionLogType.STATE` for this whole db state log
if (globalState.getDbmsSpecificOptions().showDebugInfo) {
Expand Down
91 changes: 90 additions & 1 deletion src/sqlancer/datafusion/DataFusionSchema.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Optional;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

import sqlancer.Randomly;
Expand All @@ -32,6 +34,9 @@ public DataFusionSchema(List<DataFusionTable> databaseTables) {

// update existing tables in DB by query again
// (like `show tables;`)
//
// This function also setup table<->column reference pointers
// and equivalent tables(see `DataFusionTable.equivalentTables)
public static DataFusionSchema fromConnection(SQLConnection con, String databaseName) throws SQLException {
List<DataFusionTable> databaseTables = new ArrayList<>();
List<String> tableNames = getTableNames(con);
Expand All @@ -47,6 +52,24 @@ public static DataFusionSchema fromConnection(SQLConnection con, String database
databaseTables.add(t);
}

// Setup equivalent tables
// For example, now we have t1, t1_csv, t1_parquet, t2_csv, t2_parquet
// t1's equivalent tables: t1, t1_csv, t1_parquet
// t2_csv's equivalent tables: t2_csv, t2_parquet
// ...
//
// It can be assumed that:
// base table names are like t1, t2, ...
// equivalent tables are like t1_csv, t1_parquet, ...
for (DataFusionTable t : databaseTables) {
String baseTableName = t.getName().split("_")[0];
String patternString = "^" + baseTableName + "(_.*)?$"; // t1 or t1_*
Pattern pattern = Pattern.compile(patternString);

t.equivalentTables = databaseTables.stream().filter(table -> pattern.matcher(table.getName()).matches())
.map(DataFusionTable::getName).collect(Collectors.toList());
}

return new DataFusionSchema(databaseTables);
}

Expand Down Expand Up @@ -120,8 +143,10 @@ public static DataFusionDataType parseFromDataFusionCatalog(String typeString) {
return DataFusionDataType.BOOLEAN;
case "Utf8":
return DataFusionDataType.STRING;
case "Utf8View":
return DataFusionDataType.STRING;
default:
dfAssert(false, "Unreachable. All branches should be eovered");
dfAssert(false, "Uncovered branch typeString: " + typeString);
}

dfAssert(false, "Unreachable. All branches should be eovered");
Expand Down Expand Up @@ -169,25 +194,89 @@ public Node<DataFusionExpression> getRandomConstant(DataFusionGlobalState state)
public static class DataFusionColumn extends AbstractTableColumn<DataFusionTable, DataFusionDataType> {

private final boolean isNullable;
public Optional<String> alias;

public DataFusionColumn(String name, DataFusionDataType columnType, boolean isNullable) {
super(name, null, columnType);
this.isNullable = isNullable;
this.alias = Optional.empty();
}

public boolean isNullable() {
return isNullable;
}

public String getOrignalName() {
return getTable().getName() + "." + getName();
}

@Override
public String getFullQualifiedName() {
if (getTable() == null) {
return getName();
} else {
if (alias.isPresent()) {
return alias.get();
} else {
return getTable().getName() + "." + getName();
}
}
}
}

public static class DataFusionTable
extends AbstractRelationalTable<DataFusionColumn, TableIndex, DataFusionGlobalState> {
// There might exist multiple logically equivalent tables with
// different physical format.
// e.g. t1_csv, t1_parquet, ...
//
// When generating random query, it's possible to randomly pick one
// of them for stronger randomization.
public List<String> equivalentTables;

// Pick a random equivalent table name
// This can be used when generating differential queries
public Optional<String> currentEquivalentTableName;

// For example in query `select * from t1 as tt1, t1 as tt2`
// `tt1` is the alias for the first occurance of `t1`
public Optional<String> alias;

public DataFusionTable(String tableName, List<DataFusionColumn> columns, boolean isView) {
super(tableName, columns, Collections.emptyList(), isView);
}

public String getNotAliasedName() {
if (currentEquivalentTableName != null && currentEquivalentTableName.isPresent()) {
// In case setup is not done yet
return currentEquivalentTableName.get();
} else {
return super.getName();
}
}

// TODO(datafusion) Now implementation is hacky, should send a patch
// to core to support this
@Override
public String getName() {
// Before setup equivalent tables, we use the original table name
// Setup happens in `fromConnection()`
if (equivalentTables == null || currentEquivalentTableName == null) {
return super.getName();
}

if (alias.isPresent()) {
return alias.get();
} else {
return currentEquivalentTableName.get();
}
}

public void pickAnotherEquivalentTableName() {
dfAssert(!equivalentTables.isEmpty(), "equivalentTables should not be empty");
currentEquivalentTableName = Optional.of(Randomly.fromList(equivalentTables));
}

public static List<DataFusionColumn> getAllColumns(List<DataFusionTable> tables) {
return tables.stream().map(AbstractTable::getColumns).flatMap(List::stream).collect(Collectors.toList());
}
Expand Down
Loading

0 comments on commit 321c9d1

Please sign in to comment.