Skip to content

Commit

Permalink
Merge branch 'main' into writer-compression
Browse files Browse the repository at this point in the history
  • Loading branch information
yikf authored Nov 27, 2024
2 parents 98b3b06 + 2649fa7 commit d2bad7d
Show file tree
Hide file tree
Showing 23 changed files with 401 additions and 69 deletions.
19 changes: 12 additions & 7 deletions .github/workflows/build_bundle_package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,8 @@ on:
jobs:
build-native-lib:
runs-on: ubuntu-20.04
container: apache/gluten:vcpkg-centos-7
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v4
- name: Get Ccache
uses: actions/cache/restore@v3
with:
Expand All @@ -50,9 +49,15 @@ jobs:
ccache-centos7-release-default
- name: Build Gluten velox third party
run: |
df -a
cd $GITHUB_WORKSPACE/
bash dev/ci-velox-buildstatic-centos-7.sh
docker run -v $GITHUB_WORKSPACE:/work -w /work apache/gluten:vcpkg-centos-7 bash -c "
df -a
cd /work
export CCACHE_DIR=/work/.ccache
bash dev/ci-velox-buildstatic-centos-7.sh
ccache -s
mkdir -p /work/.m2/repository/org/apache/arrow/
cp -r /root/.m2/repository/org/apache/arrow/* /work/.m2/repository/org/apache/arrow/
"
- name: Upload native libs
uses: actions/upload-artifact@v3
with:
Expand All @@ -65,10 +70,10 @@ jobs:
path: /root/.m2/repository/org/apache/arrow/
name: velox-arrow-jar-centos-7-${{github.sha}}

build-bundle-package-centos7:
build-bundle-package-centos8:
needs: build-native-lib
runs-on: ubuntu-20.04
container: centos:7
container: centos:8
steps:
- uses: actions/checkout@v2
- name: Download All Artifacts
Expand Down
98 changes: 83 additions & 15 deletions .github/workflows/velox_backend.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,8 @@ concurrency:
jobs:
build-native-lib-centos-7:
runs-on: ubuntu-20.04
container: apache/gluten:vcpkg-centos-7
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v4
- name: Get Ccache
uses: actions/cache/restore@v3
with:
Expand All @@ -68,10 +67,17 @@ jobs:
ccache-centos7-release-default
- name: Build Gluten native libraries
run: |
df -a
cd $GITHUB_WORKSPACE/
bash dev/ci-velox-buildstatic-centos-7.sh
ccache -s
docker pull apache/gluten:vcpkg-centos-7
docker run -v $GITHUB_WORKSPACE:/work -w /work apache/gluten:vcpkg-centos-7 bash -c "
df -a
cd /work
export CCACHE_DIR=/work/.ccache
bash dev/ci-velox-buildstatic-centos-7.sh
ccache -s
mkdir -p /work/.m2/repository/org/apache/arrow/
cp -r /root/.m2/repository/org/apache/arrow/* /work/.m2/repository/org/apache/arrow/
"
- name: "Save ccache"
uses: actions/cache/save@v3
id: ccache
Expand All @@ -85,7 +91,7 @@ jobs:
- uses: actions/upload-artifact@v3
with:
name: arrow-jars-centos-7-${{github.sha}}
path: /root/.m2/repository/org/apache/arrow/
path: .m2/repository/org/apache/arrow/

run-tpc-test-ubuntu:
needs: build-native-lib-centos-7
Expand Down Expand Up @@ -158,12 +164,12 @@ jobs:
&& GLUTEN_IT_JVM_ARGS=-Xmx5G sbin/gluten-it.sh queries-compare \
--local --preset=velox --benchmark-type=ds --error-on-memleak --off-heap-size=10g -s=1.0 --threads=16 --iterations=1
run-tpc-test-centos:
run-tpc-test-centos8:
needs: build-native-lib-centos-7
strategy:
fail-fast: false
matrix:
os: [ "centos:7", "centos:8" ]
os: [ "centos:8" ]
spark: [ "spark-3.2", "spark-3.3", "spark-3.4", "spark-3.5" ]
java: [ "java-8", "java-11", "java-17" ]
# Spark supports JDK17 since 3.3 and later, see https://issues.apache.org/jira/browse/SPARK-33772
Expand Down Expand Up @@ -249,6 +255,68 @@ jobs:
--local --preset=velox --benchmark-type=ds --error-on-memleak --off-heap-size=10g -s=1.0 --threads=16 --iterations=1 \
--extra-conf=spark.gluten.ras.enabled=true
run-tpc-test-centos7:
needs: build-native-lib-centos-7
strategy:
fail-fast: false
matrix:
spark: [ "spark-3.2", "spark-3.3", "spark-3.4", "spark-3.5" ]
java: [ "java-8" ]
# Spark supports JDK17 since 3.3 and later, see https://issues.apache.org/jira/browse/SPARK-33772
runs-on: ubuntu-20.04
steps:
- uses: actions/checkout@v4
- name: Download All Native Artifacts
uses: actions/download-artifact@v3
with:
name: velox-native-lib-centos-7-${{github.sha}}
path: ./cpp/build/releases/
- name: Download All Arrow Jar Artifacts
uses: actions/download-artifact@v3
with:
name: arrow-jars-centos-7-${{github.sha}}
path: .m2/repository/org/apache/arrow/
- name: Build and run TPCH/DS tests
run: |
docker pull centos:7
docker run -v $GITHUB_WORKSPACE:/work -v /$GITHUB_WORKSPACE/.m2:/root/.m2/ -w /work \
-e matrix.java=${{ matrix.java }} -e matrix.spark=${{ matrix.spark }} \
centos:7 \
bash -c "
sed -i -e 's|mirrorlist=|#mirrorlist=|g' /etc/yum.repos.d/CentOS-* || true
sed -i -e 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-* || true
# Setup java and maven
yum update -y && yum install -y java-1.8.0-openjdk-devel wget
wget -nv https://downloads.apache.org/maven/maven-3/3.8.8/binaries/apache-maven-3.8.8-bin.tar.gz
tar -xvf apache-maven-3.8.8-bin.tar.gz && mv apache-maven-3.8.8 /usr/lib/maven
export PATH=${PATH}:/usr/lib/maven/bin
# Set environment variables
export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk
# Build gluten-it
mvn -ntp clean install -P${{ matrix.spark }} -P${{ matrix.java }} -Pbackends-velox -DskipTests
cd /work/tools/gluten-it
mvn -ntp clean install -P${{ matrix.spark }} -P${{ matrix.java }}
# Run TPC-H / TPC-DS
GLUTEN_IT_JVM_ARGS=-Xmx5G sbin/gluten-it.sh queries-compare \
--local --preset=velox --benchmark-type=h --error-on-memleak --off-heap-size=10g -s=1.0 --threads=16 --iterations=1
GLUTEN_IT_JVM_ARGS=-Xmx5G sbin/gluten-it.sh queries-compare \
--local --preset=velox --benchmark-type=ds --error-on-memleak --off-heap-size=10g -s=1.0 --threads=16 --iterations=1
# Run TPC-H / TPC-DS with RAS
cd /work/tools/gluten-it
GLUTEN_IT_JVM_ARGS=-Xmx5G sbin/gluten-it.sh queries-compare \
--local --preset=velox --benchmark-type=h --error-on-memleak --off-heap-size=10g -s=1.0 --threads=16 --iterations=1 \
--extra-conf=spark.gluten.ras.enabled=true \
&& GLUTEN_IT_JVM_ARGS=-Xmx5G sbin/gluten-it.sh queries-compare \
--local --preset=velox --benchmark-type=ds --error-on-memleak --off-heap-size=10g -s=1.0 --threads=16 --iterations=1 \
--extra-conf=spark.gluten.ras.enabled=true
"
run-tpc-test-ubuntu-oom:
needs: build-native-lib-centos-7
strategy:
Expand Down Expand Up @@ -956,12 +1024,12 @@ jobs:
df -a
bash dev/ci-velox-buildshared-centos-8.sh
ccache -s
- name: "Save ccache"
uses: actions/cache/save@v3
id: ccache
with:
path: '${{ env.CCACHE_DIR }}'
key: ccache-centos8-release-default-${{github.sha}}
# - name: "Save ccache"
# uses: actions/cache/save@v3
# id: ccache
# with:
# path: '${{ env.CCACHE_DIR }}'
# key: ccache-centos8-release-default-${{github.sha}}
- name: Run CPP unit test
run: |
cd ./cpp/build && ctest -V
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/velox_backend_cache.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,8 @@ concurrency:
jobs:
cache-native-lib-centos-7:
runs-on: ubuntu-20.04
container: apache/gluten:vcpkg-centos-7
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v4
- name: Get Ccache
uses: actions/cache/restore@v3
with:
Expand All @@ -43,8 +42,9 @@ jobs:
ccache-centos7-release-default
- name: Build Gluten native libraries
run: |
df -a
bash dev/ci-velox-buildstatic-centos-7.sh
docker run -v $GITHUB_WORKSPACE:/work -w /work apache/gluten:vcpkg-centos-7 bash -c "
bash dev/ci-velox-buildstatic-centos-7.sh
"
- name: Save Ccache
uses: actions/cache/save@v3
id: ccache
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2192,7 +2192,7 @@ class GlutenClickHouseTPCHSaltNullParquetSuite extends GlutenClickHouseTPCHAbstr
}
}

test("GLUTEN-3135: Bug fix to_date") {
test("GLUTEN-3135/GLUTEN-7896: Bug fix to_date") {
val create_table_sql =
"""
| create table test_tbl_3135(id bigint, data string) using parquet
Expand All @@ -2209,13 +2209,27 @@ class GlutenClickHouseTPCHSaltNullParquetSuite extends GlutenClickHouseTPCHAbstr
|(7, '1970-01-01 00:00:00'),
|(8, '2024-3-2'),
|(9, '2024-03-2'),
|(10, '2024-03')
|(10, '2024-03'),
|(11, '2024-03-02 11:22:33')
|""".stripMargin
spark.sql(create_table_sql)
spark.sql(insert_data_sql)

val select_sql = "select id, to_date(data) from test_tbl_3135"
compareResultsAgainstVanillaSpark(select_sql, true, { _ => })

withSQLConf(("spark.sql.legacy.timeParserPolicy" -> "corrected")) {
compareResultsAgainstVanillaSpark(
"select id, to_date('2024-03-2 11:22:33', 'yyyy-MM-dd') from test_tbl_3135 where id = 11",
true,
{ _ => })
}
withSQLConf(("spark.sql.legacy.timeParserPolicy" -> "legacy")) {
compareResultsAgainstVanillaSpark(
"select id, to_date(data, 'yyyy-MM-dd') from test_tbl_3135 where id = 11",
true,
{ _ => })
}
spark.sql("drop table test_tbl_3135")
}

Expand Down
4 changes: 2 additions & 2 deletions cpp-ch/clickhouse.version
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
CH_ORG=Kyligence
CH_BRANCH=rebase_ch/20241111
CH_COMMIT=3f7e46d4e9e
CH_BRANCH=rebase_ch/20241118
CH_COMMIT=7f22fe487c88d3b988ea82a5c34882da23ea6289
5 changes: 5 additions & 0 deletions cpp-ch/local-engine/Common/CHUtil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -762,6 +762,11 @@ void BackendInitializerUtil::initSettings(const SparkConfigs::ConfigMap & spark_
settings.set(key, toField(key, value));
LOG_DEBUG(&Poco::Logger::get("CHUtil"), "Set settings key:{} value:{}", key, value);
}
else if (key == TIMER_PARSER_POLICY)
{
settings.set(key, value);
LOG_DEBUG(&Poco::Logger::get("CHUtil"), "Set settings key:{} value:{}", key, value);
}
}

/// Finally apply some fixed kvs to settings.
Expand Down
1 change: 1 addition & 0 deletions cpp-ch/local-engine/Common/CHUtil.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ namespace local_engine
static const String MERGETREE_INSERT_WITHOUT_LOCAL_STORAGE = "mergetree.insert_without_local_storage";
static const String MERGETREE_MERGE_AFTER_INSERT = "mergetree.merge_after_insert";
static const std::string DECIMAL_OPERATIONS_ALLOW_PREC_LOSS = "spark.sql.decimalOperations.allowPrecisionLoss";
static const std::string TIMER_PARSER_POLICY = "spark.sql.legacy.timeParserPolicy";

static const std::unordered_set<String> BOOL_VALUE_SETTINGS{
MERGETREE_MERGE_AFTER_INSERT, MERGETREE_INSERT_WITHOUT_LOCAL_STORAGE, DECIMAL_OPERATIONS_ALLOW_PREC_LOSS};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,36 @@ bool isMetaDataFile(const std::string & path)
return !path.ends_with("bin");
}

TemporaryWriteBufferWrapper::TemporaryWriteBufferWrapper(
const String & file_name_, const std::shared_ptr<DB::TemporaryDataBuffer> & data_buffer_)
: WriteBufferFromFileBase(data_buffer_->buffer().size(), data_buffer_->buffer().begin(), 0)
, file_name(file_name_)
, data_buffer(data_buffer_)
{
}
void TemporaryWriteBufferWrapper::preFinalize()
{
next();
}

void TemporaryWriteBufferWrapper::finalizeImpl()
{
next();
data_buffer->finalizeImpl();
}

void TemporaryWriteBufferWrapper::cancelImpl() noexcept
{
data_buffer->cancelImpl();
}

void TemporaryWriteBufferWrapper::nextImpl()
{
data_buffer->position() = position();
data_buffer->next();
BufferBase::set(data_buffer->buffer().begin(), data_buffer->buffer().size(), data_buffer->offset());
}

void CompactObjectStorageDiskTransaction::commit()
{
auto metadata_tx = disk.getMetadataStorage()->createTransaction();
Expand All @@ -52,9 +82,9 @@ void CompactObjectStorageDiskTransaction::commit()
[&](auto & item)
{
DB::DiskObjectStorageMetadata metadata(object_storage->getCommonKeyPrefix(), item.first);
DB::ReadBufferFromFilePRead read(item.second->getAbsolutePath());
auto read = item.second->read();
int file_size = 0;
while (int count = read.readBig(buffer.data(), buffer.size()))
while (int count = read->readBig(buffer.data(), buffer.size()))
{
file_size += count;
out.write(buffer.data(), count);
Expand Down Expand Up @@ -98,12 +128,13 @@ std::unique_ptr<DB::WriteBufferFromFileBase> CompactObjectStorageDiskTransaction
"Don't support write file in different dirs, path {}, prefix path: {}",
path,
prefix_path);
auto tmp = std::make_shared<DB::TemporaryFileOnDisk>(tmp_data);
auto tmp = std::make_shared<DB::TemporaryDataBuffer>(tmp_data.get());
files.emplace_back(path, tmp);
auto tx = disk.getMetadataStorage()->createTransaction();
tx->createDirectoryRecursive(std::filesystem::path(path).parent_path());
tx->createEmptyMetadataFile(path);
tx->commit();
return std::make_unique<DB::WriteBufferFromFile>(tmp->getAbsolutePath(), buf_size);

return std::make_unique<TemporaryWriteBufferWrapper>(path, tmp);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,41 @@ extern const int NOT_IMPLEMENTED;
namespace local_engine
{

class TemporaryWriteBufferWrapper : public DB::WriteBufferFromFileBase
{
public:
TemporaryWriteBufferWrapper(const String & file_name_, const std::shared_ptr<DB::TemporaryDataBuffer> & data_buffer_);

void sync() override { data_buffer->nextImpl(); }

void preFinalize() override;

protected:
void finalizeImpl() override;
void cancelImpl() noexcept override;

private:
void nextImpl() override;

public:
std::string getFileName() const override
{
return file_name;
}

private:
String file_name;
std::shared_ptr<DB::TemporaryDataBuffer> data_buffer;
};

class CompactObjectStorageDiskTransaction: public DB::IDiskTransaction {
public:
static inline const String PART_DATA_FILE_NAME = "part_data.gluten";
static inline const String PART_META_FILE_NAME = "part_meta.gluten";

explicit CompactObjectStorageDiskTransaction(DB::IDisk & disk_, const DB::DiskPtr tmp_)
explicit CompactObjectStorageDiskTransaction(DB::IDisk & disk_, const DB::TemporaryDataOnDiskScopePtr tmp_)
: disk(disk_), tmp_data(tmp_)
{
chassert(!tmp_->isRemote());
}

void commit() override;
Expand Down Expand Up @@ -170,8 +196,8 @@ class CompactObjectStorageDiskTransaction: public DB::IDiskTransaction {

private:
DB::IDisk & disk;
DB::DiskPtr tmp_data;
std::vector<std::pair<String, std::shared_ptr<DB::TemporaryFileOnDisk>>> files;
DB::TemporaryDataOnDiskScopePtr tmp_data;
std::vector<std::pair<String, std::shared_ptr<DB::TemporaryDataBuffer>>> files;
String prefix_path = "";
};
}
Expand Down
Loading

0 comments on commit d2bad7d

Please sign in to comment.