Skip to content

Commit

Permalink
[improvement](statistics)Drop expired external stats only when the ca…
Browse files Browse the repository at this point in the history
…talog is dropped. (#42244)

Drop expired external stats only when the catalog is dropped, to reduce
meta store access.
Before, we go through all external catalogs and their DBs and tables to
check expired stats, which may bring lots of meta store access.
  • Loading branch information
Jibing-Li authored Oct 24, 2024
1 parent fec6e0c commit 38313fc
Show file tree
Hide file tree
Showing 2 changed files with 90 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
Expand Down Expand Up @@ -176,10 +177,9 @@ private boolean init() {

private Map<Long, DatabaseIf<? extends TableIf>> constructDbMap() {
Map<Long, DatabaseIf<? extends TableIf>> idToDb = Maps.newHashMap();
for (CatalogIf<? extends DatabaseIf<? extends TableIf>> ctl : idToCatalog.values()) {
for (DatabaseIf<? extends TableIf> db : ctl.getAllDbs()) {
idToDb.put(db.getId(), db);
}
Collection<DatabaseIf<? extends TableIf>> internalDBs = Env.getCurrentEnv().getInternalCatalog().getAllDbs();
for (DatabaseIf<? extends TableIf> db : internalDBs) {
idToDb.put(db.getId(), db);
}
return idToDb;
}
Expand Down Expand Up @@ -268,6 +268,16 @@ private long findExpiredStats(OlapTable statsTbl, ExpiredStats expiredStats,
expiredStats.expiredCatalog.add(catalogId);
continue;
}
// Skip check external DBs and tables to avoid fetch too much metadata.
// Remove expired external table stats only when the external catalog is dropped.
// TODO: Need to check external database and table exist or not. But for now, we only check catalog.
// Because column_statistics table only keep table id and db id.
// But meta data doesn't always cache all external tables' ids.
// So we may fail to find the external table only by id. Need to use db name and table name instead.
// Have to store db name and table name in column_statistics in the future.
if (catalogId != InternalCatalog.INTERNAL_CATALOG_ID) {
continue;
}
long dbId = statsId.dbId;
if (!idToDb.containsKey(dbId)) {
expiredStats.expiredDatabase.add(dbId);
Expand Down
76 changes: 76 additions & 0 deletions regression-test/suites/statistics/test_drop_expired_stats.groovy
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

suite("test_drop_expired_stats") {

sql """drop database if exists test_drop_expired_stats"""
sql """create database test_drop_expired_stats"""
sql """use test_drop_expired_stats"""
sql """set global enable_auto_analyze=false"""

sql """CREATE TABLE table1 (
key1 bigint NOT NULL,
key2 bigint NOT NULL,
value1 int NOT NULL,
value2 int NOT NULL,
value3 int NOT NULL
)ENGINE=OLAP
DUPLICATE KEY(`key1`, `key2`)
COMMENT "OLAP"
DISTRIBUTED BY HASH(`key1`) BUCKETS 1
PROPERTIES (
"replication_num" = "1"
)
"""

sql """CREATE TABLE table2 (
key1 bigint NOT NULL,
key2 bigint NOT NULL,
value1 int NOT NULL
)ENGINE=OLAP
DUPLICATE KEY(`key1`, `key2`)
COMMENT "OLAP"
DISTRIBUTED BY HASH(`key1`) BUCKETS 1
PROPERTIES (
"replication_num" = "1"
)
"""

def id1 = getTableId("test_drop_expired_stats", "table1")
def id2 = getTableId("test_drop_expired_stats", "table2")

sql """analyze table table1 with sync"""
sql """analyze table table2 with sync"""
def result = sql """select * from __internal_schema.column_statistics where tbl_id = ${id1}"""
assertEquals(5, result.size())
result = sql """select * from __internal_schema.column_statistics where tbl_id = ${id2}"""
assertEquals(3, result.size())
sql """drop table table1"""
sql """drop expired stats"""
result = sql """select * from __internal_schema.column_statistics where tbl_id = ${id1}"""
assertEquals(0, result.size())
result = sql """select * from __internal_schema.column_statistics where tbl_id = ${id2}"""
assertEquals(3, result.size())

sql """drop database if exists test_drop_expired_stats"""
sql """drop expired stats"""
result = sql """select * from __internal_schema.column_statistics where tbl_id = ${id1}"""
assertEquals(0, result.size())
result = sql """select * from __internal_schema.column_statistics where tbl_id = ${id2}"""
assertEquals(0, result.size())
}

0 comments on commit 38313fc

Please sign in to comment.