Skip to content

Commit

Permalink
Add hive metastore catalog support (part 2/2) (#285)
Browse files Browse the repository at this point in the history
* fmt members

* setup basic test-infra for hms-catalog

* add license

* add hms create_namespace

* add hms get_namespace

* fix: typo

* add hms namespace_exists and drop_namespace

* add hms update_namespace

* move fns into HmsCatalog

* use `expose` in docker-compose

* add hms list_tables

* fix: clippy

* fix: cargo sort

* fix: cargo workspace

* move fns into utils + add constants

* include database name in error msg

* add pilota to cargo workspace

* add minio version

* change visibility to pub(crate); return namespace from conversion fn

* add minio version in rest-catalog docker-compose

* fix: hms test docker infrastructure

* add version to minio/mc

* fix: license header

* fix: core-site

* split utils and errors

* add fn get_default_table_location

* add fn get_metadata_location

* add docs

* add HiveSchemaBuilder

* add schema to HiveSchemaBuilder

* add convert_to_hive_table

* cargo sort

* implement table_ops without TableMetadataBuilder

* refactor: HiveSchema fn from_iceberg

* prepare table creation without metadata

* simplify HiveSchemaBuilder

* refactor: use ok_or_else()

* simplify HiveSchemaBuilder

* fix visibility of consts

* change serde metadata v2

* change default partition_specs and sort_orders

* change test

* add create table with metadata

* use FileIO::from_path

* add test_load_table

* small fixes + docs

* rename

* extract get_metadata_location from hive_table

* add integration tests

* fix: clippy

* remove whitespace

* fix: fixture names

* remove builder-prefix `with`

* capitalize error msg

* remove trait bound `Display`

* add const `OWNER`

* fix: default warehouse location

* add test-case `list_tables`

* add all primitives to test_schema

* exclude `Timestamptz` from hive conversion

* remove Self::T from schema

* remove context

* keep file_io in HmsCatalog

* use json schema repr

---------

Co-authored-by: mlanhenke <[email protected]>
  • Loading branch information
marvinlanhenke and mlanhenke authored Mar 22, 2024
1 parent 757ef4c commit 0629ad5
Show file tree
Hide file tree
Showing 7 changed files with 1,167 additions and 70 deletions.
5 changes: 4 additions & 1 deletion crates/catalog/hms/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,17 @@ keywords = ["iceberg", "hive", "catalog"]
[dependencies]
anyhow = { workspace = true }
async-trait = { workspace = true }
chrono = { workspace = true }
hive_metastore = { workspace = true }
iceberg = { workspace = true }
log = { workspace = true }
pilota = { workspace = true }
serde_json = { workspace = true }
tokio = { workspace = true }
typed-builder = { workspace = true }
uuid = { workspace = true }
volo-thrift = { workspace = true }

[dev-dependencies]
iceberg_test_utils = { path = "../../test_utils", features = ["tests"] }
port_scanner = { workspace = true }
tokio = { workspace = true }
231 changes: 216 additions & 15 deletions crates/catalog/hms/src/catalog.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,18 @@
// specific language governing permissions and limitations
// under the License.

use crate::error::from_io_error;
use crate::error::from_thrift_error;

use super::utils::*;
use async_trait::async_trait;
use hive_metastore::ThriftHiveMetastoreClient;
use hive_metastore::ThriftHiveMetastoreClientBuilder;
use hive_metastore::ThriftHiveMetastoreGetDatabaseException;
use hive_metastore::ThriftHiveMetastoreGetTableException;
use iceberg::io::FileIO;
use iceberg::spec::TableMetadata;
use iceberg::spec::TableMetadataBuilder;
use iceberg::table::Table;
use iceberg::{
Catalog, Error, ErrorKind, Namespace, NamespaceIdent, Result, TableCommit, TableCreation,
Expand All @@ -28,6 +35,8 @@ use iceberg::{
use std::collections::HashMap;
use std::fmt::{Debug, Formatter};
use std::net::ToSocketAddrs;
use tokio::io::AsyncReadExt;
use tokio::io::AsyncWriteExt;
use typed_builder::TypedBuilder;
use volo_thrift::ResponseError;

Expand All @@ -47,6 +56,9 @@ pub enum HmsThriftTransport {
pub struct HmsCatalogConfig {
address: String,
thrift_transport: HmsThriftTransport,
warehouse: String,
#[builder(default)]
props: HashMap<String, String>,
}

struct HmsClient(ThriftHiveMetastoreClient);
Expand All @@ -55,6 +67,7 @@ struct HmsClient(ThriftHiveMetastoreClient);
pub struct HmsCatalog {
config: HmsCatalogConfig,
client: HmsClient,
file_io: FileIO,
}

impl Debug for HmsCatalog {
Expand Down Expand Up @@ -92,11 +105,20 @@ impl HmsCatalog {
.build(),
};

let file_io = FileIO::from_path(&config.warehouse)?
.with_props(&config.props)
.build()?;

Ok(Self {
config,
client: HmsClient(client),
file_io,
})
}
/// Get the catalogs `FileIO`
pub fn file_io(&self) -> FileIO {
self.file_io.clone()
}
}

#[async_trait]
Expand Down Expand Up @@ -173,7 +195,7 @@ impl Catalog for HmsCatalog {
let db = self
.client
.0
.get_database(name.clone().into())
.get_database(name.into())
.await
.map_err(from_thrift_error)?;

Expand All @@ -197,7 +219,7 @@ impl Catalog for HmsCatalog {
async fn namespace_exists(&self, namespace: &NamespaceIdent) -> Result<bool> {
let name = validate_namespace(namespace)?;

let resp = self.client.0.get_database(name.clone().into()).await;
let resp = self.client.0.get_database(name.into()).await;

match resp {
Ok(_) => Ok(true),
Expand Down Expand Up @@ -269,13 +291,22 @@ impl Catalog for HmsCatalog {
Ok(())
}

/// Asynchronously lists all tables within a specified namespace.
///
/// # Returns
///
/// A `Result<Vec<TableIdent>>`, which is:
/// - `Ok(vec![...])` containing a vector of `TableIdent` instances, each
/// representing a table within the specified namespace.
/// - `Err(...)` if an error occurs during namespace validation or while
/// querying the database.
async fn list_tables(&self, namespace: &NamespaceIdent) -> Result<Vec<TableIdent>> {
let name = validate_namespace(namespace)?;

let tables = self
.client
.0
.get_all_tables(name.clone().into())
.get_all_tables(name.into())
.await
.map_err(from_thrift_error)?;

Expand All @@ -287,31 +318,201 @@ impl Catalog for HmsCatalog {
Ok(tables)
}

/// Creates a new table within a specified namespace using the provided
/// table creation settings.
///
/// # Returns
/// A `Result` wrapping a `Table` object representing the newly created
/// table.
///
/// # Errors
/// This function may return an error in several cases, including invalid
/// namespace identifiers, failure to determine a default storage location,
/// issues generating or writing table metadata, and errors communicating
/// with the Hive Metastore.
async fn create_table(
&self,
_namespace: &NamespaceIdent,
_creation: TableCreation,
namespace: &NamespaceIdent,
creation: TableCreation,
) -> Result<Table> {
todo!()
let db_name = validate_namespace(namespace)?;
let table_name = creation.name.clone();

let location = match &creation.location {
Some(location) => location.clone(),
None => {
let ns = self.get_namespace(namespace).await?;
get_default_table_location(&ns, &table_name, &self.config.warehouse)
}
};

let metadata = TableMetadataBuilder::from_table_creation(creation)?.build()?;
let metadata_location = create_metadata_location(&location, 0)?;

let mut file = self
.file_io
.new_output(&metadata_location)?
.writer()
.await?;
file.write_all(&serde_json::to_vec(&metadata)?).await?;
file.shutdown().await?;

let hive_table = convert_to_hive_table(
db_name.clone(),
metadata.current_schema(),
table_name.clone(),
location,
metadata_location.clone(),
metadata.properties(),
)?;

self.client
.0
.create_table(hive_table)
.await
.map_err(from_thrift_error)?;

let table = Table::builder()
.file_io(self.file_io())
.metadata_location(metadata_location)
.metadata(metadata)
.identifier(TableIdent::new(NamespaceIdent::new(db_name), table_name))
.build();

Ok(table)
}

async fn load_table(&self, _table: &TableIdent) -> Result<Table> {
todo!()
/// Loads a table from the Hive Metastore and constructs a `Table` object
/// based on its metadata.
///
/// # Returns
/// A `Result` wrapping a `Table` object that represents the loaded table.
///
/// # Errors
/// This function may return an error in several scenarios, including:
/// - Failure to validate the namespace.
/// - Failure to retrieve the table from the Hive Metastore.
/// - Absence of metadata location information in the table's properties.
/// - Issues reading or deserializing the table's metadata file.
async fn load_table(&self, table: &TableIdent) -> Result<Table> {
let db_name = validate_namespace(table.namespace())?;

let hive_table = self
.client
.0
.get_table(db_name.clone().into(), table.name.clone().into())
.await
.map_err(from_thrift_error)?;

let metadata_location = get_metadata_location(&hive_table.parameters)?;

let mut reader = self.file_io.new_input(&metadata_location)?.reader().await?;
let mut metadata_str = String::new();
reader.read_to_string(&mut metadata_str).await?;
let metadata = serde_json::from_str::<TableMetadata>(&metadata_str)?;

let table = Table::builder()
.file_io(self.file_io())
.metadata_location(metadata_location)
.metadata(metadata)
.identifier(TableIdent::new(
NamespaceIdent::new(db_name),
table.name.clone(),
))
.build();

Ok(table)
}

async fn drop_table(&self, _table: &TableIdent) -> Result<()> {
todo!()
/// Asynchronously drops a table from the database.
///
/// # Errors
/// Returns an error if:
/// - The namespace provided in `table` cannot be validated
/// or does not exist.
/// - The underlying database client encounters an error while
/// attempting to drop the table. This includes scenarios where
/// the table does not exist.
/// - Any network or communication error occurs with the database backend.
async fn drop_table(&self, table: &TableIdent) -> Result<()> {
let db_name = validate_namespace(table.namespace())?;

self.client
.0
.drop_table(db_name.into(), table.name.clone().into(), false)
.await
.map_err(from_thrift_error)?;

Ok(())
}

async fn table_exists(&self, _table: &TableIdent) -> Result<bool> {
todo!()
/// Asynchronously checks the existence of a specified table
/// in the database.
///
/// # Returns
/// - `Ok(true)` if the table exists in the database.
/// - `Ok(false)` if the table does not exist in the database.
/// - `Err(...)` if an error occurs during the process
async fn table_exists(&self, table: &TableIdent) -> Result<bool> {
let db_name = validate_namespace(table.namespace())?;
let table_name = table.name.clone();

let resp = self
.client
.0
.get_table(db_name.into(), table_name.into())
.await;

match resp {
Ok(_) => Ok(true),
Err(err) => {
if let ResponseError::UserException(ThriftHiveMetastoreGetTableException::O2(_)) =
&err
{
Ok(false)
} else {
Err(from_thrift_error(err))
}
}
}
}

async fn rename_table(&self, _src: &TableIdent, _dest: &TableIdent) -> Result<()> {
todo!()
/// Asynchronously renames a table within the database
/// or moves it between namespaces (databases).
///
/// # Returns
/// - `Ok(())` on successful rename or move of the table.
/// - `Err(...)` if an error occurs during the process.
async fn rename_table(&self, src: &TableIdent, dest: &TableIdent) -> Result<()> {
let src_dbname = validate_namespace(src.namespace())?;
let dest_dbname = validate_namespace(dest.namespace())?;

let src_tbl_name = src.name.clone();
let dest_tbl_name = dest.name.clone();

let mut tbl = self
.client
.0
.get_table(src_dbname.clone().into(), src_tbl_name.clone().into())
.await
.map_err(from_thrift_error)?;

tbl.db_name = Some(dest_dbname.into());
tbl.table_name = Some(dest_tbl_name.into());

self.client
.0
.alter_table(src_dbname.into(), src_tbl_name.into(), tbl)
.await
.map_err(from_thrift_error)?;

Ok(())
}

async fn update_table(&self, _commit: TableCommit) -> Result<Table> {
todo!()
Err(Error::new(
ErrorKind::FeatureUnsupported,
"Updating a table is not supported yet",
))
}
}
42 changes: 42 additions & 0 deletions crates/catalog/hms/src/error.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use anyhow::anyhow;
use iceberg::{Error, ErrorKind};
use std::fmt::Debug;
use std::io;

/// Format a thrift error into iceberg error.
pub fn from_thrift_error<T>(error: volo_thrift::error::ResponseError<T>) -> Error
where
T: Debug,
{
Error::new(
ErrorKind::Unexpected,
"Operation failed for hitting thrift error".to_string(),
)
.with_source(anyhow!("thrift error: {:?}", error))
}

/// Format an io error into iceberg error.
pub fn from_io_error(error: io::Error) -> Error {
Error::new(
ErrorKind::Unexpected,
"Operation failed for hitting io error".to_string(),
)
.with_source(error)
}
2 changes: 2 additions & 0 deletions crates/catalog/hms/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,6 @@
mod catalog;
pub use catalog::*;

mod error;
mod schema;
mod utils;
Loading

0 comments on commit 0629ad5

Please sign in to comment.