apache · alamb · Dec 23, 2024 · Dec 21, 2024 · Dec 22, 2024 · Dec 22, 2024
diff --git a/Cargo.toml b/Cargo.toml
@@ -73,7 +73,7 @@ version = "43.0.0"
 # selectively turn them on if needed, since we can override default-features = true (from false)
 # for the inherited dependency but cannot do the reverse (override from true to false).
 #
-# See for more detaiils: https://github.com/rust-lang/cargo/issues/11329
+# See for more details: https://github.com/rust-lang/cargo/issues/11329
 ahash = { version = "0.8", default-features = false, features = [
     "runtime-rng",
 ] }

diff --git a/datafusion-cli/src/functions.rs b/datafusion-cli/src/functions.rs
@@ -360,7 +360,7 @@ impl TableFunctionImpl for ParquetMetadataFunc {
             Field::new("total_uncompressed_size", DataType::Int64, true),
         ]));
 
-        // construct recordbatch from metadata
+        // construct record batch from metadata
         let mut filename_arr = vec![];
         let mut row_group_id_arr = vec![];
         let mut row_group_num_rows_arr = vec![];

diff --git a/datafusion-examples/README.md b/datafusion-examples/README.md
@@ -22,7 +22,7 @@
 This crate includes end to end, highly commented examples of how to use
 various DataFusion APIs to help you get started.
 
-## Prerequisites:
+## Prerequisites
 
 Run `git submodule update --init` to init test files.
 

diff --git a/datafusion-examples/examples/advanced_parquet_index.rs b/datafusion-examples/examples/advanced_parquet_index.rs
@@ -82,7 +82,7 @@ use url::Url;
 /// Specifically, this example illustrates how to:
 /// 1. Use [`ParquetFileReaderFactory`] to avoid re-reading parquet metadata on each query
 /// 2. Use [`PruningPredicate`] for predicate analysis
-/// 3. Pass a row group selection to [`ParuetExec`]
+/// 3. Pass a row group selection to [`ParquetExec`]
 /// 4. Pass a row selection (within a row group) to [`ParquetExec`]
 ///
 /// Note this is a *VERY* low level example for people who want to build their
@@ -211,7 +211,7 @@ async fn main() -> Result<()> {
     //
     // Note: in order to prune pages, the Page Index must be loaded and the
     // ParquetExec will load it on demand if not present. To avoid a second IO
-    // during query, this example loaded the Page Index pre-emptively by setting
+    // during query, this example loaded the Page Index preemptively by setting
     // `ArrowReader::with_page_index` in `IndexedFile::try_new`
     provider.set_use_row_selection(true);
     println!("** Select data, predicate `id = 950`");

diff --git a/datafusion-examples/examples/analyzer_rule.rs b/datafusion-examples/examples/analyzer_rule.rs
@@ -138,7 +138,7 @@ impl AnalyzerRule for RowLevelAccessControl {
     fn analyze(&self, plan: LogicalPlan, _config: &ConfigOptions) -> Result<LogicalPlan> {
         // use the TreeNode API to recursively walk the LogicalPlan tree
         // and all of its children (inputs)
-        let transfomed_plan = plan.transform(|plan| {
+        let transformed_plan = plan.transform(|plan| {
             // This closure is called for each LogicalPlan node
             // if it is a Scan node, add a filter to remove all managers
             if is_employee_table_scan(&plan) {
@@ -166,7 +166,7 @@ impl AnalyzerRule for RowLevelAccessControl {
         //
         // This example does not need the value of either flag, so simply
         // extract the LogicalPlan "data"
-        Ok(transfomed_plan.data)
+        Ok(transformed_plan.data)
     }
 
     fn name(&self) -> &str {

diff --git a/datafusion-examples/examples/catalog.rs b/datafusion-examples/examples/catalog.rs
@@ -46,11 +46,11 @@ async fn main() -> Result<()> {
 
     let ctx = SessionContext::new();
     let state = ctx.state();
-    let cataloglist = Arc::new(CustomCatalogProviderList::new());
+    let catalog_list = Arc::new(CustomCatalogProviderList::new());
 
     // use our custom catalog list for context. each context has a single catalog list.
     // context will by default have [`MemoryCatalogProviderList`]
-    ctx.register_catalog_list(cataloglist.clone());
+    ctx.register_catalog_list(catalog_list.clone());
 
     // initialize our catalog and schemas
     let catalog = DirCatalog::new();
@@ -81,7 +81,7 @@ async fn main() -> Result<()> {
     ctx.register_catalog("dircat", Arc::new(catalog));
     {
         // catalog was passed down into our custom catalog list since we override the ctx's default
-        let catalogs = cataloglist.catalogs.read().unwrap();
+        let catalogs = catalog_list.catalogs.read().unwrap();
         assert!(catalogs.contains_key("dircat"));
     };
 
@@ -144,8 +144,8 @@ impl DirSchema {
     async fn create(state: &SessionState, opts: DirSchemaOpts<'_>) -> Result<Arc<Self>> {
         let DirSchemaOpts { ext, dir, format } = opts;
         let mut tables = HashMap::new();
-        let direntries = std::fs::read_dir(dir).unwrap();
-        for res in direntries {
+        let directories = std::fs::read_dir(dir).unwrap();
+        for res in directories {
             let entry = res.unwrap();
             let filename = entry.file_name().to_str().unwrap().to_string();
             if !filename.ends_with(ext) {

diff --git a/datafusion-examples/examples/expr_api.rs b/datafusion-examples/examples/expr_api.rs
@@ -53,7 +53,7 @@ use datafusion_optimizer::analyzer::type_coercion::TypeCoercionRewriter;
 /// 4. Simplify expressions: [`simplify_demo`]
 /// 5. Analyze predicates for boundary ranges: [`range_analysis_demo`]
 /// 6. Get the types of the expressions: [`expression_type_demo`]
-/// 7. Apply type cocercion to expressions: [`type_coercion_demo`]
+/// 7. Apply type coercion to expressions: [`type_coercion_demo`]
 #[tokio::main]
 async fn main() -> Result<()> {
     // The easiest way to do create expressions is to use the
@@ -392,7 +392,7 @@ fn type_coercion_demo() -> Result<()> {
     )?;
     assert!(physical_expr.evaluate(&batch).is_ok());
 
-    // 4. Apply explict type coercion by manually rewriting the expression
+    // 4. Apply explicit type coercion by manually rewriting the expression
     let coerced_expr = expr
         .transform(|e| {
             // Only type coerces binary expressions.

diff --git a/datafusion-examples/examples/function_factory.rs b/datafusion-examples/examples/function_factory.rs
@@ -36,7 +36,7 @@ use datafusion_expr::{
 ///
 /// Apart from [FunctionFactory], this example covers
 /// [ScalarUDFImpl::simplify()] which is often used at the same time, to replace
-/// a function call with another expression at rutime.
+/// a function call with another expression at runtime.
 ///
 /// This example is rather simple and does not cover all cases required for a
 /// real implementation.

diff --git a/datafusion-examples/examples/memtable.rs b/datafusion-examples/examples/memtable.rs
@@ -25,7 +25,7 @@ use std::sync::Arc;
 use std::time::Duration;
 use tokio::time::timeout;
 
-/// This example demonstrates executing a simple query against a Memtable
+/// This example demonstrates executing a simple query against a [`MemTable`]
 #[tokio::main]
 async fn main() -> Result<()> {
     let mem_table = create_memtable()?;

diff --git a/datafusion-examples/examples/optimizer_rule.rs b/datafusion-examples/examples/optimizer_rule.rs
@@ -146,7 +146,7 @@ impl MyOptimizerRule {
             // Closure called for each sub tree
             match expr {
                 Expr::BinaryExpr(binary_expr) if is_binary_eq(&binary_expr) => {
-                    // destruture the expression
+                    // destructure the expression
                     let BinaryExpr { left, op: _, right } = binary_expr;
                     // rewrite to `my_eq(left, right)`
                     let udf = ScalarUDF::new_from_impl(MyEq::new());

diff --git a/datafusion-examples/examples/plan_to_sql.rs b/datafusion-examples/examples/plan_to_sql.rs
@@ -65,7 +65,7 @@ fn simple_expr_to_sql_demo() -> Result<()> {
     Ok(())
 }
 
-/// DataFusioon can remove parentheses when converting an expression to SQL.
+/// DataFusion can remove parentheses when converting an expression to SQL.
 /// Note that output is intended for humans, not for other SQL engines,
 /// as difference in precedence rules can cause expressions to be parsed differently.
 fn simple_expr_to_pretty_sql_demo() -> Result<()> {

diff --git a/datafusion-examples/examples/simple_udtf.rs b/datafusion-examples/examples/simple_udtf.rs
@@ -140,7 +140,7 @@ impl TableFunctionImpl for LocalCsvTableFunc {
         let limit = exprs
             .get(1)
             .map(|expr| {
-                // try to simpify the expression, so 1+2 becomes 3, for example
+                // try to simplify the expression, so 1+2 becomes 3, for example
                 let execution_props = ExecutionProps::new();
                 let info = SimplifyContext::new(&execution_props);
                 let expr = ExprSimplifier::new(info).simplify(expr.clone())?;
@@ -173,8 +173,8 @@ fn read_csv_batches(csv_path: impl AsRef<Path>) -> Result<(SchemaRef, Vec<Record
         .with_header(true)
         .build(file)?;
     let mut batches = vec![];
-    for bacth in reader {
-        batches.push(bacth?);
+    for batch in reader {
+        batches.push(batch?);
     }
     let schema = Arc::new(schema);
     Ok((schema, batches))

diff --git a/datafusion/common/src/column.rs b/datafusion/common/src/column.rs
@@ -230,7 +230,7 @@ impl Column {
                         .collect::<Vec<_>>();
                     for using_col in using_columns {
                         let all_matched = columns.iter().all(|c| using_col.contains(c));
-                        // All matched fields belong to the same using column set, in orther words
+                        // All matched fields belong to the same using column set, in other words
                         // the same join clause. We simply pick the qualifier from the first match.
                         if all_matched {
                             return Ok(columns[0].clone());

diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs
@@ -904,12 +904,12 @@ pub trait ConfigExtension: ExtensionOptions {
 pub trait ExtensionOptions: Send + Sync + fmt::Debug + 'static {
     /// Return `self` as [`Any`]
     ///
-    /// This is needed until trait upcasting is stabilised
+    /// This is needed until trait upcasting is stabilized
     fn as_any(&self) -> &dyn Any;
 
     /// Return `self` as [`Any`]
     ///
-    /// This is needed until trait upcasting is stabilised
+    /// This is needed until trait upcasting is stabilized
     fn as_any_mut(&mut self) -> &mut dyn Any;
 
     /// Return a deep clone of this [`ExtensionOptions`]

diff --git a/datafusion/common/src/cse.rs b/datafusion/common/src/cse.rs
@@ -60,7 +60,7 @@ pub trait Normalizeable {
 }
 
 /// The `NormalizeEq` trait extends `Eq` and `Normalizeable` to provide a method for comparing
-/// normlized nodes in optimizations like Common Subexpression Elimination (CSE).
+/// normalized nodes in optimizations like Common Subexpression Elimination (CSE).
 ///
 /// The `normalize_eq` method ensures that two nodes that are semantically equivalent (after normalization)
 /// are considered equal in CSE optimization, even if their original forms differ.

diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs
@@ -955,7 +955,7 @@ pub trait ExprSchema: std::fmt::Debug {
     /// Returns the column's optional metadata.
     fn metadata(&self, col: &Column) -> Result<&HashMap<String, String>>;
 
-    /// Return the coulmn's datatype and nullability
+    /// Return the column's datatype and nullability
     fn data_type_and_nullable(&self, col: &Column) -> Result<(&DataType, bool)>;
 }
 

diff --git a/datafusion/common/src/error.rs b/datafusion/common/src/error.rs
@@ -115,7 +115,7 @@ pub enum DataFusionError {
     Execution(String),
     /// [`JoinError`] during execution of the query.
     ///
-    /// This error can unoccur for unjoined tasks, such as execution shutdown.
+    /// This error can't occur for unjoined tasks, such as execution shutdown.
     ExecutionJoin(JoinError),
     /// Error when resources (such as memory of scratch disk space) are exhausted.
     ///

diff --git a/datafusion/common/src/scalar/mod.rs b/datafusion/common/src/scalar/mod.rs
@@ -2216,7 +2216,7 @@ impl ScalarValue {
     ///
     /// Errors if `self` is
     /// - a decimal that fails be converted to a decimal array of size
-    /// - a `Fixedsizelist` that fails to be concatenated into an array of size
+    /// - a `FixedsizeList` that fails to be concatenated into an array of size
     /// - a `List` that fails to be concatenated into an array of size
     /// - a `Dictionary` that fails be converted to a dictionary array of size
     pub fn to_array_of_size(&self, size: usize) -> Result<ArrayRef> {
@@ -2925,7 +2925,7 @@ impl ScalarValue {
     /// preferred over this function if at all possible as they can be
     /// vectorized and are generally much faster.
     ///
-    /// This function has a few narrow usescases such as hash table key
+    /// This function has a few narrow use cases such as hash table key
     /// comparisons where comparing a single row at a time is necessary.
     ///
     /// # Errors
@@ -4465,7 +4465,7 @@ mod tests {
         Ok(())
     }
 
-    // Verifies that ScalarValue has the same behavior with compute kernal when it overflows.
+    // Verifies that ScalarValue has the same behavior with compute kernel when it overflows.
     fn check_scalar_add_overflow<T>(left: ScalarValue, right: ScalarValue)
     where
         T: ArrowNumericType,
@@ -6150,9 +6150,9 @@ mod tests {
             &DataType::Timestamp(TimeUnit::Nanosecond, Some("UTC".into()))
         );
 
-        let newscalar = ScalarValue::try_from_array(&array, 0).unwrap();
+        let new_scalar = ScalarValue::try_from_array(&array, 0).unwrap();
         assert_eq!(
-            newscalar.data_type(),
+            new_scalar.data_type(),
             DataType::Timestamp(TimeUnit::Nanosecond, Some("UTC".into()))
         );
     }

diff --git a/datafusion/common/src/tree_node.rs b/datafusion/common/src/tree_node.rs
@@ -996,11 +996,11 @@ impl<
 /// construct a temporary container to be able to call `apply_ref_elements` on a
 /// collection of tree node references. But in that case the container's temporary
 /// lifetime is different to the lifetime of tree nodes that we put into it.
-/// Please find an example usecase in `Expr::apply_children` with the `Expr::Case` case.
+/// Please find an example use case in `Expr::apply_children` with the `Expr::Case` case.
 ///
 /// Most of the cases we don't need to create a temporary container with
 /// `TreeNodeRefContainer`, but we can just call `TreeNodeContainer::apply_elements`.
-/// Please find an example usecase in `Expr::apply_children` with the `Expr::GroupingSet`
+/// Please find an example use case in `Expr::apply_children` with the `Expr::GroupingSet`
 /// case.
 pub trait TreeNodeRefContainer<'a, T: 'a>: Sized {
     /// Applies `f` to all elements of the container.

diff --git a/datafusion/common/src/utils/memory.rs b/datafusion/common/src/utils/memory.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! This module provides a function to estimate the memory size of a HashTable prior to alloaction
+//! This module provides a function to estimate the memory size of a HashTable prior to allocation
 
 use crate::{DataFusionError, Result};
 use std::mem::size_of;
@@ -79,7 +79,7 @@ pub fn estimate_memory_size<T>(num_elements: usize, fixed_size: usize) -> Result
     // For the majority of cases hashbrown overestimates the bucket quantity
     // to keep ~1/8 of them empty. We take this factor into account by
     // multiplying the number of elements with a fixed ratio of 8/7 (~1.14).
-    // This formula leads to overallocation for small tables (< 8 elements)
+    // This formula leads to over-allocation for small tables (< 8 elements)
     // but should be fine overall.
     num_elements
         .checked_mul(8)

diff --git a/datafusion/common/src/utils/proxy.rs b/datafusion/common/src/utils/proxy.rs
@@ -92,12 +92,12 @@ impl<T> VecAllocExt for Vec<T> {
     type T = T;
 
     fn push_accounted(&mut self, x: Self::T, accounting: &mut usize) {
-        let prev_capacty = self.capacity();
+        let prev_capacity = self.capacity();
         self.push(x);
         let new_capacity = self.capacity();
-        if new_capacity > prev_capacty {
+        if new_capacity > prev_capacity {
             // capacity changed, so we allocated more
-            let bump_size = (new_capacity - prev_capacty) * size_of::<T>();
+            let bump_size = (new_capacity - prev_capacity) * size_of::<T>();
             // Note multiplication should never overflow because `push` would
             // have panic'd first, but the checked_add could potentially
             // overflow since accounting could be tracking additional values, and

diff --git a/datafusion/core/benches/physical_plan.rs b/datafusion/core/benches/physical_plan.rs
@@ -38,7 +38,7 @@ use datafusion::physical_plan::{
 use datafusion::prelude::SessionContext;
 use datafusion_physical_expr_common::sort_expr::LexOrdering;
 
-// Initialise the operator using the provided record batches and the sort key
+// Initialize the operator using the provided record batches and the sort key
 // as inputs. All record batches must have the same schema.
 fn sort_preserving_merge_operator(
     session_ctx: Arc<SessionContext>,

diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs
@@ -3279,7 +3279,7 @@ mod tests {
             &df_results
         );
 
-        // check that col with the same name ovwewritten
+        // check that col with the same name overwritten
         let df_results_overwrite = df
             .clone()
             .with_column("c1", col("c2") + col("c3"))?
@@ -3302,7 +3302,7 @@ mod tests {
             &df_results_overwrite
         );
 
-        // check that col with the same name ovwewritten using same name as reference
+        // check that col with the same name overwritten using same name as reference
         let df_results_overwrite_self = df
             .clone()
             .with_column("c2", col("c2") + lit(1))?

diff --git a/datafusion/core/src/datasource/default_table_source.rs b/datafusion/core/src/datasource/default_table_source.rs
@@ -67,7 +67,7 @@ impl TableSource for DefaultTableSource {
     }
 
     /// Tests whether the table provider can make use of any or all filter expressions
-    /// to optimise data retrieval.
+    /// to optimize data retrieval.
     fn supports_filters_pushdown(
         &self,
         filter: &[&Expr],

diff --git a/datafusion/core/src/datasource/physical_plan/file_groups.rs b/datafusion/core/src/datasource/physical_plan/file_groups.rs
@@ -781,7 +781,7 @@ mod test {
         assert_partitioned_files(expected, actual);
     }
 
-    /// Asserts that the two groups of `ParititonedFile` are the same
+    /// Asserts that the two groups of [`PartitionedFile`] are the same
     /// (PartitionedFile doesn't implement PartialEq)
     fn assert_partitioned_files(
         expected: Option<Vec<Vec<PartitionedFile>>>,

diff --git a/datafusion/core/src/datasource/physical_plan/json.rs b/datafusion/core/src/datasource/physical_plan/json.rs
@@ -870,7 +870,7 @@ mod tests {
     )]
     #[cfg(feature = "compression")]
     #[tokio::test]
-    async fn test_json_with_repartitioing(
+    async fn test_json_with_repartitioning(
         file_compression_type: FileCompressionType,
     ) -> Result<()> {
         let config = SessionConfig::new()

diff --git a/datafusion/core/src/datasource/physical_plan/parquet/mod.rs b/datafusion/core/src/datasource/physical_plan/parquet/mod.rs
@@ -333,7 +333,7 @@ impl ParquetExecBuilder {
 
     /// Set the filter predicate when reading.
     ///
-    /// See the "Predicate Pushdown" section of the [`ParquetExec`] documenation
+    /// See the "Predicate Pushdown" section of the [`ParquetExec`] documentation
     /// for more details.
     pub fn with_predicate(mut self, predicate: Arc<dyn PhysicalExpr>) -> Self {
         self.predicate = Some(predicate);
@@ -611,7 +611,7 @@ impl ParquetExec {
     }
 
     /// If enabled, the reader will read the page index
-    /// This is used to optimise filter pushdown
+    /// This is used to optimize filter pushdown
     /// via `RowSelector` and `RowFilter` by
     /// eliminating unnecessary IO and decoding
     pub fn with_enable_page_index(mut self, enable_page_index: bool) -> Self {

diff --git a/datafusion/core/src/datasource/physical_plan/parquet/row_filter.rs b/datafusion/core/src/datasource/physical_plan/parquet/row_filter.rs
@@ -422,7 +422,7 @@ fn would_column_prevent_pushdown(
     checker.prevents_pushdown()
 }
 
-/// Recurses through expr as a trea, finds all `column`s, and checks if any of them would prevent
+/// Recurses through expr as a tree, finds all `column`s, and checks if any of them would prevent
 /// this expression from being predicate pushed down. If any of them would, this returns false.
 /// Otherwise, true.
 pub fn can_expr_be_pushed_down_with_schemas(
@@ -692,7 +692,7 @@ mod test {
 
         let mut parquet_reader = parquet_reader_builder.build().expect("building reader");
 
-        // Parquet file is small, we only need 1 recordbatch
+        // Parquet file is small, we only need 1 record batch
         let first_rb = parquet_reader
             .next()
             .expect("expected record batch")