NREL · daniel-thom · Oct 31, 2024 · Oct 3, 2024 · Oct 4, 2024 · Oct 4, 2024
diff --git a/pyproject.toml b/pyproject.toml
@@ -26,16 +26,22 @@ classifiers = [
     "Programming Language :: Python :: Implementation :: PyPy",
 ]
 dependencies = [
-    "duckdb~=1.0.0",
+    "duckdb ~= 1.1.0",
     "duckdb_engine",
     "loguru",
-    "rich",
+    "pandas >= 2.2, < 3",
+    "polars ~= 1.11.0",
+    "pyarrow",
     "pydantic >= 2.7, < 3",
-    "sqlalchemy",
+    "pytz",
+    "rich",
+    "sqlalchemy == 2.0.35",  # 2.0.36 introduced code that duckdb_engine doesn't handle.
+    "tzdata",
 ]
 [project.optional-dependencies]
 dev = [
     "mypy",
+    "pandas-stubs",
     "pre-commit",
     "pytest",
     "pytest-cov",
@@ -47,6 +53,12 @@ Documentation = "https://github.com/NREL/chronify#readme"
 Issues = "https://github.com/NREL/chronify/issues"
 Source = "https://github.com/NREL/chronify"
 
+[tool.mypy]
+files = [
+  "src",
+  "tests",
+]
+
 [tool.pytest.ini_options]
 pythonpath = "src"
 minversion = "6.0"

diff --git a/scripts/perf_tests.py b/scripts/perf_tests.py
@@ -0,0 +1,90 @@
+from datetime import datetime, timedelta
+
+import duckdb
+import pandas as pd
+import polars as pl
+from IPython import get_ipython
+
+from sqlalchemy import Double, text
+from chronify.models import ColumnDType, CsvTableSchema, TableSchema
+from chronify.store import Store
+from chronify.time import TimeIntervalType, TimeZone
+from chronify.time_configs import DatetimeRange
+
+
+GENERATOR_TIME_SERIES_FILE = "tests/data/gen.csv"
+
+
+def read_duckdb(conn, name: str):
+    return conn.sql(f"SELECT * FROM {name}").df()
+
+
+def read_pandas(store: Store, name: str):
+    with store.engine.begin() as conn:
+        query = f"select * from {name}"
+        return pd.read_sql(query, conn)
+
+
+def read_polars(store: Store, name: str):
+    with store.engine.begin() as conn:
+        query = f"select * from {name}"
+        return pl.read_database(query, connection=conn).to_pandas()
+
+
+def read_sqlalchemy(store: Store, name: str):
+    with store.engine.begin() as conn:
+        query = f"select * from {name}"
+        res = conn.execute(text(query)).fetchall()
+        return pd.DataFrame.from_records(res, columns=["timestamp", "generator", "value"])
+
+
+def setup():
+    time_config = DatetimeRange(
+        start=datetime(year=2020, month=1, day=1),
+        resolution=timedelta(hours=1),
+        length=8784,
+        interval_type=TimeIntervalType.PERIOD_BEGINNING,
+        time_columns=["timestamp"],
+        time_zone=TimeZone.UTC,
+    )
+
+    src_schema = CsvTableSchema(
+        time_config=time_config,
+        column_dtypes=[
+            ColumnDType(name="gen1", dtype=Double),
+            ColumnDType(name="gen2", dtype=Double),
+            ColumnDType(name="gen3", dtype=Double),
+        ],
+        value_columns=["gen1", "gen2", "gen3"],
+        pivoted_dimension_name="generator",
+        time_array_id_columns=[],
+    )
+    dst_schema = TableSchema(
+        name="generators",
+        time_config=time_config,
+        time_array_id_columns=["generator"],
+        value_column="value",
+    )
+    return src_schema, dst_schema
+
+
+def run_test(engine_name: str):
+    store = Store(engine_name=engine_name)
+    src_schema, dst_schema = setup()
+    store.ingest_from_csv(GENERATOR_TIME_SERIES_FILE, src_schema, dst_schema)
+    ipython = get_ipython()
+    df = read_polars(store, dst_schema.name)  # noqa: F841
+    conn = duckdb.connect(":memory:")
+    conn.sql("CREATE OR REPLACE TABLE perf_test AS SELECT * from df")
+    print(f"Run {engine_name} database with read_duckdb.")
+    ipython.run_line_magic("timeit", "read_duckdb(conn, 'perf_test')")
+    print(f"Run {engine_name} database with read_pandas.")
+    ipython.run_line_magic("timeit", "read_pandas(store, dst_schema.name)")
+    print(f"Run {engine_name} database with read_polars.")
+    ipython.run_line_magic("timeit", "read_polars(store, dst_schema.name)")
+    print(f"Run {engine_name} database with read_sqlalchemy.")
+    ipython.run_line_magic("timeit", "read_sqlalchemy(store, dst_schema.name)")
+
+
+run_test("duckdb")
+run_test("sqlite")
diff --git a/src/chronify/__init__.py b/src/chronify/__init__.py
@@ -0,0 +1,4 @@
+import importlib.metadata as metadata
+
+
+__version__ = metadata.metadata("chronify")["Version"]
diff --git a/src/chronify/common.py b/src/chronify/common.py
@@ -0,0 +1,3 @@
+"""Common definitions for the package"""
+
+VALUE_COLUMN = "value"
diff --git a/src/chronify/csv_io.py b/src/chronify/csv_io.py
@@ -0,0 +1,39 @@
+from pathlib import Path
+from chronify.time import get_zone_info
+
+import duckdb
+from duckdb import DuckDBPyRelation
+
+from chronify.models import CsvTableSchema, get_duckdb_type_from_sqlalchemy
+from chronify.time_configs import DatetimeRange
+
+
+def read_csv(path: Path | str, schema: CsvTableSchema, **kwargs) -> DuckDBPyRelation:
+    """Read a CSV file into a DuckDB relation."""
+    if schema.column_dtypes:
+        dtypes = {x.name: get_duckdb_type_from_sqlalchemy(x.dtype) for x in schema.column_dtypes}
+        rel = duckdb.read_csv(str(path), dtype=dtypes, **kwargs)
+    else:
+        rel = duckdb.read_csv(str(path), **kwargs)
+
+    exprs = []
+    for column, dtype in zip(rel.columns, rel.types):
+        if dtype is duckdb.typing.TIMESTAMP:
+            if isinstance(schema.time_config, DatetimeRange):
+                if schema.time_config.time_zone is None:
+                    msg = "time_zone cannot be None if the time zone is not part of the timestamp string"
+                    raise ValueError(msg)
+                zone_info = get_zone_info(schema.time_config.time_zone)
+            else:
+                msg = f"need to add support for {type(schema.time_config)}"
+                raise NotImplementedError(msg)
+            expr = f"timezone('UTC', timezone({zone_info.key}, {column})) AS {column}"
+        elif dtype is duckdb.typing.TIMESTAMP_TZ:
+            msg = "no handling for timestamp with time zone yet"
+            raise NotImplementedError(msg)
+            # expr = f"timezone('UTC', {column}) AS {column}"
+        else:
+            expr = column
+        exprs.append(expr)
+    expr_str = ",".join(exprs)
+    return duckdb.sql(f"SELECT {expr_str} FROM rel")
diff --git a/src/chronify/duckdb/__init__.py b/src/chronify/duckdb/__init__.py
diff --git a/src/chronify/duckdb/functions.py b/src/chronify/duckdb/functions.py
@@ -0,0 +1,59 @@
+from collections.abc import Iterable
+from datetime import datetime, timedelta
+from pathlib import Path
+
+import duckdb
+from duckdb import DuckDBPyRelation
+
+
+def add_datetime_column(
+    rel: DuckDBPyRelation,
+    start: datetime,
+    resolution: timedelta,
+    length: int,
+    time_array_id_columns: Iterable[str],
+    time_column: str,
+    timestamps: list[datetime],
+) -> DuckDBPyRelation:
+    """Add a datetime column to the relation."""
+    # TODO
+    raise NotImplementedError
+    # values = []
+    # columns = ",".join(rel.columns)
+    # return duckdb.sql(
+    #    f"""
+    #    SELECT
+    #        AS {time_column}
+    #        ,{columns}
+    #    FROM rel
+    #    """
+    # )
+
+
+def make_write_parquet_query(table_or_view: str, file_path: Path | str) -> str:
+    """Make an SQL string that can be used to write a Parquet file from a table or view."""
+    # TODO: Hive partitioning?
+    return f"""
+        COPY
+            (SELECT * FROM {table_or_view})
+            TO '{file_path}'
+            (FORMAT 'parquet');
+        """
+
+
+def unpivot(
+    rel: DuckDBPyRelation,
+    pivoted_columns: Iterable[str],
+    name_column: str,
+    value_column: str,
+) -> DuckDBPyRelation:
+    pivoted_str = ",".join(pivoted_columns)
+
+    query = f"""
+        SELECT * FROM rel
+        UNPIVOT INCLUDE NULLS (
+            {value_column}
+            FOR {name_column} in ({pivoted_str})
+        )
+        """
+    return duckdb.sql(query)
diff --git a/src/chronify/exceptions.py b/src/chronify/exceptions.py
@@ -2,5 +2,13 @@ class ChronifyExceptionBase(Exception):
     """Base class for exceptions in this package"""
 
 
+class ConflictingInputsError(ChronifyExceptionBase):
+    """Raised when user inputs conflict with each other."""
+
+
 class InvalidTable(ChronifyExceptionBase):
     """Raised when a table does not match its schema."""
+
+
+class InvalidParameter(ChronifyExceptionBase):
+    """Raised when an invalid parameter is passed."""
diff --git a/src/chronify/loggers.py b/src/chronify/loggers.py
@@ -0,0 +1,58 @@
+"""Contains logging functionality."""
+
+import sys
+from pathlib import Path
+from typing import Iterable, Optional, Union
+
+from loguru import logger
+
+
+# Logger printing formats
+DEFAULT_FORMAT = "<level>{level}</level>: {message}"
+DEBUG_FORMAT = (
+    "<green>{time:YYYY-MM-DD HH:mm:ss}</green> | "
+    "<level>{level: <7}</level> | "
+    "<cyan>{name}:{line}</cyan> | "
+    "{message}"
+)
+
+
+def setup_logging(
+    filename: Optional[Union[str, Path]] = None,
+    console_level: str = "INFO",
+    file_level: str = "DEBUG",
+    mode: str = "w",
+    rotation: Optional[str] = "10 MB",
+    packages: Optional[Iterable] = None,
+) -> None:
+    """Configures logging to file and console.
+
+    Parameters
+    ----------
+    filename
+        Log filename, defaults to None for no file logging.
+    console_level
+        Console logging level
+    file_level
+        File logging level
+    mode
+        Mode in which to open the file
+    rotation
+        Size in which to rotate file. Set to None for no rotation.
+    packages
+        Additional packages to enable logging
+    """
+    logger.remove()
+    logger.enable("chronify")
+    for pkg in packages or []:
+        logger.enable(pkg)
+
+    logger.add(sys.stderr, level=console_level, format=DEFAULT_FORMAT)
+    if filename:
+        logger.add(
+            filename,
+            level=file_level,
+            mode=mode,
+            rotation=rotation,
+            format=DEBUG_FORMAT,
+        )
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,4 @@
		import importlib.metadata as metadata


		__version__ = metadata.metadata("chronify")["Version"]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		"""Common definitions for the package"""

		VALUE_COLUMN = "value"