From c98dcc1b2340fbf3f9b5e9eed10b4b3deaba8f79 Mon Sep 17 00:00:00 2001 From: devopskx Date: Fri, 9 Feb 2024 09:16:19 +0000 Subject: [PATCH] PyKX gitlab sync --- MANIFEST.in | 1 + README.md | 14 +- conda-recipe/conda_build_config.yaml | 5 + conda-recipe/meta.yaml | 2 +- docs/api/db.md | 10 + docs/api/pykx-q-data/type_conversions.md | 6 +- docs/api/remote.md | 11 + docs/api/system.md | 3 + docs/beta-features/db-management.md | 247 ++ docs/beta-features/index.md | 52 + docs/beta-features/remote-functions.md | 132 + docs/beta-features/threading.md | 64 + docs/contributors.md | 23 + docs/examples/db-management.ipynb | 2728 ++++++++++++++ docs/examples/subscriber/archive.zip | Bin 2350 -> 2374 bytes docs/examples/subscriber/readme.md | 4 +- docs/examples/subscriber/subscriber.py | 4 +- docs/examples/subscriber/subscriber_async.py | 4 +- docs/examples/threaded_execution/archive.zip | Bin 4798 -> 3076 bytes .../threaded_execution/asyncio_threading.py | 57 + docs/examples/threaded_execution/threading.md | 96 + docs/examples/threaded_execution/threads.py | 60 + docs/extras/known_issues.md | 11 + docs/faq.md | 65 +- .../PyKX Introduction Notebook.ipynb | 1096 ++++++ docs/getting-started/installing.md | 42 +- docs/getting-started/q_magic_command.ipynb | 38 +- docs/getting-started/quickstart.md | 69 +- docs/pykx-under-q/api.md | 81 +- docs/pykx-under-q/intro.md | 20 +- docs/pykx-under-q/known_issues.md | 131 + docs/pykx-under-q/upgrade.md | 92 +- docs/release-notes/changelog.md | 447 ++- docs/release-notes/underq-changelog.md | 168 + docs/roadmap.md | 4 +- docs/troubleshooting.md | 17 +- docs/user-guide/advanced/Pandas_API.ipynb | 3265 +++++++++-------- docs/user-guide/advanced/attributes.md | 15 +- docs/user-guide/advanced/database.md | 7 + docs/user-guide/advanced/ipc.md | 6 +- docs/user-guide/advanced/pandas_breakdown.md | 86 + docs/user-guide/configuration.md | 59 +- docs/user-guide/fundamentals/creating.md | 22 +- docs/user-guide/fundamentals/indexing.md | 35 +- .../fundamentals/nulls_and_infinities.md | 127 +- docs/user-guide/fundamentals/querying.md | 43 +- examples/subscriber/readme.md | 4 +- examples/subscriber/subscriber.py | 4 +- mkdocs.yml | 38 +- pyproject.toml | 21 +- setup.py | 15 +- src/pykx/.gitignore | 1 + src/pykx/__init__.py | 39 +- src/pykx/_tcore.c | 604 +++ src/pykx/_wrappers.pxd | 1 - src/pykx/_wrappers.pyx | 20 +- src/pykx/cast.py | 10 + src/pykx/config.py | 54 +- src/pykx/console.py | 12 +- src/pykx/core.pxd | 1 + src/pykx/core.pyx | 368 +- src/pykx/db.py | 1120 ++++++ src/pykx/embedded_q.py | 11 +- src/pykx/extensions/dashboards.q | 129 + src/pykx/include/py.h | 2 + src/pykx/ipc.py | 48 +- src/pykx/lib/dbmaint.q | 151 + src/pykx/lib/l64/libq.so | Bin 821360 -> 821392 bytes src/pykx/lib/l64arm/libq.so | Bin 671104 -> 671104 bytes src/pykx/lib/w64/q.dll | Bin 679936 -> 680448 bytes src/pykx/lib/w64/q.lib | Bin 9848 -> 9988 bytes src/pykx/nbextension.py | 33 +- src/pykx/pandas_api/__init__.py | 5 +- src/pykx/pandas_api/pandas_conversions.py | 16 +- src/pykx/pandas_api/pandas_indexing.py | 65 +- src/pykx/pandas_api/pandas_meta.py | 20 +- src/pykx/pandas_api/pandas_reset_index.py | 72 + src/pykx/pykx.c | 54 +- src/pykx/pykx.q | 210 +- src/pykx/pykx_init.q_ | Bin 3208 -> 3223 bytes src/pykx/pykxq.c | 29 +- src/pykx/query.py | 76 +- src/pykx/register.py | 1 - src/pykx/reimporter.py | 1 + src/pykx/remote.py | 239 ++ src/pykx/system.py | 5 +- src/pykx/toq.pyx | 42 +- src/pykx/util.py | 5 +- src/pykx/wrappers.py | 115 +- tests/conftest.py | 1 + tests/qcumber_tests/callables.quke | 21 + tests/qcumber_tests/conversions.quke | 41 + tests/qcumber_tests/extensions.quke | 43 + tests/qcumber_tests/pykx.quke | 43 +- tests/qcumber_tests/reimport.quke | 19 + tests/qscripts/test_stdout_stderr.q | 27 + tests/test_cloud_edition.py | 4 + tests/test_db.py | 317 ++ tests/test_license.py | 49 + tests/test_pandas_agg.py | 14 + tests/test_pandas_api.py | 83 +- tests/test_pandas_apply.py | 38 + tests/test_pykx.py | 111 +- tests/test_q.py | 22 + tests/test_q_foreign.py | 25 + tests/test_query.py | 112 +- tests/test_register.py | 4 +- tests/test_reimport.py | 15 + tests/test_remote.py | 96 + tests/test_reset_index.py | 67 + tests/test_system.py | 15 +- tests/test_toq.py | 106 +- tests/test_wrappers.py | 277 +- 113 files changed, 12555 insertions(+), 2105 deletions(-) create mode 100644 conda-recipe/conda_build_config.yaml create mode 100644 docs/api/db.md create mode 100644 docs/api/remote.md create mode 100644 docs/api/system.md create mode 100644 docs/beta-features/db-management.md create mode 100644 docs/beta-features/index.md create mode 100644 docs/beta-features/remote-functions.md create mode 100644 docs/beta-features/threading.md create mode 100644 docs/contributors.md create mode 100644 docs/examples/db-management.ipynb create mode 100644 docs/examples/threaded_execution/asyncio_threading.py create mode 100644 docs/examples/threaded_execution/threading.md create mode 100644 docs/examples/threaded_execution/threads.py create mode 100644 docs/getting-started/PyKX Introduction Notebook.ipynb create mode 100644 docs/pykx-under-q/known_issues.md create mode 100644 docs/user-guide/advanced/database.md create mode 100644 docs/user-guide/advanced/pandas_breakdown.md create mode 100644 src/pykx/_tcore.c create mode 100644 src/pykx/db.py create mode 100644 src/pykx/extensions/dashboards.q create mode 100644 src/pykx/lib/dbmaint.q create mode 100644 src/pykx/pandas_api/pandas_reset_index.py create mode 100644 src/pykx/remote.py create mode 100644 tests/qcumber_tests/extensions.quke create mode 100644 tests/qcumber_tests/reimport.quke create mode 100644 tests/qscripts/test_stdout_stderr.q create mode 100644 tests/test_db.py create mode 100644 tests/test_reimport.py create mode 100644 tests/test_remote.py create mode 100644 tests/test_reset_index.py diff --git a/MANIFEST.in b/MANIFEST.in index 47efeaa..9809842 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -5,6 +5,7 @@ include setup.py graft src/pykx/include graft src/pykx/lib +graft src/pykx/extensions recursive-include src/pykx *.py *.pxd *.pyx *.c *.so *.k recursive-include tests *.py diff --git a/README.md b/README.md index 9343fd0..29610de 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# PyKX +# PyKX ## Introduction @@ -92,18 +92,26 @@ KX only officially supports versions of PyKX built by KX, i.e. versions of PyKX PyKX depends on the following third-party Python packages: -- `pandas~=1.2` -- `numpy~=1.22` +- `pandas>=1.2, < 2.2.0` +- `numpy~=1.22; python_version<'3.11'` +- `numpy~=1.23.2; python_version=='3.11'` +- `pytz>=2022.1` +- `toml~=0.10.2` They are installed automatically by `pip` when PyKX is installed. PyKX also has an optional Python dependency of `pyarrow>=3.0.0`, which can be included by installing the `pyarrow` extra, e.g. `pip install pykx[pyarrow]` +When using PyKX with KX Dashboards users will be required to install `ast2json~=0.3` this can be installed using the `dashboards` extra, e.g. `pip install pykx[dashboards]` + +When using PyKX Beta features users will be required to install `dill>=0.2.0` this can be installed using the `beta` extra, e.g. `pip install pykx[beta]` + **Warning:** Trying to use the `pa` conversion methods of `pykx.K` objects or the `pykx.toq.from_arrow` method when PyArrow is not installed (or could not be imported without error) will raise a `pykx.PyArrowUnavailable` exception. `pyarrow` is supported Python 3.8-3.10 but remains in Beta for Python 3.11. #### Optional Non-Python Dependencies - `libssl` for TLS on [IPC connections](docs/api/ipc.md). +- `libpthread` on Linux/MacOS when using the `PYKX_THREADING` environment variable. #### Windows Dependencies diff --git a/conda-recipe/conda_build_config.yaml b/conda-recipe/conda_build_config.yaml new file mode 100644 index 0000000..87fed87 --- /dev/null +++ b/conda-recipe/conda_build_config.yaml @@ -0,0 +1,5 @@ +python: + - 3.8 + - 3.9 + - 3.10 + - 3.11 diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml index bcdb2bc..a781dfc 100644 --- a/conda-recipe/meta.yaml +++ b/conda-recipe/meta.yaml @@ -27,7 +27,7 @@ requirements: run: - python - numpy>=1.22 - - pandas>=1.2 + - pandas>=1.2, <2.2.0 - pytz>=2022.1 - toml>=0.10.2 diff --git a/docs/api/db.md b/docs/api/db.md new file mode 100644 index 0000000..074a383 --- /dev/null +++ b/docs/api/db.md @@ -0,0 +1,10 @@ +# Database interaction and management functionality + +::: pykx.db + rendering: + show_root_heading: false + options: + show_root_heading: false + members_order: source + members: + - DB diff --git a/docs/api/pykx-q-data/type_conversions.md b/docs/api/pykx-q-data/type_conversions.md index 27f289e..6feb022 100644 --- a/docs/api/pykx-q-data/type_conversions.md +++ b/docs/api/pykx-q-data/type_conversions.md @@ -322,7 +322,7 @@ True Calling `.py()` on a `pykx.LongVector` will return a list of python int objects. ```Python - >>>> pykx.LongVector([1, 2.5]).py() + >>> pykx.LongVector([1, 2.5]).py() [1, 2] ``` @@ -342,7 +342,7 @@ True Calling `.py()` on a `pykx.RealAtom` will return a python float object. ```Python - >>>> pykx.RealAtom(2.5).py() + >>> pykx.RealAtom(2.5).py() 2.5 ``` @@ -383,7 +383,7 @@ True Calling `.py()` on a `pykx.FloatAtom` will return a python float object. ```Python - >>>> pykx.FloatAtom(2.5).py() + >>> pykx.FloatAtom(2.5).py() 2.5 ``` diff --git a/docs/api/remote.md b/docs/api/remote.md new file mode 100644 index 0000000..2101520 --- /dev/null +++ b/docs/api/remote.md @@ -0,0 +1,11 @@ +# Remote Python Execution Functionality + +::: pykx.remote + rendering: + show_root_heading: false + options: + show_root_heading: false + members_order: source + members: + - session + - function diff --git a/docs/api/system.md b/docs/api/system.md new file mode 100644 index 0000000..8a9ed5f --- /dev/null +++ b/docs/api/system.md @@ -0,0 +1,3 @@ +# System Command Wrappers + +::: pykx.system diff --git a/docs/beta-features/db-management.md b/docs/beta-features/db-management.md new file mode 100644 index 0000000..5d0fdcf --- /dev/null +++ b/docs/beta-features/db-management.md @@ -0,0 +1,247 @@ +# Database Management + +!!! Warning + + This module is a Beta Feature and is subject to change. To enable this functionality for testing please follow the configuration instructions [here](../user-guide/configuration.md) setting `PYKX_BETA_FEATURES='true'` + +## Introduction + +The term Database Management as used here, refers to creating and maintaining [partitioned kdb+ databases](https://code.kx.com/q/kb/partition/). Go to [Q for Mortals](https://code.kx.com/q4m3/14_Introduction_to_Kdb+/#143-partitioned-tables) for more in-depth information about partitioned databases in kdb+. + +A kdb+ database consists of one or more tables saved on-disk, where they are split into separate folders called partitions. These partitions are most often based on a temporal field within the dataset, such as date or month. Each table within the database must follow the same partition structure. + +We recommend using partitioned databases when the volume of data being handled exceeds ~100 million records. + +## Functional walkthrough + +This walkthrough will demonstrate the following steps: + +1. Creating a database from a historical dataset. +1. Adding a new partition to the database. +1. Managing the on-disk database by: + 1. Renaming a table and column + 1. Creating a copy of a column to the database + 1. Applying a Python function to a column of the database + 1. Updating the data type of a column +1. Adding a new table to the most recent partition of the database. + +All integrations with the `Database Management` functionality are facilitated through use of the `pykx.DB` class. To follow along with the example outlined below you can use the [companion notebook](../examples/db-management.ipynb). This uses a more complex table but runs the same commands. For full information on the functions available you can reference the [API section](../api/db.md). + +### Creating a database + +Create a dataset containing time-series data with multiple dates, and columns of various types: + +```python +>>> import pykx as kx +>>> from datetime import date +>>> N = 100000 +>>> dataset = kx.Table(data={ +... 'date': kx.random.random(N, [date(2020, 1, 1), date(2020, 1, 2)]), +... 'sym': kx.random.random(N, ['AAPL', 'GOOG', 'MSFT']), +... 'price': kx.random.random(N, 10.0) +... }) +``` + +Initialise the `DB` class. The expected input is the file path where you intend to save the partitioned database and its associated tables. + +```python +>>> db = kx.DB(path = 'db') +``` + +Create the database using the `date` column as the partition, and add `dataset` as a table called `trade_data` within it. + +```python +>>> db.create(dataset, 'trade_data', 'date', by_field = 'sym', sym_enum = 'symcol') +Writing Database Partition 2020.01.01 to table trade_data +Writing Database Partition 2020.01.02 to table trade_data +``` + +This now exists as a table and is saved to disk. + +```python +>>> db.tables +['trade_data'] +``` + +When a table is saved, an attribute is added to the `db` class for it. For our newly generated table, this is `db.trade_data` + +```python +>>> db.trade_data +pykx.PartitionedTable(pykx.q(' +date sym price +------------------------- +2020.01.01 AAPL 7.055037 +2020.01.01 AAPL 3.907669 +2020.01.01 AAPL 2.20948 +2020.01.01 AAPL 7.839242 +2020.01.01 AAPL 0.8549648 +.. +') +``` + +### Adding a new partition to the database + +Once a table has been generated, you can add more partitions to the database through reuse of the `create` method. In this case we are adding the new partition `2020.01.03` to the database. + +```python +>>> N = 10000 +>>> dataset = kx.Table(data={ +... 'sym': kx.random.random(N, ['AAPL', 'GOOG', 'MSFT']), +... 'price': kx.random.random(N, 10.0) +... }) +>>> db.create(dataset, 'trade_data', date(2020, 1, 3), by_field = 'sym', sym_enum = 'symcol') +Writing Database Partition 2020.01.03 to table trade_data +``` + +### Managing the database + +This section covers updating the contents of a database. We will continue using the table created in the [Creating a database](#creating-a-database) section above. + +The name of a table can be updated using the `rename_table` method. Below, we are updating the table `trade_data` to be called `trade`. + +```python +>>> db.rename_table('trade_data', 'trade') +2023.12.08 09:54:22 renaming :/tmp/db/2020.01.01/trade_data to :/tmp/db/2020.01.01/trade +2023.12.08 09:54:22 renaming :/tmp/db/2020.01.02/trade_data to :/tmp/db/2020.01.02/trade +2023.12.08 09:54:22 renaming :/tmp/db/2020.01.03/trade_data to :/tmp/db/2020.01.03/trade +``` + +During the rename process, the attribute in the `db` class is also updated. + +```python +>>> db.trade +pykx.PartitionedTable(pykx.q(' +date sym price +------------------------- +2020.01.01 AAPL 7.055037 +2020.01.01 AAPL 3.907669 +2020.01.01 AAPL 2.20948 +2020.01.01 AAPL 7.839242 +2020.01.01 AAPL 0.8549648 +.. +') +``` + +Renaming a column in a table is achieved using the `rename_column` method. For example, let's update the `sym` column in the `trade` table to be called `ticker`. + +```python +>>> db.rename_column('trade', 'sym', 'ticker') +2023.12.08 10:06:27 renaming sym to ticker in `:/tmp/db/2020.01.01/trade +2023.12.08 10:06:27 renaming sym to ticker in `:/tmp/db/2020.01.02/trade +2023.12.08 10:06:27 renaming sym to ticker in `:/tmp/db/2020.01.03/trade +``` + +To safely apply a function to modify the `price` column within the database, first create a copy of the column. + +```python +>>> db.copy_column('trade', 'price', 'price_copy') +2023.12.08 10:14:54 copying price to price_copy in `:/tmp/db/2020.01.01/trade +2023.12.08 10:14:54 copying price to price_copy in `:/tmp/db/2020.01.02/trade +2023.12.08 10:14:54 copying price to price_copy in `:/tmp/db/2020.01.03/trade +``` + +You can now apply a function to the copied column without the risk of losing the original data. Below we are modifying the copied column by multiplying the contents by 2. + +```python +>>> db.apply_function('trade', 'price_copy', lambda x: 2*x) +2023.12.08 10:18:18 resaving column price_copy (type 9) in `:/tmp/db/2020.01.01/trade +2023.12.08 10:18:18 resaving column price_copy (type 9) in `:/tmp/db/2020.01.02/trade +2023.12.08 10:18:18 resaving column price_copy (type 9) in `:/tmp/db/2020.01.03/trade +>>> db.trade +pykx.PartitionedTable(pykx.q(' +date ticker price price_copy +-------------------------------------- +2020.01.01 AAPL 7.055037 14.11007 +2020.01.01 AAPL 3.907669 7.815337 +2020.01.01 AAPL 2.20948 4.418959 +2020.01.01 AAPL 7.839242 15.67848 +2020.01.01 AAPL 0.8549648 1.70993 +.. +') +``` + +Once you are happy with the new values within the `price_copy` column, you can safely delete the `price` column, then rename the `price_copy` column to be called `price`. + +```python +>>> db.delete_column('trade', 'price') +2023.12.08 10:20:02 deleting column price from `:/tmp/db/2020.01.01/trade +2023.12.08 10:20:02 deleting column price from `:/tmp/db/2020.01.02/trade +2023.12.08 10:20:02 deleting column price from `:/tmp/db/2020.01.03/trade +>>> db.rename_column('trade', 'price_copy', 'price') +2023.12.08 10:06:27 renaming price_copy to price in `:/tmp/db/2020.01.01/trade +2023.12.08 10:06:27 renaming price_copy to price in `:/tmp/db/2020.01.02/trade +2023.12.08 10:06:27 renaming price_copy to price in `:/tmp/db/2020.01.03/trade +>>> db.trade +pykx.PartitionedTable(pykx.q(' +date ticker price +-------------------------- +2020.01.01 AAPL 14.11007 +2020.01.01 AAPL 7.815337 +2020.01.01 AAPL 4.418959 +2020.01.01 AAPL 15.67848 +2020.01.01 AAPL 1.70993 +.. +') +``` + +To convert the data type of a column, you can use the `set_column_type` method. Currently the `price` column is the type `FloatAtom`. We will update this to be a type `RealAtom`. + +```python +>>> db.set_column_type('trade', 'price', kx.RealAtom) +2023.12.08 10:28:28 resaving column price (type 8) in `:/tmp/db/2020.01.01/trade +2023.12.08 10:28:28 resaving column price (type 8) in `:/tmp/db/2020.01.02/trade +2023.12.08 10:28:28 resaving column price (type 8) in `:/tmp/db/2020.01.03/trade +``` + +### Adding a new table to the database + +Now that you have successfully set up one table, you may want to add a second table named `quotes`. In this example, the `quotes` table only contains data for `2020.01.03`. We follow the same method as before and create the `quotes` table using the `create` method + +```python +>>> quotes = kx.Table(data={ +... 'sym': kx.random.random(N, ['AAPL', 'GOOG', 'MSFT']), +... 'open': kx.random.random(N, 10.0), +... 'high': kx.random.random(N, 10.0), +... 'low': kx.random.random(N, 10.0), +... 'close': kx.random.random(N, 10.0) +... }) +>>> db.create(quotes, 'quotes', date(2020, 1, 3), by_field = 'sym', sym_enum = 'symcol') +Writing Database Partition 2020-01-03 to table quotes +``` + +As mentioned in the introduction, all tables within a database must contain the same partition structure. To ensure the new table can be accessed, the quotes table needs to exist in every partition within the database, even if there is no data for that partition. This is called backfilling data. For the partitions where the `quotes` table is missing, we use the `fill_database` method. + +```python +>>> db.fill_database() +Successfully filled missing tables to partition: :/tmp/db/2020.01.01 +Successfully filled missing tables to partition: :/tmp/db/2020.01.02 +``` + +Now that the database has resolved the missing tables within the partitions, we can view the new `quotes` table + +```python +>>> db.quotes +pykx.PartitionedTable(pykx.q(' +date sym open high low close +------------------------------------------------------- +2020.01.03 AAPL 7.456644 7.217498 5.012176 3.623649 +2020.01.03 AAPL 6.127973 0.4229592 7.450608 5.651364 +2020.01.03 AAPL 8.147475 4.459108 3.493555 5.78803 +2020.01.03 AAPL 5.812028 7.81659 5.395469 8.424176 +2020.01.03 AAPL 8.519148 1.18101 6.684017 8.376375 +.. +') +``` + +Finally, to view the amount of saved data you can count the number of rows per partition using `partition_count` + +```python +>>> db.partition_count() +pykx.Dictionary(pykx.q(' + | quotes trade +----------| ------------- +2020.01.01| 0 49859 +2020.01.02| 0 50141 +2020.01.03| 100000 100000 +')) +``` diff --git a/docs/beta-features/index.md b/docs/beta-features/index.md new file mode 100644 index 0000000..ca6fecc --- /dev/null +++ b/docs/beta-features/index.md @@ -0,0 +1,52 @@ +# Beta Features + +## What is a Beta Feature? + +As used commonly within software development "Beta Features" within PyKX describe features which have completed an initial development process phase and are being released in an opt-in manner to users of PyKX wishing to test these features. These features are not intended to be for production use while in beta and are subject to change prior to release as full features. Usage of these features will not effect the default behaviour of the library outside of the scope of the new functionality being added. + +Feedback on Beta Feature development is incredibly helpful and helps to determine when these features are promoted to fully supported production features. If you run into any issues while making use of these features please raise an issue on the PyKX Github [here](https://github.com/KxSystems/pykx/issues). + +## How do I enable Beta Features? + +Within PyKX beta features are enabled through the use of a configuration/environment variable `PYKX_BETA_FEATURES`, within a Python session users can set this prior to importing PyKX as shown below, note that when enabled you will be able to see what features are in beta through access of `kx.beta_features`: + +```python +>>> import os +>>> os.environ['PYKX_BETA_FEATURES'] = 'True' +>>> import pykx as kx +>>> kx.beta_features +['Database Management', 'Remote Functions'] +``` + +Alternatively you can set beta features to be available at all times by adding `PYKX_BETA_FEATURES` to your `.pykx-config` file as outlined [here](../user-guide/configuration.md#configuration-file). An example of a configuration making use of this is as follows: + +```bash +[default] +PYKX_KEEP_LOCAL_TIMES='true' + +[beta] +PYKX_BETA_FEATURES='true' +``` + +## What Beta Features are available? + +As mentioned above the list of available features to a user is contained within the `beta_features` property, for users with these features available you can get access to this information as follows within a Python session + +```python +>>> import pykx as kx +>>> kx.beta_features +['Database Management', 'Remote Functions'] +``` + +The following are the currently available beta features: + +- [Database Management](db-management.md) provides users with the ability to create, load and maintain databases and their associated tables including but not limited to: + + - Database table creation and renaming. + - Enumeration of in-memory tables against on-disk sym file. + - Column listing, addition, reordering, renaming copying, function application and deletion on-disk. + - Attribute setting and removal. + - Addition of missing tables from partitions within a database. + +- [Remote Functions](remote-functions.md) let you define functions in Python which interact directly with kdb+ data on a q process. These functions can seamlessly integrate into existing Python infrastructures and also benefit systems that use q processes over Python for performance reasons or as part of legacy applications. +- [PyKX Threading](threading.md) provides users with the ability to call into `EmbeddedQ` from multithreaded python programs and allow any thread to modify global state safely. diff --git a/docs/beta-features/remote-functions.md b/docs/beta-features/remote-functions.md new file mode 100644 index 0000000..41145f8 --- /dev/null +++ b/docs/beta-features/remote-functions.md @@ -0,0 +1,132 @@ +# Remote Function Execution + +!!! Warning + + This module is a Beta Feature and is subject to change. To enable this functionality for testing please follow the configuration instructions [here](../user-guide/configuration.md) setting `PYKX_BETA_FEATURES='true'` + +## Introduction + +Remote Functions let you define Python functions within your Python environment which can interact with kdb+ data on a q process. Once defined, these functions are registered to a [remote session object]() along with any Python dependencies which need to be imported. The [remote session object]() establishes and manages the remote connection to the kdb+/q server. + +To execute kdb+/q functions using PyKX, please see [PyKX under q](../pykx-under-q/intro.html) + +## Requirements and limitations + +To run this functionality, the kdb+/q server you connect to must have the ability to load PyKX under q. It is your responsibility to ensure the version and existence of Python library dependencies are correct in your kdb+/q environment at runtime. + +Users must additionally ensure that they have all Python requirements installed on the client server, in particular `dill>=0.2` is required for this functionality. + +It can be installed using the following command: + +```bash +pip install pykx[beta] +``` + +## Functional walkthrough + +This walkthrough will demonstrate the following steps: + +1. Initialize a q/kdb+ server loading PyKX under q on a specified port. +1. Import PyKX and generate a remote session object which denotes the process against which the Python functions will be executed +1. Define a number of Python functions which will be executed on the remote q/kdb+ server. + +### Initializing a q/kdb+ server with PyKX under q + +This step ensures you have a q process running with PyKX under q, as well as having a kdb+ table available to query. If you have this already, proceed to the next step. + +Ensure that you have q installed. If you do not have this installed please follow the guide provided [here](https://code.kx.com/q/learn/install/), retrieving your license following the instructions provided [here](https://kx.com/kdb-insights-personal-edition-license-download). + +Install PyKX under q using the following command. + +```bash +python -c "import pykx;pykx.install_into_QHOME()" +``` + +Start the q process to which you will execute your functions. + +```bash +q pykx.q -p 5050 +``` + +Create a table which you will use within your Python analytics defined below. + +```q +q)N:1000 +q)tab:([]sym:N?`AAPL`MSFT`GOOG`FDP;price:100+N?100f;size:10+N?100) +``` + +Set a requirement for users to provide a username/password if you wish to add security to your q process. + +```q +.z.pw:{[u;p]$[(u~`user)&p~`password;1b;0b]} +``` + +### Import PyKX and create a session + +Create a session object from a Python environment of your choice, which establishes and manages the remote connection to the kdb+/q server. + +```python +>>> import os +>>> os.environ['PYKX_BETA_FEATURES'] = 'true' +>>> from pykx.remote import session +>>> remote_session = session() +>>> remote_session.create(host='localhost', port=5050, username='user', password='password') +``` + +### Defining and Executing Python functions using a session + +Tag the Python functions you want to run on the remote server using the `kx.remote.function` decorator. This registers the functions on the `remote_session` object you have just created. + +=== "Single Argument Function" + + ```python + >>> from pykx.remote import function + >>> @function(remote_session) + ... def single_arg_function(x): + ... return x+10 + >>> single_arg_function(10) + pykx.LongAtom(pykx.q('20')) + ``` + +=== "Multi Argument Function" + + ```python + >>> from pykx.remote import function + >>> @function(remote_session) + ... def multi_arg_function(x, y): + ... return x+y + >>> multi_arg_function(10, 20) + pykx.LongAtom(pykx.q('30')) + ``` + +Add any Python libraries which need to be available when executing the function(s) you have just defined. You can achieve this in two ways: + +1. Using `session.add_library` to import required libraries before defining your function +1. Importing libraries within the body of the function being executed + +Both examples can be seen below + +=== "Library addition functionality" + + ```python + >>> remote_session.add_library('numpy', 'pykx') + >>> @function(remote_session) + ... def dependent_function(x, y, z): + ... return pykx.q.mavg(4, numpy.linspace(x, y, z)) + >>> dependent_function(0, 10, 10) + pykx.FloatVector(pykx.q('0 0.5555556 1.111111 2.222222 3...')) + ``` + +=== "Defining imports within function body" + + ```python + >>> @function(remote_session) + ... def dependent_function(x, y, z): + ... import pykx as kx + ... import numpy as np + ... return kx.q.mavg(4, np.linspace(x, y, z)) + >>> dependent_function(0, 10, 10) + pykx.FloatVector(pykx.q('0 0.5555556 1.111111 2.222222 3...')) + ``` + +While both are valid, we suggest using `add_library` as it allows for pre-checking of the libraries prior to definition of the function and will be expanded over time to include additional validation. diff --git a/docs/beta-features/threading.md b/docs/beta-features/threading.md new file mode 100644 index 0000000..eaefb4e --- /dev/null +++ b/docs/beta-features/threading.md @@ -0,0 +1,64 @@ +# Multi-Threaded Execution + +!!! Warning + + This module is a Beta Feature and is subject to change. To enable this functionality for testing please follow the configuration instructions [here](../user-guide/configuration.md) setting `PYKX_BETA_FEATURES='true'` and `PYKX_THREADING='true'`. + +## Introduction + +One major limitation of `EmbeddedQ` when using python with multi-threading is that only the main +thread (the thread that imports PyKX and loads `libq`) is allowed to modify state within `EmbeddedQ`. +However if you wanted to use one of Pythons multi-threading libraries whether that is the `threading` +library or `asyncio` or any other library that allows Python to utilise multiple threads at once, +and have those threads modify state in some way; whether that be to upsert a row to a global table, +open `QConnection` instances or any other use case that requires the threads to modify state. You +would not be able to do that by default in PyKX. + +This beta feature allows these use cases to become possible by spawning a background thread that all +calls into `EmbeddedQ` will be run on. This background thread is created at the `C` level using +`libpthread` with lightweight future objects to ensure the lowest overhead possible for passing +calls onto a secondary thread. This allows multi-threaded programs to modify state within the spawned +threads safely, without losing out on performance. + + +!!! Note + + While using `PyKX Threading` it is not possible to also use the functionality within `pykx.q`, + it is also not possible to have q call back into Python. + +## How to enable + +This beta feature requires an extra opt-in step. While the overhead for offloading calls onto a secondary +thread is low, there will always be a cost to forcing a thread context switch to process a call into +`EmbeddedQ`. Therefore you will need to enable both the `PYKX_BETA_FEATURES` environment variable as +well as the `PYKX_THREADING` environment variable. + +!!! Warning + + Because using `PyKX Threading` spawns a background thread to run all queries to `EmbeddedQ`, you + must ensure that you call `kx.shutdown_thread()` at the end of your script to ensure that this + background thread is properly shutdown at the end. If you fail to do this the background thread will + be left running after the script is finished. The best way to ensure this always happens is to start + a main function for your script within a `try` - `finally` block. + + +```Python +import os +import asyncio +os.environ['PYKX_THREADING'] = '1' +os.environ['PYKX_BETA_FEATURES'] = '1' +import pykx as kx + +def main(): # Your scripts entry point + ... + +if __name__ == '__main__': + try: + main() + finally: + kx.shutdown_thread() # This will be called if the script completes normally or errors early +``` + +## More complete examples + +More examples showing this functionality in use can be found [here](../examples/threaded_execution/threading.md). diff --git a/docs/contributors.md b/docs/contributors.md new file mode 100644 index 0000000..e44d1d7 --- /dev/null +++ b/docs/contributors.md @@ -0,0 +1,23 @@ +# Contributors + +The aim of this page is to include a list of the contributors to our project both internal and external to KX. If you wish to contribute to the project please open a pull request to our project [here](https://github.com/KxSystems/pykx/pulls). + +## Internal Development Team (Current and Past) + +- [Conor McCarthy](https://github.com/cmccarthy1) +- [Kian Shepherd](https://github.com/kshepherdkx) +- [Rian Ó Cuinneagáin](https://github.com/rianoc-kx) +- [Reuben Taylor](https://github.com/roobxyz) +- [Bruno Le Hyaric](https://github.com/bu2) +- [Will Da Silva](https://github.com/WillDaSilva) +- [Matt Maynes](https://github.com/mattmaynes) +- [Tim Thornton](https://github.com/igorii) +- Siobhán Stevenson +- Andy McDonald +- Sean Foden + +## External Contributors + +- [neutropolis](https://github.com/neutropolis) +- [nipsn](https://github.com/nipsn) +- [marcosvm13](https://github.com/marcosvm13) diff --git a/docs/examples/db-management.ipynb b/docs/examples/db-management.ipynb new file mode 100644 index 0000000..b58935d --- /dev/null +++ b/docs/examples/db-management.ipynb @@ -0,0 +1,2728 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "015ba887", + "metadata": {}, + "source": [ + "# Introduction\n", + "\n", + "This notebook provides a walkthrough of some of the functionality available for users looking to create and maintain large databases using PyKX.\n", + "\n", + "In particular, this notebook refers to creating and maintaining [partitioned kdb+ databases](https://code.kx.com/q/kb/partition/). Go to [Q for Mortals](https://code.kx.com/q4m3/14_Introduction_to_Kdb+/#143-partitioned-tables) for more in-depth information about partitioned databases in kdb+.\n", + "\n", + "You can download this walkthrough as a `.ipynb` notebook file using the following link.", + "\n", + "This walkthrough provides examples of the following tasks:\n", + "\n", + "1. Creating a database from a historical dataset\n", + "1. Adding a new partition to the database\n", + "1. Managing the on-disk database by:\n", + " 1. Renaming a table and column\n", + " 2. Creating a copy of a column to the database\n", + " 3. Applying a Python function to a column of the database\n", + " 4. Updating the data type of a column\n", + "1. Adding a new table to the most recent partition of the database\n", + "\n", + "For full information on the functions available you can reference the [API section](https://code.kx.com/pykx/api/db.html).\n", + "\n", + "---\n", + "\n", + "## Initial setup\n", + "\n", + "Import all required libraries and create a temporary directory which will be used to store the database we create for this walkthrough" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "0afee62a", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os.environ['PYKX_BETA_FEATURES'] = 'true'\n", + "\n", + "import pykx as kx\n", + "from datetime import date\n", + "import tempfile" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "64c18054", + "metadata": {}, + "outputs": [], + "source": [ + "tempdir = tempfile.TemporaryDirectory()" + ] + }, + { + "cell_type": "markdown", + "id": "2e91160e", + "metadata": {}, + "source": [ + "Database interactions are facilitated through use of the `pykx.DB` class. All methods/attributes used in this notebook are contained within this class. \n", + "\n", + "Initialise the `DB` class to start. The expected input is the file path where you intend to save the partitioned database and its associated tables. In this case we're going to use the temporary directory we just created. " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "90d9eac3", + "metadata": {}, + "outputs": [], + "source": [ + "db = kx.DB(path = tempdir.name + '/db')" + ] + }, + { + "cell_type": "markdown", + "id": "143e0886", + "metadata": {}, + "source": [ + "For details on any methods contained within this class, you can use the `help` method. " + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "0e817132", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Help on method create in module pykx.db:\n", + "\n", + "create(table, table_name, partition, *, by_field=None, sym_enum=None, log=True) method of pykx.db.DB instance\n", + " Create an on-disk partitioned table within a kdb+ database from a supplied\n", + " `pykx.Table` object. Once generated this table will be accessible\n", + " as an attribute of the `DB` class or a sub attribute of `DB.table`.\n", + " \n", + " Parameters:\n", + " table: The `pykx.Table` object which is to be persisted to disk\n", + " table_name: The name with which the table will be persisted and accessible\n", + " once loaded and available as a `pykx.PartitionedTable`\n", + " partition: The name of the column which is to be used to partition the data if\n", + " supplied as a `str` or if supplied as non string object this will be used as\n", + " the partition to which all data is persisted\n", + " by_field: A field of the table to be used as a by column, this column will be\n", + " the second column in the table (the first being the virtual column determined\n", + " by the partitioning column)\n", + " sym_enum: The name of the symbol enumeration table to be associated with the table\n", + " log: Print information about status of partitioned datab\n", + " \n", + " Returns:\n", + " A `None` object on successful invocation, the database class will be\n", + " updated to contain attributes associated with the available created table\n", + " \n", + " Examples:\n", + " \n", + " Generate a partitioned table from a table containing multiple partitions\n", + " \n", + " ```python\n", + " >>> import pykx as kx\n", + " >>> db = kx.DB(path = 'newDB')\n", + " >>> N = 1000\n", + " >>> qtab = kx.Table(data = {\n", + " ... 'date': kx.q.asc(kx.random.random(N, kx.q('2020.01 2020.02 2020.03'))),\n", + " ... 'sym': kx.random.random(N, ['AAPL', 'GOOG', 'MSFT']),\n", + " ... 'price': kx.random.random(N, 10.0),\n", + " ... 'size': kx.random.random(N, 100)\n", + " ... })\n", + " >>> db.create(qtab, 'stocks', 'date', by_field = 'sym', sym_enum = 'symbols')\n", + " >>> db.tables\n", + " ['stocks']\n", + " >>> db.stocks\n", + " pykx.PartitionedTable(pykx.q('\n", + " month sym price size\n", + " ---------------------------\n", + " 2020.01 AAPL 7.979004 85\n", + " 2020.01 AAPL 5.931866 55\n", + " 2020.01 AAPL 5.255477 49\n", + " 2020.01 AAPL 8.15255 74\n", + " 2020.01 AAPL 4.771067 80\n", + " ..\n", + " '))\n", + " ```\n", + " \n", + " Add a table as a partition to an on-disk database, in the example below we are adding\n", + " a partition to the table generated above\n", + " \n", + " ```python\n", + " >>> import pykx as kx\n", + " >>> db = kx.DB(path = 'newDB')\n", + " >>> N = 333\n", + " >>> qtab = kx.Table(data = {\n", + " ... 'sym': kx.random.random(N, ['AAPL', 'GOOG', 'MSFT']),\n", + " ... 'price': kx.random.random(N, 10.0),\n", + " ... 'size': kx.random.random(N, 100)\n", + " ... })\n", + " >>> db.create(qtab, 'stocks', kx.q('2020.04'), by_field = 'sym', sym_enum = 'symbols')\n", + " >>> db.tables\n", + " ['stocks']\n", + " >>> db.stocks\n", + " pykx.PartitionedTable(pykx.q('\n", + " month sym price size\n", + " ---------------------------\n", + " 2020.01 AAPL 7.979004 85\n", + " 2020.01 AAPL 5.931866 55\n", + " 2020.01 AAPL 5.255477 49\n", + " 2020.01 AAPL 8.15255 74\n", + " 2020.01 AAPL 4.771067 80\n", + " ..\n", + " '))\n", + " ```\n", + "\n" + ] + } + ], + "source": [ + "help(db.create)" + ] + }, + { + "cell_type": "markdown", + "id": "607599f8", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "id": "3516ab83", + "metadata": {}, + "source": [ + "## Create the sample dataset\n", + "\n", + "Create a dataset called `trades` containing time-series data spanning multiple dates, and columns of various types:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "686441cc", + "metadata": {}, + "outputs": [], + "source": [ + "N = 1000000\n", + "trades = kx.Table(data={\n", + " 'date': kx.random.random(N, [date(2020, 1, 1), date(2020, 1, 2)]),\n", + " 'sym': kx.random.random(N, ['AAPL', 'GOOG', 'MSFT']),\n", + " 'price': kx.random.random(N, 10.0),\n", + " 'size': kx.random.random(N, 1000)\n", + "})" + ] + }, + { + "cell_type": "markdown", + "id": "d0529e7c", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "id": "0738729d", + "metadata": {}, + "source": [ + "## Create the database" + ] + }, + { + "cell_type": "markdown", + "id": "0fb4659b", + "metadata": {}, + "source": [ + "Create the database using the `date` column as the partition, and add `trades` as a table called `trade_data` within it." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "db8b9a04", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Writing Database Partition 2020.01.01 to table trade_data\n", + "Writing Database Partition 2020.01.02 to table trade_data\n" + ] + } + ], + "source": [ + "db.create(trades, 'trade_data', 'date')" + ] + }, + { + "cell_type": "markdown", + "id": "ad2fa6f9", + "metadata": {}, + "source": [ + "This now exists as a table and is saved to disk." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "82796fbc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['trade_data']" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "db.tables" + ] + }, + { + "cell_type": "markdown", + "id": "c0ecec19", + "metadata": {}, + "source": [ + "When a table is saved, an attribute is added to the `db` class for it. For our newly generated table, this is `db.trade_data`" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "29606b7a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datesympricesize
02020.01.01MSFT7.079266800
12020.01.01AAPL1.82432165
22020.01.01MSFT2.408259292
32020.01.01GOOG1.6754387
42020.01.01AAPL8.311168183
52020.01.01AAPL2.208693989
62020.01.01MSFT6.068126567
72020.01.01AAPL4.918926794
82020.01.01AAPL9.33186939
92020.01.01AAPL1.142611507
102020.01.01AAPL2.685874581
112020.01.01AAPL3.483591163
122020.01.01AAPL0.4422525466
132020.01.01MSFT7.406654976
142020.01.01MSFT2.493871171
152020.01.01AAPL9.24208828
162020.01.01MSFT0.3954522747
172020.01.01MSFT0.3441191512
182020.01.01GOOG9.662762998
192020.01.01AAPL9.601674812
202020.01.01AAPL4.969858910
212020.01.01GOOG1.048204830
222020.01.01GOOG0.9817644595
...............
9999992020.01.02GOOG1.470716636
\n", + "

1,000,000 rows × 4 columns

" + ], + "text/plain": [ + "pykx.PartitionedTable(pykx.q('\n", + "date sym price size\n", + "------------------------------\n", + "2020.01.01 MSFT 7.079266 800 \n", + "2020.01.01 AAPL 1.824321 65 \n", + "2020.01.01 MSFT 2.408259 292 \n", + "2020.01.01 GOOG 1.675438 7 \n", + "2020.01.01 AAPL 8.311168 183 \n", + "2020.01.01 AAPL 2.208693 989 \n", + "2020.01.01 MSFT 6.068126 567 \n", + "2020.01.01 AAPL 4.918926 794 \n", + "2020.01.01 AAPL 9.331869 39 \n", + "2020.01.01 AAPL 1.142611 507 \n", + "2020.01.01 AAPL 2.685874 581 \n", + "2020.01.01 AAPL 3.483591 163 \n", + "2020.01.01 AAPL 0.4422525 466 \n", + "2020.01.01 MSFT 7.406654 976 \n", + "2020.01.01 MSFT 2.493871 171 \n", + "2020.01.01 AAPL 9.242088 28 \n", + "2020.01.01 MSFT 0.3954522 747 \n", + "2020.01.01 MSFT 0.3441191 512 \n", + "2020.01.01 GOOG 9.662762 998 \n", + "2020.01.01 AAPL 9.601674 812 \n", + "..\n", + "'))" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "db.trade_data" + ] + }, + { + "cell_type": "markdown", + "id": "5ed4224e", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "id": "889dfb46", + "metadata": {}, + "source": [ + "## Add a new partition to the database\n", + "\n", + "Once a table has been generated, you can add more partitions to the database through reuse of the `create` method. In this case we are adding the new partition `2020.01.03` to the database." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "7cce4947", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Writing Database Partition 2020-01-03 to table trade_data\n" + ] + } + ], + "source": [ + "N = 10000\n", + "new_day = kx.Table(data={\n", + " 'sym': kx.random.random(N, ['AAPL', 'GOOG', 'MSFT']),\n", + " 'price': kx.random.random(N, 10.0),\n", + " 'size': kx.random.random(N, 100)\n", + "})\n", + "db.create(new_day, 'trade_data', date(2020, 1, 3))" + ] + }, + { + "cell_type": "markdown", + "id": "e24ecc1d", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "id": "09f0bd28", + "metadata": {}, + "source": [ + "## Manage the database\n", + "\n", + "This section covers updating the contents of a database. The examples below demonstrate a number of common tasks that would be completed regularly when updating a database.\n", + "\n", + "The name of a table can be updated using the `rename_table` method. Below, we are updating the table `trade_data` to be called `trade`." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "ae9d244b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023.12.15 16:14:22 renaming :/var/folders/l8/t7s11kcs02x3dchm9_m48mq80000gn/T/tmp2ts68edc/db/2020.01.01/trade_data to :/var/folders/l8/t7s11kcs02x3dchm9_m48mq80000gn/T/tmp2ts68edc/db/2020.01.01/trades\n", + "2023.12.15 16:14:22 renaming :/var/folders/l8/t7s11kcs02x3dchm9_m48mq80000gn/T/tmp2ts68edc/db/2020.01.02/trade_data to :/var/folders/l8/t7s11kcs02x3dchm9_m48mq80000gn/T/tmp2ts68edc/db/2020.01.02/trades\n", + "2023.12.15 16:14:22 renaming :/var/folders/l8/t7s11kcs02x3dchm9_m48mq80000gn/T/tmp2ts68edc/db/2020.01.03/trade_data to :/var/folders/l8/t7s11kcs02x3dchm9_m48mq80000gn/T/tmp2ts68edc/db/2020.01.03/trades\n" + ] + } + ], + "source": [ + "db.rename_table('trade_data', 'trades')" + ] + }, + { + "cell_type": "markdown", + "id": "5edc2eba", + "metadata": {}, + "source": [ + "During the rename process, the attribute in the `db` class is also updated. " + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "00eaf253", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datesympricesize
02020.01.01MSFT7.079266800
12020.01.01AAPL1.82432165
22020.01.01MSFT2.408259292
32020.01.01GOOG1.6754387
42020.01.01AAPL8.311168183
52020.01.01AAPL2.208693989
62020.01.01MSFT6.068126567
72020.01.01AAPL4.918926794
82020.01.01AAPL9.33186939
92020.01.01AAPL1.142611507
102020.01.01AAPL2.685874581
112020.01.01AAPL3.483591163
122020.01.01AAPL0.4422525466
132020.01.01MSFT7.406654976
142020.01.01MSFT2.493871171
152020.01.01AAPL9.24208828
162020.01.01MSFT0.3954522747
172020.01.01MSFT0.3441191512
182020.01.01GOOG9.662762998
192020.01.01AAPL9.601674812
202020.01.01AAPL4.969858910
212020.01.01GOOG1.048204830
222020.01.01GOOG0.9817644595
...............
10099992020.01.03AAPL9.75038799
\n", + "

1,010,000 rows × 4 columns

" + ], + "text/plain": [ + "pykx.PartitionedTable(pykx.q('\n", + "date sym price size\n", + "------------------------------\n", + "2020.01.01 MSFT 7.079266 800 \n", + "2020.01.01 AAPL 1.824321 65 \n", + "2020.01.01 MSFT 2.408259 292 \n", + "2020.01.01 GOOG 1.675438 7 \n", + "2020.01.01 AAPL 8.311168 183 \n", + "2020.01.01 AAPL 2.208693 989 \n", + "2020.01.01 MSFT 6.068126 567 \n", + "2020.01.01 AAPL 4.918926 794 \n", + "2020.01.01 AAPL 9.331869 39 \n", + "2020.01.01 AAPL 1.142611 507 \n", + "2020.01.01 AAPL 2.685874 581 \n", + "2020.01.01 AAPL 3.483591 163 \n", + "2020.01.01 AAPL 0.4422525 466 \n", + "2020.01.01 MSFT 7.406654 976 \n", + "2020.01.01 MSFT 2.493871 171 \n", + "2020.01.01 AAPL 9.242088 28 \n", + "2020.01.01 MSFT 0.3954522 747 \n", + "2020.01.01 MSFT 0.3441191 512 \n", + "2020.01.01 GOOG 9.662762 998 \n", + "2020.01.01 AAPL 9.601674 812 \n", + "..\n", + "'))" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "db.trades" + ] + }, + { + "cell_type": "markdown", + "id": "4c44fab2", + "metadata": {}, + "source": [ + "Renaming a column in a table is achieved using the `rename_column` method. For example, let's update the `sym` column in the `trade` table to be called `ticker`." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "1c52d0b0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023.12.15 16:14:25 renaming sym to ticker in `:/var/folders/l8/t7s11kcs02x3dchm9_m48mq80000gn/T/tmp2ts68edc/db/2020.01.01/trades\n", + "2023.12.15 16:14:25 renaming sym to ticker in `:/var/folders/l8/t7s11kcs02x3dchm9_m48mq80000gn/T/tmp2ts68edc/db/2020.01.02/trades\n", + "2023.12.15 16:14:25 renaming sym to ticker in `:/var/folders/l8/t7s11kcs02x3dchm9_m48mq80000gn/T/tmp2ts68edc/db/2020.01.03/trades\n" + ] + } + ], + "source": [ + "db.rename_column('trades', 'sym', 'ticker')" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "b03c5c17", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datetickerpricesize
02020.01.01MSFT7.079266800
12020.01.01AAPL1.82432165
22020.01.01MSFT2.408259292
32020.01.01GOOG1.6754387
42020.01.01AAPL8.311168183
52020.01.01AAPL2.208693989
62020.01.01MSFT6.068126567
72020.01.01AAPL4.918926794
82020.01.01AAPL9.33186939
92020.01.01AAPL1.142611507
102020.01.01AAPL2.685874581
112020.01.01AAPL3.483591163
122020.01.01AAPL0.4422525466
132020.01.01MSFT7.406654976
142020.01.01MSFT2.493871171
152020.01.01AAPL9.24208828
162020.01.01MSFT0.3954522747
172020.01.01MSFT0.3441191512
182020.01.01GOOG9.662762998
192020.01.01AAPL9.601674812
202020.01.01AAPL4.969858910
212020.01.01GOOG1.048204830
222020.01.01GOOG0.9817644595
...............
10099992020.01.03AAPL9.75038799
\n", + "

1,010,000 rows × 4 columns

" + ], + "text/plain": [ + "pykx.PartitionedTable(pykx.q('\n", + "date ticker price size\n", + "--------------------------------\n", + "2020.01.01 MSFT 7.079266 800 \n", + "2020.01.01 AAPL 1.824321 65 \n", + "2020.01.01 MSFT 2.408259 292 \n", + "2020.01.01 GOOG 1.675438 7 \n", + "2020.01.01 AAPL 8.311168 183 \n", + "2020.01.01 AAPL 2.208693 989 \n", + "2020.01.01 MSFT 6.068126 567 \n", + "2020.01.01 AAPL 4.918926 794 \n", + "2020.01.01 AAPL 9.331869 39 \n", + "2020.01.01 AAPL 1.142611 507 \n", + "2020.01.01 AAPL 2.685874 581 \n", + "2020.01.01 AAPL 3.483591 163 \n", + "2020.01.01 AAPL 0.4422525 466 \n", + "2020.01.01 MSFT 7.406654 976 \n", + "2020.01.01 MSFT 2.493871 171 \n", + "2020.01.01 AAPL 9.242088 28 \n", + "2020.01.01 MSFT 0.3954522 747 \n", + "2020.01.01 MSFT 0.3441191 512 \n", + "2020.01.01 GOOG 9.662762 998 \n", + "2020.01.01 AAPL 9.601674 812 \n", + "..\n", + "'))" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "db.trades" + ] + }, + { + "cell_type": "markdown", + "id": "148207eb", + "metadata": {}, + "source": [ + "To safely apply a function to modify the `price` column within the database, first create a copy of the column." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "f7d2f116", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023.12.15 16:14:29 copying price to price_copy in `:/var/folders/l8/t7s11kcs02x3dchm9_m48mq80000gn/T/tmp2ts68edc/db/2020.01.01/trades\n", + "2023.12.15 16:14:29 copying price to price_copy in `:/var/folders/l8/t7s11kcs02x3dchm9_m48mq80000gn/T/tmp2ts68edc/db/2020.01.02/trades\n", + "2023.12.15 16:14:29 copying price to price_copy in `:/var/folders/l8/t7s11kcs02x3dchm9_m48mq80000gn/T/tmp2ts68edc/db/2020.01.03/trades\n" + ] + } + ], + "source": [ + "db.copy_column('trades', 'price', 'price_copy')" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "9bad2096", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datetickerpricesizeprice_copy
02020.01.01MSFT7.0792668007.079266
12020.01.01AAPL1.824321651.824321
22020.01.01MSFT2.4082592922.408259
32020.01.01GOOG1.67543871.675438
42020.01.01AAPL8.3111681838.311168
52020.01.01AAPL2.2086939892.208693
62020.01.01MSFT6.0681265676.068126
72020.01.01AAPL4.9189267944.918926
82020.01.01AAPL9.331869399.331869
92020.01.01AAPL1.1426115071.142611
102020.01.01AAPL2.6858745812.685874
112020.01.01AAPL3.4835911633.483591
122020.01.01AAPL0.44225254660.4422525
132020.01.01MSFT7.4066549767.406654
142020.01.01MSFT2.4938711712.493871
152020.01.01AAPL9.242088289.242088
162020.01.01MSFT0.39545227470.3954522
172020.01.01MSFT0.34411915120.3441191
182020.01.01GOOG9.6627629989.662762
192020.01.01AAPL9.6016748129.601674
202020.01.01AAPL4.9698589104.969858
212020.01.01GOOG1.0482048301.048204
222020.01.01GOOG0.98176445950.9817644
..................
10099992020.01.03AAPL9.750387999.750387
\n", + "

1,010,000 rows × 4 columns

" + ], + "text/plain": [ + "pykx.PartitionedTable(pykx.q('\n", + "date ticker price size price_copy\n", + "-------------------------------------------\n", + "2020.01.01 MSFT 7.079266 800 7.079266 \n", + "2020.01.01 AAPL 1.824321 65 1.824321 \n", + "2020.01.01 MSFT 2.408259 292 2.408259 \n", + "2020.01.01 GOOG 1.675438 7 1.675438 \n", + "2020.01.01 AAPL 8.311168 183 8.311168 \n", + "2020.01.01 AAPL 2.208693 989 2.208693 \n", + "2020.01.01 MSFT 6.068126 567 6.068126 \n", + "2020.01.01 AAPL 4.918926 794 4.918926 \n", + "2020.01.01 AAPL 9.331869 39 9.331869 \n", + "2020.01.01 AAPL 1.142611 507 1.142611 \n", + "2020.01.01 AAPL 2.685874 581 2.685874 \n", + "2020.01.01 AAPL 3.483591 163 3.483591 \n", + "2020.01.01 AAPL 0.4422525 466 0.4422525 \n", + "2020.01.01 MSFT 7.406654 976 7.406654 \n", + "2020.01.01 MSFT 2.493871 171 2.493871 \n", + "2020.01.01 AAPL 9.242088 28 9.242088 \n", + "2020.01.01 MSFT 0.3954522 747 0.3954522 \n", + "2020.01.01 MSFT 0.3441191 512 0.3441191 \n", + "2020.01.01 GOOG 9.662762 998 9.662762 \n", + "2020.01.01 AAPL 9.601674 812 9.601674 \n", + "..\n", + "'))" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "db.trades" + ] + }, + { + "cell_type": "markdown", + "id": "3c63e2bb", + "metadata": {}, + "source": [ + "You can now apply a function to the copied column without the risk of losing the original data. Below we are modifying the copied column by multiplying the contents by 2." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "483a3b48", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023.12.15 16:14:31 resaving column price_copy (type 9) in `:/var/folders/l8/t7s11kcs02x3dchm9_m48mq80000gn/T/tmp2ts68edc/db/2020.01.01/trades\n", + "2023.12.15 16:14:31 resaving column price_copy (type 9) in `:/var/folders/l8/t7s11kcs02x3dchm9_m48mq80000gn/T/tmp2ts68edc/db/2020.01.02/trades\n", + "2023.12.15 16:14:31 resaving column price_copy (type 9) in `:/var/folders/l8/t7s11kcs02x3dchm9_m48mq80000gn/T/tmp2ts68edc/db/2020.01.03/trades\n" + ] + } + ], + "source": [ + "db.apply_function('trades', 'price_copy', kx.q('{2*x}'))" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "e5285600", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datetickerpricesizeprice_copy
02020.01.01MSFT7.07926680014.15853
12020.01.01AAPL1.824321653.648642
22020.01.01MSFT2.4082592924.816519
32020.01.01GOOG1.67543873.350875
42020.01.01AAPL8.31116818316.62234
52020.01.01AAPL2.2086939894.417385
62020.01.01MSFT6.06812656712.13625
72020.01.01AAPL4.9189267949.837851
82020.01.01AAPL9.3318693918.66374
92020.01.01AAPL1.1426115072.285222
102020.01.01AAPL2.6858745815.371748
112020.01.01AAPL3.4835911636.967183
122020.01.01AAPL0.44225254660.8845049
132020.01.01MSFT7.40665497614.81331
142020.01.01MSFT2.4938711714.987742
152020.01.01AAPL9.2420882818.48418
162020.01.01MSFT0.39545227470.7909045
172020.01.01MSFT0.34411915120.6882382
182020.01.01GOOG9.66276299819.32552
192020.01.01AAPL9.60167481219.20335
202020.01.01AAPL4.9698589109.939716
212020.01.01GOOG1.0482048302.096408
222020.01.01GOOG0.98176445951.963529
..................
10099992020.01.03AAPL9.7503879919.50077
\n", + "

1,010,000 rows × 4 columns

" + ], + "text/plain": [ + "pykx.PartitionedTable(pykx.q('\n", + "date ticker price size price_copy\n", + "-------------------------------------------\n", + "2020.01.01 MSFT 7.079266 800 14.15853 \n", + "2020.01.01 AAPL 1.824321 65 3.648642 \n", + "2020.01.01 MSFT 2.408259 292 4.816519 \n", + "2020.01.01 GOOG 1.675438 7 3.350875 \n", + "2020.01.01 AAPL 8.311168 183 16.62234 \n", + "2020.01.01 AAPL 2.208693 989 4.417385 \n", + "2020.01.01 MSFT 6.068126 567 12.13625 \n", + "2020.01.01 AAPL 4.918926 794 9.837851 \n", + "2020.01.01 AAPL 9.331869 39 18.66374 \n", + "2020.01.01 AAPL 1.142611 507 2.285222 \n", + "2020.01.01 AAPL 2.685874 581 5.371748 \n", + "2020.01.01 AAPL 3.483591 163 6.967183 \n", + "2020.01.01 AAPL 0.4422525 466 0.8845049 \n", + "2020.01.01 MSFT 7.406654 976 14.81331 \n", + "2020.01.01 MSFT 2.493871 171 4.987742 \n", + "2020.01.01 AAPL 9.242088 28 18.48418 \n", + "2020.01.01 MSFT 0.3954522 747 0.7909045 \n", + "2020.01.01 MSFT 0.3441191 512 0.6882382 \n", + "2020.01.01 GOOG 9.662762 998 19.32552 \n", + "2020.01.01 AAPL 9.601674 812 19.20335 \n", + "..\n", + "'))" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "db.trades" + ] + }, + { + "cell_type": "markdown", + "id": "a7db5560", + "metadata": {}, + "source": [ + "Once you are happy with the new values within the `price_copy` column, you can safely delete the `price` column, then rename the `price_copy` column to be called `price`." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "fbb0fe94", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023.12.15 16:14:33 deleting column price from `:/var/folders/l8/t7s11kcs02x3dchm9_m48mq80000gn/T/tmp2ts68edc/db/2020.01.01/trades\n", + "2023.12.15 16:14:33 deleting column price from `:/var/folders/l8/t7s11kcs02x3dchm9_m48mq80000gn/T/tmp2ts68edc/db/2020.01.02/trades\n", + "2023.12.15 16:14:33 deleting column price from `:/var/folders/l8/t7s11kcs02x3dchm9_m48mq80000gn/T/tmp2ts68edc/db/2020.01.03/trades\n", + "2023.12.15 16:14:33 renaming price_copy to price in `:/var/folders/l8/t7s11kcs02x3dchm9_m48mq80000gn/T/tmp2ts68edc/db/2020.01.01/trades\n", + "2023.12.15 16:14:33 renaming price_copy to price in `:/var/folders/l8/t7s11kcs02x3dchm9_m48mq80000gn/T/tmp2ts68edc/db/2020.01.02/trades\n", + "2023.12.15 16:14:33 renaming price_copy to price in `:/var/folders/l8/t7s11kcs02x3dchm9_m48mq80000gn/T/tmp2ts68edc/db/2020.01.03/trades\n" + ] + } + ], + "source": [ + "db.delete_column('trades', 'price')\n", + "db.rename_column('trades', 'price_copy', 'price')" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "2810b08f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datetickersizeprice
02020.01.01MSFT80014.15853
12020.01.01AAPL653.648642
22020.01.01MSFT2924.816519
32020.01.01GOOG73.350875
42020.01.01AAPL18316.62234
52020.01.01AAPL9894.417385
62020.01.01MSFT56712.13625
72020.01.01AAPL7949.837851
82020.01.01AAPL3918.66374
92020.01.01AAPL5072.285222
102020.01.01AAPL5815.371748
112020.01.01AAPL1636.967183
122020.01.01AAPL4660.8845049
132020.01.01MSFT97614.81331
142020.01.01MSFT1714.987742
152020.01.01AAPL2818.48418
162020.01.01MSFT7470.7909045
172020.01.01MSFT5120.6882382
182020.01.01GOOG99819.32552
192020.01.01AAPL81219.20335
202020.01.01AAPL9109.939716
212020.01.01GOOG8302.096408
222020.01.01GOOG5951.963529
...............
10099992020.01.03AAPL9919.50077
\n", + "

1,010,000 rows × 4 columns

" + ], + "text/plain": [ + "pykx.PartitionedTable(pykx.q('\n", + "date ticker size price \n", + "--------------------------------\n", + "2020.01.01 MSFT 800 14.15853 \n", + "2020.01.01 AAPL 65 3.648642 \n", + "2020.01.01 MSFT 292 4.816519 \n", + "2020.01.01 GOOG 7 3.350875 \n", + "2020.01.01 AAPL 183 16.62234 \n", + "2020.01.01 AAPL 989 4.417385 \n", + "2020.01.01 MSFT 567 12.13625 \n", + "2020.01.01 AAPL 794 9.837851 \n", + "2020.01.01 AAPL 39 18.66374 \n", + "2020.01.01 AAPL 507 2.285222 \n", + "2020.01.01 AAPL 581 5.371748 \n", + "2020.01.01 AAPL 163 6.967183 \n", + "2020.01.01 AAPL 466 0.8845049\n", + "2020.01.01 MSFT 976 14.81331 \n", + "2020.01.01 MSFT 171 4.987742 \n", + "2020.01.01 AAPL 28 18.48418 \n", + "2020.01.01 MSFT 747 0.7909045\n", + "2020.01.01 MSFT 512 0.6882382\n", + "2020.01.01 GOOG 998 19.32552 \n", + "2020.01.01 AAPL 812 19.20335 \n", + "..\n", + "'))" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "db.trades" + ] + }, + { + "cell_type": "markdown", + "id": "119a373b", + "metadata": {}, + "source": [ + "To convert the data type of a column, you can use the `set_column_type` method. Before we do that, we can look at the metadata information for the table using the `meta` method. \n" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "45f01b75", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
tfa
c
date\"d\"
ticker\"s\"
size\"j\"
price\"f\"
" + ], + "text/plain": [ + "pykx.KeyedTable(pykx.q('\n", + "c | t f a\n", + "------| -----\n", + "date | d \n", + "ticker| s \n", + "size | j \n", + "price | f \n", + "'))" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kx.q.meta(db.trades)" + ] + }, + { + "cell_type": "markdown", + "id": "ffad39b1", + "metadata": {}, + "source": [ + "Currently the `size` column is the type `LongAtom`. We will update this to be a type `ShortAtom`." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "3706ad43", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023.12.15 16:20:03 resaving column size (type 5) in `:/var/folders/l8/t7s11kcs02x3dchm9_m48mq80000gn/T/tmp2ts68edc/db/2020.01.01/trades\n", + "2023.12.15 16:20:03 resaving column size (type 5) in `:/var/folders/l8/t7s11kcs02x3dchm9_m48mq80000gn/T/tmp2ts68edc/db/2020.01.02/trades\n", + "2023.12.15 16:20:03 resaving column size (type 5) in `:/var/folders/l8/t7s11kcs02x3dchm9_m48mq80000gn/T/tmp2ts68edc/db/2020.01.03/trades\n" + ] + } + ], + "source": [ + "db.set_column_type('trades', 'size', kx.ShortAtom)" + ] + }, + { + "cell_type": "markdown", + "id": "319317bf", + "metadata": {}, + "source": [ + "Now let's apply the `grouped` attribute to the size column. For more information on attributes in kdb+, please refer to the Q for Mortals [Attributes section](https://code.kx.com/q4m3/8_Tables/#88-attributes)." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "fd550ac7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023.12.15 16:20:04 resaving column ticker (type 20) in `:/var/folders/l8/t7s11kcs02x3dchm9_m48mq80000gn/T/tmp2ts68edc/db/2020.01.01/trades\n", + "2023.12.15 16:20:04 resaving column ticker (type 20) in `:/var/folders/l8/t7s11kcs02x3dchm9_m48mq80000gn/T/tmp2ts68edc/db/2020.01.02/trades\n", + "2023.12.15 16:20:04 resaving column ticker (type 20) in `:/var/folders/l8/t7s11kcs02x3dchm9_m48mq80000gn/T/tmp2ts68edc/db/2020.01.03/trades\n" + ] + } + ], + "source": [ + "db.set_column_attribute('trades', 'ticker', 'grouped')" + ] + }, + { + "cell_type": "markdown", + "id": "95e9a5a9", + "metadata": {}, + "source": [ + "Let's revisit the metadata of the table to ensure they have been applied correctly." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "debf733d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
tfa
c
date\"d\"
ticker\"s\"g
size\"h\"
price\"f\"
" + ], + "text/plain": [ + "pykx.KeyedTable(pykx.q('\n", + "c | t f a\n", + "------| -----\n", + "date | d \n", + "ticker| s g\n", + "size | h \n", + "price | f \n", + "'))" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kx.q.meta(db.trades)" + ] + }, + { + "cell_type": "markdown", + "id": "e75b07ae", + "metadata": {}, + "source": [ + "## Onboarding your next table\n", + "\n", + "Now that you have successfully set up one table, you may want to add a second table. We follow the same method as before and create the `quotes` table using the `create` method. In this example, the `quotes` table only contains data for `2020.01.03`." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "b04c2f77", + "metadata": {}, + "outputs": [], + "source": [ + "quotes = kx.Table(data={\n", + " 'sym': kx.random.random(N, ['AAPL', 'GOOG', 'MSFT']),\n", + " 'open': kx.random.random(N, 10.0),\n", + " 'high': kx.random.random(N, 10.0),\n", + " 'low': kx.random.random(N, 10.0),\n", + " 'close': kx.random.random(N, 10.0)\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "6914a50e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Writing Database Partition 2020-01-03 to table quotes\n" + ] + } + ], + "source": [ + "db.create(quotes, 'quotes', date(2020, 1, 3), by_field = 'sym')" + ] + }, + { + "cell_type": "markdown", + "id": "87670793", + "metadata": {}, + "source": [ + "All tables within a database must contain the same partition structure. To ensure the new table can be accessed, the `quotes` table needs to exist in every partition within the database, even if there is no data for that partition. This is called backfilling data. For the partitions where the `quotes` table is missing, we use the `fill_database` method. " + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "e6f873e0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Successfully filled missing tables to partition: :/var/folders/l8/t7s11kcs02x3dchm9_m48mq80000gn/T/tmp2ts68edc/db/2020.01.02\n", + "Successfully filled missing tables to partition: :/var/folders/l8/t7s11kcs02x3dchm9_m48mq80000gn/T/tmp2ts68edc/db/2020.01.01\n" + ] + } + ], + "source": [ + "db.fill_database()" + ] + }, + { + "cell_type": "markdown", + "id": "e41e8589", + "metadata": {}, + "source": [ + "Now that the database has resolved the missing tables within the partitions, we can view the new `quotes` table" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "b3be6075", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datesymopenhighlowclose
02020.01.03AAPL8.2040260.91152013.9168649.813545
12020.01.03AAPL8.0927546.0195780.085131372.825277
22020.01.03AAPL1.4250438.8817194.2854617.820761
32020.01.03AAPL7.1727363.339855.9994033.010211
42020.01.03AAPL2.9741851.5593722.763565.182052
52020.01.03AAPL3.2007597.4850887.9288136.437041
62020.01.03AAPL7.7495995.5594440.33004049.424896
72020.01.03AAPL4.8859614.6774328.2883184.366883
82020.01.03AAPL7.4128915.0821899.2140367.900838
92020.01.03AAPL6.6258479.7921396.2088189.195079
102020.01.03AAPL2.0757975.3403210.40387090.7533655
112020.01.03AAPL4.7976428.3733174.981566.299731
122020.01.03AAPL0.86887651.9676163.3495734.094004
132020.01.03AAPL2.6841430.057673528.8781742.166685
142020.01.03AAPL3.1810934.6861130.89676137.39341
152020.01.03AAPL3.6302680.45638092.890256.428857
162020.01.03AAPL7.3424699.2984047.0985091.698009
172020.01.03AAPL1.2931448.1258347.2141845.946857
182020.01.03AAPL8.0513221.4461929.4361854.824975
192020.01.03AAPL1.0187811.2994011.181810.6091787
202020.01.03AAPL4.0029094.1157725.0362111.680549
212020.01.03AAPL0.98641044.750850.51407352.468647
222020.01.03AAPL8.3885616.1704051.0671532.034476
.....................
99992020.01.03MSFT2.8328181.4661713.4575455.985203
\n", + "

10,000 rows × 6 columns

" + ], + "text/plain": [ + "pykx.PartitionedTable(pykx.q('\n", + "date sym open high low close \n", + "---------------------------------------------------------\n", + "2020.01.03 AAPL 8.204026 0.9115201 3.916864 9.813545 \n", + "2020.01.03 AAPL 8.092754 6.019578 0.08513137 2.825277 \n", + "2020.01.03 AAPL 1.425043 8.881719 4.285461 7.820761 \n", + "2020.01.03 AAPL 7.172736 3.33985 5.999403 3.010211 \n", + "2020.01.03 AAPL 2.974185 1.559372 2.76356 5.182052 \n", + "2020.01.03 AAPL 3.200759 7.485088 7.928813 6.437041 \n", + "2020.01.03 AAPL 7.749599 5.559444 0.3300404 9.424896 \n", + "2020.01.03 AAPL 4.885961 4.677432 8.288318 4.366883 \n", + "2020.01.03 AAPL 7.412891 5.082189 9.214036 7.900838 \n", + "2020.01.03 AAPL 6.625847 9.792139 6.208818 9.195079 \n", + "2020.01.03 AAPL 2.075797 5.340321 0.4038709 0.7533655\n", + "2020.01.03 AAPL 4.797642 8.373317 4.98156 6.299731 \n", + "2020.01.03 AAPL 0.8688765 1.967616 3.349573 4.094004 \n", + "2020.01.03 AAPL 2.684143 0.05767352 8.878174 2.166685 \n", + "2020.01.03 AAPL 3.181093 4.686113 0.8967613 7.39341 \n", + "2020.01.03 AAPL 3.630268 0.4563809 2.89025 6.428857 \n", + "2020.01.03 AAPL 7.342469 9.298404 7.098509 1.698009 \n", + "2020.01.03 AAPL 1.293144 8.125834 7.214184 5.946857 \n", + "2020.01.03 AAPL 8.051322 1.446192 9.436185 4.824975 \n", + "2020.01.03 AAPL 1.018781 1.299401 1.18181 0.6091787\n", + "..\n", + "'))" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "db.quotes" + ] + }, + { + "cell_type": "markdown", + "id": "43366fab", + "metadata": {}, + "source": [ + "Finally, to view the amount of saved data you can count the number of rows per partition using `partition_count`" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "78b45d91", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
quotestrades
2020.01.010500425
2020.01.020499575
2020.01.031000010000
" + ], + "text/plain": [ + "pykx.Dictionary(pykx.q('\n", + " | quotes trades\n", + "----------| -------------\n", + "2020.01.01| 0 500425\n", + "2020.01.02| 0 499575\n", + "2020.01.03| 10000 10000 \n", + "'))" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "db.partition_count()" + ] + }, + { + "cell_type": "markdown", + "id": "b03cfb4b", + "metadata": {}, + "source": [ + "## Cleanup temporary database created" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "f3883344", + "metadata": {}, + "outputs": [], + "source": [ + "tempdir.cleanup()" + ] + }, + { + "cell_type": "markdown", + "id": "90049e04", + "metadata": {}, + "source": [ + "---" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/examples/subscriber/archive.zip b/docs/examples/subscriber/archive.zip index 0e8323ee0f7d6668a00b863ed6f1013be3f859d5..655bb7dcd1dd44934bd136e21e5ce648be9c8ca2 100644 GIT binary patch literal 2374 zcma);S5(v48pZ#R5PC0CgM;(|L~3XPQ3!-$sG%uHXc0tO=sJ{O8Kid=2Lu%9C4}BP z4t)$F3?L9eV^ATKkv4em!&TRsd+$2yJKs4k`)TjJekd~#m=6E|CO{~{#G#%X(YZFcbQJ>{c&6*8#3*)(UZSDyzgP z5dWG$^NcdMGqa{BlIE){uw-4_6{M`LQq_*M%{#k7H0-aEKNlu&7M%s+VtscJgzx+!(}q)P^HSQuZXb5t;Qer21<$lx3Le>3xcsUl;^p~MVOB0%do=Sz zfzvsK>*t!*@@y&a^0+6c?9&vV9F=QQFY2z2vz3|f9#4cV3mg0Tu_J|%HZ>pu+f%EA z#sQK@H4J-}PH$*Jus@*J(Eoku2cCqzk;e%=B(kXNOAXHTS!MbOFHe5%l0hX?mB#Xr z{P3CV5j}{=tiAn|uZEDyj4q+z{7N}#-v-5HHb3mc>6O%zL(*yJ0sC7s1`0tYRabT+ z88{v`QYOZIRA*8a=`0)RA9`>zr@5QMhf-Olf`xmRM>g$%_^(J?3+WD-Nm1f7qzy z965#Krr&!uawT=IoQq0l zN1LX@h}=y4LWwuCy%sw_&b?*Pjq;0KUPoEav?7SFtR5MyT#FCoQ!2XDVHqK98_ z>FJCgE2Y~hNV>KTiIo2si8mICPXf)+V9Eh38cgQCnTo`t>M$0?1OntwAIYYS0Tf#k z!m14&w4X2~3o<93Mj89sbt|e1cg5Zab8{iV+yjo7A#3zno z?MrywSFwTIZ^iCu4!sTzI_e2|m*LvASHzy9M43YP!JxW&Fe8obQY>^|_}0LZtK=h3 znzdhl>CFa(`Su63y@M z!!F?zXJc^G-{J{X@fY%|&SHu!iWy!kw{5Lze(zhVZRbtq6W2e<(mo`KJ%E-a z@{SHZ+NNNIh4ua^(Z^)URkxkY|<_#K-*$fT-T=E>*jA@A<^$jZBn!`I?+TyqO z%7<9T=9jW5x-F;0zEm;C>G#Wn&WB;ZPP8jjF5(y=0HBzUYSwt~)gVfff*lw?Z9|fKw_dB^pX024vCGkO9>+ek)OCP|GwqbfyZTcBUcZgB#y-~WdK zl%mE^JNjD-LR{F( zv0Qd#VaK*}?8CY<-s|fNN@Tl=@Jw2~oa=Xw;55Gvw{8`t+HEKgy>sF-BD}de<`xq~rVUq<0x%XjM33ru=?6KL zn#Pm8E6*5rHp^L5Q*SWkBNlEbFi0+HnYRz#d1LrF!Q0gMayy+$oABJDqSntI%`!TY z5$?&6bV3$#n=cN^@#rIk{XJCm*4s-lVPm(l(rpv12iDTf@_0Nt9)yl228-S6<}=!1 zM48diLqY$94T2{)0RVI-QylyC^$kQY{2L;E0Ei#uzOUDB;&o*v8UiWXTpLMwT#?CMu*M9AircVHgZEj1f&FjEW;mb|RO?l09p~ zWLHR8D-#`iB%BjUS<0F2eeONgx%aux_xXH3-|s)~zh3Y6^TwEgz#;$u@B&+f>h=Ow zYYDxa0FcfN0DOQHK*76tknrjx4?9~v0K_N(;hFM=_#i4jzyW>>0)T&B^>!Bi;~E0Z zOZL!bLMnrBi=klBkSL4PDYWvEQ&K~qwp^nYW1oERSmmcBG`3;HLzX4P4{h7q+ui*M z;l=r2g=tB)Uog0Q$9G-uE*GCeT6@I*s^G7VdDK78P3N&dG+*6Uw1b_C8YZENCgR4k2w4TT*K;py=D$ZkPC^!@ztx6 zKs9wDz86&+HV5s6pY6l(sU%9j96nN1N6FGUd{9Kj-afRp@k8mLZ%){#pa0XQB9}q5 z`oh}nQV%fRB6SQbUsUd!BBUGF_0Sni*pi7K-Rp~vQ-V=Xp6#$cFMFsYc#LwiFZjgR@|M3K~H1F1K7Bh|(r z3Y7sx=L9lIHmnIjKb^bpufP~f%kvTFwMkg9CP-PSujcI2h1bG|R%tYwj4iuusn^B< zE1e@qHB^Mt4P4t)j+eF0^>bVmh7PoHdRVHJibJ|us3*qw-RG1R|1woj5adgM7>+-(vsfr;hv_nVOQJ+f)_#6&(C)QmTws4An z&%(^UoERfrQGVURS{sz(;ChxWef{LA!uDYGKF_8Pe((He4T?(==w8{`sSWuzQ{3*f zI67+SqI#TG659v(b+?As8SYUB#fc%ly+mrGZUp_yB;(=Q#@g3D+@k90B{g7tco-m>TZC+40GiD^rclTmpvxvMkG1?VA)=(g&!mN1@% z8+Mz1b}f<5c2q0_V~N%DP|^4@!7(dF+=)#T2kK}o4|iQUX)lV-&5AlwD#!o9cH$#% zycV}4Ps3t>>y!u)n)wppNc`d=D}sd(1e=CT$V&0~;^f4;)T+4+{z5D6Ork+v+(aqO z(MLN1H2HfQqx`xxYJ4Ex@A{ZVHKJX5zu9O1zGMtS`4`tSd;(zEo>7`qwBUY>GcP_t zg^*7lZn4v=F&M}y%7vv^dUl5k~Rs-le-r zm6TS`ZL@sd0~I@wJiXGFQa>u!ObyWfA=duYk|y~%PW|3mo4_8qEn8SO&tVq{y^Q$h zv08!FnKm$FMg0ET&EC_~v-+eZWS4PL6))ZRHnSCiSn<}F%e{U^OCiem*pf{7nwIuG zLj66kjLsYQ8*qhc+1F~F4^nrwU7|xCCF0gBhk{;HdI@>jYfb6Iwx*3ehS*WLykAB$ z42IQ*WAOnqm)9i!A~y%+W{*TTdqLRbnt%ZS_5-;tZh@Em-2csQrnVve+x(R0X*TV# z<7|GnRk$}?B}7w1LQFOFmkIf)!OXg7>uWRS!qpolNpre{#nwykQ1|Gf=yx+7p4$Es z*-F1*XE|BxeSE3W88+#Qas@kCKBO8|exKhC*ygD2kr$f`kHEH5RFevJfUAMAa}h+r!g9BXF^nC-=ztW5fy~}iq5k5cWSX_B`x`U;jtwSdM^nO zT=q$~iq#%-zHK*da*^yKYinSXseo7u2ue z%G#+PQOmPn%hKH)*O>EHNhYU{+WCv!e50?S2{B#qsk3kFJWzZe5$oOAS@2B5=fWdU zVnLLUz!$yu@m@t;7TDd9DWMiu6Yk@Oifw+?H#@f;BP3Axbnd(_qqayoyT;xRw$e8* zS^IW+bp*uRw2PMLIeM*M!C&_R7ku)Vd2@I1v_U{yErMP%0Z3d(AoUMTQfpqP=qEdA zqMIY)8bmhnU+p?n!qda9l#T|iquU(4y3d-H8Y|C6^%k0QX}MU8&%bC)=yJU&zYW5e zad66j{#QK^XV(S*;9yTd&Kq_9;u&kTry^G?o!rhQ`uZL)IatGh{TRu{6jKQTDNvBs7?^mnAc{ z#DuaZTUp`^4U#20cg{J_bMNUq_uPA*-}C(~fBfG6-uLrbTd=Sm0{{RV0MSs>%Q>ks z$s8;IfFBzGa10<0@VpV~@8xqH5rhgvdV2f#|8g1=YJZ6nz!J6m2Kn3E4B-YavCcET z{Jv#e>h>R33%xbue<%`o+x-!j$-|TAJ{ePiO(0ZC#0M>qY4+gh9x5gBTN(iY z4=g#Imf4*ykSjyH8T8e9AxGE`oL!6 zI*;U7g`7DG8ZSHMXYEqd56(zj$XGvT&}&0xJ`0Xb=DXnh-s4D(1JK07s053*RxGQk z^hn_B(Y_2k%0Vf&$-LZv$eMk%RV{`tlpnd7D9IBeJGuXCc8fj3Myy^X-$>Nq1&?KX zi7>DDr=lN0OUAMaw?iU$fECX`bR&&{f|Go>A!??8VKq-sw6sDWX316z5$Ngpxy;@^ zm7^UWTBQ|rV-8T-Kd^3qmk;I=* z;TU8Iz3Te5o`{7?75k*L#~U^`g3&DI=E5Se2l4w7hzRl%MG3sMI(E71bCJ(C5lqxT zwK+-WTH=I6A(D)FB&Sx8?j(XeLk^hseUvIp{ReJUR&1%D84N%-G}TZyb1VDwp~W7) zF)y<{UY9)2H1=h!7;-FBH=!1!n^4?DT{>7eU5GK5nYxX;6B-@16w4K$=+vqDK7so7 zA9~|eXAvVCC!?V)(@Ke%VEubX%y%~{BL}*tLB>{K%EZ_6YvAN?TmDQHtwE2TVSnKE zgv^caTa!$c3*l$k-Is!BhH0crZLqw?4@@iA9UCaQk0tz8-<>~LHE z$B%06Yv={o74w@BikeJNBN}$2EEPdk_I7>>(^-kR$=B(nDq^n!jOH zIb82ao0=)Xm>Pq$l~x-6TJcR*X;di6Gtz;TgxZh-CejVl6(*_07qat=^ zvBrG39mL5IKheeE?%W}yYBTO4;m=o-B~;N?(k zDW*pO<-7Qy%*AIen%h%VWB8kQQ_6j&rv_)L()^k_O`H-|?SA0koCyP_*7jjA%0SDg zS=~+(7za7!&Sk@7!ht}Mf}aggcv2TN6sA-w5OT8q2^kX!v(HNr;su|lgm0d!fbD@S z17LR?k}Qz@Hu-c-TbZ3@KTq$aN>YQ|IoVv4FF{wNub4&e@i44iXug|smq5&}9@krK zSsF{y(Y$gW9hi6SXn0_kr24See3@F%?sv4}Gg6elv2^?JTCZ8Wgy_o1kb{Bdf{X!C znuC8X!<0ivEk$_pMHBR-y^RL%f*^9D_F0NfB9KXwG_K;^9oAmCqQ3Hy(4lkpTTUYN zik7z+5oh9jM%mZ1FQ6irT68>&N@x^iJSOgy)VQ<8x1Zc07$!TTnr7bc_>#6zM4W3) z)Pjk^#gz*P&^lQQx5iI6=5b;=qNZCtp|otvp;A#o4R?_SGZm}%ftnU;8yP?iI5mt7 zhnhbSSY;04Vy+$bsXC{(II%ve?9iTDoSluy9lY<>pPkz;NPfzVHK+Mqv@3;IX?E=1 ze_wfkFy%XS8~73+yu$2Zz>~ss_BPw4MHz1$&o5^Z75uQ~FO;$8^Y7VVHHe+S9M?X- z*F9IkqPrR2MRVXNU1U}k_(7!{#5|NtstIs~>^zxGzIO?NsQDLRudk}Ug@3x24 zYTlmv(N-~9n3KD9CH#7j0A-PL-MMAM5U9WJr*$AXWAABo)K6?eV&2O!am_9PtDH~4 z50pz~H5bQ_6o9X>x*_h0<05=8+4DpPPhT2ZMP%q|&s6PWYoSNh(|Xi7#FvI(QWS~T z?izt6?eZzxpJ$6;z}*D#ehtQdIM%?4-U_CnI|VttR_dmt^E8DGO6e*gsH(mqa;wAU z1r^;P_lBM)>Wa`Yb+kZ!?Eq?HbLEzN>|vhxt=}uGwLso}KhsOzra>>qIn~ki za8`6ZFXH6^w!%Ydp8J{C=`9k|cRJ;Xq?zNXMHm&(;1HF#pvH!$Y|Z4|<&D6`oY-Vr zCp&KS?XDTON4nlO#mqQqizB|JAE1I^sc!|=K|e=DRLR~WRYp`$7y#z_S5(~ipV)<8 zA6MgPq&f1Av}z8ET1L4Ih(bTsxK!oOk1cY?N|&5TM+L~i($cS*9PTQ1Zab6Pa8+{9UA^D2n`+%y<#40YE|J0?$ebHI57I;05-jpFR0`;Mc@dS<)SR`!%`C?g z1^j_`clrlYf@y7b*nkL~y|5b*WYBnvCVf=Tvu&~IBi$*&Q5N`adC)|ByP4^Y2`3J) zzlhd97e!uHj_230p3XjUd??cgj7EEf$wy1}tre`bh$l`9up&m8#mX*tM=9lxDZMuJ zNtETj3T}t6Kc6}hr3LEZQr@KFb%paB3eh#JWdFYCD9r3&ueoKDjTkYio6DEbnnS}( z@S%@efl>GDDcX3x3OZ;q)t6`jpS$(EuB-^pD*9CjdQobMi&$wgR?<-P8rH2n+Rz>% zVEE=~HHIh`W@#0}U~o5GQUNCxg580^+!|)3Fczq+a` zOV@HzJ(?t1J2h!W9TL6DM!G$CS4+8_rC*$Wv9{HLwmQ0AcI*Nn!XdKCZub~0-p_3A zsXpMF;wK_&Y~i0b!gUP0vA=4Yd0Y(cG%)RRwNz^|H~*?c5}Ig(IQzA7Xn+@0iJ?NO z93N5AW`h%Y7^ORclvvOC;7#LQ&s+N^pYIGD%!p+jNs`1=P;mTXY-*j(ldslGvV z@J6g&{3d9wu>Y&(`y>4QqV86+3ZNbDW$?|udSA@}p0RSe(JdOJ8_MaOt@Q()km-hQt y<)69#XJrS5#eZ%A|DWQ|@$hHGC`PyVL-CKW_&YryBUk_c7RK6RoU2^Fz5WS>k$OG= literal 4798 zcmaJ_1z1#D*B(+*y1N-bL_tDwfI(V%5F{jq5*T1;5J4EYw9?%m-JoxUKf*TvPxS0LK^=vZ26c?1`LHjJ&9RnQLTZWoHF9GUA1~gG1+W+W7F~cJ@i% zZef}dFdnlqXT*JjpzGXA!C&W;h{yu)a z7&eENgF@2ctB(oLqajJ9VX9al8E2a$`0xc)x99(zAU-&E^Uc(kQywU)@Gq(|= zJEz5^oU_b!h*l5XmvI)JS#2WfAuzBQ=W$H*)EOIy-z9UCh1WX|@16xl1M%lS#tQ0( z3NdiC{A*K`u=3M$A>_|DW8CFym`ER@yUbXm3}l>hYfaL>O7m&DgxX6^l}^S@)_OZ; z>n9dPnXCKQWA@&&#q`154vvR*r`^WTBGalD*e9s1jFdX+l|IfDWs4&oR#$$8O46Fz ze7MsP8Cl1ev^57Ti-6YU7lW8@h>FqW>1Tb30OJU>sbyJ+6-98L_B=>4AZ@$dwVi>P z?}U$dQXIqHk#<6dAoh@HcElf)AT`)Rpd?*tv;-+IvG~7hQh!;CZ%YlXV2Dn%YY|5NnQ9MJa$%u@j>2j>B;IgUuX79%KyMR$eDk{fubV0g8rM z`z=RpVxGr??^fFEo?F8nWCr~ZWvMn*7q(-P2OJdb4i8x$JEV)Pt<2>1kx2bMN2f>K`Igm zhSwN0Iu~f~FuxSL8wbJ*?X-`I>zu#5%`31J+(J*~aP@)sz@#r&?Zr@M;)Z%<_<}hT zqtHELN9mO)*@C$pK*S}}A_@WQ^HNUShxs_qo>*D7_iY3Z_4Eqoi?eWP4P&UkEOFji z7ke~>tCKvMonfN=LRbatDqr|4%(GU13}co>e1xhI1&BbTH%sf55|V4Pa+ntAg6p;R z!g?44p=9+d5VL-pZ00yrxi(C)rxsthD`dDbHn^y6SnO!icD8bfqz?rau8^IF!-p7( zUoCEw!fAC{&2DR%02!#PFt0Y;C(B#XH6&a~&G2QL(Slp}gz`DC^D zxl0@CJBS+H0)wNCk2wxB^sV3`y~Q>X3QFhSSVz%jfu{;21|Emg!3F}+GTEcDK>?u0 zC96Z&cx%TO-R!i?A_YbymFSum7bfRr8=D7NsxwBCy#4AT^8AhAK@H`Ba(M-#l5ftd z1%;jW)Kr_)rB-ElXo^QE4cCfyvjX8n3!VZO3Fqw>cl+WcoibGL*@5ZYl|p1adEO^L zN)=*G=QcxPHCn7CX4PY@m(n4uZrG@*(rQ{u*7fuRYVqYXA7He~?XgF#x{eCmYHHw8 zVK#EJFluRR8OjMk6?EWZ-EC)Q=)7OFl#_Mw%MpxK%k;32ob7L-I*Rgg(f1dRLJTz_ zQFr@BE22mCZ5F`$DDe?6*KC{dIO=k*@?~D5c#NGyr*r>~PtB`UaQ>v=SJbcELX%J@y*7$>UfU1z z#o?Ly@KYh_3ksoKCi?f|?KERv2u5-P+h_FNWEtUaiX}~>JOZOe;$sQ4+A=D05er}+ zOxba>t%XdW}FGx^J=AqL)e_ZFG-*_2kgkZP#45(%Tw zVNiU)n$=jv(b|@k_6rqQR21dEJgXNRs2U8NiN94t;m1w%dIU=YjCBGU_~Q_5M2iq}Q%?gITk&YotvahBNFD2E zmbK1J>e(j-s8uqeXph4^HQFXnl1GV&(%)o705#GslN`0+M83&&!Yy4<&Xsa;Ln?<` zD+B|txf?fc*@ZRbOe&ALv}zFb1?aL$UWZr#kCf92+ly2xQH4H3*)h5B zb(gi{P{^+L*%^&QZ^BM#*`t>zJ4+UN-SuXBcVWz$uZSMD&36jh(49Sr@4#tW}X!#0a z>A@fpCz2d6lRu-G?Vi~h+@*wk-H~$oR^A2qiwbcJ!=xgL($VRDi+5mKJQ=*A*RoPn za96|F)d1b&B(c|IhmZ-k*$r!bC*Fd!Aqd zLjm8X8r~V}%q=8)PM~26J-std&yf>rUGE^6xF##fo2Y!&(@h(5BB0WF-Xd|@61YA~ zNnJ1WXxGNJ%Dx=vs~tXK!&=)&`<9TJfexB6R9en%z;h7)pe&Nnj|M{w4dkwSiu!4w z;8yorfmMMOeqFyF-~*b?Lv>XGU!_l0oNI>W|hf6|JZnQ|{#mkAnvFY+xyC`)A8CDBmJSLaI2wQ5W=f(--2Dlq?k$OqCW4 zRIoV5P@JvcZ917OTk5w`?f#?%x~Uu_2|$ z1d{G#NV@-!=H_Nb=5FSu&TuPxyDREz|4n^PVn&eh+ZVcqgalX)B=WoDq667?RJO+` zW_AzyrxfbS>NBlbx>EZR_$}t}y;2u0p9@v&R8RW}wzkphS=^vBVfZ%1;y;k_)JxrK zi?DM;Zj0!t()SrO`qdV>D%Q#2X`?z&WLbpXXRk_|TQlYNn&c`>WQnY@iC#rk&3yX7<2dP~aDP<8M*^wcu?a4s-j{?bdHP zf;|;(Y`)9Y>q5QXNAguK+rHM)+ZXASZsM;+L5@~!zP>_^bN^4e)P(;W#qCT1QySI2<6G))#7+h{`RBn5@r+olX6p> zB)WWgvCONq;{TCW1`bXi;CD)NJ~}2(@dRFwzNkPGY2tq*ZS3Z&vXWP&gRo z246tlqX~ax5bJ;Fh92>d+YR+$Ty3-Khv|+ZpTVV*RXR6^ar)%pwds2t90iN`Sg@@74i7rvD^9mc8SCV(mqF0gm z z$@UmnEcY^2_ibupUu9tSx5=o^8@3M1ZB^J(DWAxbx!ZMMS#!T(dvL;?8gXMA+rF>A z!$);sSuxFOJ-1H68<-}Qih;Z*suBDd#%$NA|W-v3d1e>r;xZb^_^*fj9n1f~ z|DAZc#xKMAhm`uKTfZwp*Jf$B|5B3v3GuspyM{PIo;v-es$H!l2D0c~sXS~z0&=Yg I2(NzqABI2uEdT%j diff --git a/docs/examples/threaded_execution/asyncio_threading.py b/docs/examples/threaded_execution/asyncio_threading.py new file mode 100644 index 0000000..9d96f06 --- /dev/null +++ b/docs/examples/threaded_execution/asyncio_threading.py @@ -0,0 +1,57 @@ +import os +import asyncio +os.environ['PYKX_THREADING'] = '1' +os.environ['PYKX_BETA_FEATURES'] = '1' + +import pykx as kx + + +table = kx.q('table: ([] a: 10?10; b: 10?10)') + + +def assert_result(res): + # assert message from q process has the correct schema to be appended to the table + return type(res) is kx.LongVector and len(res) == 2 + + +async def upsert_threaded(q, calls): + counter = calls + while True: + result = await q.poll_recv_async() + if assert_result(result): + kx.q.upsert('table', result) + result = None + counter -= 1 + if counter <= 0: + break + + +async def main(): + N = 20 + calls = 1000 + conns = [await kx.RawQConnection(port=5001, event_loop=asyncio.get_event_loop()) for _ in range(N)] # noqa + main_q_con = kx.SyncQConnection(port=5001) + print('===== Initial Table =====') + print(kx.q('table')) + print('===== Initial Table =====') + # Set the variable py_server on the q process pointing towards this processes IPC connection + # We use neg to ensure the messages are sent async so no reply is expected from this process + [await conns[i](f'py_server{i}: neg .z.w') for i in range(N)] + query = 'send_data: {' + for i in range(N): + query += f'py_server{i}[2?100];' + query = query[:-1] + '}' + + await conns[0](query) + + tasks = [asyncio.create_task(upsert_threaded(conns[i], calls)) for i in range(N)] + main_q_con(f'do[{calls}; send_data[]]', wait=False) + [await t for t in tasks] + print(kx.q('table')) + + +if __name__ == '__main__': + try: + asyncio.run(main()) + finally: + kx.shutdown_thread() diff --git a/docs/examples/threaded_execution/threading.md b/docs/examples/threaded_execution/threading.md new file mode 100644 index 0000000..c3dbef4 --- /dev/null +++ b/docs/examples/threaded_execution/threading.md @@ -0,0 +1,96 @@ +# PyKX Calling into q from multiple threads + +The purpose of this example is to provide a quickstart for setting up a python process using `PyKX` +to call into `EmbeddedQ` from multiple threads. + +To follow along with this example please feel free to download this +zip archive that contains a copy of the python scripts and this +writeup. + +## Quickstart + +This example creates a python process that creates multiple tasks/threads that subscribe to a `q` +process over IPC and upon recieving a new row upsert it to a local table. There are 2 scripts +included: `asyncio_threading.py` and `threads.py`, the first uses asyncio tasks running on +seperate threads and the second example uses the python `threading` library directly to spawn +threads. + + +### Running the example + +```bash +$ python asyncio_threading.py +// or +$ python threads.py +``` + +### Outcome + +The inital table will be printed upon starting the program, once all the threads/tasks have +upserted all of the rows they have recieved to the table the final table will be printed. + +``` +$ python asyncio_threading.py +===== Initial Table ===== +a b +--- +4 8 +9 1 +2 9 +7 5 +0 4 +1 6 +9 6 +2 1 +1 8 +8 5 +===== Initial Table ===== +a b +----- +4 8 +9 1 +2 9 +7 5 +0 4 +1 6 +9 6 +2 1 +1 8 +8 5 +7 63 +11 13 +80 89 +43 50 +96 35 +35 83 +28 31 +96 12 +83 16 +77 33 +.. +``` + +### Important Note on usage + +Since using `PYKX_THREADING` creates a background thread to run the calls into `q`, the +background thread must be shutdown when finished. The easiest way to ensure this is done is by using +a `try` - `finally` block around the entrypoint to your script. This will ensure that even in the +event of an error the background thread will still be shutdown correctly so python can exit. + +``` +import os +os.environ['PYKX_THREADING'] = '1' +os.environ['PYKX_BETA_FEATURES'] = '1' +import pykx as kx + +def main(): + ... + + +if __name__ == '__main__': + try: + main() + finally: + # Must shutdown the background thread to properly exit + kx.shutdown_thread() +``` diff --git a/docs/examples/threaded_execution/threads.py b/docs/examples/threaded_execution/threads.py new file mode 100644 index 0000000..f93689e --- /dev/null +++ b/docs/examples/threaded_execution/threads.py @@ -0,0 +1,60 @@ +import os +import asyncio +from threading import Thread +os.environ['PYKX_THREADING'] = '1' +os.environ['PYKX_BETA_FEATURES'] = '1' + +import pykx as kx + + +table = kx.q('table: ([] a: 10?10; b: 10?10)') + + +def assert_result(res): + # assert message from q process has the correct schema to be appended to the table + return type(res) is kx.LongVector and len(res) == 2 + + +def upsert_threaded(q, calls): + counter = calls + while True: + result = q.poll_recv() + if result is not None and assert_result(result): + kx.q.upsert('table', result) + result = None + counter -= 1 + if counter <= 0: + break + return 0 + + +async def main(): + N = 20 + calls = 1000 + conns = [await kx.RawQConnection(port=5001, event_loop=asyncio.get_event_loop()) for _ in range(N)] # noqa + main_q_con = kx.SyncQConnection(port=5001) + print('===== Initial Table =====') + print(kx.q('table')) + print('===== Initial Table =====') + # Set the variable py_server on the q process pointing towards this processes IPC connection + # We use neg to ensure the messages are sent async so no reply is expected from this process + [await conns[i](f'py_server{i}: neg .z.w') for i in range(N)] + query = 'send_data: {' + for i in range(N): + query += f'py_server{i}[2?100];' + query = query[:-1] + '}' + + await conns[0](query) + + tasks = [Thread(target=upsert_threaded, args=[conns[i], calls]) for i in range(N)] + [t.start() for t in tasks] + main_q_con(f'do[{calls}; send_data[]]', wait=False) + [t.join() for t in tasks] + print(kx.q('table')) + + +if __name__ == '__main__': + try: + asyncio.run(main()) + finally: + kx.shutdown_thread() diff --git a/docs/extras/known_issues.md b/docs/extras/known_issues.md index c60cbf6..b16d656 100644 --- a/docs/extras/known_issues.md +++ b/docs/extras/known_issues.md @@ -9,3 +9,14 @@ use a `MonthVector` or a `DayVector`. In the scenario that it is not possible to determine the expected type a warning will be raised and the `DayVector` type will be used as a default. +- `None` and `pykx.Identity(pykx.q('::'))` do not pass through to single argument Python functions set under q. See [here](../pykx-under-q/known_issues.md#default-parameter). + + ``` + >>> def func(n=2): + ... return n + ... + >>> kx.q('func', None) + pykx.LongAtom(pykx.q('2')) + >>> kx.q('func', kx.q('::')) + pykx.LongAtom(pykx.q('2')) + ``` diff --git a/docs/faq.md b/docs/faq.md index 977c223..b639e74 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -1,4 +1,9 @@ -# FAQ +# FAQs + +## Known Issues + +* [PyKX known issues](extras/known_issues.md) +* [PyKX under q known issues](pykx-under-q/known_issues.md) ## How to work around the `'cores` licensing error? @@ -11,22 +16,52 @@ This error indicates your license is limited to a given number of cores but PyKX tried to use more cores than the license allows. - On Linux you can use `taskset` to limit the number of cores used by the python process and likewise PyKX and EmbeddedQ: -``` -# Example to limit python to the 4 first cores on a 8 cores CPU -$ taskset -c 0-3 python -``` + + ```bash + # Example to limit python to the 4 first cores on a 8 cores CPU + $ taskset -c 0-3 python + ``` - You can also do this in python before importing PyKX (Linux only): -``` ->>> import os ->>> os.sched_setaffinity(0, [0, 1, 2, 3]) ->>> import pykx as kx ->>> kx.q('til 10') -pykx.LongVector(pykx.q('0 1 2 3 4 5 6 7 8 9')) -``` + + ```bash + >>> import os + >>> os.sched_setaffinity(0, [0, 1, 2, 3]) + >>> import pykx as kx + >>> kx.q('til 10') + pykx.LongVector(pykx.q('0 1 2 3 4 5 6 7 8 9')) + ``` - On Windows you can use the `start` command with its `/affinity` argument (see: `> help start`): + + ```bat + > start /affinity f python + ``` + + (above, `0xf = 00001111b`, so the python process will only use the four cores for which the mask bits are equal to 1) + +## How does PyKX determine the license that is used? + +The following outlines the paths searched for when loading PyKX + +1. Search for `kx.lic`, `kc.lic` and `k4.lic` license files in this order within the following locations + 1. Current working directory + 1. Location defined by environment variable `QLIC` if set + 1. Location defined by environment variable `QHOME` if set +2. If a license is not found use the following environment variables (if they are set) to install and make use of a license + 1. `KDB_LICENSE_B64` pointing to a base64 encoded version of a `kc.lic` license + 1. `KDB_K4LICENSE_B64` pointing to a base64 encoded version of a `k4.lic` license +3. If a license has not been located according to the above search you will be guided to install a license following a prompt based license installation walkthrough. + +## Can I use PyKX in a subprocess? + +Yes, however doing so requires some considerations. To ensure that PyKX is initialized in a clean environment it is suggested that the creation of subprocesses reliant on PyKX should be done within a code block making use of the `kx.PyKXReimport` functionality as follows: + +```python +import pykx as kx +import subprocess +with kx.PyKXReimport(): + subprocess.Popen(['python', 'file.py']) # Run Python with a file that imports PyKX ``` -> start /affinity f python -``` -(above, 0xf = 00001111b, so the python process will only use the four cores for which the mask bits are equal to 1) + +Failure to use this functionality can result in segmentation faults as noted in the troubleshooting guide [here](troubleshooting.md). For more information on the `PyKXReimport` functionality see its API documentation [here](api/reimporting.md). diff --git a/docs/getting-started/PyKX Introduction Notebook.ipynb b/docs/getting-started/PyKX Introduction Notebook.ipynb new file mode 100644 index 0000000..1623b7e --- /dev/null +++ b/docs/getting-started/PyKX Introduction Notebook.ipynb @@ -0,0 +1,1096 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# PyKX Introduction Notebook\n", + "\n", + "The purpose of this notebook is to provide an introduction to the capabilities and functionality made available to you with PyKX.\n", + "\n", + "To follow along please download this notebook using the following 'link.'\n", + "\n", + "This Notebook is broken into the following sections\n", + "\n", + "1. [How to import PyKX](#How-to-import-Pykx)\n", + "1. [The basic data structures of PyKX](#The-basic-data-structures-of-PyKX)\n", + "1. [Accessing and creating PyKX objects](#Accessing-and-creating-PyKX-objects)\n", + "1. [Running analytics on objects in PyKX](#Running-analytics-on-objects-in-PyKX)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Welcome to PyKX!\n", + "\n", + "PyKX is a Python library built and maintained for interfacing seamlessly with the worlds fastest time-series database technology kdb+ and it's underlying vector programming language q.\n", + "\n", + "It's aim is to provide you and all Python data-engineers and data-scientist with an interface to efficiently apply analytics on large volumes of on-disk and in-memory data, in a fraction of the time of competitor libraries.\n", + "\n", + "## How to import PyKX\n", + "\n", + "To access PyKX and it's functions import it in your Python code as follows" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": ["hide_code"] + }, + "outputs": [], + "source": [ + "import os\n", + "os.environ['PYKX_Q_LOADED_MARKER'] = '' # Only used here for running Notebook under mkdocs-jupyter during document generation.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + }, + "outputs": [], + "source": [ + "import pykx as kx\n", + "kx.q.system.console_size = [10, 80]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The shortening of the import name to `kx` is done for readability of code that uses PyKX and is the intended standard for the library. As such we recommend that you always use `import pykx as kx` when using the library.\n", + "\n", + "Below we load additional libraries used through this notebook." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## The basic data structures of PyKX\n", + "\n", + "Central to your interaction with PyKX are the various data types that are supported by the library, fundamentally PyKX is built atop a fully featured functional programming language `q` which provides small footprint data structures that can be used in analytic calculations and the creation of highly performant databases. The types we show below are generated from Python equivalent types but as you will see through this notebook \n", + "\n", + "In this section we will describe the basic elements which you will come in contact with as you traverse the library and explain why/how they are different.\n", + "\n", + "### PyKX Atomic Types\n", + "\n", + "In PyKX an atom denotes a single irreducible value of a specific type, for example you may come across `pykx.FloatAtom` or `pykx.DateAtom` objects generated as follows which may have been generated as follows from an equivalent Pythonic representation. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "kx.FloatAtom(1.0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from datetime import date\n", + "kx.DateAtom(date(2020, 1, 1))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### PyKX Vector Types\n", + "\n", + "Similar to atoms, vectors are a data structure composed of a collection of multiple elements of a single specified type. These objects in PyKX along with lists described below form the basis for the majority of the other important data structures that you will encounter including dictionaries and tables.\n", + "\n", + "Typed vector objects provide significant benefits when it comes to the applications of analytics over Python lists for example. Similar to Numpy, PyKX gains from the underlying speed of it's analytic engine when operating on these strictly typed objects.\n", + "\n", + "Vector type objects are always 1-D and as such are/can be indexed along a single axis.\n", + "\n", + "In the following example we are creating PyKX vectors from common Python equivalent `numpy` and `pandas` objects." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "kx.IntVector(np.array([1, 2, 3, 4], dtype=np.int32))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "kx.toq(pd.Series([1, 2, 3, 4]))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### PyKX Lists\n", + "\n", + "A `List` in PyKX can loosely be described as an untyped vector object. Unlike vectors which are optimised for the performance of analytics, lists are more commonly used for storing reference information or matrix data.\n", + "\n", + "Unlike vector objects which are by definition 1-D in shape, lists can be ragged N-Dimensional objects. This makes them useful for the storage of some complex data structures but limits their performance when dealing with data-access/data modification tasks." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "kx.List([[1, 2, 3], [1.0, 1.1, 1.2], ['a', 'b', 'c']])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### PyKX Dictionaries\n", + "\n", + "A dictionary in PyKX is defined as a mapping between a direct key-value mapping, the list of keys and values to which they are associated must have the same count. While it can be considered as a key-value pair, it is physically stored as a pair of lists." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(kx.Dictionary({'x': [1, 2, 3], 'x1': np.array([1, 2, 3])}))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### PyKX Tables\n", + "\n", + "Tables in PyKX are a first-class typed entity which live in memory. They can be fundamentally described as a collection of named columns implemented as a dictionary. This mapping construct means that tables in PyKX are column-oriented which makes analytic operations on specified columns much faster than would be the case for a relational database equivalent.\n", + "\n", + "Tables in PyKX come in many forms but the key table types are as follows\n", + "\n", + "- `pykx.Table` \n", + "- `pykx.KeyedTable`\n", + "- `pykx.SplayedTable`\n", + "- `pykx.PartitionedTable`\n", + "\n", + "In this section we will deal only with the first two of these which constitute specifically the in-memory data table types. As will be discussed in later sections `Splayed` and `Partitioned` tables are memory-mapped on-disk data structures, these are derivations of the `pykx.Table` and `pykx.KeyedTable` type objects.\n", + "\n", + "#### `pykx.Table`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(kx.Table([[1, 2, 'a'], [2, 3, 'b'], [3, 4, 'c']], columns = ['col1', 'col2', 'col3']))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(kx.Table(data = {'col1': [1, 2, 3], 'col2': [2 , 3, 4], 'col3': ['a', 'b', 'c']}))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### `pykx.KeyedTable`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "kx.Table(data = {'x': [1, 2, 3], 'x1': [2, 3, 4], 'x2': ['a', 'b', 'c']}).set_index(['x'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Other Data Types\n", + "\n", + "The above types outline the majority of the important type structures in PyKX but there are many others which you will encounter as you use the library, below we have outlined some of the important ones that you will run into through the rest of this notebook.\n", + "\n", + "#### `pykx.Lambda`\n", + "\n", + "A `pykx.Lambda` is the most basic kind of function within PyKX. They take between 0 and 8 parameters and are the building blocks for most analytics written by users when interacting with data from PyKX." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pykx_lambda = kx.q('{x+y}')\n", + "type(pykx_lambda)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pykx_lambda(1, 2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### `pykx.Projection`\n", + "\n", + "Similar to [functools.partial](https://docs.python.org/3/library/functools.html#functools.partial), functions in PyKX can have some of their parameters fixed in advance, resulting in a new function, which is called a projection. When this projection is called, the fixed parameters are no longer required, and cannot be provided.\n", + "\n", + "If the original function had `n` total parameters, and it had `m` provided, the result would be a function (projection) that requires a user to input `n-m` parameters." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "projection = kx.q('{x+y}')(1)\n", + "projection" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "projection(2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Accessing and creating PyKX objects\n", + "\n", + "Now that we have seen some of the PyKX object types that you will encounter, practically speaking how will they be created in real-world scenarios?\n", + "\n", + "### Creating PyKX objects from Pythonic data types\n", + "\n", + "One of the most common ways that PyKX data is generated is through conversions from equivalent Pythonic data types. PyKX natively supports conversions to and from the following common Python data formats.\n", + "\n", + "- Python\n", + "- Numpy\n", + "- Pandas\n", + "- PyArrow\n", + "\n", + "In each of the above cases generation of PyKX objects is facilitated through the use of the `kx.toq` PyKX function." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pydict = {'a': [1, 2, 3], 'b': ['a', 'b', 'c'], 'c': 2}\n", + "kx.toq(pydict)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nparray = np.array([1, 2, 3, 4], dtype = np.int32)\n", + "kx.toq(nparray)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pdframe = pd.DataFrame(data = {'a':[1, 2, 3], 'b': ['a', 'b', 'c']})\n", + "kx.toq(pdframe)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Random data generation\n", + "\n", + "PyKX provides users with a module for the creation of random data of user specified PyKX types or their equivalent Python types. The creation of random data is useful in prototyping analytics and is used extensively within our documentation when creating test examples.\n", + "\n", + "As a first example you can generate a list of 1,000,000 random floating point values between 0 and 1 as follows" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "kx.random.random(1000000, 1.0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If instead you wish to choose values randomly from a list, this can be facilitated by using the list as the second argument to your function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "kx.random.random(5, [kx.LongAtom(1), ['a', 'b', 'c'], np.array([1.1, 1.2, 1.3])])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Random data does not only come in 1-Dimensional forms however and modifications to the first argument to be a list allow you to create multi-Dimensional PyKX Lists. The below examples are additionally using a PyKX trick where nulls/infinities can be used to generate random data across the full allowable range" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "kx.random.random([2, 5], kx.GUIDAtom.null)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "kx.random.random([2, 3, 4], kx.IntAtom.inf)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, users can set the seed for the random data generation explicitly allowing users to have consistency over the generated objects. This can be completed globally or for individual function calls" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "kx.random.seed(10)\n", + "kx.random.random(10, 2.0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "kx.random.random(10, 2.0, seed = 10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Running q code to generate data\n", + "\n", + "As mentioned in the introduction PyKX provides an entrypoint to the vector programming language q, as such users of PyKX can execute q code directly via PyKX within a Python session. This is facilitated through use of calls to `kx.q`.\n", + "\n", + "Create some q data:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "kx.q('0 1 2 3 4')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "kx.q('([idx:desc til 5]col1:til 5;col2:5?1f;col3:5?`2)')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Apply arguments to a user specified function `x+y`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "kx.q('{x+y}', kx.LongAtom(1), kx.LongAtom(2))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Read data from a CSV file\n", + "\n", + "A lot of data that you run into for data analysis tasks comes in the form of CSV files, PyKX similar to Pandas provides a CSV reader called via `kx.q.read.csv`, in the following cell we will create a CSV to be read in using PyKX" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import csv\n", + "\n", + "with open('pykx.csv', 'w', newline='') as file:\n", + " writer = csv.writer(file)\n", + " field = [\"name\", \"age\", \"height\", \"country\"]\n", + " \n", + " writer.writerow(field)\n", + " writer.writerow([\"Oladele Damilola\", \"40\", \"180.0\", \"Nigeria\"])\n", + " writer.writerow([\"Alina Hricko\", \"23\", \"179.2\", \"Ukraine\"])\n", + " writer.writerow([\"Isabel Walter\", \"50\", \"179.5\", \"United Kingdom\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "kx.q.read.csv('pykx.csv', types = {'age': kx.LongAtom, 'country': kx.SymbolAtom})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os.remove('pykx.csv')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Querying external Processes via IPC\n", + "\n", + "One of the most common usage patterns in organisations with access to data in kdb+/q you will encounter is to query this data from an external server process infrastructure. In the example below we assume that you have q installed in addition to PyKX, see [here](https://kx.com/kdb-insights-personal-edition-license-download/) to install q alongside the license access for PyKX.\n", + "\n", + "First we set up a q/kdb+ server setting it on port 5050 and populating it with some data in the form of a table `tab`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import subprocess\n", + "import time\n", + "\n", + "try:\n", + " proc = subprocess.Popen(\n", + " ('q', '-p', '5000'),\n", + " stdin=subprocess.PIPE,\n", + " stdout=subprocess.DEVNULL,\n", + " stderr=subprocess.DEVNULL,\n", + " )\n", + " time.sleep(2)\n", + "except:\n", + " raise kx.QError('Unable to create q process on port 5000')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once a q process is available you can establish a connection to it for synchronous query execution as follows" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "conn = kx.SyncQConnection(port = 5000)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can now run q commands against the q server" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "conn('tab:([]col1:100?`a`b`c;col2:100?1f;col3:100?0Ng)')\n", + "conn('select from tab where col1=`a')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Or use the PyKX query API" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "conn.qsql.select('tab', where=['col1=`a', 'col2<0.3'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Or use PyKX's context interface to run SQL server side if it's available to you" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "conn('\\l s.k_')\n", + "conn.sql('SELECT * FROM tab where col2>=0.5')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally the q server used for this demonstration can be shut down" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "proc.stdin.close()\n", + "proc.kill()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Running analytics on objects in PyKX\n", + "\n", + "Like many Python libraries including Numpy and Pandas PyKX provides a number of ways that it's data can be used with analytics defined internal to the library and which you have self generated.\n", + "\n", + "### Using in-built methods on PyKX Vectors\n", + "\n", + "When you are interacting with PyKX Vectors you may wish to gain insights into these objects through the application of basic analytics such as calculation of the `mean`/`median`/`mode` of the vector" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "q_vector = kx.random.random(1000, 10.0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "q_vector.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "q_vector.max()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The above is useful for basic analysis but will not be sufficient for more bespoke analytics on these vectors, to allow you more control over the analytics run you can also use the `apply` method." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def bespoke_function(x, y):\n", + " return x*y\n", + "\n", + "q_vector.apply(bespoke_function, 5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Using in-built methods on PyKX Tables\n", + "\n", + "In addition to the vector processing capabilities of PyKX your ability to operate on Tabular structures is also important. Highlighted in greater depth within the Pandas-Like API documentation [here](../user-guide/advanced/Pandas_API.ipynb) these methods allow you to apply functions and gain insights into your data in a way that is familiar.\n", + "\n", + "In the below example you will use combinations of the most commonly used elements of this Table API operating on the following table" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "N = 1000000\n", + "example_table = kx.Table(data = {\n", + " 'sym' : kx.random.random(N, ['a', 'b', 'c']),\n", + " 'col1' : kx.random.random(N, 10.0),\n", + " 'col2' : kx.random.random(N, 20)\n", + " }\n", + ")\n", + "example_table" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can search for and filter data within your tables using `loc` similarly to how this is achieved by Pandas as follows" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "example_table.loc[example_table['sym'] == 'a']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This behavior also is incorporated when retrieving data from a table through the `__get__` method as you can see here" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "example_table[example_table['sym'] == 'b']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can additionally set the index columns of the table, when dealing with PyKX tables this converts the table from a `pykx.Table` object to a `pykx.KeyedTable` object" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "example_table.set_index('sym')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Additional to basic data manipulation such as index setting you also get access to analytic capabilities such as the application of basic data manipulation operations such as `mean` and `median` as demonstrated here" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print('mean:')\n", + "print(example_table.mean(numeric_only = True))\n", + "\n", + "print('median:')\n", + "print(example_table.median(numeric_only = True))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can make use of the `groupby` method which groups the PyKX tabular data which can then be used for analytic application.\n", + "\n", + "In your first example let's start by grouping the dataset based on the `sym` column and then calculating the `mean` for each column based on their `sym`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "example_table.groupby('sym').mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As an extension to the above groupby you can now consider a more complex example which is making use of `numpy` to run some calculations on the PyKX data, you will see later that this can be simplified further in this specific use-case" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def apply_func(x):\n", + " nparray = x.np()\n", + " return np.sqrt(nparray).mean()\n", + "\n", + "example_table.groupby('sym').apply(apply_func)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Time-series specific joining of data can be completed using `merge_asof` joins. In this example a number of tables with temporal information namely a `trades` and `quotes` table" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "trades = kx.Table(data={\n", + " \"time\": [\n", + " pd.Timestamp(\"2016-05-25 13:30:00.023\"),\n", + " pd.Timestamp(\"2016-05-25 13:30:00.023\"),\n", + " pd.Timestamp(\"2016-05-25 13:30:00.030\"),\n", + " pd.Timestamp(\"2016-05-25 13:30:00.041\"),\n", + " pd.Timestamp(\"2016-05-25 13:30:00.048\"),\n", + " pd.Timestamp(\"2016-05-25 13:30:00.049\"),\n", + " pd.Timestamp(\"2016-05-25 13:30:00.072\"),\n", + " pd.Timestamp(\"2016-05-25 13:30:00.075\")\n", + " ],\n", + " \"ticker\": [\n", + " \"GOOG\",\n", + " \"MSFT\",\n", + " \"MSFT\",\n", + " \"MSFT\",\n", + " \"GOOG\",\n", + " \"AAPL\",\n", + " \"GOOG\",\n", + " \"MSFT\"\n", + " ],\n", + " \"bid\": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],\n", + " \"ask\": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03]\n", + "})\n", + "quotes = kx.Table(data={\n", + " \"time\": [\n", + " pd.Timestamp(\"2016-05-25 13:30:00.023\"),\n", + " pd.Timestamp(\"2016-05-25 13:30:00.038\"),\n", + " pd.Timestamp(\"2016-05-25 13:30:00.048\"),\n", + " pd.Timestamp(\"2016-05-25 13:30:00.048\"),\n", + " pd.Timestamp(\"2016-05-25 13:30:00.048\")\n", + " ],\n", + " \"ticker\": [\"MSFT\", \"MSFT\", \"GOOG\", \"GOOG\", \"AAPL\"],\n", + " \"price\": [51.95, 51.95, 720.77, 720.92, 98.0],\n", + " \"quantity\": [75, 155, 100, 100, 100]\n", + "})\n", + "\n", + "print('trades:')\n", + "display(trades)\n", + "print('quotes:')\n", + "display(quotes)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When applying the asof join you can additionally used named arguments to ensure that it is possible to make a distinction between the tables that the columns originate. In this case suffixing with `_trades` and `_quotes`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "trades.merge_asof(quotes, on='time', suffixes=('_trades', '_quotes'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Using PyKX/q native functions\n", + "\n", + "While use of the Pandas-Like API and methods provided off PyKX Vectors provides an effective method of applying analytics on PyKX data the most efficient and performant way you can run analytics on your data is through the use of the PyKX/q primitives which are available through the `kx.q` module.\n", + "\n", + "These include functionality for the calculation of moving averages, application of asof/window joins, column reversal etc. A full list of the available functions and some examples of their usage can be found [here](../api/pykx-execution/q.md).\n", + "\n", + "Here are a few examples of usage of how you can use these functions, broken into sections for convenience\n", + "\n", + "#### Mathematical functions\n", + "\n", + "##### mavg\n", + "\n", + "Calculate a series of average values across a list using a rolling window" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "kx.q.mavg(10, kx.random.random(10000, 2.0))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### cor\n", + "\n", + "Calculate the correlation between two lists" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "kx.q.cor([1, 2, 3], [2, 3, 4])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "kx.q.cor(kx.random.random(100, 1.0), kx.random.random(100, 1.0))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### prds\n", + "\n", + "Calculate the cumulative product across a supplied list" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "kx.q.prds([1, 2, 3, 4, 5])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Iteration functions\n", + "\n", + "##### each\n", + "\n", + "Supplied both as a standalone primitive and as a method for PyKX Lambdas `each` allows you to pass individual elements of a PyKX object to a function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "kx.q.each(kx.q('{prd x}'), kx.random.random([5, 5], 10.0, seed=10))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "kx.q('{prd x}').each(kx.random.random([5, 5], 10.0, seed=10))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Table functions\n", + "\n", + "##### meta\n", + "\n", + "Retrieval of metadata information about a table" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "qtab = kx.Table(data = {\n", + " 'x' : kx.random.random(1000, ['a', 'b', 'c']).grouped(),\n", + " 'y' : kx.random.random(1000, 1.0),\n", + " 'z' : kx.random.random(1000, kx.TimestampAtom.inf)\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "kx.q.meta(qtab)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### xasc\n", + "\n", + "Sort the contents of a specified column in ascending order" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "kx.q.xasc('z', qtab)" + ] + } + ], + "metadata": { + "file_extension": ".py()", + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + }, + "mimetype": "text/x-python", + "name": "python", + "npconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": 3 + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/getting-started/installing.md b/docs/getting-started/installing.md index f044603..effd713 100644 --- a/docs/getting-started/installing.md +++ b/docs/getting-started/installing.md @@ -131,12 +131,26 @@ The following steps outline the process by which a user can gain access to and i ### License installation using environment variables +To provide environment specific flexibility there are two methods by which users can install a license using environment variables. In both cases this method is flexible to the installation of both `kc.lic` and `k4.lic` versions of a license. + +#### Using a supplied license file directly + 1. Visit https://kx.com/kdb-insights-personal-edition-license-download/ or https://kx.com/kdb-insights-commercial-evaluation-license-download/ and fill in the attached form following the instructions provided. 2. On receipt of an email from KX providing access to your license download the license file and save to a secure location on your computer. 3. Set an environment variable on your computer pointing to the folder containing the license file (instructions for setting environment variables on PyKX supported operating systems can be found [here](https://chlee.co/how-to-setup-environment-variables-for-windows-mac-and-linux/). * Variable Name: `QLIC` * Variable Value: `/user/path/to/folder` +#### Using the base64 encoded license content + +1. Visit https://kx.com/kdb-insights-personal-edition-license-download/ or https://kx.com/kdb-insights-commercial-evaluation-license-download/ and fill in the attached form following the instructions provided. +2. On receipt of an email from KX providing access to your license copy the base64 encoded contents of your license provided in plain-text within the email +3. Set an environment variable `KDB_LICENSE_B64` on your computer pointing with the value copied in step 2 (instructions for setting environment variables on PyKX supported operating systems can be found [here](https://chlee.co/how-to-setup-environment-variables-for-windows-mac-and-linux/). + * Variable Name: `KDB_LICENSE_B64` + * Variable Value: `` + +If looking to make use of a `k4.lic` you can do so by setting the base64 encoded content of your file as the environment variable `KDB_K4LICENSE_B64`. + ## Supported Environments KX only officially supports versions of PyKX built by KX, i.e. versions of PyKX installed from wheel files. Support for user-built installations of PyKX (e.g. built from the source distribution) is only provided on a best-effort basis. Currently, PyKX provides wheels for the following environments: @@ -149,27 +163,47 @@ KX only officially supports versions of PyKX built by KX, i.e. versions of PyKX ### Python Dependencies +#### Required Python dependencies + PyKX depends on the following third-party Python packages: -- `pandas>=1.2` -- `numpy~=1.22` +- `numpy~=1.20; python_version=='3.7'` +- `numpy~=1.22; python_version<'3.11', python_version>'3.7'` +- `numpy~=1.23.2; python_version>='3.11'` +- `pandas>=1.2, < 2.2.0` - `pytz>=2022.1` - `toml~=0.10.2` They are installed automatically by `pip` when PyKX is installed. -### Optional Python Dependencies +The following provides a breakdown of how these libraries are used within PyKX + +- [Numpy](https://pypi.org/project/numpy) is used by PyKX when converting data from PyKX objects to numpy equivalent array/recarray style objects, additionally low level integration allowing direct calls to numpy functions such as `numpy.max` with PyKX objects relies on the numpy Python API. +- [Pandas](https://pypi.org/project/pandas) is used by PyKX when converting PyKX data to Pandas Series/DataFrame equivalent objects, additionally when converting data to PyArrow data formats as supported by the optional dependencies below Pandas is used as an intermendiary data format. +- [pytz](https://pypi.org/project/pytz/) is used by PyKX when converting data with timezone information to PyKX objects in order to ensure that the timezone offsets are accurately applied. +- [toml](https://pypi.org/project/toml/) is used by PyKX for configuration parsing, in particular when users make use of `.pykx-config` files for configuration management as outlined [here](../user-guide/configuration.md). + + +#### Optional Python Dependencies - `pyarrow>=3.0.0`, which can be included by installing the `pyarrow` extra, e.g. `pip install pykx[pyarrow]`. -- `find-libpython~=0.2`, which can be included by installing the `debug` extra, e.g. `pip install pykx[debug]`. This dependency can be used to help find `libpython` in the scenario that `pykx.q` fails to find it. +- `find-libpython~=0.2`, which can be included by installing the `debug` extra, e.g. `pip install pykx[debug]`. +- `ast2json~=0.3`, which is required for KX Dashboards Direct integration and can be installed with the `dashboards` extra, e.g. `pip install pykx[dashboards]` +- `dill>=0.2`, which is required for the Beta feature `Remote Functions` can be installed via pip with the `beta` extra, e.g. `pip install pykx[beta]` !!! Warning Trying to use the `pa` conversion methods of `pykx.K` objects or the `pykx.toq.from_arrow` method when PyArrow is not installed (or could not be imported without error) will raise a `pykx.PyArrowUnavailable` exception. `pyarrow` is supported Python 3.8-3.10 but remains in Beta for Python 3.11. +The following provides a breakdown of how these libraries are used within PyKX + +- [PyArrow](https://pypi.org/project/pyarrow) is used by PyKX for the conversion of PyKX object to and from their PyArrow equivalent table/array objects. +- [find-libpython](https://pypi.org/project/find-libpython) can be used by developers using PyKX to source the `libpython.{so|dll|dylib}` file required by [PyKX under q](../pykx-under-q/intro.md). + ### Optional Non-Python Dependencies - `libssl` for TLS on [IPC connections](../api/ipc.md). +- `libpthread` on Linux/MacOS when using the `PYKX_THREADING` environment variable. ### Windows Dependencies diff --git a/docs/getting-started/q_magic_command.ipynb b/docs/getting-started/q_magic_command.ipynb index 9411f67..aa2c079 100644 --- a/docs/getting-started/q_magic_command.ipynb +++ b/docs/getting-started/q_magic_command.ipynb @@ -1,5 +1,17 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": ["hide_code"] + }, + "outputs": [], + "source": [ + "import os\n", + "os.environ['PYKX_Q_LOADED_MARKER'] = '' # Only used here for running Notebook under mkdocs-jupyter during document generation.\n" + ] + }, { "cell_type": "code", "execution_count": null, @@ -7,8 +19,6 @@ "metadata": {}, "outputs": [], "source": [ - "import os\n", - "os.environ['PYKX_Q_LOADED_MARKER'] = '' # Only used here for running Notebook under mkdocs-jupyter during document generation.\n", "import pykx as kx" ] }, @@ -61,12 +71,24 @@ ] }, { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "`print` is the default method called on returned objects. To use `display` you can instead pass `--display`" - ] - }, + "cell_type": "markdown", + "id": "89ec26e4", + "metadata": {}, + "source": [ + "#### Execution options\n", + "\n", + "Execution options can also be included after `%%q`.\n", + "\n", + "Here is the list of currently supported execution options.\n", + "\n", + "```\n", + "--debug: prints the q backtrace before raising a QError\n", + " if the cell errors\n", + "--display: calls display rather than the default print\n", + " on returned objects\n", + "```\n" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/docs/getting-started/quickstart.md b/docs/getting-started/quickstart.md index 233b9f4..aebc930 100644 --- a/docs/getting-started/quickstart.md +++ b/docs/getting-started/quickstart.md @@ -50,26 +50,6 @@ x x1 ')) ``` -### Creation of PyKX objects using q - -Generation of PyKX objects using q can be completed through calling `kx.q` - -```python ->>> kx.q('10 20 30') -pykx.LongVector(pykx.q('10 20 30')) - ->>> kx.q('([]5?1f;5?`4;5?0Ng)') -pykx.Table(pykx.q(' -x x1 x2 ---------------------------------------------------- -0.439081 ncej 8c6b8b64-6815-6084-0a3e-178401251b68 -0.5759051 jogn 5ae7962d-49f2-404d-5aec-f7c8abbae288 -0.5919004 ciha 5a580fb6-656b-5e69-d445-417ebfe71994 -0.8481567 hkpb ddb87915-b672-2c32-a6cf-296061671e9d -0.389056 aeaj 580d8c87-e557-0db1-3a19-cb3a44d623b1 -')) -``` - ### Creation of PyKX objects from Python data types Generation of PyKX objects from Python, Numpy, Pandas and PyArrow objects can be completed as follows using the `kx.toq` method. @@ -113,6 +93,26 @@ col1 col2 ')) ``` +### Creation of PyKX objects using q + +Generation of PyKX objects using q can be completed through calling `kx.q` + +```python +>>> kx.q('10 20 30') +pykx.LongVector(pykx.q('10 20 30')) + +>>> kx.q('([]5?1f;5?`4;5?0Ng)') +pykx.Table(pykx.q(' +x x1 x2 +--------------------------------------------------- +0.439081 ncej 8c6b8b64-6815-6084-0a3e-178401251b68 +0.5759051 jogn 5ae7962d-49f2-404d-5aec-f7c8abbae288 +0.5919004 ciha 5a580fb6-656b-5e69-d445-417ebfe71994 +0.8481567 hkpb ddb87915-b672-2c32-a6cf-296061671e9d +0.389056 aeaj 580d8c87-e557-0db1-3a19-cb3a44d623b1 +')) +``` + ## Interacting with PyKX Objects PyKX objects can be interacted with in a variety of ways, through indexing using Pythonic syntax, passing PyKX objects to q/numpy functions, querying via SQL/qSQL syntax or through the use of q functionality via the context interface. Each of these is described in more depth throughout this documentation but examples of each are provided here @@ -121,7 +121,7 @@ PyKX objects can be interacted with in a variety of ways, through indexing using * Create a PyKX list and interact with the list using indexing and slices. ```python - >>> qarray = kx.toq([random() for _ in range(10)], kx.FloatVector) + >>> qarray = kx.random.random(10, 1.0) >>> qarray pykx.FloatVector(pykx.q('0.391543 0.08123546 0.9367503 0.2782122 0.2392341 0.1508133 0.1567317 0.9785 ..')) >>> qarray[1] @@ -143,11 +143,12 @@ PyKX objects can be interacted with in a variety of ways, through indexing using * Create a PyKX table and manipulate using Pythonic syntax ```python + >>> N = 100 >>> qtable = kx.Table( data={ - 'x': [random() for _ in range(100)], - 'x1': [random() * 5 for _ in range(100)], - 'x2': [['a', 'b', 'c'][randint(0, 2)] for _ in range(100)] + 'x': kx.random.random(N, 1.0), + 'x1': 5 * kx.random.random(N, 1.0), + 'x2': kx.random.random(N, ['a', 'b', 'c']) } ) >>> qtable @@ -216,16 +217,16 @@ PyKX objects can be interacted with in a variety of ways, through indexing using * Pass a PyKX array objects to a Numpy functions ```python - >>> qarray1 = kx.toq([random() for _ in range(10)], kx.FloatVector) + >>> qarray1 = kx.random.random(10, 1.0) >>> qarray1 pykx.FloatVector(pykx.q('0.7880561 0.9677446 0.9325539 0.6501981 0.4837422 0.5338642 0.5156039 0.31358..')) - >>> qarray2 = kx.toq([random() for _ in range(10)], kx.FloatVector) + >>> qarray2 = kx.random.random(10, 1.0) >>> qarray2 pykx.FloatVector(pykx.q('0.04164985 0.6417901 0.1608836 0.691249 0.4832847 0.6339534 0.4614883 0.06373..')) >>> np.max(qarray1) 0.9677445779088885 - >>> np.sum(kx.toq([randint(0, 10) for _ in range(10)])) + >>> np.sum(kx.random.random(10, 10)) 43 >>> np.add(qarray1, qarray2) pykx.FloatVector(pykx.q('0.8297059 1.609535 1.093438 1.341447 0.9670269 1.167818 0.9770923 0.3773123 1..')) @@ -234,11 +235,12 @@ PyKX objects can be interacted with in a variety of ways, through indexing using * Query using SQL/qSQL ```python + >>> N = 100 >>> qtable = kx.Table( data={ - 'x': [['a', 'b', 'c'][randint(0, 2)] for _ in range(100)] - 'x1': [random() for _ in range(100)], - 'x2': [random() * 5 for _ in range(100)], + 'x': kx.random.random(N, ['a', 'b', 'c'], + 'x1': kx.random.random(N, 1.0), + 'x2': 5 * kx.random.random(N, 1.0), } ) >>> qtable[0:5] @@ -310,6 +312,9 @@ Objects generated via the PyKX library can be converted where reasonable to `Pyt * Convert PyKX objects to Numpy ```python + >>> import numpy as np + >>> random = np.random.random + >>> randint = np.random.randint >>> qvec = kx.q('10?5') >>> qvec.np() array([0, 2, 4, 1, 2, 1, 0, 1, 0, 1]) @@ -360,7 +365,7 @@ Objects generated via the PyKX library can be converted where reasonable to `Pyt * Convert PyKX objects to PyArrow ```python - >>> qvec = kx.q('10?5') + >>> qvec = kx.random.random(10, 5) >>> qvec.pa() [ @@ -388,5 +393,5 @@ Objects generated via the PyKX library can be converted where reasonable to `Pyt ## Next steps -- [Interface Overview Notebook](interface_overview.ipynb) +- [Interface Overview Notebook](PyKX%20Introduction%20Notebook.ipynb#ipc-communication) - [PyKX User Guide](../user-guide/index.md) diff --git a/docs/pykx-under-q/api.md b/docs/pykx-under-q/api.md index 23d0b3f..2abee19 100644 --- a/docs/pykx-under-q/api.md +++ b/docs/pykx-under-q/api.md @@ -131,6 +131,13 @@ x x1 x2 -------------------------------------------------- 0.439081 49f2404d-5aec-f7c8-abba-e2885a580fb6 mil 0.5759051 656b5e69-d445-417e-bfe7-1994ddb87915 igf + +// Enter PyKX console setting Python objects using PyKX +q).pykx.console[] +>>> a = list(range(5)) +>>> quit() +q).pykx.eval["a"]` +0 1 2 3 4 ``` ## `.pykx.debugInfo` @@ -559,6 +566,35 @@ q).pykx.repr til 5 "0 1 2 3 4" ``` +## `.pykx.safeReimport` + + +_Isolated execution of a q function which relies on importing PyKX_ + +```q +.pykx.safeReimport[qFunction] +``` + +**Parameters:** + +name | type | description +-------------|------------|------------- +`qFunction` | `function` | A function which is to be run following unsetting of PyKX environment variables and prior to their reset + +**Returns:** + +type | description +-------|------------ +`any` | On successful execution this function will return the result of the executed function + +**Example:** + +```q +q)\l pykx.q +q).pykx.safeReimport[{system"python -c 'import pykx as kx'";til 5}] +0 1 2 3 4 +``` + ## `.pykx.set` @@ -713,12 +749,14 @@ type | description | [Python](https://docs.python.org/3/library/datatypes.html) | `"py", "python", "Python"` | [PyArrow](https://arrow.apache.org/docs/python/index.html) | `"pa", "pyarrow", "PyArrow"` | [K](../api/pykx-q-data/type_conversions.md) | `"k", "q"` | + raw | `"raw"` | + default | `"default"` | ```q -// Default value on startup is "np" +// Default value on startup is "default" q).pykx.util.defaultConv -"np" +"default" // Set default value to Pandas q).pykx.setdefault["Pandas"] @@ -726,6 +764,45 @@ q).pykx.util.defaultConv "pd" ``` +## `.pykx.todefault` + + +_Tag a q object to indicate it should use the PyKX default conversion when called in Python_ + +```q +.pykx.todefault[qObject] +``` + +**Parameters:** + +name | type | description | +----------|---------|-------------| +`qObject` | `any` | A q object which is to be converted to a default form in Python. | + +**Return:** + +type | description +-------------|------------ +`projection` | A projection which is used to indicate that once the q object is passed to Python for evaluation is should be treated as a default object. | + +!!! Note + The `todefault` conversion is used to match embedPy conversion logic, in particular it converts q lists to Python lists when dealing with contiguous datatypes rather than to nested single value array types. Additionally it converts q tables to Pandas DataFrames + +```q +// Denote that a q object once passed to Python should be managed as a default object +// in this case a q list is converted to numpy +q).pykx.todefault til 10 +enlist[`..numpy;;][0 1 2 3 4 5 6 7 8 9] + +// Pass a q list to Python treating the Python object as PyKX default +q).pykx.print .pykx.eval["lambda x: type(x)"] .pykx.todefault (til 10;til 10) + + +// Pass a q Table to Python by default treating the Python table as a Pandas DataFrame +q).pykx.print .pykx.eval["lambda x: type(x)"] .pykx.todefault ([]til 10;til 10) + +``` + ## `.pykx.tok` diff --git a/docs/pykx-under-q/intro.md b/docs/pykx-under-q/intro.md index 94e4dee..2a6568d 100644 --- a/docs/pykx-under-q/intro.md +++ b/docs/pykx-under-q/intro.md @@ -165,13 +165,13 @@ Foreign objects can be stored in variables just like any other q datatype, or as Foreign objects cannot be directly operated on in q. Instead, Python objects are typically represented as PyKX objects, which wrap the underlying foreign objects. This provides the ability to get and set attributes, index, call or convert the underlying foreign object to a q object. -Use `.pykx.wrap` to create an PyKX object from a foreign object. +Use `.pykx.wrap` to create a PyKX object from a foreign object. ```q q)x foreign q)p:.pykx.wrap x -q)p /how an PyKX object looks +q)p /how a PyKX object looks {[f;x].pykx.util.pykx[f;x]}[foreign]enlist ``` @@ -187,7 +187,7 @@ function | argument | example ### Converting data -Given `obj`, an PyKX object representing Python data, we can get the underlying data (as foreign or q) using +Given `obj`, a PyKX object representing Python data, we can get the underlying data (as foreign or q) using ```q obj`. / get data as foreign @@ -214,7 +214,7 @@ There is one important exception to this. When calling Python functions, methods ### Getting attributes and properties -Given `obj`, an PyKX object representing a Python object, we can get an attribute or property directly using +Given `obj`, a PyKX object representing a Python object, we can get an attribute or property directly using ```q obj`:attr / equivalent to obj.attr in Python @@ -248,7 +248,7 @@ q)obj[`:y]` ### Setting attributes and properties -Given `obj`, an PyKX object representing a Python object, we can set an attribute or property directly using +Given `obj`, a PyKX object representing a Python object, we can set an attribute or property directly using ```q obj[:;`:attr;val] / equivalent to obj.attr=val in Python @@ -271,7 +271,7 @@ q)obj[`:y]` ### Indexing -Given `lst`, an PyKX object representing an indexable container object in Python, we can access the element at index `i` using +Given `lst`, a PyKX object representing an indexable container object in Python, we can access the element at index `i` using ```q lst[@;i] / equivalent to lst[i] in Python @@ -307,7 +307,7 @@ q)lst` ### Getting methods -Given `obj`, an PyKX object representing a Python object, we can access a method directly using +Given `obj`, a PyKX object representing a Python object, we can access a method directly using ```q obj`:method / equivalent to obj.method in Python @@ -334,7 +334,7 @@ Using the function API, PyKX objects can be called directly (returning PyKX obje Users explicitly specify the return type as q or foreign, the default is as a PyKX object. -Given `func`, an `PyKX` object representing a callable Python function or method, we can carry out the following operations: +Given `func`, a `PyKX` object representing a callable Python function or method, we can carry out the following operations: ```q func / func is callable by default (returning PyKX) @@ -432,7 +432,7 @@ q)oarg:.pykx.eval"10" q)oarg` 10 q)ofunc:.pykx.eval["lambda x:2+x";<] -q)ofunc[1]` +q)ofunc[1] 3 q)ofunc oarg 12 @@ -691,7 +691,7 @@ func(None) #call with argument None !!! warning "PyKX function called with `::` calls Python with no arguments" - Although `::` in q corresponds to `None` in Python, if an PyKX function is called with `::` as its only argument, the corresponding Python function will be called with _no_ arguments. + Although `::` in q corresponds to `None` in Python, if a PyKX function is called with `::` as its only argument, the corresponding Python function will be called with _no_ arguments. To call a Python function with `None` as its sole argument, retrieve `None` as a foreign object in q and pass that as the argument. diff --git a/docs/pykx-under-q/known_issues.md b/docs/pykx-under-q/known_issues.md new file mode 100644 index 0000000..d463fcb --- /dev/null +++ b/docs/pykx-under-q/known_issues.md @@ -0,0 +1,131 @@ +# PyKX under q known issues + +PyKX aims to make q and Python operate as seamlessly as possible together. +However due to differences in the languages there are some special cases to be aware of when using the interface. + +## Passing special values to PyKX objects + +PyKX under q uses certain special values to control how objects are returned/converted. When you need to pass these special values as parameters some specific steps must be followed. + +### Return control values `<`, `>`, and `*` + +Using the [PyKX function API](intro.md#pykx-function-api), PyKX objects can be called directly (returning PyKX objects) or declared callable returning q or `foreign` data. + +Users explicitly specify the return type as q or foreign, the default is as a PyKX object. + +Given `func`, a object representing a callable Python function or method, we can carry out the following operations: + +```q +func / func is callable by default (returning PyKX) +func arg / call func(arg) (returning PyKX) +func[*] / declare func callable (returning PyKX) +func[*]arg / call func(arg) (returning PyKX) +func[*;arg] / equivalent +func[<] / declare func callable (returning q) +func[<]arg / call func(arg) (returning q) +func[<;arg] / equivalent +func[>] / declare func callable (returning foreign) +func[>]arg / call func(arg) (returning foreign) +func[>;arg] / equivalent +``` + +**Chaining operations** Returning another PyKX object from a function or method call, allows users to chain together sequences of operations. +We can also chain these operations together with calls to `.pykx.import`, `.pykx.get` and `.pykx.eval`. + +Due to this usage of `<`, `>`, and `*` as control characters passing them as arguments to functions must be managed more carefully. + +```q +func // Avoid passing the function without specifying a return type if you need to pass *,<,> as possible arguments +func arg // Avoid passing the argument without specifying a return type if you need to pass *,<,> as possible arguments +``` + +Do attach a return type to the function as you define it: + +```q +q)f:.pykx.eval["lambda x: x";<] // Specify < to return output as q object +q)f[*] // *,<,> can now be passed as a arguments successfully +* +``` + +### Conversion control values `` ` `` and `` `. `` + +When [converting data](intro.md#converting-data), given a PyKX object `obj` representing Python data, we can get the underlying data (as foreign or q) using: + +```q +obj`. / get data as foreign +obj` / get data as q +``` + +For example: + +```q +q)x:.pykx.eval"(1,2,3)" +q)x +{[f;x].pykx.util.pykx[f;x]}[foreign]enlist +q)x`. +foreign +q)x` +1 2 3 +``` + +Due to this usage of `` ` `` and `` `. `` as control characters passing them as arguments to functions must be managed more carefully: + +.i.e + +```q +q).pykx.eval["lambda x: x"][`]` +'Provided foreign object is not a Python object +``` + +To avoid this you can define the return type using `<` or `>` in advance: + +```q +q).pykx.eval["lambda x: x";<][`] +` +``` + +Or wrap the input in `.pykx.tok`: + +```q +q).pykx.eval["lambda x: x"][.pykx.tok[`]]` +` +``` + +### Default parameter `::` + +In q, functions take between 1-8 parameters. This differs from Python. + +When one calls a q function with empty brackets `[]` a default value is still passed. +This value is `::` the generic null. + +```q +q)(::)~{x}[] //Showing x parameter receives the generic null :: +1b +``` + +Due to this difference with Python, using `::` as an argument to PyKX functions has some difficulties: + +```q +q)f:.pykx.eval["lambda x: x";<] +q)f[::] // The Python cannot tell the difference between f[] and f[::] as they resolve to the same input +'TypeError("() missing 1 required positional argument: 'x'") + [0] f[::] +``` + +You can avoid this by wrapping the input in `.pykx.tok`: + +```q +q)(::)~f[.pykx.tok[::]] +1b +``` + +Note Python functions with 0 parameters run without issue as they ignore the passed `(::)`: + +```q +p)def noparam():return 7 +q)f:.pykx.get[`noparam;<] +q)f[] +7 +q)f[::] / equivalent +7 +``` \ No newline at end of file diff --git a/docs/pykx-under-q/upgrade.md b/docs/pykx-under-q/upgrade.md index a9d1967..f451454 100644 --- a/docs/pykx-under-q/upgrade.md +++ b/docs/pykx-under-q/upgrade.md @@ -30,18 +30,6 @@ EmbedPy does not allow users to discern between q string and symbol types when c 1b ``` - -### Python object type support - -EmbedPy contains a fundamental limitation with respect to the data formats that are supported when converting between q and Python. Namely that all q objects when passed to Python functions use the analogous Python/Numpy representation. This limitation means that a user of embedPy must handle their own data conversions when handling Pandas or PyArrow objects. - -PyKX natively supports data conversions from q to Python, Numpy, Pandas and PyArrow and as such can support workflows which previously required users to manually control these conversions, for example: - -```q -q).pykx.print .pykx.eval["lambda x:type(x)"] .pykx.topd ([]10?1f) - -``` - ## Functionality mapping The following table describes the function mapping from PyKX to embedPy for various elements of the supported functionality within embedPy, where a mapping supported this will be explicitly noted. Where workarounds exist these are additionally noted. @@ -70,3 +58,83 @@ The following table describes the function mapping from PyKX to embedPy for vari | Convert a q object to a Python foreign object | Unsupported | `.p.q2py` | | Create a Python closure using a q function | Unsupported | `.p.closure` | | Create a Python generator using a q function | Unsupported | `.p.generator` | + +## PyKX under q benefits over embedPy + +PyKX under q provides a number of key functional benefits over embedPy alone when considering the generation of workloads that integrate Python and q code. The following are the key functional/feature updates which provide differentiation between the two libraries + +1. Flexibility in supported data formats and conversions +2. Python code interoperability +3. Access to PyKX in it's Python first modality + +### Flexibility in supported data formats and conversions + +EmbedPy contains a fundamental limitation with respect to the data formats that are supported when converting between q and Python. Namely that all q objects when passed to Python functions use the analogous Python/NumPy representation. This limitation means that a user of embedPy who require data to be in a Pandas/PyArrow format need to handle these conversions manually. + +As PyKX supports Python, NumPy, Pandas and PyArrow data formats this improves the flexibility of workflows that can be supported, for example PyKX will by default convert q tables to Pandas DataFrames when passed to a Python function as follows + +```q +q).pykx.eval["lambda x:type(x)"] ([]10?1f;10?1f) + +``` + +Additional to this a number of helper functions are provided to allow users to selectively choose the target data formats which are used when passing to multivariable functions, for example + +```q +q).pykx.eval["lambda x, y:print(type(x), type(y))"][.pykx.tonp ([]10?1f);.pykx.topd til 10]; + +``` + +This flexibility makes integration with custom libraries easier to manage. + +### Python interoperability + +For users that are working to integrate tightly their Python code and q code prototyping Python functions for use within embedPy could be difficult. Users are required when defining their functions either to provide them as a string with appropriate tab/indent usage to a `.p.e` as follows + +```q +q).p.e"def func(x):\n\treturn x+1" +q)pyfunc:.pykx.get[`func;<] +q)pyfunc[2] +3 +``` + +Alternatively users could create a `.py`/`.p` file and access their functions using ```.pykx.import[`file_name]``` or `\l file_name.p` respectively. + +While these solutions provide provide a method of integrating your Python code they are not intuitive to a user versed both in Python and q. + +PyKX provides a function `.pykx.console` which allows users within a q session to run a Python "console" to generate their functions/variables for use within their q code. The following example uses PyKX 2.3.0. + +```q +q).pykx.console[] +>>> def func(x): +... return x+1 +... +>>> quit() +q)pyfunc:.pykx.get[`func;<] +q)pyfunc[2] +3 +``` + +This change allows users to iterate development of their analytics faster than when operating with embedPy. + +### Access to PyKX in it's Python first modality + +Following on from the Python interoperability section above access to PyKX itself as a Python module provides significant flexibility to users when developing analytics for use within a q session. + +With embedPy when q/kdb+ data is passed to Python for the purposes of completing "Python first" analysis there is a requirement that that analysis fully uses Python libraries that are available to a user and can not get performance benefits from having access to q/kdb+. + +Take for example a case where a user wishes to run a Python function which queries a table available in their q process using SQL and calculates the mean value for all numeric columns. + +```q +q)tab:([]100?`a`b`c;100?1f;100?1f;100?0Ng) +q).pykx.console[] +>>> import pykx as kx +>>> def pyfunction(x): +... qtab = kx.q.sql('SELECT * from tab where x=$1', x) +... return qtab.mean(numeric_only=True) +>>> quit() +q)pyfunc:.pykx.get[`pyfunction;<] +q)pyfunc `a +x1| 0.5592623 +x2| 0.486176 +``` diff --git a/docs/release-notes/changelog.md b/docs/release-notes/changelog.md index 53073e8..59d61e6 100644 --- a/docs/release-notes/changelog.md +++ b/docs/release-notes/changelog.md @@ -4,15 +4,426 @@ The changelog presented here outlines changes to PyKX when operating within a Python environment specifically, if you require changelogs associated with PyKX operating under a q environment see [here](./underq-changelog.md). -## PyKX 2.2.0 +!!! Warning + + Currently PyKX is not compatible with Pandas 2.2.0 or above as it introduced breaking changes which cause data to be cast to the incorrect type. + +## PyKX 2.3.1 #### Release Date -2023-11-09 +2024-02-07 + +### Fixes and Improvements + +- Python functions saved to q would error if passed `''` or `'.'`. These now pass without issue. + + === "Behavior prior to change" + + ```python + >>> def func(n=2): + ... return n + ... + >>> kx.q['func']= func + >>> kx.q('func', '') + Traceback (most recent call last): + File "", line 1, in + File "/home/rocuinneagain/.local/lib/python3.10/site-packages/pykx/embedded_q.py", line 227, in __call__ + return factory(result, False) + File "pykx/_wrappers.pyx", line 493, in pykx._wrappers._factory + File "pykx/_wrappers.pyx", line 486, in pykx._wrappers.factory + pykx.exceptions.QError: Provided foreign object is not a Python object + >>> kx.q('func', '.') + Traceback (most recent call last): + File "", line 1, in + File "/home/rocuinneagain/.local/lib/python3.10/site-packages/pykx/embedded_q.py", line 227, in __call__ + return factory(result, False) + File "pykx/_wrappers.pyx", line 493, in pykx._wrappers._factory + File "pykx/_wrappers.pyx", line 486, in pykx._wrappers.factory + pykx.exceptions.QError: rank + ``` + + === "Behavior post change" + + ```python + >>> def func(n=2): + ... return n + ... + >>> kx.q['func']= func + >>> kx.q('func', '') + pykx.SymbolAtom(pykx.q('`')) + >>> kx.q('func', '.') + pykx.SymbolAtom(pykx.q('`.')) + ``` + +- Changed `Table.rename()` to ignore any `columns` values that are of the wrong type instead of throwing an unhelpful error. + + === "Behavior prior to change" + + ```python + >>> key_tab.rename({0:'PolicyID'}, axis = 1) + ValueError('nyi') + ``` + + === "Behavior post change" + + ```python + >>> key_tab.rename({0:'PolicyID'}, axis = 1) + pykx.KeyedTable(pykx.q(' + idx| x y + ---| --- + 0 | 0 3 + 1 | 1 2 + 2 | 2 1 + ')) + ``` + +- Improved upon the quality of `Table.rename()` error messages and documentation on the function. +- PyKX would error with `_get_config_value() missing 1 required positional argument: 'default'` on import if a license was not found since 2.3.0. Now correctly opens the license walkthrough. +- Pandas 2.2.0 introduced breaking changes which effect PyKX. PyKX dependencies have been updated to `pandas>=1.2, < 2.2.0` until these are resolved. Data casting behavior leads to an unexpected datatype being returned: + + === "Behavior with Pandas <2.2.0" + + ```python + >>> pd.Series([1, pd.NA, 3], dtype=pd.Int64Dtype()).to_numpy() + array([1, , 3], dtype=object) + + >>> kx.K(pd.Series([1, pd.NA, 3], dtype=pd.Int64Dtype())) + pykx.LongVector(pykx.q('1 0N 3')) + ``` + + === "Behavior with Pandas >=2.2.0" + + ```python + >>> pd.Series([1, pd.NA, 3], dtype=pd.Int64Dtype()).to_numpy() + array([ 1., nan, 3.]) + + >>> kx.K(pd.Series([1, pd.NA, 3], dtype=pd.Int64Dtype())) + pykx.FloatVector(pykx.q('1 -9.223372e+18 3')) + ``` + +- `df.select_dtypes()` updated to now accept `kx.*Atom` values for `include`/`exclude` params. Use of `kx.CharVector` will return error. +- To align with other areas of PyKX the `upsert` and `insert` methods for PyKX tables and keyed tables now support the keyword argument `inplace`, this change will deprecate usage of `replace_self` with the next major release of PyKX. + +### Beta Features + +- Addition of the concept of `Remote Function` execution to PyKX, this allows users, from a Python session to define Python functions which will be executed on a remote q/kdb+ server running PyKX under q. The intention with this feature is to allow onboarding of Python first operations within existing or q/kdb+ first infrastructures + + ```python + >>> from pykx.remote import function, session + >>> remote_session = session() + >>> remote_session.create('localhost', 5050) + >>> @function(remote_session) + ... def func(x): + ... return x+1 + >>> func(2) # Functionality run on q server + pykx.LongAtom(pykx.q('3')) + >>> remote_session.clear() + ``` + +## PyKX 2.3.0 + +#### Release Date + +2024-01-22 + +### Additions + +- PyKX now supports the use of `KDB_LICENSE_B64` or `KDB_K4LICENSE_B64` configuration values/environment variables to define the content of a `kc.lic` or `k4.lic` license respectively if no license is found on initial usage of PyKX. +- Shortcut provided for access to current date, time and timestamp information using `'today'` and `'now'`. + + ```python + >>> kx.DateAtom('today') + pykx.DateAtom(pykx.q('2024.01.05')) + >>> kx.TimeAtom('now') + pykx.TimeAtom(pykx.q('16:15:32.724')) + >>> kx.TimestampAtom('now') + pykx.TimestampAtom(pykx.q('2024.01.05T16:15:42.926631000')) + ``` + +- Addition of support for `inplace` updates of PyKX tables modified using qsql select/update/delete operations on in-memory data. Application of `inplace` modifications is not supported for direct application on Partitioned/Splayed tables. + + ```python + >>> N = 1000 + >>> qtab = kx.Table(data={'x': kx.random.random(N, 1.0, seed=10)}) + >>> qtab + pykx.Table(pykx.q(' + x + ----------- + 0.0891041 + 0.8345194 + 0.3621949 + 0.999934 + 0.3837986 + .. + ')) + >>> kx.q.qsql.select(qtab, where = ['x>0.5'], inplace=True) + pykx.Table(pykx.q(' + x + ----------- + 0.8345194 + 0.999934 + 0.8619188 + 0.7517286 + 0.6348263 + .. + ')) + >>> qtab + pykx.Table(pykx.q(' + x + ----------- + 0.8345194 + 0.999934 + 0.8619188 + 0.7517286 + 0.6348263 + .. + ')) + ``` + +- Addition of `reset_index`, `add_suffix`, `add_prefix`, `count`, `skew` and `std` functionality to Pandas Like API + - See [here](../user-guide/advanced/Pandas_API.ipynb) for details of supported keyword arguments, limitations and examples. +- `%%q` Jupyter Notebook magic adds `--debug` option which prints the q backtrace if the cell execution fails. +- Release 2.3.0 adds to PyKX the concept of Beta features, these features are available to users through setting the configuration/environment variable `PYKX_BETA_FEATURES`. For more information on Beta features see further documentation [here](../beta-features/index.md) + +### Fixes and Improvements + +- `%%q` Jupyter Notebook magic now returns all outputs up to and including an error when thrown. Previously only the error was returned. +- `%%q` Jupyter Notebook magic ignores accidental whitespace in execution options. Below example no longer fails with `Received unknown argument` error: + + ```python + %%q --port 5000 + ``` + +- In cases where PyKX IPC sockets read data from unexpected publishers it could raise an `IndexError`. PyKX will now provide a more verbose error indicating that an unexpected message has been received, the bytes processed and requests a reproducible example to be provided if possible. +- Update to table column retrieval logic to error when a user attempts to access a non-existent column with a queried table. + + === "Behavior prior to change" + + ```python + >>> tab = kx.Table(data = {'a': [1, 2, 3]}) + >>> tab['c'] + pykx.LongVector(pykx.q('`long$()')) + ``` + + === "Behavior post change" + + ```python + >>> tab = kx.Table(data = {'a': [1, 2, 3]}) + >>> tab['c'] + .. + QError: Attempted to retrieve inaccessible column: c + ``` + +- Improved error message for conversion failures. +- Fixes an issue where a user would receive a length error when attempting to apply `min`, `max`, `prod` and `sum` functions on `pykx.KeyedTable` objects. + +### Beta Features + +- Database Management functionality has been added for the creation, loading and maintenance of PyKX Partitioned Databases. A full worked example of this functionality can be found [here](../examples/db-management.ipynb) along with full API documentation which includes examples of each function [here](../api/db.md). The API includes but is not limited to the following: + + - Database table creation and renaming. + - Enumeration of in-memory tables against on-disk sym file. + - Column listing, addition, reordering, renaming copying, function application and deletion on-disk. + - Attribute setting and removal. + - Addition of missing tables from partitions within a database. + +- Added `PYKX_THREADING` environment variable that allows [multithreaded programs](../beta-features/threading.md) to modify state when calling into python on secondary threads. Note: This behaviour is only supported on Linux / MacOS. + + !!! Note + + When using `PYKX_THREADING` you must ensure you call `kx.shutdown_thread()` at the end of the script to ensure the background thread is properly closed. + +## PyKX 2.2.3 + +#### Release Date + +2024-01-11 + +### Fixes and Improvements + +- PyKX now raises an error appropriately when failing to locate `msvcr100.dll` when loading on Windows. +- Config values now default to `False` when not set rather than `None`. +- Resolved issue where both `PYKX_NO_SIGNAL` and `PYKX_NO_SIGINT` needed to be set to take effect. Now correctly accepts either. +- Reduced signal handling list to only `SIGINT` and `SIGTERM`. The inclusion of `SIGSEGV` since 2.2.1 could cause segfaults with compressed enum files. +- Updated q libraries to 2024.01.09 + +!!! Note + + PyKX 2.2.3 is currently not available for Mac x86 for all Python versions, additionally it is unavailable for Mac ARM on Python 3.7. Updated builds will be provided once available. + +## PyKX 2.2.2 !!! Warning - PyKX 2.2.0 presently does not include a Python 3.11 release for MacOS x86 and Linux x86 architectures, this will be rectified in an upcoming patch release. + Please skip this release and use 2.2.3 or newer. This is due to potential segfaults when reading compressed files. + +#### Release Date + +2023-12-12 + +### Fixes and Improvements + +- Conversions between `UUID` and `pykx.GUID` types could produce invalid results under various conditions in both licensed and unlicensed mode. +- A regression in 2.2.1 resulted in `SIGINT` signals being incorrectly treated as `SIGTERM` style signals, PyKX now resets all signals overwritten by PyKX to their values prior to import. +- Indexing regression in 2.2.1 causing hangs for certain inputs such as `tbl[::-1]` has been resolved. + +## PyKX 2.2.1 + +!!! Warning + + Please skip this release and use 2.2.3 or newer. This is due to potential segfaults when reading compressed files. + +#### Release Date + +2023-11-30 + +### Fixes and Improvements + +- Some messages to `stdout` were not being captured when redirecting. Now all are captured. +- Deprecation of internally used environment variable `UNDER_PYTHON` which has been replaced by `PYKX_UNDER_PYTHON` to align with other internally used environment variables. +- Fix `Unknown default conversion type` error when `PYKX_DEFAULT_CONVERSION` is set to `k` +- Numpy dependency for Python 3.11 corrected to `numpy~=1.23.2` +- `pykx.q.qsql.select` and `pykx.q.qsql.exec` statements no longer use `get` calls for table retrieval unnecessarily when operating locally or via IPC. +- Null integral values in table keys will no longer convert the underlying vectors to floats when converting from a `pykx.KeyedTable` to `pandas.DataFrame` + + === "Behaviour prior to change" + + ```python + >>> kx.q('`col1 xkey ([] col1: (1j; 2j; 0Nj); col2:(1j; 2j; 0Nj); col3:`a`b`c)').pd() + col2 col3 + col1 + 1.0 1 a + 2.0 2 b + 0.0 -- c + ``` + + === "Behaviour post change" + + ```python + >>> kx.q('`col1 xkey ([] col1: (1j; 2j; 0Nj); col2:(1j; 2j; 0Nj); col3:`a`b`c)').pd() + col2 col3 + col1 + 1 1 a + 2 2 b + -- -- c + ``` + + !!! Warning + + For multi-keyed PyKX tables converted to Pandas the appropriate round-trip behaviour is supported however due to limitations in Pandas displaying of these as masked arrays is not supported as below + + ```python + >>> kx.q('`col1`col2 xkey ([] col1: (1j; 2j; 0Nj); col2:(1j; 2j; 0Nj); col3:`a`b`c)').pd() + col3 + col1 col2 + 1 1 a + 2 2 b + -9223372036854775808 -9223372036854775808 c + ``` + +- Fix to issue where providing `SIGTERM` signals to Python processes running PyKX would not result in the Python process being terminated. +- Addition of deprecation warning for environmental configuration option `PYKX_NO_SIGINT` which is to be replaced by `PYKX_NO_SIGNAL`. This is used when users require no signal handling logic overwrites and now covers `SIGTERM`, `SIGINT`, `SIGABRT` signals amongst others. +- Use of `pykx.q.system.variables` no longer prepends leading `.` to supplied string allowing users to get the variables associated with dictionary like namespaces. + + === "Behaviour prior to change" + + ```python + >>> kx.q('.test.a:1;.test.b:2') + >>> kx.q('test.c:3;test.d:4') + >>> kx.q.system.variables('.test') + pykx.SymbolVector(pykx.q('`s#`a`b')) + >>> kx.q.system.variables('test') + pykx.SymbolVector(pykx.q('`s#`a`b')) + ``` + + === "Behaviour post change" + + ```python + >>> kx.q('.test.a:1;.test.b:2') + >>> kx.q('test.c:3;test.d:4') + >>> kx.q.system.variables('.test') + pykx.SymbolVector(pykx.q('`s#`a`b')) + >>> kx.q.system.variables('test') + pykx.SymbolVector(pykx.q('`s#`c`d')) + ``` + +- q dictionaries with tables as keys were being incorrectly wrapped as `pykx.KeyedTable`. Now corrected to `pykx.Dictionary`: + + === "Behavior prior to change" + + ```python + >>> type(pykx.q('([] a:1 2 3;b:2 3 4)!enlist each 1 2 3')) + + ``` + + === "Behavior post change" + + ```python + >>> type(pykx.q('([] a:1 2 3;b:2 3 4)!enlist each 1 2 3')) + + ``` +- Added consistent conversion of `datetime.time` objects + + === "Behavior prior to change" + + ```q + q).pykx.pyexec"from datetime import time" + q).pykx.eval["time(11, 34, 56)"]` + foreign + ``` + + ```python + >>> kx.toq(time(11, 34, 56)) + Traceback (most recent call last): + File "", line 1, in + File "pykx/toq.pyx", line 2641, in pykx.toq.ToqModule.__call__ + File "pykx/toq.pyx", line 270, in pykx.toq._default_converter + TypeError: Cannot convert 'datetime.time(11, 34, 56)' to K object + ``` + + === "Behavior post change" + + ```q + q).pykx.pyexec"from datetime import time" + q).pykx.eval["time(11, 34, 56)"]` + 0D11:34:56.000000000 + ``` + + ```python + >>> kx.toq(time(11, 34, 56)) + pykx.TimespanAtom(pykx.q('0D11:34:56.000000000')) + ``` + +- Fixed null value for `TimestampVector` returning `NoneType` instead of `pykx.wrappers.TimestampAtom` for `.py()` method + + === "Before Null Change" + + ```python + >>> for x in kx.q('0Np,.z.p').py(): + ... print(type (x)) + + + ``` + + === "After Null Change" + + ```python + >>> for x in kx.q('0Np,.z.p').py(): + ... print(type (x)) + + + ``` + +### Upgrade considerations + +- If dependent on the environment variable `UNDER_PYTHON` please upgrade your code to use `PYKX_UNDER_PYTHON` + +## PyKX 2.2.0 + +#### Release Date + +2023-11-09 ### Additions @@ -204,6 +615,32 @@ - Addition of `poll_recv_async` to `RawQConnection` objects to support asynchronous polling. +- Addition of negative slicing to `list` , `vector` and `table` objects + + ```python + >>> import pykx as kx + >>> qlist = kx.q('("a";2;3.3;`four)') + >>> qlist[-3:] + pykx.List(pykx.q(' + 2 + 3.3 + `four + ')) + + >>> vector = kx.q('til 5') + >>> vector[:-1] + pykx.LongVector(pykx.q('0 1 2 3')) + + >>> table = kx.q('([] a:1 2 3; b:4 5 6; c:7 8 9)') + >>> table[-2:] + pykx.Table(pykx.q(' + a b c + ----- + 2 5 8 + 3 6 9 + ')) + ``` + ### Fixes and Improvements - Fix to allow users to use Python functions when operating on a `pykx.GroupbyTable` with an `apply` function @@ -399,7 +836,7 @@ the following reads a CSV file and specifies the types of the three columns name ``` - Notebooks will HTML print tables and dictionaries through the addition of `_repr_html_`. Previous `q` style output is still available using `print`. -- Added [`serialize` and `deserialize`](../api/serialize.html) as base methods to assist with the serialization of `K` objects for manual use over IPC. +- Added [`serialize` and `deserialize`](../api/serialize.md) as base methods to assist with the serialization of `K` objects for manual use over IPC. - Added support for `pandas` version `2.0`. !!! Warning "Pandas 2.0 has deprecated the `datetime64[D/M]` types." @@ -674,7 +1111,7 @@ the following reads a CSV file and specifies the types of the three columns name ### Additions - Added `to_local_folder` kwarg to `install_into_QHOME` to enable use of `pykx.q` without write access to `QHOME`. -- Added [an example](../examples/threaded_execution/README.md) that shows how to use `EmbeddedQ` in a multithreaded context where the threads need to modify global state. +- Added [an example](../examples/threaded_execution/threading.md) that shows how to use `EmbeddedQ` in a multithreaded context where the threads need to modify global state. - Added [PYKX_NO_SIGINT](../user-guide/configuration.md#environment-variables) environment variable. ### Fixes and Improvements diff --git a/docs/release-notes/underq-changelog.md b/docs/release-notes/underq-changelog.md index c30b87f..f248ca3 100644 --- a/docs/release-notes/underq-changelog.md +++ b/docs/release-notes/underq-changelog.md @@ -6,8 +6,172 @@ This changelog provides updates from PyKX 2.0.0 and above, for information relat The changelog presented here outlines changes to PyKX when operating within a q environment specifically, if you require changelogs associated with PyKX operating within a Python environment see [here](./changelog.md). +## PyKX 2.3.1 + +#### Release Date + +2024-02-07 + +### Fixes and Improvements + +- `.pykx.eval` is now variadic allowing an optional second parameter to be passed to define return type. Previously would error with `rank`. + + === "Behavior prior to change" + + ```q + q).pykx.eval["lambda x: x";<] 7 + 'rank + [0] .pykx.eval["lambda x: x";<] 7 + ``` + + === "Behavior post change" + + ```q + q).pykx.eval["lambda x: x";<] 7 + 7 + ``` + +- Wraps which have a return type assigned using `<` or `>` are now considered wraps and can be unwrapped: + + === "Behavior prior to change" + + ```q + q).pykx.util.isw .pykx.eval["lambda x: x"][<] + 0b + q).pykx.unwrap .pykx.eval["lambda x: x"][<] + {$[type[x]in 104 105 112h;util.foreignToq unwrap x;x]}.[code[foreign]]`.pykx.util.parseArgsenlist + ``` + + === "Behavior post change" + + ```q + q).pykx.util.isw .pykx.eval["lambda x: x"][<] + 1b + q).pykx.unwrap .pykx.eval["lambda x: x"][<] + foreign + ``` + +- `.pykx.qcallable` and `.pykx.pycallable` can now convert wraps which already have return types assigned: + + === "Behavior prior to change" + + ```q + q).pykx.qcallable[.pykx.eval["lambda x: x"][<]]` + 'Could not convert provided function to callable with q return + q).pykx.print .pykx.pycallable[.pykx.eval["lambda x: x"][>]] + 'Could not convert provided function to callable with Python return + ``` + + === "Behavior post change" + + ```q + q).pykx.qcallable[.pykx.eval["lambda x: x"][<]]`test + `test + q).pykx.print .pykx.wrap .pykx.pycallable[.pykx.eval["lambda x: x"][>]]`test + test + ``` + +## PyKX 2.3.0 + +#### Release Date + +2024-01-22 + +### Fixes and Improvements + +- A bug was fixed when using `.pykx.console`, it is now possible to access python variables set using the console with `.pykx.(eval|pyexec|pyeval)` functions. + + === "Behavior prior to change" + + ```q + q) .pykx.console[] + >>> a = 10 + >>> quit() + q) .pykx.eval["a"]` + 'NameError("name 'a' is not defined") + [1] /.../q/pykx.q:968: .pykx.eval:{wrap pyeval x} + ``` + + === "Behavior post change" + + ```q + q) .pykx.console[] + >>> a = 10 + >>> quit() + q) .pykx.eval["a"]` + 10 + ``` + +## PyKX 2.2.2 + +#### Release Date + +2023-12-07 + +### Fixes and Improvements + +- When loaded in a q process loading `pykx.q` would not allow `Ctrl+C` (SIGINT) interruption. + +## PyKX 2.2.1 + +#### Release Date + +2023-11-30 + +### Fixes and Improvements + +- `.pykx.print` was using `repr` representation for some objects. Now consistently calls `print`. +- `.pykx.safeReimport` now resets environment variables correctly before throwing any error raised by the function supplied to it. +- Wrapped Python objects being supplied as arguments to functions were being converted according to `.pykx.util.defaultConv`. Now are left untouched: + + === "Behavior prior to change" + + ```q + q)\l pykx.q + q)np:.pykx.import `numpy; + q)r:np[`:random.rand;1;2]; + q).pykx.print r + array([[0.03720163, 0.72012121]]) + q).pykx.print .pykx.eval["lambda x: x"] r + array([array([0.03720163, 0.72012121])], dtype=object) + q).pykx.setdefault"py" + q).pykx.print .pykx.eval["lambda x: x"] r + [[0.037201634310417564, 0.7201212148535847]] + ``` + + === "Behavior post change" + + ```q + q).pykx.print r + array([[0.59110368, 0.52612429]]) + q).pykx.print .pykx.eval["lambda x: x"] r + array([[0.59110368, 0.52612429]]) + q).pykx.setdefault"py" + q).pykx.print .pykx.eval["lambda x: x"] r + array([[0.59110368, 0.52612429]]) + ``` +- q hsym will convert correctly to `pathlib.PosixPath` rather than `str`: + + === "Behavior prior to change" + + ```q + q).pykx.eval["lambda x: print(type(x))"] `:/path/to/somewhere; + + ``` + + === "Behavior post change" + + ```q + q).pykx.eval["lambda x: print(type(x))"] `:/path/to/somewhere; + + ``` + ## PyKX 2.2.0 +#### Release Date + +2023-11-09 + ### Additions - Addition of `PYKX_EXECUTABLE` environment/configuration variable to allow control of which Python executable is used under q. @@ -63,6 +227,10 @@ This changelog provides updates from PyKX 2.0.0 and above, for information relat ## PyKX 2.1.0 +#### Release Date + +2023-10-09 + ### Fixes and Improvements - Update to default conversion logic for q objects passed to PyKX functions to more closely match embedPy based conversion expectations.For version <=2.0 conversions of KX lists would produce N Dimensional Numpy arrays of singular type. This results in issues when applying to many analytic libraries which rely on lists of lists rather than singular N Dimensional arrays. Additionally q tables and keyed tables would be converted to Numpy recarrays, these are now converted to Pandas DataFrames. To maintain previous behavior please set the following environment variable `PYKX_DEFAULT_CONVERSION="np"`. diff --git a/docs/roadmap.md b/docs/roadmap.md index 65e3496..45e84b3 100644 --- a/docs/roadmap.md +++ b/docs/roadmap.md @@ -16,7 +16,7 @@ If you need a feature that's not included in this list please let us know by rai - Database management functionality allowing for Pythonic persistence and management of on-disk kdb+ Databases (Beta) - Improvements to multi-threaded PyKX efficiency, reducing per-call overhead for running PyKX on separate threads -- Configurable initialisation logic in the absense of a license. Thus allowing users who have their own workflows for license access to modify the instructions for their users. +- Configurable initialisation logic in the absence of a license. Thus allowing users who have their own workflows for license access to modify the instructions for their users. - Addition of `cast` keyword when inserting/upserting data into a table reducing mismatch issues ## Future @@ -26,4 +26,4 @@ If you need a feature that's not included in this list please let us know by rai - Continued additions of Pandas-like functionality on PyKX Table objects - Performance improvements through enhanced usage of Cython - Real-time/Streaming functionality utilities -- Data pre-processing and statitics modules for operation on PyKX tables and vector objects +- Data pre-processing and statistics modules for operation on PyKX tables and vector objects diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index c7118c7..7aaf644 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -22,7 +22,7 @@ Once you have access to your license you can install the license following the w ### Initialization failing with a 'embedq' error -Failure to initialize PyKX while raising an error `embedq` indicates that the license you are attempting to use for PyKX in [licensed modality](modes.md) does not have the sufficient feature flags necessary to run PyKX. To access a license which does allow for running PyKX in this modality please following the instructions [here](#accessing-a-license-valid-for-pykx) to get a new license with appropriate feature flags. +Failure to initialize PyKX while raising an error `embedq` indicates that the license you are attempting to use for PyKX in [licensed modality](user-guide/advanced/modes.md) does not have the sufficient feature flags necessary to run PyKX. To access a license which does allow for running PyKX in this modality please following the instructions [here](#accessing-a-license-valid-for-pykx) to get a new license with appropriate feature flags. ### Initialization failing with a 'kc.lic' error @@ -156,3 +156,18 @@ The following section outlines how a user can get access to a verbose set of env which q: /usr/local/anaconda3/bin/q q info: ``` + +## Issues running PyKX in a subprocess? + +Internally PyKX makes use of a number of variables/environment variables which are persisted within the Python/q process within imports PyKX. Due to how Python subprocesses work with respect to inheriting environment variables users who attempt to spawn a subprocess dependent on PyKX will run into a Segmentation Fault. + +To avoid this subprocesses should be spawned while making use of the `kx.PyKXReimport` functionality as follows: + +```python +import pykx as kx +import subprocess +with kx.PyKXReimport(): + subprocess.Popen(['python', 'file.py']) # Run Python with a file that imports PyKX +``` + +For more information on the `PyKXReimport` functionality see its API documentation [here](api/reimporting.md). diff --git a/docs/user-guide/advanced/Pandas_API.ipynb b/docs/user-guide/advanced/Pandas_API.ipynb index 56c8d38..ee56ed9 100644 --- a/docs/user-guide/advanced/Pandas_API.ipynb +++ b/docs/user-guide/advanced/Pandas_API.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "2d0c8656", + "id": "dfa26ef1", "metadata": {}, "source": [ "# Pandas API\n", @@ -12,24 +12,37 @@ "\n", "This demonstration will outline the following\n", "\n", - "1. [Metadata properties](#metadata)\n", - "2. [Indexing operations](#indexing)\n", - "3. [Reindexing operations](#reindexing)\n", - "4. [Merging tables](#merging)\n", - "5. [Computations](#computations)\n", - "6. [Setting Indexes](#setting-indexes)" + "1. [Constructing Tables](#Constructing-Tables)\n", + "2. [Metadata](#Metadata)\n", + "3. [Querying and Data Interrogation](#Querying-and-Data-Interrogation)\n", + "4. [Data Joins/Merging](#Data-Joins/Merging)\n", + "5. [Analytic Functionality](#Analytic-functionality)\n", + "6. [Data Preprocessing](#Data-Preprocessing)" ] }, { "cell_type": "code", "execution_count": null, - "id": "17f28b87", - "metadata": {}, + "id": "5b2f27e1", + "metadata": { + "tags": [ + "hide_code" + ] + }, "outputs": [], "source": [ "import os\n", "os.environ['IGNORE_QHOME'] = '1' # Ignore symlinking PyKX q libraries to QHOME \n", - "os.environ['PYKX_Q_LOADED_MARKER'] = '' # Only used here for running Notebook under mkdocs-jupyter during document generation.\n", + "os.environ['PYKX_Q_LOADED_MARKER'] = '' # Only used here for running Notebook under mkdocs-jupyter during document generation.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "356b337c", + "metadata": {}, + "outputs": [], + "source": [ "import pykx as kx\n", "import numpy as np\n", "import pandas as pd\n", @@ -38,7 +51,7 @@ }, { "cell_type": "markdown", - "id": "774122a0", + "id": "b5c9b878", "metadata": {}, "source": [ "## Constructing Tables" @@ -46,7 +59,7 @@ }, { "cell_type": "markdown", - "id": "0fd8910c", + "id": "15884a6f", "metadata": {}, "source": [ "### Table\n", @@ -75,7 +88,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9a748c27", + "id": "a3d8e590", "metadata": {}, "outputs": [], "source": [ @@ -84,7 +97,7 @@ }, { "cell_type": "markdown", - "id": "231a5e28", + "id": "1967dbd6", "metadata": {}, "source": [ "Create a Table from an array like object." @@ -93,7 +106,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7e43d716", + "id": "b8c67d04", "metadata": {}, "outputs": [], "source": [ @@ -102,7 +115,7 @@ }, { "cell_type": "markdown", - "id": "1e426cda", + "id": "b59c678b", "metadata": {}, "source": [ "Create a Table from an array like object and provide names for the columns to use." @@ -111,7 +124,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2b3c2edf", + "id": "6469f77e", "metadata": {}, "outputs": [], "source": [ @@ -120,7 +133,7 @@ }, { "cell_type": "markdown", - "id": "be094191", + "id": "a3074cc5", "metadata": {}, "source": [ "### Keyed Table\n", @@ -150,7 +163,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d93e73d3", + "id": "03162ab2", "metadata": {}, "outputs": [], "source": [ @@ -159,7 +172,7 @@ }, { "cell_type": "markdown", - "id": "119c2e1f", + "id": "eda04de8", "metadata": {}, "source": [ "Create a keyed table from a list of rows." @@ -168,7 +181,7 @@ { "cell_type": "code", "execution_count": null, - "id": "959fcd3d", + "id": "de9fcc81", "metadata": {}, "outputs": [], "source": [ @@ -177,7 +190,7 @@ }, { "cell_type": "markdown", - "id": "9d83854e", + "id": "ab5393c3", "metadata": {}, "source": [ "Create a keyed table from a list of rows and provide names for the resulting columns." @@ -186,7 +199,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4b2c6989", + "id": "576e4254", "metadata": {}, "outputs": [], "source": [ @@ -195,7 +208,7 @@ }, { "cell_type": "markdown", - "id": "356b29d8", + "id": "cca4e246", "metadata": {}, "source": [ "Create a keyed table with a specified index column." @@ -204,7 +217,7 @@ { "cell_type": "code", "execution_count": null, - "id": "acbe339c", + "id": "a29d1521", "metadata": {}, "outputs": [], "source": [ @@ -213,7 +226,7 @@ }, { "cell_type": "markdown", - "id": "95a04686", + "id": "73bf284f", "metadata": {}, "source": [ "## Metadata" @@ -222,7 +235,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a52fdc82", + "id": "4b363f07", "metadata": {}, "outputs": [], "source": [ @@ -233,7 +246,7 @@ }, { "cell_type": "markdown", - "id": "280baf05", + "id": "40155b78", "metadata": {}, "source": [ "### Table.columns\n", @@ -244,7 +257,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a2ee3fad", + "id": "e8a0395e", "metadata": {}, "outputs": [], "source": [ @@ -253,7 +266,7 @@ }, { "cell_type": "markdown", - "id": "40da029e", + "id": "13516f56", "metadata": {}, "source": [ "### Table.dtypes\n", @@ -264,7 +277,7 @@ { "cell_type": "code", "execution_count": null, - "id": "70bd32d2", + "id": "5a312477", "metadata": {}, "outputs": [], "source": [ @@ -273,7 +286,7 @@ }, { "cell_type": "markdown", - "id": "00e49e84", + "id": "10124c07", "metadata": {}, "source": [ "### Table.empty\n", @@ -284,7 +297,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9dc49e08", + "id": "751fc442", "metadata": {}, "outputs": [], "source": [ @@ -293,7 +306,7 @@ }, { "cell_type": "markdown", - "id": "c00e46ef", + "id": "c973fb82", "metadata": {}, "source": [ "### Table.ndim\n", @@ -304,7 +317,7 @@ { "cell_type": "code", "execution_count": null, - "id": "db113636", + "id": "ee6b55a0", "metadata": {}, "outputs": [], "source": [ @@ -313,7 +326,7 @@ }, { "cell_type": "markdown", - "id": "5ea4b315", + "id": "07ac8e54", "metadata": {}, "source": [ "### Table.shape\n", @@ -324,7 +337,7 @@ { "cell_type": "code", "execution_count": null, - "id": "78125654", + "id": "8d6f890c", "metadata": {}, "outputs": [], "source": [ @@ -333,7 +346,7 @@ }, { "cell_type": "markdown", - "id": "1e3f85a5", + "id": "654129cc", "metadata": {}, "source": [ "### Table.size\n", @@ -344,10 +357,8 @@ { "cell_type": "code", "execution_count": null, - "id": "c77c5bc7", - "metadata": { - "scrolled": false - }, + "id": "0e621250", + "metadata": {}, "outputs": [], "source": [ "tab.size" @@ -355,473 +366,312 @@ }, { "cell_type": "markdown", - "id": "2be2ece3", + "id": "8e210a91", "metadata": {}, "source": [ - "### Table.mean()\n", - "\n", - "```\n", - "Table.mean(axis=0, numeric_only=False)\n", - "```\n", - "\n", - "Get the mean of values across the requested axis.\n", - "\n", - "**Parameters:**\n", - "\n", - "| Name | Type | Description | Default |\n", - "| :----------: | :--: | :--------------------------------------------------------- | :-----: |\n", - "| axis | int | The axis to calculate mean across 0 is columns, 1 is rows. | 0 |\n", - "| numeric_only | bool | Include only columns / rows with numeric data. | False |\n", - "\n", - "**Returns:**\n", - "\n", - "| Type | Description |\n", - "| :--------: | :--------------------------------------------------------------------------------------------- |\n", - "| Dictionary | The mean across each row / column with the key corresponding to the row number or column name. |" + "## Querying and Data Interrogation" ] }, { - "cell_type": "markdown", - "id": "cb8c5ef8", + "cell_type": "code", + "execution_count": null, + "id": "603d5534", "metadata": {}, + "outputs": [], "source": [ - "**Examples:**\n", - "\n", - "Calculate the mean across the columns of a table" + "# The examples in this section will use this example table filled with random data\n", + "kx.q('N: 1000')\n", + "tab = kx.q('([] x: til N; y: N?`AAPL`GOOG`MSFT; z: N?500f; w: N?1000; v: N?(0N 0 50 100 200 250))')\n", + "tab" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "0c3e5d76", + "cell_type": "markdown", + "id": "9bd3dada", "metadata": {}, - "outputs": [], "source": [ - "tab = kx.Table(data=\n", - " {\n", - " 'a': [1, 2, 2, 4],\n", - " 'b': [1, 2, 6, 7],\n", - " 'c': [7, 8, 9, 10],\n", - " 'd': [7, 11, 14, 14]\n", - " }\n", - ")\n", - "tab" + "### Table.all()\n", + "\n", + "```\n", + "Table.all(axis=0, bool_only=False, skipna=True)\n", + "```\n", + "\n", + "Returns whether or not all values across the given axis have a `truthy` value.\n", + "\n", + "**Parameters:**\n", + "\n", + "| Name | Type | Description | Default |\n", + "| :----------: | :--: | :------------------------------------------------------------------------------- | :-----: |\n", + "| axis | int | The axis to calculate `all` across 0 is columns, 1 is rows. | 0 |\n", + "| bool_only | bool | Only use columns of the table that are boolean types. | False |\n", + "| skipna | bool | Ignore any null values along the axis. | True |\n", + "\n", + "**Returns:**\n", + "\n", + "| Type | Description |\n", + "| :----------------: | :------------------------------------------------------------------- |\n", + "| Dictionary | A dictionary where the key represents the column name / row number and the values are the result of calling `all` on that column / row. |" ] }, { "cell_type": "code", "execution_count": null, - "id": "9986a550", + "id": "95aa447d", "metadata": {}, "outputs": [], "source": [ - "tab.mean()" + "tab.all()" ] }, { "cell_type": "markdown", - "id": "24ac0b99", + "id": "4ac12eb0", "metadata": {}, "source": [ - "Calculate the mean across the rows of a table" + "### Table.any()\n", + "\n", + "```\n", + "Table.any(axis=0, bool_only=False, skipna=True)\n", + "```\n", + "\n", + "Returns whether or not any values across the given axis have a `truthy` value.\n", + "\n", + "**Parameters:**\n", + "\n", + "| Name | Type | Description | Default |\n", + "| :----------: | :--: | :------------------------------------------------------------------------------- | :-----: |\n", + "| axis | int | The axis to calculate `any` across 0 is columns, 1 is rows. | 0 |\n", + "| bool_only | bool | Only use columns of the table that are boolean types. | False |\n", + "| skipna | bool | Ignore any null values along the axis. | True |\n", + "\n", + "**Returns:**\n", + "\n", + "| Type | Description |\n", + "| :----------------: | :------------------------------------------------------------------- |\n", + "| Dictionary | A dictionary where the key represents the column name / row number and the values are the result of calling `any` on that column / row. |" ] }, { "cell_type": "code", "execution_count": null, - "id": "41f6f669", + "id": "a43aabc4", "metadata": {}, "outputs": [], "source": [ - "tab.mean(axis=1)" + "tab.any()" ] }, { "cell_type": "markdown", - "id": "7bf853c5", + "id": "81a8e19f", "metadata": {}, "source": [ - "### Table.median()\n", + "### Table.at[]\n", "\n", "```\n", - "Table.median(axis=0, numeric_only=False)\n", + "Table.at[row, col]\n", "```\n", "\n", - "Get the median of values across the requested axis.\n", - "\n", - "**Parameters:**\n", - "\n", - "| Name | Type | Description | Default |\n", - "| :----------: | :--: | :----------------------------------------------------------- | :-----: |\n", - "| axis | int | The axis to calculate median across 0 is columns, 1 is rows. | 0 |\n", - "| numeric_only | bool | Include only columns / rows with numeric data. | False |\n", + "Access a single value for a row / column pair.\n", "\n", - "**Returns:**\n", + "Similar to `loc[]`, in that both provide label-based lookups. Use `at` if you only need to get or set a single value.\n", "\n", - "| Type | Description |\n", - "| :--------: | :----------------------------------------------------------------------------------------------- |\n", - "| Dictionary | The median across each row / column with the key corresponding to the row number or column name. |" + "The `at` property can be used for both assignment and retrieval of values at a given row and column." ] }, { "cell_type": "markdown", - "id": "98da458a", + "id": "44a37aff", "metadata": {}, "source": [ "**Examples:**\n", "\n", - "Calculate the median across the columns of a table" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bff5ac07", - "metadata": {}, - "outputs": [], - "source": [ - "tab = kx.Table(data=\n", - " {\n", - " 'a': [1, 2, 2, 4],\n", - " 'b': [1, 2, 6, 7],\n", - " 'c': [7, 8, 9, 10],\n", - " 'd': [7, 11, 14, 14]\n", - " }\n", - ")\n", - "tab" + "Get the value of the `z` column in the 997th row." ] }, { "cell_type": "code", "execution_count": null, - "id": "579c8b33", + "id": "618fe622", "metadata": {}, "outputs": [], "source": [ - "tab.median()" + "tab.at[997, 'z']" ] }, { "cell_type": "markdown", - "id": "f6698350", + "id": "23203909", "metadata": {}, "source": [ - "Calculate the median across the rows of a table" + "Reassign the value of the `z` column in the 997th row to `3.14159`." ] }, { "cell_type": "code", "execution_count": null, - "id": "5664bd93", - "metadata": { - "scrolled": false - }, + "id": "978d991d", + "metadata": {}, "outputs": [], "source": [ - "tab.median(axis=1)" + "tab.at[997, 'z'] = 3.14159\n", + "tab.at[997, 'z']" ] }, { "cell_type": "markdown", - "id": "33af56bb", + "id": "3d62cbbc", "metadata": {}, "source": [ - "### Table.mode()\n", + "### Table.get()\n", "\n", "```\n", - "Table.mode(axis=0, numeric_only=False, dropna=True)\n", + "Table.get(key, default=None)\n", "```\n", "\n", - "Get the mode of values across the requested axis.\n", + "Get a column or columns from a table by key, if the key does not exist return the default value.\n", "\n", "**Parameters:**\n", "\n", - "| Name | Type | Description | Default |\n", - "| :----------: | :--: | :------------------------------------------------------------ | :-----: |\n", - "| axis | int | The axis to calculate mode across 0 is columns, 1 is rows. | 0 |\n", - "| numeric_only | bool | Include only columns / rows with numeric data. | False |\n", - "| dropna | bool | Remove null values from the data before calculating the mode. | True |\n", + "| Name | Type | Description | Default |\n", + "| :-----: | :--------------------: | :------------------------------------------------------ | :--------: |\n", + "| key | Union[str, list[str]] | The column name or list of names to get from the table. | _required_ |\n", + "| default | int | The default value if the key is not found. | None |\n", "\n", "**Returns:**\n", "\n", - "| Type | Description |\n", - "| :--------: | :------------------------------------------------------------------------------------------------ |\n", - "| Table | The mode across each row / column with the column corresponding to the row number or column name. |" + "| Type | Description |\n", + "| :---------------: | :------------------------------------------------------------------- |\n", + "| Union[Table, Any] | A table containing only the columns requested or the default value. |" ] }, { "cell_type": "markdown", - "id": "4201c9af", + "id": "00c06637", "metadata": {}, "source": [ "**Examples:**\n", "\n", - "Calculate the mode across the columns of a table" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b4bfe36c", - "metadata": {}, - "outputs": [], - "source": [ - "tab = kx.Table(data=\n", - " {\n", - " 'a': [1, 2, 2, 4],\n", - " 'b': [1, 2, 6, 7],\n", - " 'c': [7, 8, 9, 10],\n", - " 'd': [7, 11, 14, 14]\n", - " }\n", - ")\n", - "tab" + "Get the `y` column from the table." ] }, { "cell_type": "code", "execution_count": null, - "id": "e1a7eeb1", + "id": "f950cc1e", "metadata": { "scrolled": true }, "outputs": [], "source": [ - "tab.mode()" + "tab.get('y')" ] }, { "cell_type": "markdown", - "id": "6a47af49", + "id": "78608b1c", "metadata": {}, "source": [ - "Calculate the median across the rows of a table" + "Get the `y` and `z` columns from the table." ] }, { "cell_type": "code", "execution_count": null, - "id": "130081ce", + "id": "02d4d586", "metadata": { - "scrolled": false + "scrolled": true }, "outputs": [], "source": [ - "tab.mode(axis=1)" + "tab.get(['y', 'z'])" ] }, { "cell_type": "markdown", - "id": "29dffe0d", + "id": "2a2186aa", "metadata": {}, "source": [ - "Calculate the mode across columns and keep null values." + "Attempt to get the `q` column from the table and receive none as that column does not exist." ] }, { "cell_type": "code", "execution_count": null, - "id": "53a8251a", - "metadata": { - "scrolled": true - }, + "id": "a88ef7dc", + "metadata": {}, "outputs": [], "source": [ - "tab = kx.Table(data=\n", - " {\n", - " 'x': [0, 1, 2, 3, 4, 5, 6, 7, np.NaN, np.NaN],\n", - " 'y': [10, 11, 12, 13, 14, 15, 16, 17, 18, np.NaN],\n", - " 'z': ['a', 'b', 'c', 'd', 'd', 'e', 'e', 'f', 'g', 'h']\n", - " }\n", - ")\n", - "tab" + "print(tab.get('q'))" + ] + }, + { + "cell_type": "markdown", + "id": "ea3dc01a", + "metadata": {}, + "source": [ + "Attempt to get the `q` column from the table and receive the default value `not found` as that column does not exist." ] }, { "cell_type": "code", "execution_count": null, - "id": "f8558148", + "id": "2f3abc92", "metadata": {}, "outputs": [], "source": [ - "tab.mode(dropna=False)" + "tab.get('q', 'not found')" ] }, { "cell_type": "markdown", - "id": "f5c66579", + "id": "b2195cfe", "metadata": {}, "source": [ - "### Table.std()\n", + "### Table.head()\n", "\n", "```\n", - "Table.std(axis=0, skipna=True, numeric_only=False, ddof=0)\n", + "Table.head(n=5)\n", "```\n", "\n", - "Return sample standard deviation over requested axis. Normalized by N-1 by default. This can be changed using the ddof argument.\n", - "\n", + "Get the first n rows from a table.\n", "\n", "**Parameters:**\n", "\n", - "| Name | Type | Description | Default |\n", - "| :----------: | :--: | :------------------------------------------------------------------------------- | :-----: |\n", - "| axis | int | The axis to calculate the sum across 0 is columns, 1 is rows. | 0 |\n", - "| skipna | bool | not yet implemented | True |\n", - "| numeric_only | bool | Only use columns of the table that are of a numeric data type. | False |\n", - "| ddof | int | Delta Degrees of Freedom. The divisor used in calculations is N - ddof, where N represents the number of elements. | 1 |\n", + "| Name | Type | Description | Default |\n", + "| :--: | :--: | :---------------------------- | :-----: |\n", + "| n | int | The number of rows to return. | 5 |\n", "\n", "**Returns:**\n", "\n", - "| Type | Description |\n", - "| :----------------: | :------------------------------------------------------------------- |\n", - "| Table | The std across each row / column with the key corresponding to the row number or column name. |" + "| Type | Description |\n", + "| :---: | :------------------------------- |\n", + "| Table | The first `n` rows of the table. |" ] }, { "cell_type": "markdown", - "id": "c2767afd", + "id": "18a0ca1e", "metadata": {}, "source": [ "**Examples:**\n", "\n", - "Calculate the std across the columns of a table" + "Return the first 5 rows of the table." ] }, { "cell_type": "code", "execution_count": null, - "id": "87b94fd0", + "id": "5120ce1c", "metadata": {}, "outputs": [], "source": [ - "tab = kx.Table(data=\n", - " {\n", - " 'a': [1, 2, 2, 4],\n", - " 'b': [1, 2, 6, 7],\n", - " 'c': [7, 8, 9, 10],\n", - " 'd': [7, 11, 14, 14]\n", - " }\n", - ")\n", - "tab" + "tab.head()" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "3e54d557", - "metadata": {}, - "outputs": [], - "source": [ - "tab.std()" - ] - }, - { - "cell_type": "markdown", - "id": "14950833", - "metadata": {}, - "source": [ - "Calculate the std across the rows of a table" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f19161ed", - "metadata": {}, - "outputs": [], - "source": [ - "tab.std(axis=1)" - ] - }, - { - "cell_type": "markdown", - "id": "a8ea5a38", - "metadata": {}, - "source": [ - "Calculate std accross columns with ddof=0:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6361dcb7", - "metadata": {}, - "outputs": [], - "source": [ - "tab.std(ddof=0)" - ] - }, - { - "cell_type": "markdown", - "id": "7e2813b4", - "metadata": {}, - "source": [ - "## Indexing" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "77ab64ab", - "metadata": {}, - "outputs": [], - "source": [ - "# The examples in this section will use this example table filled with random data\n", - "kx.q('N: 1000')\n", - "tab = kx.q('([] x: til N; y: N?`AAPL`GOOG`MSFT; z: N?500f; w: N?1000; v: N?(0N 0 50 100 200 250))')\n", - "tab" - ] - }, - { - "cell_type": "markdown", - "id": "69313988", - "metadata": {}, - "source": [ - "### Table.head()\n", - "\n", - "```\n", - "Table.head(n=5)\n", - "```\n", - "\n", - "Get the first n rows from a table.\n", - "\n", - "**Parameters:**\n", - "\n", - "| Name | Type | Description | Default |\n", - "| :--: | :--: | :---------------------------- | :-----: |\n", - "| n | int | The number of rows to return. | 5 |\n", - "\n", - "**Returns:**\n", - "\n", - "| Type | Description |\n", - "| :---: | :------------------------------- |\n", - "| Table | The first `n` rows of the table. |" - ] - }, - { - "cell_type": "markdown", - "id": "edf33458", - "metadata": {}, - "source": [ - "**Examples:**\n", - "\n", - "Return the first 5 rows of the table." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "916fcf4d", - "metadata": { - "scrolled": false - }, - "outputs": [], - "source": [ - "tab.head()" - ] - }, - { - "cell_type": "markdown", - "id": "cb58279a", + "cell_type": "markdown", + "id": "08f158a8", "metadata": {}, "source": [ "Return the first 10 rows of the table." @@ -830,7 +680,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bf32db40", + "id": "de9c2842", "metadata": {}, "outputs": [], "source": [ @@ -839,233 +689,134 @@ }, { "cell_type": "markdown", - "id": "a5c4a5e9", - "metadata": {}, - "source": [ - "### Table.tail()\n", - "\n", - "```\n", - "Table.tail(n=5)\n", - "```\n", - "\n", - "Get the last n rows from a table.\n", - "\n", - "**Parameters:**\n", - "\n", - "| Name | Type | Description | Default |\n", - "| :--: | :--: | :---------------------------- | :-----: |\n", - "| n | int | The number of rows to return. | 5 |\n", - "\n", - "**Returns:**\n", - "\n", - "| Type | Description |\n", - "| :---: | :------------------------------- |\n", - "| Table | The last `n` rows of the table. |" - ] - }, - { - "cell_type": "markdown", - "id": "4e3fee46", - "metadata": {}, - "source": [ - "**Examples:**\n", - "\n", - "Return the last 5 rows of the table." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a0d34e0b", - "metadata": {}, - "outputs": [], - "source": [ - "tab.tail()" - ] - }, - { - "cell_type": "markdown", - "id": "e223e705", - "metadata": {}, - "source": [ - "Return the last 10 rows of the table." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4edae0c3", - "metadata": {}, - "outputs": [], - "source": [ - "tab.tail(10)" - ] - }, - { - "cell_type": "markdown", - "id": "c87325f8", + "id": "d1c370e4", "metadata": {}, "source": [ - "### Table.get()\n", + "### Table.iloc[]\n", "\n", "```\n", - "Table.get(key, default=None)\n", + "Table.iloc[:, :]\n", "```\n", "\n", - "Get a column or columns from a table by key, if the key does not exist return the default value.\n", + "Purely integer-location based indexing for selection by position.\n", "\n", - "**Parameters:**\n", + "`iloc` is primarily integer position based (from 0 to length-1 of the axis), but may also be used with a BooleanVector.\n", "\n", - "| Name | Type | Description | Default |\n", - "| :-----: | :--------------------: | :------------------------------------------------------ | :--------: |\n", - "| key | Union[str, list[str]] | The column name or list of names to get from the table. | _required_ |\n", - "| default | int | The default value if the key is not found. | None |\n", + "Allowed inputs are:\n", + "- An integer, e.g. 5.\n", + "- A list or array of integers, e.g. [4, 3, 0].\n", + "- A slice object with ints, e.g. 1:7.\n", + "- A BooleanVector.\n", + "- A callable function with one argument (the calling Series or DataFrame) and that returns valid output for indexing (one of the above). This is useful in method chains, when you don’t have a reference to the calling object, but would like to base your selection on some value.\n", + "- A tuple of row and column indexes. The tuple elements consist of one of the above inputs, e.g. (0, 1).\n", "\n", "**Returns:**\n", "\n", - "| Type | Description |\n", - "| :---------------: | :------------------------------------------------------------------- |\n", - "| Union[Table, Any] | A table contatining only the columns requested or the default value. |" + "| Type | Description |\n", + "| :---: | :----------------------------------------------------- |\n", + "| Table | A table containing only the columns / rows requested. |" ] }, { "cell_type": "markdown", - "id": "7c96cd34", + "id": "07e31d96", "metadata": {}, "source": [ "**Examples:**\n", "\n", - "Get the `y` column from the table." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7f64d914", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "tab.get('y')" - ] - }, - { - "cell_type": "markdown", - "id": "88ee5698", - "metadata": {}, - "source": [ - "Get the `y` and `z` columns from the table." + "Get the second row from a table." ] }, { "cell_type": "code", "execution_count": null, - "id": "daef6ce6", + "id": "f8108853", "metadata": { "scrolled": true }, "outputs": [], "source": [ - "tab.get(['y', 'z'])" + "tab.iloc[1]" ] }, { "cell_type": "markdown", - "id": "26a53f6d", + "id": "30c429f4", "metadata": {}, "source": [ - "Attempt to get the `q` column from the table and recieve none as that column does not exist." + "Get the first 5 rows from a table." ] }, { "cell_type": "code", "execution_count": null, - "id": "3856084d", + "id": "2f817967", "metadata": {}, "outputs": [], "source": [ - "print(tab.get('q'))" + "tab.iloc[:5]" ] }, { "cell_type": "markdown", - "id": "91932d32", + "id": "2eb41e47", "metadata": {}, "source": [ - "Attempt to get the `q` column from the table and recieve the default value `not found` as that column does not exist." + "Get all rows of the table where the `y` column is equal to `AAPL`." ] }, { "cell_type": "code", "execution_count": null, - "id": "7d2a2bcf", - "metadata": {}, + "id": "69e14007", + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ - "tab.get('q', 'not found')" - ] - }, - { - "cell_type": "markdown", - "id": "9e831e14", - "metadata": {}, - "source": [ - "### Table.at[]\n", - "\n", - "```\n", - "Table.at[row, col]\n", - "```\n", - "\n", - "Access a single value for a row / column pair.\n", - "\n", - "Similar to `loc[]`, in that both provide label-based lookups. Use `at` if you only need to get or set a single value.\n", - "\n", - "The `at` property can be used for both assignment and retrieval of values at a given row and column." + "tab.iloc[tab['y'] == 'AAPL']" ] }, { "cell_type": "markdown", - "id": "97519657", + "id": "7861f193", "metadata": {}, "source": [ - "**Examples:**\n", - "\n", - "Get the value of the `z` column in the 997th row." + "Get all rows of the table where the `y` column is equal to `AAPL`, and only return the `y`, `z` and `w` columns." ] }, { "cell_type": "code", "execution_count": null, - "id": "9cd275bf", + "id": "323cc0f8", "metadata": {}, "outputs": [], "source": [ - "tab.at[997, 'z']" + "tab.iloc[tab['y'] == 'AAPL', ['y', 'z', 'w']]" ] }, { "cell_type": "markdown", - "id": "1fd39083", + "id": "9de566f3", "metadata": {}, "source": [ - "Reassign the value of the `z` column in the 997th row to `3.14159`." + "Replace all null values in the column `v` with the value `-100`." ] }, { "cell_type": "code", "execution_count": null, - "id": "814fa8e0", + "id": "be66947d", "metadata": {}, "outputs": [], "source": [ - "tab.at[997, 'z'] = 3.14159\n", - "tab.at[997, 'z']" + "tab.iloc[tab['v'] == kx.q('0N'), 'v'] = -100\n", + "tab" ] }, { "cell_type": "markdown", - "id": "7815e8c3", + "id": "ed37aa73", "metadata": {}, "source": [ "### Table.loc[]\n", @@ -1096,12 +847,12 @@ "\n", "| Type | Description |\n", "| :---: | :----------------------------------------------------- |\n", - "| Table | A table contatining only the columns / rows requested. |" + "| Table | A table containing only the columns / rows requested. |" ] }, { "cell_type": "markdown", - "id": "5ee06186", + "id": "c68e21f1", "metadata": {}, "source": [ "**Examples:**\n", @@ -1112,7 +863,7 @@ { "cell_type": "code", "execution_count": null, - "id": "12fc6807", + "id": "e46092cc", "metadata": { "scrolled": true }, @@ -1123,7 +874,7 @@ }, { "cell_type": "markdown", - "id": "97206dd7", + "id": "9e136f10", "metadata": {}, "source": [ "Get all rows of the table where the value in the `z` column is greater than `250.0`" @@ -1132,7 +883,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a6c9add0", + "id": "52d2f0fe", "metadata": {}, "outputs": [], "source": [ @@ -1141,7 +892,7 @@ }, { "cell_type": "markdown", - "id": "a32aca6b", + "id": "52c058a6", "metadata": {}, "source": [ "Replace all null values in the column `v` with the value `-100`." @@ -1150,7 +901,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c1ad3a23", + "id": "960f1933", "metadata": { "scrolled": true }, @@ -1162,7 +913,7 @@ }, { "cell_type": "markdown", - "id": "447b9fd2", + "id": "9b262eca", "metadata": {}, "source": [ "Replace all locations in column `v` where the value is `-100` with a null." @@ -1171,7 +922,7 @@ { "cell_type": "code", "execution_count": null, - "id": "31ea02c9", + "id": "f4c974c7", "metadata": {}, "outputs": [], "source": [ @@ -1181,7 +932,7 @@ }, { "cell_type": "markdown", - "id": "ac4c5e4b", + "id": "ddc94e12", "metadata": {}, "source": [ "Usage of the `loc` functionality under the hood additionally allows users to set columns within a table for single or multiple columns. Data passed for this can be q/Python." @@ -1190,7 +941,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f378ba4a", + "id": "f9d06838", "metadata": {}, "outputs": [], "source": [ @@ -1200,7 +951,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0f2936b9", + "id": "1505d9bb", "metadata": {}, "outputs": [], "source": [ @@ -1209,2120 +960,2688 @@ }, { "cell_type": "markdown", - "id": "a3368987", + "id": "05765a04", "metadata": {}, "source": [ - "### Table.iloc[]\n", + "### Table.sample()\n", "\n", "```\n", - "Table.iloc[:, :]\n", + "Table.sample(n, frac, replace, weights, random_state, axis, ignore_index)\n", "```\n", "\n", - "Purely integer-location based indexing for selection by position.\n", + "Sample random data from the table.\n", "\n", - "`iloc` is primarily integer position based (from 0 to length-1 of the axis), but may also be used with a BooleanVector.\n", + "**Parameters:**\n", "\n", - "Allowed inputs are:\n", - "- An integer, e.g. 5.\n", - "- A list or array of integers, e.g. [4, 3, 0].\n", - "- A slice object with ints, e.g. 1:7.\n", - "- A BooleanVector.\n", - "- A callable function with one argument (the calling Series or DataFrame) and that returns valid output for indexing (one of the above). This is useful in method chains, when you don’t have a reference to the calling object, but would like to base your selection on some value.\n", - "- A tuple of row and column indexes. The tuple elements consist of one of the above inputs, e.g. (0, 1).\n", + "| Name | Type | Description | Default |\n", + "| :----------: | :---: | :----------------------------------------------------------------- | :-----: |\n", + "| n | int | Number of rows to return. Cannot be used with `frac`. Default is 1 if `frac` is None. | None |\n", + "| frac | float | Fraction of the rows to return. Cannot be used with `n`. | None |\n", + "| replace | bool | Whether or not it should be possible to sample the same row twice. | False |\n", + "| weights | None | Not yet implemented. | None |\n", + "| random_state | None | Not yet implemented. | None |\n", + "| axis | None | Not yet implemented. | None |\n", + "| ignore_index | bool | Not yet implemented. | False |\n", "\n", "**Returns:**\n", "\n", - "| Type | Description |\n", - "| :---: | :----------------------------------------------------- |\n", - "| Table | A table contatining only the columns / rows requested. |" - ] - }, - { - "cell_type": "markdown", - "id": "0ef4d8cf", - "metadata": {}, - "source": [ - "**Examples:**\n", - "\n", - "Get the second row from a table." + "| Type | Description |\n", + "| :---: | :----------------------------------------------------------------- |\n", + "| Table | A table with the given column(s) renamed. |" ] }, { "cell_type": "code", "execution_count": null, - "id": "683ab48b", + "id": "8b4a10be", "metadata": { "scrolled": true }, "outputs": [], "source": [ - "tab.iloc[1]" + "# The examples in this section will use this example table filled with random data\n", + "kx.q('N: 1000')\n", + "tab = kx.q('([] x: til N; y: N?`AAPL`GOOG`MSFT; z: N?500f; w: N?1000; v: N?(0N 0 50 100 200 250))')\n", + "tab.head()" ] }, { "cell_type": "markdown", - "id": "e71bebdb", + "id": "970c8ea4", "metadata": {}, "source": [ - "Get the first 5 rows from a table." + "**Examples:**\n", + "\n", + "Sample 10 Rows." ] }, { "cell_type": "code", "execution_count": null, - "id": "a13730fd", - "metadata": { - "scrolled": false - }, - "outputs": [], - "source": [ - "tab.iloc[:5]" - ] - }, - { - "cell_type": "markdown", - "id": "60f892e0", + "id": "9dde77b1", "metadata": {}, - "source": [ - "Get all rows of the table where the `y` column is equal to `AAPL`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d7afdf65", - "metadata": { - "scrolled": true - }, "outputs": [], "source": [ - "tab.iloc[tab['y'] == 'AAPL']" + "tab.sample(n=10)" ] }, { "cell_type": "markdown", - "id": "8b3b9279", + "id": "1d14afe9", "metadata": {}, "source": [ - "Get all rows of the table where the `y` column is equal to `AAPL`, and only return the `y`, `z` and `w` columns." + "Sample 10% of the rows." ] }, { "cell_type": "code", "execution_count": null, - "id": "a0d9f08d", + "id": "32772c46", "metadata": {}, "outputs": [], "source": [ - "tab.iloc[tab['y'] == 'AAPL', ['y', 'z', 'w']]" + "tab.sample(frac=0.1)" ] }, { "cell_type": "markdown", - "id": "045bc156", + "id": "82a7a79d", "metadata": {}, "source": [ - "Replace all null values in the column `v` with the value `-100`." + "Sample 10% of the rows and allow the same row to be sampled twice." ] }, { "cell_type": "code", "execution_count": null, - "id": "7e21c163", + "id": "4c96839b", "metadata": {}, "outputs": [], "source": [ - "tab.iloc[tab['v'] == kx.q('0N'), 'v'] = -100\n", - "tab" + "tab.sample(frac=0.1, replace=True)" ] }, { "cell_type": "markdown", - "id": "76021266", - "metadata": {}, + "id": "82b501a6", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, "source": [ - "### Table.pop()\n", + "### Table.select_dtypes()\n", "\n", "```\n", - "Table.pop(item)\n", + "Table.select_dtypes(include=None, exclude=None)\n", "```\n", "\n", - "Remove a column or columns from a table by column name and return the column after it has been removed.\n", + "Return a subset of the DataFrame’s columns based on the column dtypes.\n", "\n", - "**Parameters:**\n", + "Allowed inputs for `include`/`exclude` are:\n", + "- A single dtype or string.\n", + "- A list of dtypes or strings.\n", + "- Inputs given for `include` and `exclude` cannot overlap.\n", "\n", - "| Name | Type | Description | Default |\n", - "| :-----: | :--------------------: | :------------------------------------------------------ | :--------: |\n", - "| item | Union[str, list[str]] | The column name or list of names to pop from the table. | _required_ |\n", + "The dtype `kx.CharVector` will return an error. Use `kx.CharAtom` for a column of single chars.\n", + "Both `kx.*Atom` and `kx.*Vector` will be taken to mean a column containing a single item per row of type `*`. `kx.List` will include/exclude any columns containing mixed list data (including string columns).\n", + " \n", + "**Parameters:**\n", "\n", + "| Name | Type | Description | Default |\n", + "| :-----: | :--------------: | :----------------------------------------------: | :-----: |\n", + "| include | Union[List, str] | A selection of dtypes or strings to be included. | None |\n", + "| exclude | Union[List, str] | A selection of dtypes or strings to be excluded. | None |\n", + " \n", + "At least one of these parameters must be supplied.\n", + " \n", "**Returns:**\n", "\n", - "| Type | Description |\n", - "| :---: | :----------------------------------------------------------------- |\n", - "| Table | A table contatining only the columns removed from the input table. |" + "| Type | Description |\n", + "| :-------: | :----------------------------------------------------------------------------------------------: |\n", + "| Dataframe | The subset of the frame including the dtypes in `include` and excluding the dtypes in `exclude`. |" ] }, { "cell_type": "markdown", - "id": "e5fdfbd3", + "id": "0570165c", "metadata": {}, "source": [ "**Examples:**\n", "\n", - "Remove the `v` column from the table and return it." + "The examples in the section will use the example table." ] }, { "cell_type": "code", "execution_count": null, - "id": "7a960191", - "metadata": { - "scrolled": true - }, + "id": "74ade8d1", + "metadata": {}, "outputs": [], "source": [ - "display(tab.head())\n", - "print('\\n\\nPop the `v` column out of the table')\n", - "display(tab.pop(\"v\"))\n", - "print('\\n\\nUpdated Table')\n", - "display(tab.head())" + "df = kx.q('([] c1:`a`b`c; c2:1 2 3h; c3:1 2 3j; c4:1 2 3i)')" ] }, { "cell_type": "markdown", - "id": "35062560", + "id": "b889d7c7", "metadata": {}, "source": [ - "Remove the `z` and `w` columns from the table and return them." + "Exclude columns containing symbols" ] }, { "cell_type": "code", "execution_count": null, - "id": "a46189b2", - "metadata": { - "scrolled": false - }, + "id": "e8a792da", + "metadata": {}, "outputs": [], "source": [ - "display(tab.head())\n", - "print('\\n\\nPop the `z` and `w` columns out of the table')\n", - "display(tab.pop([\"z\", \"w\"]).head())\n", - "print('\\n\\nUpdated Table')\n", - "display(tab.head())" + "df.select_dtypes(exclude = kx.SymbolVector)" ] }, { "cell_type": "markdown", - "id": "f71b6917", + "id": "c87f28c4", "metadata": {}, "source": [ - "## Reindexing" + "Include a list of column types" ] }, { "cell_type": "code", "execution_count": null, - "id": "a2b1a198", - "metadata": { - "scrolled": true - }, + "id": "ac2af334", + "metadata": {}, "outputs": [], "source": [ - "# The examples in this section will use this example table filled with random data\n", - "kx.q('N: 1000')\n", - "tab = kx.q('([] x: til N; y: N?`AAPL`GOOG`MSFT; z: N?500f; w: N?1000; v: N?(0N 0 50 100 200 250))')\n", - "tab.head()" + "df.select_dtypes(include = [kx.ShortVector, kx.LongVector])" ] }, { "cell_type": "markdown", - "id": "f5a7ac0e", + "id": "ede98735", "metadata": {}, "source": [ - "### Table.drop()\n", + "### Table.tail()\n", "\n", "```\n", - "Table.drop(item, axis=0)\n", + "Table.tail(n=5)\n", "```\n", "\n", - "Remove either columns or rows from a table and return the resulting Table object.\n", + "Get the last n rows from a table.\n", "\n", "**Parameters:**\n", "\n", - "| Name | Type | Description | Default |\n", - "| :-----: | :--------------------: | :---------------------------------------------------------- | :--------: |\n", - "| item | Union[str, list[str]] | The column name(s) or row number(s) to drop from the table. | _required_ |\n", - "| axis | int | The column name or list of names to pop from the table. | 0 |\n", + "| Name | Type | Description | Default |\n", + "| :--: | :--: | :---------------------------- | :-----: |\n", + "| n | int | The number of rows to return. | 5 |\n", "\n", "**Returns:**\n", "\n", - "| Type | Description |\n", - "| :---: | :----------------------------------------------------------------- |\n", - "| Table | A table with the given column(s) / row(s) removed. |" + "| Type | Description |\n", + "| :---: | :------------------------------- |\n", + "| Table | The last `n` rows of the table. |" ] }, { "cell_type": "markdown", - "id": "008a2e74", + "id": "a7b6bd44", "metadata": {}, "source": [ "**Examples:**\n", "\n", - "Drop rows from a table." + "Return the last 5 rows of the table." ] }, { "cell_type": "code", "execution_count": null, - "id": "0f74d3f2", + "id": "d1f5f644", "metadata": {}, "outputs": [], "source": [ - "tab.drop([0, 2, 4, 6, 8, 10]).head()" + "tab.tail()" ] }, { "cell_type": "markdown", - "id": "cb4e82aa", + "id": "181a4d86", "metadata": {}, "source": [ - "Drop columns from a table." + "Return the last 10 rows of the table." ] }, { "cell_type": "code", "execution_count": null, - "id": "57ad6a64", + "id": "c8a0bb7b", "metadata": {}, "outputs": [], "source": [ - "tab.drop('y', axis=1).head()" + "tab.tail(10)" ] }, { "cell_type": "markdown", - "id": "90db87b0", + "id": "29b0e773", "metadata": {}, "source": [ - "### Table.drop_duplicates()\n", + "## Data Joins/Merging" + ] + }, + { + "cell_type": "markdown", + "id": "666a7621", + "metadata": {}, + "source": [ + "### Table.merge()\n", "\n", "```\n", - "Table.drop_duplicates()\n", + "Table.merge(\n", + " right,\n", + " how='inner',\n", + " on=None,\n", + " left_on=None,\n", + " right_on=None,\n", + " left_index=False,\n", + " right_index=False,\n", + " sort=False,\n", + " suffixes=('_x', '_y'),\n", + " copy=True,\n", + " validate=None,\n", + " q_join=False\n", + ")\n", "```\n", "\n", - "Remove either columns or rows from a table and return the resulting Table object.\n", + "Merge Table or KeyedTable objects with a database-style join.\n", + "\n", + "The join is done on columns or keys. If joining columns on columns, the Table key will be ignored. Otherwise if joining keys on keys or keys on a column or columns, the index will be passed on. When performing a cross merge, no column specifications to merge on are allowed.\n", + "\n", + "**Parameters:**\n", + "\n", + "| Name | Type | Description | Default |\n", + "| :---------: | :--: | :-------------------------------------------------------------------------------- | :-----: |\n", + "| right | Union[Table/KeyedTable] | The object to merge with. | _required_ |\n", + "| how | str | The type of join to be used. One of {‘left’, ‘right’, ‘outer’, ‘inner’, ‘cross’}. | ‘inner’ |\n", + "| on | str | The column name to join on. | None |\n", + "| left_on | str | The column name in the left table to join on. | None |\n", + "| right_on | str | The column name in the right table to join on. | None |\n", + "| left_index | bool | Use the index of the left Table. | False |\n", + "| right_index | bool | Use the index of the right Table. | False |\n", + "| sort | bool | Sort the join keys of the resulting table. | False |\n", + "| suffixes | Tuple(str, str) | The number of rows to return. | ('\\_x', '\\_y') |\n", + "| copy | bool | If False avoid copies and modify the input table. | None |\n", + "| validate | str | If specified checks if merge matches specified type.
- “one_to_one” or “1:1”: check if merge keys are unique in both left and right datasets.
- “one_to_many” or “1:m”: check if merge keys are unique in left dataset.
- “many_to_one” or “m:1”: check if merge keys are unique in right dataset.
- “many_to_many” or “m:m”: allowed, but does not result in checks.
| None |\n", + "| q_join | bool | If True perform native q joins instead of the pandas SQL like joins. More documentation around these joins can be found [here.](https://code.kx.com/q/basics/joins/) | False |\n", "\n", "**Returns:**\n", "\n", - "| Type | Description |\n", - "| :---: | :--------------------------------------- |\n", - "| Table | A table with all duplicate rows removed. |" + "| Type | Description |\n", + "| :----------------: | :------------------------------------------------------------------ |\n", + "| Table / KeyedTable | The resulting table-like object after the join has been preformed. |" ] }, { "cell_type": "markdown", - "id": "3af33f03", + "id": "61d1567a", "metadata": {}, "source": [ "**Examples:**\n", "\n", - "Create a table with duplicates for the example" + "Merge tab1 and tab2 on the lkey and rkey columns. The value columns have the default suffixes, \\_x and \\_y, appended." ] }, { "cell_type": "code", "execution_count": null, - "id": "af182307", - "metadata": {}, + "id": "8a9acd51", + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ - "tab2 = kx.q('([] 100?`AAPL`GOOG`MSFT; 100?3)')\n", - "tab2" + "tab1 = kx.Table(data={'lkey': ['foo', 'bar', 'baz', 'foo'], 'value': [1, 2, 3, 5]})\n", + "tab2 = kx.Table(data={'rkey': ['foo', 'bar', 'baz', 'foo'], 'value': [5, 6, 7, 8]})\n", + "tab1.merge(tab2, left_on='lkey', right_on='rkey')" ] }, { "cell_type": "markdown", - "id": "48143d51", + "id": "e004bf64", "metadata": {}, "source": [ - "Drop all duplicate rows from the table." + "Merge tab1 and tab2 on the lkey and rkey columns using a native q inner join. The value columns have the default suffixes, \\_x and \\_y, appended." ] }, { "cell_type": "code", "execution_count": null, - "id": "eeff16e7", + "id": "07df7437", "metadata": {}, "outputs": [], "source": [ - "tab2.drop_duplicates()" - ] - }, - { - "cell_type": "markdown", - "id": "6d71c8c0", - "metadata": {}, - "source": [ - "### Table.rename()\n", - "\n", - "```\n", - "Table.rename(columns)\n", - "```\n", - "\n", - "Rename columns in a table and return the resulting Table object.\n", - "\n", - "**Parameters:**\n", - "\n", - "| Name | Type | Description | Default |\n", - "| :-----: | :-------------: | :------------------------------------------------------------------ | :--------: |\n", - "| item | dict[str, str] | A dictonary of column name to new column name to use when renaming. | _required_ |\n", - "\n", - "**Returns:**\n", - "\n", - "| Type | Description |\n", - "| :---: | :----------------------------------------------------------------- |\n", - "| Table | A table with the given column(s) renamed. |" + "tab1.merge(tab2, left_on='lkey', right_on='rkey', q_join=True)" ] }, { "cell_type": "markdown", - "id": "73260da1", + "id": "7350d9db", "metadata": {}, "source": [ - "**Examples:**\n", - "\n", - "The inital table we will be renaming columns on." + "Merge tab1 and tab2 with specified left and right suffixes appended to any overlapping columns." ] }, { "cell_type": "code", "execution_count": null, - "id": "3cc68fa6", + "id": "23685dcb", "metadata": {}, "outputs": [], "source": [ - "tab.head()" + "tab1.merge(tab2, left_on='lkey', right_on='rkey', suffixes=('_left', '_right'))" ] }, { "cell_type": "markdown", - "id": "eef94948", + "id": "3b2c65d4", "metadata": {}, "source": [ - "Rename column `y` to `symbol` and `z` to `price`." + "Merge tab1 and tab2 but raise an exception if the Tables have any overlapping columns." ] }, { "cell_type": "code", "execution_count": null, - "id": "d5e76248", - "metadata": {}, + "id": "b5d16312", + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ - "tab.rename(columns={'y': 'symbol', 'z': 'price'}).head()" + "try:\n", + " tab1.merge(tab2, left_on='lkey', right_on='rkey', suffixes=(False, False))\n", + "except BaseException as e:\n", + " print(f'Caught Error: {e}')" ] }, { - "cell_type": "markdown", - "id": "05124590", + "cell_type": "code", + "execution_count": null, + "id": "793df3f3", "metadata": {}, + "outputs": [], "source": [ - "### Table.sample()\n", - "\n", - "```\n", - "Table.sample(n, frac, replace, weights, random_state, axis, ignore_index)\n", - "```\n", - "\n", - "Sample random data from the table.\n", - "\n", - "**Parameters:**\n", - "\n", - "| Name | Type | Description | Default |\n", - "| :----------: | :---: | :----------------------------------------------------------------- | :-----: |\n", - "| n | int | Number of rows to return. Cannot be used with `frac`. Default is 1 if `frac` is None. | None |\n", - "| frac | float | Fraction of the rows to return. Cannot be used with `n`. | None |\n", - "| replace | bool | Whether or not it should be possible to sample the same row twice. | False |\n", - "| weights | None | Not yet implemented. | None |\n", - "| random_state | None | Not yet implemented. | None |\n", - "| axis | None | Not yet implemented. | None |\n", - "| ignore_index | bool | Not yet implemented. | False |\n", - "\n", - "**Returns:**\n", - "\n", - "| Type | Description |\n", - "| :---: | :----------------------------------------------------------------- |\n", - "| Table | A table with the given column(s) renamed. |" + "tab1 = kx.Table(data={'a': ['foo', 'bar'], 'b': [1, 2]})\n", + "tab2 = kx.Table(data={'a': ['foo', 'baz'], 'c': [3, 4]})" ] }, { "cell_type": "markdown", - "id": "e8f78917", + "id": "d58a52a3", "metadata": {}, "source": [ - "**Examples:**\n", - "\n", - "Sample 10 Rows." + "Merge tab1 and tab2 on the `a` column using an inner join." ] }, { "cell_type": "code", "execution_count": null, - "id": "d88ab348", - "metadata": {}, + "id": "1180e6f4", + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ - "tab.sample(n=10)" + "tab1.merge(tab2, how='inner', on='a')" ] }, { "cell_type": "markdown", - "id": "78e03554", + "id": "b14e36da", "metadata": {}, "source": [ - "Sample 10% of the rows." + "Merge tab1 and tab2 on the `a` column using a left join." ] }, { "cell_type": "code", "execution_count": null, - "id": "8585d62e", + "id": "4b0098da", "metadata": {}, "outputs": [], "source": [ - "tab.sample(frac=0.1)" + "tab1.merge(tab2, how='left', on='a')" ] }, { "cell_type": "markdown", - "id": "c77712d3", + "id": "00d0ad6a", "metadata": {}, "source": [ - "Sample 10% of the rows and allow the same row to be sampled twice." + "Merge tab1 and tab2 using a cross join." ] }, { "cell_type": "code", "execution_count": null, - "id": "b138f770", - "metadata": {}, + "id": "b55be868", + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ - "tab.sample(frac=0.1, replace=True)" + "tab1 = kx.Table(data={'left': ['foo', 'bar']})\n", + "tab2 = kx.Table(data={'right': [7, 8]})\n", + "tab1.merge(tab2, how='cross')" ] }, { "cell_type": "markdown", - "id": "6f6f5672", + "id": "7583c015", "metadata": {}, "source": [ - "### Table.select_dtypes()\n", + "### Table.merge_asof()\n", "\n", "```\n", - "Table.select_dtypes(include=None, exclude=None)\n", + "Table.merge_asof(\n", + " right,\n", + " on=None,\n", + " left_on=None,\n", + " right_on=None,\n", + " left_index=False,\n", + " right_index=False,\n", + " by=None,\n", + " left_by=None,\n", + " right_by=None,\n", + " suffixes=('_x', '_y'),\n", + " tolerance=None,\n", + " allow_exact_matches=True,\n", + " direction='backward'\n", + "\n", + ")\n", "```\n", "\n", - "Return a subset of the DataFrame’s columns based on the column dtypes.\n", - " \n", + "Merge Table or KeyedTable objects with a database-style join.\n", + "\n", + "The join is done on columns or keys. If joining columns on columns, the Table key will be ignored. Otherwise if joining keys on keys or keys on a column or columns, the index will be passed on. When performing a cross merge, no column specifications to merge on are allowed.\n", + "\n", "**Parameters:**\n", "\n", - "| Name | Type | Description | Default |\n", - "| :-----: | :--------------: | :----------------------------------------------: | :-----: |\n", - "| include | Union[List, str] | A selection of dtypes or strings to be included. | None |\n", - "| exclude | Union[List, str] | A selection of dtypes or strings to be excluded. | None |\n", - " \n", - "At least one of these parameters must be supplied.\n", - " \n", + "| Name | Type | Description | Default |\n", + "| :---------: | :--: | :-------------------------------------------------------------------------------- | :-----: |\n", + "| right | Union[Table/KeyedTable] | The object to merge with. | _required_ |\n", + "| how | str | The type of join to be used. One of {‘left’, ‘right’, ‘outer’, ‘inner’, ‘cross’}. | ‘inner’ |\n", + "| on | str | The column name to join on. | None |\n", + "| left_on | str | The column name in the left table to join on. | None |\n", + "| right_on | str | The column name in the right table to join on. | None |\n", + "| left_index | bool | Use the index of the left Table. | False |\n", + "| right_index | bool | Use the index of the right Table. | False |\n", + "| by | str | Not yet implemented. | None |\n", + "| left_by | str | Field names to match on in the left table. | None |\n", + "| right_by | str | Field names to match on in the right table. | None |\n", + "| suffixes | Tuple(str, str) | The number of rows to return. | ('\\_x', '\\_y') |\n", + "| tolerance | Any | Not yet implemented. | None |\n", + "| allow_exact_matches | bool | Not yet implemented. | True |\n", + "| direction | str | Not yet implemented. | 'backward' |\n", + "\n", + "\n", "**Returns:**\n", "\n", - "| Type | Description |\n", - "| :-------: | :----------------------------------------------------------------------------------------------: |\n", - "| Dataframe | The subset of the frame including the dtypes in `include` and excluding the dtypes in `exclude`. |" + "| Type | Description |\n", + "| :----------------: | :------------------------------------------------------------------ |\n", + "| Table / KeyedTable | The resulting table like object after the join has been preformed. |" ] }, { "cell_type": "markdown", - "id": "6a703c57", + "id": "908499df", "metadata": {}, "source": [ "**Examples:**\n", "\n", - "The examples in the section will use the example table." + "Perform a simple asof join on two tables." ] }, { "cell_type": "code", "execution_count": null, - "id": "5e9734f7", + "id": "e660e496", "metadata": {}, "outputs": [], "source": [ - "df = kx.q('([] c1:`a`b`c; c2:1 2 3h; c3:1 2 3j; c4:1 2 3i)')" + "left = kx.Table(data={\"a\": [1, 5, 10], \"left_val\": [\"a\", \"b\", \"c\"]})\n", + "right = kx.Table(data={\"a\": [1, 2, 3, 6, 7], \"right_val\": [1, 2, 3, 6, 7]})\n", + "left" ] }, { - "cell_type": "markdown", - "id": "42d9ffa6", + "cell_type": "code", + "execution_count": null, + "id": "e456e4ad", "metadata": {}, + "outputs": [], "source": [ - "Exclude columns contatining symbols" + "right" ] }, { "cell_type": "code", "execution_count": null, - "id": "3d934cf0", + "id": "d4616f6d", "metadata": {}, "outputs": [], "source": [ - "df.select_dtypes(exclude = kx.SymbolVector)" + "left.merge_asof(right)" ] }, { "cell_type": "markdown", - "id": "e4302f7d", + "id": "496d5a72", "metadata": {}, "source": [ - "Include a list of column types" + "Perform a asof join on two tables but first merge them on the by column." ] }, { "cell_type": "code", "execution_count": null, - "id": "f698f5f0", + "id": "3f0fcc13", "metadata": {}, "outputs": [], "source": [ - "df.select_dtypes(include = [kx.ShortVector, kx.LongVector])" - ] - }, - { - "cell_type": "markdown", - "id": "5590d1ca", - "metadata": {}, - "source": [ - "### Table.astype()\n", - "\n", - "```\n", - "Table.astype(dtype, copy=True, errors='raise')\n", - "```\n", - "\n", - "Cast a column/columns of the Dataframes object to a specified `dtype`.\n", - "\n", - "**Parameters:**\n", - "\n", - "| Name | Type | Description | Default |\n", - "| :-: | :-: | :-: | :-: |\n", - "| dtype | data type, or dict of column name -> data type | Use a PyKx wrapper data type or Python type to cast all columns to the same type. Alternatively, use {col: dtype, …}, where col is a column label and dtype is PyKx wrapper data type to cast one or more of the DataFrame’s columns to column-specific types. | |\n", - "| copy | Boolean | Default of True, False not implemented | True |\n", - "| errors | {‘raise’, ‘ignore’} | If passed anything other than 'raise', it will return the dataframe | 'raise' |\n", - "\n", - "\n", - "**Returns:**\n", - "\n", - "| Type | Description |\n", - "| :-: | :-: |\n", - "| Dataframe | The dataframe with columns casted according to passed dtypes |" + "trades = kx.Table(data={\n", + " \"time\": [\n", + " pd.Timestamp(\"2016-05-25 13:30:00.023\"),\n", + " pd.Timestamp(\"2016-05-25 13:30:00.023\"),\n", + " pd.Timestamp(\"2016-05-25 13:30:00.030\"),\n", + " pd.Timestamp(\"2016-05-25 13:30:00.041\"),\n", + " pd.Timestamp(\"2016-05-25 13:30:00.048\"),\n", + " pd.Timestamp(\"2016-05-25 13:30:00.049\"),\n", + " pd.Timestamp(\"2016-05-25 13:30:00.072\"),\n", + " pd.Timestamp(\"2016-05-25 13:30:00.075\")\n", + " ],\n", + " \"ticker\": [\n", + " \"GOOG\",\n", + " \"MSFT\",\n", + " \"MSFT\",\n", + " \"MSFT\",\n", + " \"GOOG\",\n", + " \"AAPL\",\n", + " \"GOOG\",\n", + " \"MSFT\"\n", + " ],\n", + " \"bid\": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],\n", + " \"ask\": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03]\n", + "})\n", + "quotes = kx.Table(data={\n", + " \"time\": [\n", + " pd.Timestamp(\"2016-05-25 13:30:00.023\"),\n", + " pd.Timestamp(\"2016-05-25 13:30:00.038\"),\n", + " pd.Timestamp(\"2016-05-25 13:30:00.048\"),\n", + " pd.Timestamp(\"2016-05-25 13:30:00.048\"),\n", + " pd.Timestamp(\"2016-05-25 13:30:00.048\")\n", + " ],\n", + " \"ticker\": [\"MSFT\", \"MSFT\", \"GOOG\", \"GOOG\", \"AAPL\"],\n", + " \"price\": [51.95, 51.95, 720.77, 720.92, 98.0],\n", + " \"quantity\": [75, 155, 100, 100, 100]\n", + "})\n", + "trades" ] }, { - "cell_type": "markdown", - "id": "f9ca98d2", + "cell_type": "code", + "execution_count": null, + "id": "b7259913", "metadata": {}, + "outputs": [], "source": [ - "**Examples:**\n", - "\n", - "The examples in the section will use the example table." + "quotes" ] }, { "cell_type": "code", "execution_count": null, - "id": "831836c8", + "id": "32e41b85", "metadata": {}, "outputs": [], "source": [ - "df = kx.q('([] c1:1 2 3i; c2:1 2 3j; c3:1 2 3h; c4:1 2 3i)')" + "trades.merge_asof(quotes, on=\"time\")" ] }, { "cell_type": "markdown", - "id": "0bf0d78f", + "id": "04e022a9", "metadata": {}, "source": [ - "Cast all columns to dtype LongVector" + "## Analytic functionality" ] }, { "cell_type": "code", "execution_count": null, - "id": "6833400a", + "id": "c167fdc9", "metadata": {}, "outputs": [], "source": [ - "df.astype(kx.LongVector)" + "# All the examples in this section will use this example table.\n", + "kx.q('N: 100')\n", + "tab = kx.q('([] sym: N?`AAPL`GOOG`MSFT; price: 250f - N?500f; traded: 100 - N?200; hold: N?0b)')\n", + "tab" ] }, { "cell_type": "markdown", - "id": "7a2bfcd3", + "id": "be074715", "metadata": {}, "source": [ - "Casting as specified in the dcitionary supplied with given dtype per column" + "### Table.abs()\n", + "\n", + "```\n", + "Table.abs(numeric_only=False)\n", + "```\n", + "\n", + "Take the absolute value of each element in the table. This will raise an error if there are columns that contain data that have no absolute value.\n", + "\n", + "**Parameters:**\n", + "\n", + "| Name | Type | Description | Default |\n", + "| :----------: | :--: | :------------------------------------------------------------------------------- | :-----: |\n", + "| numeric_only | bool | Only use columns of the table that can be converted to an absolute value. | False |\n", + "\n", + "**Returns:**\n", + "\n", + "| Type | Description |\n", + "| :----------------: | :------------------------------------------------------------------- |\n", + "| Table / KeyedTable | The resulting table like object with only positive numerical values. |" ] }, { "cell_type": "code", "execution_count": null, - "id": "872db9aa", - "metadata": {}, + "id": "52f27400", + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ - "df.astype({'c1':kx.LongVector, 'c2':'kx.ShortVector'})" + "tab.abs(numeric_only=True)" ] }, { "cell_type": "markdown", - "id": "ef3b4225", + "id": "85d42035", "metadata": {}, "source": [ - "The next example will use this table" + "### Table.count()\n", + "\n", + "```\n", + "Table.count(axis=0, numeric_only=False)\n", + "```\n", + "\n", + "Returns the count of non null values across the given axis.\n", + "\n", + "**Parameters:**\n", + "\n", + "| Name | Type | Description | Default |\n", + "| :----------: | :--: | :------------------------------------------------------------------------------- | :-----: |\n", + "| axis | int | The axis to count elements across 1 is columns, 0 is rows. | 0 |\n", + "| numeric_only | bool | Only use columns of the table that are of a numeric data type. | False |\n", + "\n", + "**Returns:**\n", + "\n", + "| Type | Description |\n", + "| :----------------: | :------------------------------------------------------------------- |\n", + "| Dictionary | A dictionary where the key represent the column name / row number and the values are the result of calling `count` on that column / row. |" ] }, { "cell_type": "code", "execution_count": null, - "id": "6a20abdd", + "id": "a53125cb", "metadata": {}, "outputs": [], "source": [ - "df = kx.q('([] c1:3#.z.p; c2:`abc`def`ghi; c3:1 2 3j; c4:(\"abc\";\"def\";\"ghi\");c5:\"abc\";c6:(1 2 3;4 5 6;7 8 9))')" + "tab.count()" ] }, { "cell_type": "markdown", - "id": "908fa4ea", + "id": "77a5a83f", "metadata": {}, "source": [ - "Casting char and string columns to symbol columns" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5ea7fe9e", + "### Table.max()\n", + "\n", + "```\n", + "Table.max(axis=0, skipna=True, numeric_only=False)\n", + "```\n", + "\n", + "Returns the maximum value across the given axis.\n", + "\n", + "**Parameters:**\n", + "\n", + "| Name | Type | Description | Default |\n", + "| :----------: | :--: | :------------------------------------------------------------------------------- | :-----: |\n", + "| axis | int | The axis to calculate the maximum across 0 is columns, 1 is rows. | 0 |\n", + "| skipna | bool | Ignore any null values along the axis. | True |\n", + "| numeric_only | bool | Only use columns of the table that are of a numeric data type. | False |\n", + "\n", + "**Returns:**\n", + "\n", + "| Type | Description |\n", + "| :----------------: | :------------------------------------------------------------------- |\n", + "| Dictionary | A dictionary where the key represents the column name / row number and the values are the result of calling `max` on that column / row. |" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5aea50f5", "metadata": {}, "outputs": [], "source": [ - "df.astype({'c4':kx.SymbolVector, 'c5':kx.SymbolVector})" + "tab.max()" ] }, { "cell_type": "markdown", - "id": "0f8813a0", + "id": "71dab7ac", "metadata": {}, "source": [ - "### Table.add_prefix()\n", + "### Table.min()\n", "\n", "```\n", - "Table.add_prefix(columns)\n", + "Table.min(axis=0, skipna=True, numeric_only=False)\n", "```\n", "\n", - "Rename columns adding a prefix in a table and return the resulting Table object.\n", + "Returns the minimum value across the given axis.\n", "\n", "**Parameters:**\n", "\n", - "| Name | Type | Description | Default |\n", - "| :-----: | :-------------: | :------------------------------------------------------------------ | :--------: |\n", - "| prefix | str | The string that will be concatenated with the name of the columns | _required_ |\n", - "| axis | int | Axis to add prefix on. | 0 |\n", + "| Name | Type | Description | Default |\n", + "| :----------: | :--: | :------------------------------------------------------------------------------- | :-----: |\n", + "| axis | int | The axis to calculate the minimum across 0 is columns, 1 is rows. | 0 |\n", + "| skipna | bool | Ignore any null values along the axis. | True |\n", + "| numeric_only | bool | Only use columns of the table that are of a numeric data type. | False |\n", "\n", "**Returns:**\n", "\n", - "| Type | Description |\n", - "| :---: | :----------------------------------------------------------------- |\n", - "| Table | A table with the given column(s) renamed adding a prefix. |" - ] - }, - { - "cell_type": "markdown", - "id": "9186ed86", - "metadata": {}, - "source": [ - "**Examples:**\n", - "\n", - "The initial table to which a prefix will be added to its columns" + "| Type | Description |\n", + "| :----------------: | :------------------------------------------------------------------- |\n", + "| Dictionary | A dictionary where the key represents the column name / row number and the values are the result of calling `min` on that column / row. |" ] }, { "cell_type": "code", "execution_count": null, - "id": "5f20131b", + "id": "9f13e8a7", "metadata": {}, "outputs": [], "source": [ - "tab.head()" + "tab.min()" ] }, { "cell_type": "markdown", - "id": "73c2b08f", + "id": "1bf3da2a", "metadata": {}, "source": [ - "Add \"col_\" to table columns:" + "### Table.sum()\n", + "\n", + "```\n", + "Table.sum(axis=0, skipna=True, numeric_only=False, min_count=0)\n", + "```\n", + "\n", + "Returns the sum of all values across the given axis.\n", + "\n", + "**Parameters:**\n", + "\n", + "| Name | Type | Description | Default |\n", + "| :----------: | :--: | :------------------------------------------------------------------------------- | :-----: |\n", + "| axis | int | The axis to calculate the sum across 0 is columns, 1 is rows. | 0 |\n", + "| skipna | bool | Ignore any null values along the axis. | True |\n", + "| numeric_only | bool | Only use columns of the table that are of a numeric data type. | False |\n", + "| min_count | int | If not set to 0 if there are less then `min_count` values across the axis a null value will be returned | 0 |\n", + "\n", + "**Returns:**\n", + "\n", + "| Type | Description |\n", + "| :----------------: | :------------------------------------------------------------------- |\n", + "| Dictionary | A dictionary where the key represents the column name / row number and the values are the result of calling `sum` on that column / row. |" ] }, { "cell_type": "code", "execution_count": null, - "id": "926c8295", + "id": "09975a7a", "metadata": {}, "outputs": [], "source": [ - "tab.add_prefix(prefix=\"col_\").head()" + "tab.sum()" ] }, { "cell_type": "markdown", - "id": "0a4abc8c", + "id": "97920009", "metadata": {}, "source": [ - "### Table.add_suffix()\n", + "### Table.mean()\n", "\n", "```\n", - "Table.add_suffix(columns)\n", + "Table.mean(axis=0, numeric_only=False)\n", "```\n", "\n", - "Rename columns adding a suffix in a table and return the resulting Table object.\n", + "Get the mean of values across the requested axis.\n", "\n", "**Parameters:**\n", "\n", - "| Name | Type | Description | Default |\n", - "| :-----: | :-------------: | :------------------------------------------------------------------ | :--------: |\n", - "| suffix | str | The string that will be concatenated with the name of the columns | _required_ |\n", - "| axis | int | Axis to add suffix on. | 0 |\n", + "| Name | Type | Description | Default |\n", + "| :----------: | :--: | :--------------------------------------------------------- | :-----: |\n", + "| axis | int | The axis to calculate mean across 0 is columns, 1 is rows. | 0 |\n", + "| numeric_only | bool | Include only columns / rows with numeric data. | False |\n", "\n", "**Returns:**\n", "\n", - "| Type | Description |\n", - "| :---: | :----------------------------------------------------------------- |\n", - "| Table | A table with the given column(s) renamed adding a suffix. |" + "| Type | Description |\n", + "| :--------: | :--------------------------------------------------------------------------------------------- |\n", + "| Dictionary | The mean across each row / column with the key corresponding to the row number or column name. |" ] }, { "cell_type": "markdown", - "id": "c22262b8", + "id": "dee2e8cc", "metadata": {}, "source": [ "**Examples:**\n", "\n", - "The initial table to which a suffix will be added to its columns" + "Calculate the mean across the columns of a table" ] }, { "cell_type": "code", "execution_count": null, - "id": "55c1f504", + "id": "9d4c8a22", "metadata": {}, "outputs": [], "source": [ - "tab.head()" + "tab = kx.Table(data=\n", + " {\n", + " 'a': [1, 2, 2, 4],\n", + " 'b': [1, 2, 6, 7],\n", + " 'c': [7, 8, 9, 10],\n", + " 'd': [7, 11, 14, 14]\n", + " }\n", + ")\n", + "tab" ] }, { - "cell_type": "markdown", - "id": "b4687851", + "cell_type": "code", + "execution_count": null, + "id": "d02c4cfd", "metadata": {}, + "outputs": [], "source": [ - "Add \"_col\" to table columns:" + "tab.mean()" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "e00d0f5c", + "cell_type": "markdown", + "id": "c6feb4ea", "metadata": {}, - "outputs": [], "source": [ - "tab.add_suffix(suffix=\"_col\").head()" + "Calculate the mean across the rows of a table" ] }, { - "cell_type": "markdown", - "id": "718584f8", + "cell_type": "code", + "execution_count": null, + "id": "506a6867", "metadata": {}, + "outputs": [], "source": [ - "## Merging" + "tab.mean(axis=1)" ] }, { "cell_type": "markdown", - "id": "ef401426", + "id": "cd714c1b", "metadata": {}, "source": [ - "### Table.merge()\n", + "### Table.median()\n", "\n", "```\n", - "Table.merge(\n", - " right,\n", - " how='inner',\n", - " on=None,\n", - " left_on=None,\n", - " right_on=None,\n", - " left_index=False,\n", - " right_index=False,\n", - " sort=False,\n", - " suffixes=('_x', '_y'),\n", - " copy=True,\n", - " validate=None,\n", - " q_join=False\n", - ")\n", + "Table.median(axis=0, numeric_only=False)\n", "```\n", "\n", - "Merge Table or KeyedTable objects with a database-style join.\n", - "\n", - "The join is done on columns or indexes. If joining columns on columns, the DataFrame indexes will be ignored. Otherwise if joining indexes on indexes or indexes on a column or columns, the index will be passed on. When performing a cross merge, no column specifications to merge on are allowed.\n", + "Get the median of values across the requested axis.\n", "\n", "**Parameters:**\n", "\n", - "| Name | Type | Description | Default |\n", - "| :---------: | :--: | :-------------------------------------------------------------------------------- | :-----: |\n", - "| right | Union[Table/KeyedTable] | The object to merge with. | _required_ |\n", - "| how | str | The type of join to be used. One of {‘left’, ‘right’, ‘outer’, ‘inner’, ‘cross’}. | ‘inner’ |\n", - "| on | str | The column name to join on. | None |\n", - "| left_on | str | The column name in the left table to join on. | None |\n", - "| right_on | str | The column name in the right table to join on. | None |\n", - "| left_index | bool | Use the index of the left Table. | False |\n", - "| right_index | bool | Use the index of the right Table. | False |\n", - "| sort | bool | Sort the join keys of the resulting table. | False |\n", - "| suffixes | Tuple(str, str) | The number of rows to return. | ('\\_x', '\\_y') |\n", - "| copy | bool | If False avoid copies and modify the input table. | None |\n", - "| validate | str | If specified checks if merge matches specified type.
- “one_to_one” or “1:1”: check if merge keys are unique in both left and right datasets.
- “one_to_many” or “1:m”: check if merge keys are unique in left dataset.
- “many_to_one” or “m:1”: check if merge keys are unique in right dataset.
- “many_to_many” or “m:m”: allowed, but does not result in checks.
| None |\n", - "| q_join | bool | If True perform native q joins instead of the pandas SQL like joins. More documentation around these joins can be found [here.](https://code.kx.com/q/basics/joins/) | False |\n", + "| Name | Type | Description | Default |\n", + "| :----------: | :--: | :----------------------------------------------------------- | :-----: |\n", + "| axis | int | The axis to calculate median across 0 is columns, 1 is rows. | 0 |\n", + "| numeric_only | bool | Include only columns / rows with numeric data. | False |\n", "\n", "**Returns:**\n", "\n", - "| Type | Description |\n", - "| :----------------: | :------------------------------------------------------------------ |\n", - "| Table / KeyedTable | The resulting table like object after the join has been preformed. |" + "| Type | Description |\n", + "| :--------: | :----------------------------------------------------------------------------------------------- |\n", + "| Dictionary | The median across each row / column with the key corresponding to the row number or column name. |" ] }, { "cell_type": "markdown", - "id": "9e613e3c", + "id": "00d44518", "metadata": {}, "source": [ "**Examples:**\n", "\n", - "Merge tab1 and tab2 on the lkey and rkey columns. The value columns have the default suffixes, \\_x and \\_y, appended." + "Calculate the median across the columns of a table" ] }, { "cell_type": "code", "execution_count": null, - "id": "a3b0ec9f", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "tab1 = kx.Table(data={'lkey': ['foo', 'bar', 'baz', 'foo'], 'value': [1, 2, 3, 5]})\n", - "tab2 = kx.Table(data={'rkey': ['foo', 'bar', 'baz', 'foo'], 'value': [5, 6, 7, 8]})\n", - "tab1.merge(tab2, left_on='lkey', right_on='rkey')" - ] - }, - { - "cell_type": "markdown", - "id": "6e32596c", + "id": "df20ecfc", "metadata": {}, + "outputs": [], "source": [ - "Merge tab1 and tab2 on the lkey and rkey columns using a native q inner join. The value columns have the default suffixes, \\_x and \\_y, appended." + "tab = kx.Table(data=\n", + " {\n", + " 'a': [1, 2, 2, 4],\n", + " 'b': [1, 2, 6, 7],\n", + " 'c': [7, 8, 9, 10],\n", + " 'd': [7, 11, 14, 14]\n", + " }\n", + ")\n", + "tab" ] }, { "cell_type": "code", "execution_count": null, - "id": "8ea253c9", + "id": "6e9dc5be", "metadata": {}, "outputs": [], "source": [ - "tab1.merge(tab2, left_on='lkey', right_on='rkey', q_join=True)" + "tab.median()" ] }, { "cell_type": "markdown", - "id": "2d9240b3", + "id": "585d9d01", "metadata": {}, "source": [ - "Merge tab1 and tab2 with specified left and right suffixes appended to any overlapping columns." + "Calculate the median across the rows of a table" ] }, { "cell_type": "code", "execution_count": null, - "id": "64425a1d", + "id": "6ccf50df", "metadata": {}, "outputs": [], "source": [ - "tab1.merge(tab2, left_on='lkey', right_on='rkey', suffixes=('_left', '_right'))" + "tab.median(axis=1)" ] }, { "cell_type": "markdown", - "id": "e749c7e0", + "id": "aeec2045", "metadata": {}, "source": [ - "Merge tab1 and tab2 but raise an exception if the Tables have any overlapping columns." + "### Table.mode()\n", + "\n", + "```\n", + "Table.mode(axis=0, numeric_only=False, dropna=True)\n", + "```\n", + "\n", + "Get the mode of values across the requested axis.\n", + "\n", + "**Parameters:**\n", + "\n", + "| Name | Type | Description | Default |\n", + "| :----------: | :--: | :------------------------------------------------------------ | :-----: |\n", + "| axis | int | The axis to calculate mode across 0 is columns, 1 is rows. | 0 |\n", + "| numeric_only | bool | Include only columns / rows with numeric data. | False |\n", + "| dropna | bool | Remove null values from the data before calculating the mode. | True |\n", + "\n", + "**Returns:**\n", + "\n", + "| Type | Description |\n", + "| :--------: | :------------------------------------------------------------------------------------------------ |\n", + "| Table | The mode across each row / column with the column corresponding to the row number or column name. |" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "a938230d", - "metadata": { - "scrolled": true - }, - "outputs": [], + "cell_type": "markdown", + "id": "c52ffed8", + "metadata": {}, "source": [ - "try:\n", - " tab1.merge(tab2, left_on='lkey', right_on='rkey', suffixes=(False, False))\n", - "except BaseException as e:\n", - " print(f'Caught Error: {e}')" + "**Examples:**\n", + "\n", + "Calculate the mode across the columns of a table" ] }, { "cell_type": "code", "execution_count": null, - "id": "b1d99a31", + "id": "786fe3b6", "metadata": {}, "outputs": [], "source": [ - "tab1 = kx.Table(data={'a': ['foo', 'bar'], 'b': [1, 2]})\n", - "tab2 = kx.Table(data={'a': ['foo', 'baz'], 'c': [3, 4]})" - ] - }, - { - "cell_type": "markdown", - "id": "385c0465", - "metadata": {}, - "source": [ - "Merge tab1 and tab2 on the `a` column using an inner join." + "tab = kx.Table(data=\n", + " {\n", + " 'a': [1, 2, 2, 4],\n", + " 'b': [1, 2, 6, 7],\n", + " 'c': [7, 8, 9, 10],\n", + " 'd': [7, 11, 14, 14]\n", + " }\n", + ")\n", + "tab" ] }, { "cell_type": "code", "execution_count": null, - "id": "7431a148", + "id": "58909ffa", "metadata": { "scrolled": true }, "outputs": [], "source": [ - "tab1.merge(tab2, how='inner', on='a')" + "tab.mode()" + ] + }, + { + "cell_type": "markdown", + "id": "7d437b70", + "metadata": {}, + "source": [ + "Calculate the median across the rows of a table" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cfa17533", + "metadata": {}, + "outputs": [], + "source": [ + "tab.mode(axis=1)" + ] + }, + { + "cell_type": "markdown", + "id": "4c270df3", + "metadata": {}, + "source": [ + "Calculate the mode across columns and keep null values." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "80afc141", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "tab = kx.Table(data=\n", + " {\n", + " 'x': [0, 1, 2, 3, 4, 5, 6, 7, np.NaN, np.NaN],\n", + " 'y': [10, 11, 12, 13, 14, 15, 16, 17, 18, np.NaN],\n", + " 'z': ['a', 'b', 'c', 'd', 'd', 'e', 'e', 'f', 'g', 'h']\n", + " }\n", + ")\n", + "tab" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4e3300f5", + "metadata": {}, + "outputs": [], + "source": [ + "tab.mode(dropna=False)" + ] + }, + { + "cell_type": "markdown", + "id": "4117c73f", + "metadata": {}, + "source": [ + "### Table.prod()\n", + "\n", + "```\n", + "Table.prod(axis=0, skipna=True, numeric_only=False, min_count=0)\n", + "```\n", + "\n", + "Returns the product of all values across the given axis.\n", + "\n", + "**Parameters:**\n", + "\n", + "| Name | Type | Description | Default |\n", + "| :----------: | :--: | :------------------------------------------------------------------------------- | :-----: |\n", + "| axis | int | The axis to calculate the product across 0 is columns, 1 is rows. | 0 |\n", + "| skipna | bool | Ignore any null values along the axis. | True |\n", + "| numeric_only | bool | Only use columns of the table that are of a numeric data type. | False |\n", + "| min_count | int | If not set to 0 if there are less then `min_count` values across the axis a null value will be returned | 0 |\n", + "\n", + "**Returns:**\n", + "\n", + "| Type | Description |\n", + "| :----------------: | :------------------------------------------------------------------- |\n", + "| Dictionary | A dictionary where the key represents the column name / row number and the values are the result of calling `prd` on that column / row. |" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a6c64b75", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# This example will use a smaller version of the above table\n", + "# as the result of calculating the product quickly goes over the integer limits.\n", + "kx.q('N: 10')\n", + "tab = kx.q('([] sym: N?`AAPL`GOOG`MSFT; price: 2.5f - N?5f; traded: 10 - N?20; hold: N?0b)')\n", + "tab[tab['traded'] == 0, 'traded'] = 1\n", + "tab[tab['price'] == 0, 'price'] = 1.0\n", + "tab" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "540297e2", + "metadata": {}, + "outputs": [], + "source": [ + "tab.prod(numeric_only=True)" + ] + }, + { + "cell_type": "markdown", + "id": "c777923e", + "metadata": {}, + "source": [ + "### Table.skew()\n", + "\n", + "```\n", + "Table.skew(axis=0, skipna=True, numeric_only=False)\n", + "```\n", + "\n", + "Returns the skewness of all values across the given axis.\n", + "\n", + "**Parameters:**\n", + "\n", + "| Name | Type | Description | Default |\n", + "| :----------: | :--: | :------------------------------------------------------------------------------- | :-----: |\n", + "| axis | int | The axis to calculate the skewness across 0 is columns, 1 is rows. | 0 |\n", + "| skipna | bool | Ignore any null values along the axis. | True |\n", + "| numeric_only | bool | Only use columns of the table that are of a numeric data type. | False |\n", + "\n", + "\n", + "**Returns:**\n", + "\n", + "| Type | Description |\n", + "| :----------------: | :------------------------------------------------------------------- |\n", + "| Dictionary | A dictionary where the key represent the column name / row number and the values are the result of calling `skew` on that column / row. |" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fc109f0f", + "metadata": {}, + "outputs": [], + "source": [ + "tab.skew(numeric_only=True)" + ] + }, + { + "cell_type": "markdown", + "id": "22940e03", + "metadata": {}, + "source": [ + "### Table.std()\n", + "\n", + "```\n", + "Table.std(axis=0, skipna=True, numeric_only=False, ddof=0)\n", + "```\n", + "\n", + "Return sample standard deviation over requested axis. Normalized by N-1 by default. This can be changed using the ddof argument.\n", + "\n", + "\n", + "**Parameters:**\n", + "\n", + "| Name | Type | Description | Default |\n", + "| :----------: | :--: | :------------------------------------------------------------------------------- | :-----: |\n", + "| axis | int | The axis to calculate the sum across 0 is columns, 1 is rows. | 0 |\n", + "| skipna | bool | not yet implemented | True |\n", + "| numeric_only | bool | Only use columns of the table that are of a numeric data type. | False |\n", + "| ddof | int | Delta Degrees of Freedom. The divisor used in calculations is N - ddof, where N represents the number of elements. | 1 |\n", + "\n", + "**Returns:**\n", + "\n", + "| Type | Description |\n", + "| :----------------: | :------------------------------------------------------------------- |\n", + "| Table | The std across each row / column with the key corresponding to the row number or column name. |" + ] + }, + { + "cell_type": "markdown", + "id": "292f9c39", + "metadata": {}, + "source": [ + "**Examples:**\n", + "\n", + "Calculate the std across the columns of a table" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f2df159e", + "metadata": {}, + "outputs": [], + "source": [ + "tab = kx.Table(data=\n", + " {\n", + " 'a': [1, 2, 2, 4],\n", + " 'b': [1, 2, 6, 7],\n", + " 'c': [7, 8, 9, 10],\n", + " 'd': [7, 11, 14, 14]\n", + " }\n", + ")\n", + "tab" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "63d45751", + "metadata": {}, + "outputs": [], + "source": [ + "tab.std()" + ] + }, + { + "cell_type": "markdown", + "id": "2e9705de", + "metadata": {}, + "source": [ + "Calculate the std across the rows of a table" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8edf71a4", + "metadata": {}, + "outputs": [], + "source": [ + "tab.std(axis=1)" + ] + }, + { + "cell_type": "markdown", + "id": "1ef61cd5", + "metadata": {}, + "source": [ + "Calculate std accross columns with ddof=0:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f66fe87", + "metadata": {}, + "outputs": [], + "source": [ + "tab.std(ddof=0)" + ] + }, + { + "cell_type": "markdown", + "id": "c80d90ae", + "metadata": {}, + "source": [ + "## Group By" + ] + }, + { + "cell_type": "markdown", + "id": "2e1d05d5", + "metadata": {}, + "source": [ + "### Table.groupby()\n", + "\n", + "```\n", + "Table.groupby(\n", + " by=None,\n", + " axis=0,\n", + " level=None,\n", + " as_index=True,\n", + " sort=True,\n", + " group_keys=True,\n", + " observed=False,\n", + " dropna=True\n", + ")\n", + "```\n", + "\n", + "Group data based on like values within columns to easily apply operations on groups.\n", + "\n", + "**Parameters:**\n", + "\n", + "| Name | Type | Description | Default |\n", + "| :--------------: | :--: | :-------------------------------------------------------------------------- | :------: |\n", + "| by | Union[Symbol/SymbolVector/int/list] | The column name(s) or column index(es) to group the data on. | None |\n", + "| axis | int | Not Yet Implemented. | 0 |\n", + "| level | Union[Symbol/SymbolVector/int/list] | The column name(s) or column index(es) to group the data on. | None | \n", + "| as_index | bool | Return the table with groups as the key column. | True |\n", + "| sort | bool | Sort the resulting table based off the key. | True |\n", + "| group_keys | bool | Not Yet Implemented. | True | \n", + "| observed | bool | Not Yet Implemented. | False |\n", + "| dropna | bool | Drop groups where the group is null. | True | \n", + "\n", + "Either `by` or `level` can be used to specify the columns to group on, using both will raise an error.\n", + "\n", + "Using and integer or list of integers is only possible when calling `groupby` on a `KeyedTable` object.\n", + "\n", + "**Returns:**\n", + "\n", + "| Type | Description |\n", + "| :----------: | :---------------------------------------------- |\n", + "| GroupbyTable | The resulting table after the grouping is done. |\n", + "\n", + "**Examples:**\n", + "\n", + "Example Table." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c0454f7d", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "tab = kx.Table(data={\n", + " 'Animal': ['Falcon', 'Falcon', 'Parrot', 'Parrot'],\n", + " 'Max Speed': [380., 370., 24., 26.],\n", + " 'Max Altitude': [570., 555., 275., 300.]\n", + "})\n", + "\n", + "tab" ] }, { "cell_type": "markdown", - "id": "230a7666", + "id": "55b6b4e0", "metadata": {}, "source": [ - "Merge tab1 and tab2 on the `a` column using a left join." + "Group on the `Animal` column and calculate the mean of the resulting `Max Speed` and `Max Altitude` columns." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "30c55810", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "tab.groupby(kx.SymbolVector(['Animal'])).mean()" + ] + }, + { + "cell_type": "markdown", + "id": "0e62a99f", + "metadata": {}, + "source": [ + "Example table with multiple columns to group on." ] }, { "cell_type": "code", "execution_count": null, - "id": "04b96b08", + "id": "0ceddbbf", "metadata": {}, "outputs": [], "source": [ - "tab1.merge(tab2, how='left', on='a')" + "tab = kx.q('2!', kx.Table(\n", + " data={\n", + " 'Animal': ['Falcon', 'Falcon', 'Parrot', 'Parrot', 'Parrot'],\n", + " 'Type': ['Captive', 'Wild', 'Captive', 'Wild', 'Wild'],\n", + " 'Max Speed': [390., 350., 30., 20., 25.]\n", + " }\n", + "))\n", + "tab" ] }, { "cell_type": "markdown", - "id": "d991656c", + "id": "7e43e1bc", "metadata": {}, "source": [ - "Merge tab1 and tab2 using a cross join." + "Group on multiple columns using thier indexes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c281e305", + "metadata": {}, + "outputs": [], + "source": [ + "tab.groupby(level=[0, 1]).mean()" + ] + }, + { + "cell_type": "markdown", + "id": "e5d04220", + "metadata": {}, + "source": [ + "Example table with Nulls." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ae67684c", + "metadata": {}, + "outputs": [], + "source": [ + "tab = kx.Table(\n", + " [\n", + " [\"a\", 12, 12],\n", + " [kx.q('`'), 12.3, 33.],\n", + " [\"b\", 12.3, 123],\n", + " [\"a\", 1, 1]\n", + " ],\n", + " columns=[\"a\", \"b\", \"c\"]\n", + ")\n", + "tab" + ] + }, + { + "cell_type": "markdown", + "id": "512021d7", + "metadata": {}, + "source": [ + "Group on column `a` and keep null groups." ] }, { "cell_type": "code", "execution_count": null, - "id": "09886503", + "id": "a09a6d3a", "metadata": { "scrolled": true }, "outputs": [], "source": [ - "tab1 = kx.Table(data={'left': ['foo', 'bar']})\n", - "tab2 = kx.Table(data={'right': [7, 8]})\n", - "tab1.merge(tab2, how='cross')" + "tab.groupby('a', dropna=False).sum()" ] }, { "cell_type": "markdown", - "id": "b2f4aff1", + "id": "4ca2006b", "metadata": {}, "source": [ - "### Table.merge_asof()\n", + "Group on column `a` keeping null groups and not using the groups as an index column." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "caa2576e", + "metadata": {}, + "outputs": [], + "source": [ + "tab.groupby('a', dropna=False, as_index=False).sum()" + ] + }, + { + "cell_type": "markdown", + "id": "660b3c92", + "metadata": {}, + "source": [ + "## Apply\n", + "\n", + "### Table.apply()\n", "\n", "```\n", - "Table.merge_asof(\n", - " right,\n", - " on=None,\n", - " left_on=None,\n", - " right_on=None,\n", - " left_index=False,\n", - " right_index=False,\n", - " by=None,\n", - " left_by=None,\n", - " right_by=None,\n", - " suffixes=('_x', '_y'),\n", - " tolerance=None,\n", - " allow_exact_matches=True,\n", - " direction='backward'\n", + "Table.apply(\n", + " func,\n", + " *args,\n", + " axis=0,\n", + " raw=None,\n", + " result_type=None,\n", + " **kwargs\n", + ")\n", + "```\n", + "\n", + "Apply a function along an axis of the DataFrame.\n", + "\n", + "Objects passed to a function are passed as kx list objects.\n", + "\n", + "**Parameters:**\n", + "\n", + "| Name | Type | Description | Default |\n", + "| :--------------: | :---------------------------------: | :-------------------------------------------------------------------------- | :------: |\n", + "| func | function | Function to apply to each column or row. | |\n", + "| `*args` | any | Positional arguments to pass to `func` in addition to the kx list. | |\n", + "| axis | int | The axis along which the function is applied, `0` applies function to each column, `1` applied function to each row. | 0 | \n", + "| raw | bool | Not yet implemented. | None |\n", + "| result_type | str | Not yet implemented. | None |\n", + "| `**kwargs` | dict | Additional keyword arguments to pass as keywords to `func`, this argument is not implemented in the case `func` is a kx callable function. | None | \n", + "\n", + "\n", + "**Returns:**\n", + "\n", + "| Type | Description |\n", + "| :-----------------------: | :---------------------------------------------- |\n", + "| List, Dictionary or Table | Result of applying `func` along the giveen axis of the `kx.Table`. |\n", + "\n", + "**Examples:**\n", + "\n", + "Example Table." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d746cddb", + "metadata": {}, + "outputs": [], + "source": [ + "tab = kx.Table([[4, 9]] * 3, columns=['A', 'B'])\n", + "\n", + "tab" + ] + }, + { + "cell_type": "markdown", + "id": "54c09d0c", + "metadata": {}, + "source": [ + "Apply square root on each item within a column" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f8bbcac7", + "metadata": {}, + "outputs": [], + "source": [ + "tab.apply(kx.q.sqrt)" + ] + }, + { + "cell_type": "markdown", + "id": "09a61483", + "metadata": {}, + "source": [ + "Apply a reducing function sum on either axis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "84b92b9b", + "metadata": {}, + "outputs": [], + "source": [ + "tab.apply(kx.q.sum)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "169d8ed3", + "metadata": {}, + "outputs": [], + "source": [ + "tab.apply(lambda x: sum(x), axis=1)" + ] + }, + { + "cell_type": "markdown", + "id": "ed4d720c", + "metadata": {}, + "source": [ + "## Aggregate\n", + "\n", + "### Table.agg()\n", "\n", + "```\n", + "Table.agg(\n", + " func,\n", + " axis=0,\n", + " *args,\n", + " **kwargs\n", ")\n", "```\n", "\n", - "Merge Table or KeyedTable objects with a database-style join.\n", + "Aggregate data using one or more operations over a specified axis\n", "\n", - "The join is done on columns or indexes. If joining columns on columns, the DataFrame indexes will be ignored. Otherwise if joining indexes on indexes or indexes on a column or columns, the index will be passed on. When performing a cross merge, no column specifications to merge on are allowed.\n", + "Objects passed to a function are passed as kx vector/list objects.\n", "\n", "**Parameters:**\n", "\n", - "| Name | Type | Description | Default |\n", - "| :---------: | :--: | :-------------------------------------------------------------------------------- | :-----: |\n", - "| right | Union[Table/KeyedTable] | The object to merge with. | _required_ |\n", - "| how | str | The type of join to be used. One of {‘left’, ‘right’, ‘outer’, ‘inner’, ‘cross’}. | ‘inner’ |\n", - "| on | str | The column name to join on. | None |\n", - "| left_on | str | The column name in the left table to join on. | None |\n", - "| right_on | str | The column name in the right table to join on. | None |\n", - "| left_index | bool | Use the index of the left Table. | False |\n", - "| right_index | bool | Use the index of the right Table. | False |\n", - "| by | str | Not yet implemented. | None |\n", - "| left_by | str | Field names to match on in the left table. | None |\n", - "| right_by | str | Field names to match on in the right table. | None |\n", - "| suffixes | Tuple(str, str) | The number of rows to return. | ('\\_x', '\\_y') |\n", - "| tolerance | Any | Not yet implemented. | None |\n", - "| allow_exact_matches | bool | Not yet implemented. | True |\n", - "| direction | str | Not yet implemented. | 'backward' |\n", + "| Name | Type | Description | Default |\n", + "| :--------------: | :---------------------------------: | :-------------------------------------------------------------------------- | :------: |\n", + "| func | function, str, list or dict | Function to use for aggregating the data. If a function this must either work when passed a `Table` or when passed to `Table.apply`

Accepted combinations are: