diff --git a/docs/catalogs/advanced.rst b/docs/catalogs/advanced.rst new file mode 100644 index 00000000..e4249418 --- /dev/null +++ b/docs/catalogs/advanced.rst @@ -0,0 +1,37 @@ +Advanced Usage +=============================================================================== + +We aim to support ingestion of a lot of kinds of catalog data. Here, we discuss +some ways you can tune the import pipeline for different kinds of data. + +.. tip:: + Reach out! + + If you have some *very* interesting data that isn't well-supported by this + pipeline, we want to hear about it! :doc:`/guide/contact` + + +``add_hipscat_index`` +------------------------------------------------------------------------------- + +TODO + +``use_schema_file`` +------------------------------------------------------------------------------- + +TODO + +``debug_stats_only`` +------------------------------------------------------------------------------- + +TODO + +``epoch`` +------------------------------------------------------------------------------- + +TODO + +``catalog_type`` +------------------------------------------------------------------------------- + +TODO diff --git a/docs/catalogs/arguments.rst b/docs/catalogs/arguments.rst new file mode 100644 index 00000000..c2dcac62 --- /dev/null +++ b/docs/catalogs/arguments.rst @@ -0,0 +1,115 @@ +Catalog Import Arguments +=============================================================================== + +This page discusses a few topics around setting up a catalog pipeline. + +For a full list of the available arguments, see the API documentation for +:py:class:`hipscat_import.catalog.arguments.ImportArguments` + +Reading input files +------------------------------------------------------------------------------- + +Catalog import reads through a list of files and converts them into a hipscatted catalog. + +Which files? +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +There are a few ways to specify the files to read: + +* ``input_path`` + ``input_format``: + will search for files ending with the format string in the indicated directory. +* ``input_file_list``: + a list of fully-specified paths you want to read. + + * this strategy can be useful to first run the import on a single input + file and validate the input, then run again on the full input set, or + to debug a single input file with odd behavior. + * if you have a mix of files in your target directory, you can use a glob + statement like the following to gather input files: + +.. code-block:: python + + in_file_paths = glob.glob("/data/object_and_source/object**.csv") + in_file_paths.sort() + +How to read them? +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Specify an instance of ``InputReader`` for the ``file_reader`` parameter. + +see the API documentation for +:py:class:`hipscat_import.catalog.file_readers.InputReader` + +We use the ``InputReader`` class to read files in chunks and pass the chunks +along to the map/reduce stages. We've provided reference implementations for +reading CSV, FITS, and Parquet input files, but you can subclass the reader +type to suit whatever input files you've got. + +.. code-block:: python + + class StarrReader(InputReader): + """Class for fictional Starr file format.""" + def __init__(self, chunksize=500_000, **kwargs): + self.chunksize = chunksize + self.kwargs = kwargs + + def read(self, input_file): + starr_file = starr_io.read_table(input_file, **self.kwargs) + for smaller_table in starr_file.to_batches(max_chunksize=self.chunksize): + smaller_table = filter_nonsense(smaller_table) + yield smaller_table.to_pandas() + + def provenance_info(self) -> dict: + provenance_info = { + "input_reader_type": "StarrReader", + "chunksize": self.chunksize, + } + return provenance_info + + ... + + args = ImportArguments( + ... + ## Locates files like "/directory/to/files/**starr" + input_path="/directory/to/files/", + input_format="starr", + ## NB - you need the parens here! + file_reader=StarrReader(), + + ) + +Which fields? +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Specify the ``ra_column`` and ``dec_column`` for the dataset. + +There are two fields that we require in order to make a valid hipscatted +catalog, the right ascension and declination. At this time, this is the only +supported system for celestial coordinates. + + +Healpix order and thresholds +------------------------------------------------------------------------------- + +Details for ``pixel_threshold``, ``highest_healpix_order``, and +``constant_healpix_order`` arguments + +When creating a new catalog through the hipscat-import process, we try to +create partitions with approximately the same number of rows per partition. +This isn't perfect, because the sky is uneven, but we still try to create +smaller-area pixels in more dense areas, and larger-area pixels in less dense +areas. + +We use the argument ``pixel_threshold`` and will split a partition into +smaller healpix pixels until the number of rows is smaller than ``pixel_threshold``. +We will only split by healpix pixels up to the ``highest_healpix_order``. If we +would need to split further, we'll throw an error at the "Binning" stage, and you +should adjust your parameters. + +For more discussion of the ``pixel_threshold`` argument and a strategy for setting +this parameter, see notebook :doc:`/notebooks/estimate_pixel_threshold` + +Alternatively, you can use the ``constant_healpix_order`` argument. This will +**ignore** both of the ``pixel_threshold`` and ``highest_healpix_order`` arguments +and the catalog will be partitioned by healpix pixels at the +``constant_healpix_order``. This can be useful for very sparse datasets. \ No newline at end of file diff --git a/docs/catalogs/debug.rst b/docs/catalogs/debug.rst new file mode 100644 index 00000000..594bb934 --- /dev/null +++ b/docs/catalogs/debug.rst @@ -0,0 +1,16 @@ +Debugging Tips +======================================================================================== + +.. tip:: + If you're struggling with your dataset after looking over these tips, reach out! + + :doc:`/guide/contact` + +Reduce step +------------------------------------------------------------------------------- + +Errors like: + + ``` + Exception: "ArrowNotImplementedError('Unsupported cast from string to null using function cast_null')" + ``` \ No newline at end of file diff --git a/docs/catalogs/overview.rst b/docs/catalogs/overview.rst deleted file mode 100644 index b7073268..00000000 --- a/docs/catalogs/overview.rst +++ /dev/null @@ -1,13 +0,0 @@ -Overview -=============================================================================== - -The LINCC Frameworks team has built the import tool hoping to handle catalogs -in various formats. We've learned some lessons in importing public data sets, -and provide steps to import those catalogs in case they help anyone else. - -Catalogs -------------------------------------------------------------------------------- - -* :doc:`allwise` -* :doc:`neowise` -* :doc:`tic` diff --git a/docs/catalogs/allwise.rst b/docs/catalogs/public/allwise.rst similarity index 71% rename from docs/catalogs/allwise.rst rename to docs/catalogs/public/allwise.rst index 0a229ca9..ea67ec83 100644 --- a/docs/catalogs/allwise.rst +++ b/docs/catalogs/public/allwise.rst @@ -18,7 +18,9 @@ Challenges with this data set - The numeric fields may be null, which is not directly supported by the ``int64`` type in pandas, so we must use the nullable ``Int64`` type. -You can download the :download:`allwise_types` CSV file we used. +You can download the :download:`allwise_types` CSV file we used, +and the associated schema file :download:`allwise_schema` +with column-level parquet metadata. Example import ------------------------------------------------------------------------------- @@ -27,16 +29,16 @@ Example import import pandas as pd - import hipscat_import.run_import as runner - from hipscat_import.arguments import ImportArguments - from hipscat_import.file_readers import CsvReader + import hipscat_import.pipeline as runner + from hipscat_import.catalog.arguments import ImportArguments + from hipscat_import.catalog.file_readers import CsvReader # Load the column names and types from a side file. type_frame = pd.read_csv("allwise_types.csv") type_map = dict(zip(type_frame["name"], type_frame["type"])) args = ImportArguments( - catalog_name="allwise", + output_catalog_name="allwise", input_path="/path/to/allwise/", input_format="csv.bz2", file_reader=CsvReader( @@ -45,10 +47,13 @@ Example import column_names=type_frame["name"].values.tolist(), type_map=type_map, chunksize=250_000, - ).read, - ra_column="RA", - dec_column="DEC", - id_column="SOURCE_ID", + ), + use_schema_file="allwise_schema.parquet", + ra_column="ra", + dec_column="dec", + id_column="source_id", + pixel_threshold=1_000_000, + highest_healpix_order=7, output_path="/path/to/catalogs/", ) - runner.run(args) + runner.pipeline(args) diff --git a/docs/catalogs/public/index.rst b/docs/catalogs/public/index.rst new file mode 100644 index 00000000..69f9ad13 --- /dev/null +++ b/docs/catalogs/public/index.rst @@ -0,0 +1,28 @@ +Public Catalogs +=============================================================================== + +The LINCC Frameworks team has built the import tool hoping to handle catalogs +in various formats. We've learned some lessons in importing public data sets, +and provide steps to import those catalogs in case these hints help anyone else. + +.. note:: + These are datasets that our team has data rights to. We make no guarantees + about the data rights of others, the quality of the datasets, or their + availability. + + Further, please respect the publication policy associated with the datasets. + +.. toctree:: + :maxdepth: 1 + + allwise + neowise + tic + zubercal + +.. tip:: + Want to see more? + + Have you used this tool with a dataset, and you want to help others with + the idiosyncrasies? Is there a commonly used public dataset that you'd like + some tips for importing? :doc:`/guide/contact` \ No newline at end of file diff --git a/docs/catalogs/neowise.rst b/docs/catalogs/public/neowise.rst similarity index 88% rename from docs/catalogs/neowise.rst rename to docs/catalogs/public/neowise.rst index ea61c9b5..1a29ed69 100644 --- a/docs/catalogs/neowise.rst +++ b/docs/catalogs/public/neowise.rst @@ -27,16 +27,16 @@ Example import import pandas as pd - import hipscat_import.run_import as runner - from hipscat_import.arguments import ImportArguments - from hipscat_import.file_readers import CsvReader + import hipscat_import.pipeline as runner + from hipscat_import.catalog.arguments import ImportArguments + from hipscat_import.catalog.file_readers import CsvReader # Load the column names and types from a side file. type_frame = pd.read_csv("neowise_types.csv") type_map = dict(zip(type_frame["name"], type_frame["type"])) args = ImportArguments( - catalog_name="neowise_1", + output_catalog_name="neowise_1", input_path="/path/to/neowiser_year8/", input_format="csv.bz2", file_reader=CsvReader( diff --git a/docs/catalogs/tic.rst b/docs/catalogs/public/tic.rst similarity index 88% rename from docs/catalogs/tic.rst rename to docs/catalogs/public/tic.rst index e044b1ee..f1fa6091 100644 --- a/docs/catalogs/tic.rst +++ b/docs/catalogs/public/tic.rst @@ -27,15 +27,15 @@ Example import import pandas as pd - import hipscat_import.run_import as runner - from hipscat_import.arguments import ImportArguments - from hipscat_import.file_readers import CsvReader + import hipscat_import.pipeline as runner + from hipscat_import.catalog.arguments import ImportArguments + from hipscat_import.catalog.file_readers import CsvReader type_frame = pd.read_csv("tic_types.csv") type_map = dict(zip(type_frame["name"], type_frame["type"])) args = ImportArguments( - catalog_name="tic_1", + output_catalog_name="tic_1", input_path="/path/to/tic/", input_format="csv.gz", file_reader=CsvReader( diff --git a/docs/catalogs/public/zubercal.rst b/docs/catalogs/public/zubercal.rst new file mode 100644 index 00000000..f4982cef --- /dev/null +++ b/docs/catalogs/public/zubercal.rst @@ -0,0 +1,85 @@ +Zubercal +=============================================================================== + +Getting the data +------------------------------------------------------------------------------- + +See docs at CalTech. + +http://atua.caltech.edu/ZTF/Fields/ReadMe.txt + + +Challenges with this data set +------------------------------------------------------------------------------- + +- The ``__index_level_0__`` pandas index should be ignored when reading. + + - it is identical to the ``objectid`` column, and is just bloat + + - it is non-unique, and that makes it tricky when splitting the file up + +- The files are written out by band, and the band is included in the file + name (but not as a field in the resulting data product). this uses a + regular expression to parse out the band and insert it as a column in + the dataframe. +- the parquet files are all a fine size for input files, so we don't mess + with another chunk size. +- there are over 500 thousand data files (TODO - how we handle this=]) + +.. code-block:: python + + import hipscat_import.pipeline as runner + from hipscat_import.catalog.arguments import ImportArguments + from hipscat_import.catalog.file_readers import ParquetReader + import pyarrow.parquet as pq + import pyarrow as pa + import re + import glob + + + class ZubercalParquetReader(ParquetReader): + def read(self, input_file): + """Reader for the specifics of zubercal parquet files.""" + columns = [ + "mjd", + "mag", + "objdec", + "objra", + "magerr", + "objectid", + "info", + "flag", + "rcidin", + "fieldid", + ] + + ## Parse the band from the file name, and hold onto it for later. + match = re.match(r".*ztf_[\d]+_[\d]+_([gir]).parquet", str(input_file)) + band = match.group(1) + + parquet_file = pq.read_table(input_file, columns=columns, **self.kwargs) + for smaller_table in parquet_file.to_batches(max_chunksize=self.chunksize): + frame = pa.Table.from_batches([smaller_table]).to_pandas() + frame["band"] = band + yield frame + + + files = glob.glob("/path/to/downloads/**/**.parquet") + files.sort() + + args = ImportArguments( + output_catalog_name="zubercal", + input_file_list=files, + ## NB - you need the parens here! + file_reader=ZubercalParquetReader(), + input_format="parquet", + catalog_type="source", + ra_column="objra", + dec_column="objdec", + id_column="objectid", + highest_healpix_order=10, + pixel_threshold=20_000_000, + output_path="/path/to/catalogs/", + ) + + runner.pipeline(args) diff --git a/docs/guide/resume.rst b/docs/catalogs/resume.rst similarity index 100% rename from docs/guide/resume.rst rename to docs/catalogs/resume.rst diff --git a/docs/guide/association.rst b/docs/guide/association.rst new file mode 100644 index 00000000..57537648 --- /dev/null +++ b/docs/guide/association.rst @@ -0,0 +1,9 @@ +Association Table +=============================================================================== + +See the API documentation for :py:class:`hipscat_import.association.arguments.AssociationArguments` + +.. note:: + + TODO - write the rest + diff --git a/docs/guide/contact.rst b/docs/guide/contact.rst new file mode 100644 index 00000000..3d076031 --- /dev/null +++ b/docs/guide/contact.rst @@ -0,0 +1,11 @@ +Contact us +=============================================================================== + +We at LINCC Frameworks pride ourselves on being a friendly bunch! + +If you're encountering issues, have some gnarly dataset, have ideas for +making our products better, or pretty much anything else, reach out! + +* Open an issue in our github repo for hipscat-import + * https://github.com/astronomy-commons/hipscat-import/issues/new +* If you're on LSSTC slack, so are we! #lincc-dataformats \ No newline at end of file diff --git a/docs/guide/contributing.rst b/docs/guide/contributing.rst index 9faf8a9b..a2fd75ea 100644 --- a/docs/guide/contributing.rst +++ b/docs/guide/contributing.rst @@ -13,6 +13,11 @@ a description. You can reach the team with bug reports, feature requests, and general inquiries by creating a new GitHub issue. +.. tip:: + Want to help? + + Do you want to help out, but you're not sure how? :doc:`/guide/contact` + Create a branch ------------------------------------------------------------------------------- @@ -32,9 +37,22 @@ Most folks use conda for virtual environments. You may want to as well. $ pip install -e . .. tip:: - Installing dev dependencies on Mac + Installing on Mac + + ``healpy`` is a very necessary dependency for hipscat libraries at this time, but + native prebuilt binaries for healpy on Apple Silicon Macs + `do not yet exist `_, + so it's recommended to install via conda before proceeding to hipscat-import. - (Make sure to include the single quotes) + .. code-block:: bash + + $ conda config --add channels conda-forge + $ conda install healpy + $ git clone https://github.com/astronomy-commons/hipscat-import + $ cd hipscat-import + $ pip install -e . + + When installing dev dependencies, make sure to include the single quotes. .. code-block:: bash diff --git a/docs/guide/index_table.rst b/docs/guide/index_table.rst new file mode 100644 index 00000000..a5f6d1bb --- /dev/null +++ b/docs/guide/index_table.rst @@ -0,0 +1,8 @@ +Index Table +=============================================================================== + +See the API documentation for :py:class:`hipscat_import.index.arguments.IndexArguments` + +.. note:: + + TODO - write the rest \ No newline at end of file diff --git a/docs/guide/margin_cache.rst b/docs/guide/margin_cache.rst new file mode 100644 index 00000000..1c97dcb6 --- /dev/null +++ b/docs/guide/margin_cache.rst @@ -0,0 +1,9 @@ +Margin Cache +=============================================================================== + +See the API documentation for +:py:class:`hipscat_import.margin_cache.margin_cache_arguments.MarginCacheArguments` + +.. note:: + + TODO - write the rest \ No newline at end of file diff --git a/docs/guide/overview.rst b/docs/guide/overview.rst deleted file mode 100644 index 14fd472f..00000000 --- a/docs/guide/overview.rst +++ /dev/null @@ -1,14 +0,0 @@ -Getting Started -=============================================================================== - -Installation -------------------------------------------------------------------------------- - -.. code-block:: bash - - pip install hipscat-import - -Other Topics -------------------------------------------------------------------------------- - -* :doc:`resume` diff --git a/docs/index.rst b/docs/index.rst index f56c5dae..9cdfb2cc 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -3,26 +3,76 @@ HiPSCat Import Utility for ingesting large survey data into HiPSCat structure. +Installation +------------------------------------------------------------------------------- + +.. code-block:: bash + + pip install hipscat-import + +.. tip:: + Installing on Mac + + ``healpy`` is a very necessary dependency for hipscat libraries at this time, but + native prebuilt binaries for healpy on Apple Silicon Macs + `do not yet exist `_, + so it's recommended to install via conda before proceeding to hipscat-import. + + .. code-block:: bash + + $ conda config --append channels conda-forge + $ conda install healpy + +Setting up a pipeline +------------------------------------------------------------------------------- + +For each type of dataset the hipscat-import tool can generate, there is an argument +container class that you will need to instantiate and populate with relevant arguments. + +See dataset-specific notes on arguments: + +* :doc:`catalogs/arguments` (most common) +* :doc:`guide/margin_cache` +* :doc:`guide/association` +* :doc:`guide/index_table` + +Once you have created your arguments object, you pass it into the pipeline control, +and then wait: + +.. code-block:: python + + import hipscat_import.pipeline as runner + + args = ... + runner.pipeline(args) + + .. toctree:: - :maxdepth: 1 - :caption: Importing Catalogs + :hidden: + :maxdepth: 2 + :caption: Catalogs - guide/overview - guide/resume - Notebooks + catalogs/arguments + catalogs/resume + catalogs/debug + catalogs/advanced + catalogs/public/index .. toctree:: + :hidden: :maxdepth: 1 - :caption: Example Catalogs + :caption: Other Datasets - catalogs/overview - catalogs/allwise - catalogs/neowise - catalogs/tic + guide/margin_cache + guide/association + guide/index_table + Notebooks .. toctree:: + :hidden: :maxdepth: 1 :caption: Developers guide/contributing API Reference + guide/contact diff --git a/docs/notebooks.rst b/docs/notebooks.rst index 7f7e544d..5e218f09 100644 --- a/docs/notebooks.rst +++ b/docs/notebooks.rst @@ -3,4 +3,4 @@ Notebooks .. toctree:: - Introducing Jupyter Notebooks + Estimate Pixel Threshold diff --git a/docs/notebooks/estimate_pixel_threshold.ipynb b/docs/notebooks/estimate_pixel_threshold.ipynb new file mode 100755 index 00000000..078b039b --- /dev/null +++ b/docs/notebooks/estimate_pixel_threshold.ipynb @@ -0,0 +1,206 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "98af180d", + "metadata": {}, + "source": [ + "# Estimate pixel threshold\n", + "\n", + "For best performance, we try to keep catalog parquet files between 200-800MB in size.\n", + "\n", + "**Background**\n", + "\n", + "When creating a new catalog through the hipscat-import process, we try to create partitions with approximately the same number of rows per partition. This isn't perfect, because the sky is uneven, but we still try to create smaller-area pixels in more dense areas, and larger-area pixels in less dense areas. We use the argument `pixel_threshold` and will split a partition into smaller healpix pixels until the number of rows is smaller than `pixel_threshold`.\n", + "\n", + "We do this to increase parallelization of reads and downstream analysis: if the files are around the same size, and operations on each partition take around the same amount of time, we're not as likely to be waiting on a single process to complete for the whole pipeline to complete.\n", + "\n", + "In addition, a single catalog file should not exceed a couple GB - we're going to need to read the whole thing into memory, so it needs to fit!\n", + "\n", + "**Objective**\n", + "\n", + "In this notebook, we'll go over *one* strategy for estimating the `pixel_threshold` argument you can use when importing a new catalog into hipscat format.\n", + "\n", + "This is not guaranteed to give you optimal results, but it could give you some hints toward *better* results." + ] + }, + { + "cell_type": "markdown", + "id": "eb86458c", + "metadata": {}, + "source": [ + "## Create a sample parquet file\n", + "\n", + "The first step is to read in your survey data in its original form, and convert a sample into parquet. This has a few benefits:\n", + "- parquet uses compression in various ways, and by creating the sample, we can get a sense of both the overall and field-level compression with real dat\n", + "- using the importer `FileReader` interface now sets you up for more success when you get around to importing!\n", + "\n", + "If your data is already in parquet format, just change the `sample_parquet_file` path to an existing file, and don't run the second cell." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5dd94480", + "metadata": {}, + "outputs": [], + "source": [ + "### Change this path!!!\n", + "sample_parquet_file=\"../../tests/hipscat_import/data/sample.parquet\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e6a53db0", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from hipscat_import.catalog.file_readers import CsvReader\n", + "\n", + "### Change this path!!!\n", + "input_file=\"../../tests/hipscat_import/data/small_sky/catalog.csv\"\n", + "\n", + "file_reader = CsvReader(\n", + " chunksize=5_000\n", + " )\n", + "\n", + "next(file_reader.read(input_file)).to_parquet(sample_parquet_file)" + ] + }, + { + "cell_type": "markdown", + "id": "124eb444", + "metadata": {}, + "source": [ + "## Inspect parquet file and metadata\n", + "\n", + "Now that we have parsed our survey data into parquet, we can check what the data will look like when it's imported into hipscat format.\n", + "\n", + "If you're just here to get a naive estimate for your pixel threshold, we'll do that first, then take a look at some other parquet characteristics later for the curious." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a9f0e279", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import pyarrow.parquet as pq\n", + "\n", + "sample_file_size = os.path.getsize(sample_parquet_file)\n", + "parquet_file = pq.ParquetFile(sample_parquet_file)\n", + "num_rows = parquet_file.metadata.num_rows\n", + "\n", + "## 100MB\n", + "ideal_file_small = 100 *1024*1024\n", + "## 800MB\n", + "ideal_file_large = 800 *1024*1024\n", + "\n", + "threshold_small = ideal_file_small/sample_file_size*num_rows\n", + "threshold_large = ideal_file_large/sample_file_size*num_rows\n", + "\n", + "print(f\"threshold between {int(threshold_small):_} and {int(threshold_large):_}\")" + ] + }, + { + "cell_type": "markdown", + "id": "23971c38", + "metadata": {}, + "source": [ + "## Want to see more?\n", + "\n", + "I'm so glad you're still here! I have more to show you!\n", + "\n", + "The first step below shows us the file-level metadata, as stored by parquet. The number of columns here SHOULD match your expectations on the number of columns in your survey data.\n", + "\n", + "The `serialized_size` value is just the size of the total metadata, not the size of the file. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc402acf", + "metadata": {}, + "outputs": [], + "source": [ + "import pyarrow.parquet as pq\n", + "\n", + "parquet_file = pq.ParquetFile(sample_parquet_file)\n", + "print(parquet_file.metadata)" + ] + }, + { + "cell_type": "markdown", + "id": "7835b6d9", + "metadata": {}, + "source": [ + "The next step is to look at the column-level metadata. You can check that the on-disk type of each column matches your expectation of the data. Note that for some integer types, the on-disk type may be a smaller int than originally set (e.g. `bitWidth=8` or `16`). This is part of parquet's multi-part compression strategy." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6ff8fb4d", + "metadata": {}, + "outputs": [], + "source": [ + "print(parquet_file.schema)" + ] + }, + { + "cell_type": "markdown", + "id": "5c2b593b", + "metadata": {}, + "source": [ + "Parquet will also perform some column-level compression, so not all columns with the same type will take up the same space on disk.\n", + "\n", + "Below, we inspect the row and column group metadata to show the compressed size of the fields on disk. The last column, `percent`, show the percent of total size taken up by the column.\n", + "\n", + "You *can* use this to inform which columns you keep when importing a catalog into hipscat format. e.g. if some columns are less useful for your science, and take up a lot of space, maybe leave them out!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dcf152f2", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "num_cols = parquet_file.metadata.num_columns\n", + "num_row_groups = parquet_file.metadata.num_row_groups\n", + "sizes = np.zeros(num_cols)\n", + "\n", + "for rg in range(num_row_groups):\n", + " for col in range (num_cols):\n", + " sizes[col] += parquet_file.metadata.row_group(rg).column(col).total_compressed_size\n", + "\n", + "## This is just an attempt at pretty formatting\n", + "percents = [f\"{s/sizes.sum()*100:.1f}\" for s in sizes]\n", + "pd.DataFrame({\"name\":parquet_file.schema.names, \"size\":sizes.astype(int), \"percent\": percents}).sort_values(\"size\", ascending=False)" + ] + } + ], + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.15" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/notebooks/intro_notebook.ipynb b/docs/notebooks/intro_notebook.ipynb deleted file mode 100644 index 71086d5e..00000000 --- a/docs/notebooks/intro_notebook.ipynb +++ /dev/null @@ -1,111 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "accepting-editor", - "metadata": { - "cell_marker": "\"\"\"" - }, - "source": [ - "# Introducing Jupyter Notebooks\n", - "\n", - "_(The example used here is JamesALeedham's notebook: [intro.ipynb](https://github.com/JamesALeedham/Sphinx-Autosummary-Recursion/blob/master/docs/notebooks/intro.ipynb))_\n", - "\n", - "First, set up the environment:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "actual-thirty", - "metadata": {}, - "outputs": [], - "source": [ - "import matplotlib\n", - "import matplotlib.pyplot as pl\n", - "import numpy as np\n", - "\n", - "try:\n", - " from IPython import get_ipython\n", - " get_ipython().run_line_magic('matplotlib', 'inline')\n", - "except AttributeError:\n", - " print('Magic function can only be used in IPython environment')\n", - " matplotlib.use('Agg')\n", - "\n", - "pl.rcParams[\"figure.figsize\"] = [15, 8]" - ] - }, - { - "cell_type": "markdown", - "id": "coral-upper", - "metadata": { - "cell_marker": "\"\"\"", - "lines_to_next_cell": 1 - }, - "source": [ - "Then, define a function that creates a pretty graph:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "funded-protection", - "metadata": { - "lines_to_next_cell": 1 - }, - "outputs": [], - "source": [ - "def SineAndCosineWaves():\n", - " # Get a large number of X values for a nice smooth curve. Using Pi as np.sin requires radians...\n", - " x = np.linspace(0, 2 * np.pi, 180)\n", - " # Convert radians to degrees to make for a meaningful X axis (1 radian = 57.29* degrees)\n", - " xdeg = 57.29577951308232 * np.array(x)\n", - " # Calculate the sine of each value of X\n", - " y = np.sin(x)\n", - " # Calculate the cosine of each value of X\n", - " z = np.cos(x)\n", - " # Plot the sine wave in blue, using degrees rather than radians on the X axis\n", - " pl.plot(xdeg, y, color=\"blue\", label=\"Sine wave\")\n", - " # Plot the cos wave in green, using degrees rather than radians on the X axis\n", - " pl.plot(xdeg, z, color=\"green\", label=\"Cosine wave\")\n", - " pl.xlabel(\"Degrees\")\n", - " # More sensible X axis values\n", - " pl.xticks(np.arange(0, 361, 45))\n", - " pl.legend()\n", - " pl.show()" - ] - }, - { - "cell_type": "markdown", - "id": "thorough-cutting", - "metadata": { - "cell_marker": "\"\"\"" - }, - "source": [ - "Finally, call that function to display the graph:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "imported-uruguay", - "metadata": {}, - "outputs": [], - "source": [ - "SineAndCosineWaves()" - ] - } - ], - "metadata": { - "jupytext": { - "cell_markers": "\"\"\"" - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/docs/static/allwise_schema.parquet b/docs/static/allwise_schema.parquet new file mode 100644 index 00000000..d09274f1 Binary files /dev/null and b/docs/static/allwise_schema.parquet differ diff --git a/src/hipscat_import/catalog/arguments.py b/src/hipscat_import/catalog/arguments.py index 6d83c9f8..59004bd0 100644 --- a/src/hipscat_import/catalog/arguments.py +++ b/src/hipscat_import/catalog/arguments.py @@ -60,11 +60,11 @@ class ImportArguments(RuntimeArguments): pixels that don't meed the threshold""" pixel_threshold: int = 1_000_000 """maximum number of rows for a single resulting pixel. - we may combine hierarchically until we near the `pixel_threshold`""" + we may combine hierarchically until we near the ``pixel_threshold``""" mapping_healpix_order: int = -1 """healpix order to use when mapping. will be - `highest_healpix_order` unless a positive value is provided for - `constant_healpix_order`""" + ``highest_healpix_order`` unless a positive value is provided for + ``constant_healpix_order``""" debug_stats_only: bool = False """do not perform a map reduce and don't create a new catalog. generate the partition info""" @@ -178,8 +178,8 @@ def additional_runtime_provenance_info(self) -> dict: def check_healpix_order_range( order, field_name, lower_bound=0, upper_bound=hipscat_id.HIPSCAT_ID_HEALPIX_ORDER ): - """Helper method to heck if the `order` is within the range determined by the - `lower_bound` and `upper_bound`, inclusive. + """Helper method to check if the ``order`` is within the range determined by the + ``lower_bound`` and ``upper_bound``, inclusive. Args: order (int): healpix order to check