From 60466acc63f5af886af8bfc0da311558d6f00673 Mon Sep 17 00:00:00 2001 From: Matteo Bachetti Date: Fri, 18 Oct 2024 22:48:56 +0200 Subject: [PATCH 1/2] Add tutorial to deal with large data --- .../Dealing with large data files.ipynb | 567 ++++++++++++++++++ 1 file changed, 567 insertions(+) create mode 100644 Performance/Dealing with large data files.ipynb diff --git a/Performance/Dealing with large data files.ipynb b/Performance/Dealing with large data files.ipynb new file mode 100644 index 0000000..623b8a9 --- /dev/null +++ b/Performance/Dealing with large data files.ipynb @@ -0,0 +1,567 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "24df35e1-e5c9-40a8-89ab-73c782e070b3", + "metadata": {}, + "source": [ + "In this tutorial, we approach the case of a very large event file, larger than the memory of our computer. Will we be able to analyze it?" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "8957430d-d905-40a2-87cc-dcbdbaec91d7", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "%load_ext memory_profiler\n", + "import psutil\n", + "import os\n", + "import numpy as np\n", + "import gc\n", + "\n", + "from stingray import EventList, AveragedPowerspectrum" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "73c7f674-c04e-44c9-bf02-54713675352b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Current memory use (29237): 91.16 MB\n" + ] + } + ], + "source": [ + "pid = os.getpid()\n", + "python_process = psutil.Process(pid)\n", + "memory_use = python_process.memory_info()[0]/2.**20\n", + "print(f\"Current memory use ({pid}): {memory_use:.2f} MB\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "ac2a89f2-c520-4c32-9f28-9da5a0029b46", + "metadata": {}, + "source": [ + "## Data preparation\n", + "\n", + "Now we simulate and load a full dataset. Let's simulate a large observation, about 2GB" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "ed97efbf-1310-4651-bf46-6830d92fb6f8", + "metadata": {}, + "outputs": [], + "source": [ + "fname = \"events_large.evt\"" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "c2221d2a-a3fa-4119-860e-e090affc17b9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/Users/meo/devel/StingraySoftware/hendrics/hendrics/io.py:38: UserWarning: Warning! NetCDF is not available. Using pickle format.\n", + " warnings.warn(msg)\n", + "/Users/meo/devel/StingraySoftware/hendrics/hendrics/fold.py:38: UserWarning: PINT is not installed. Some pulsar functionality will not be available\n", + " warnings.warn(\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!HENfake -c 20000 --tstart 0 --tstop 10000 --mjdref 56000 -o events_large.evt" + ] + }, + { + "cell_type": "markdown", + "id": "b7d9d777-09ad-4a7c-a479-a50548bedc90", + "metadata": {}, + "source": [ + "## Naive procedure: create light curve, then calculate PDS\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "c0b076b3-33f2-4ff2-86e1-f33c796d4e17", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "peak memory: 5645.70 MiB, increment: 5347.78 MiB\n" + ] + } + ], + "source": [ + "%memit events = EventList.read(fname, fmt=\"ogip\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "0c7520ca-69f2-4445-83df-4a888c10082e", + "metadata": {}, + "source": [ + "Loading the observation into memory takes about 5 GB. Now, we want a power spectrum with very high frequencies. Let us do the traditional way, creating first a light curve, then analyzing it with AveragedPowerspectrum" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "40c1a77b-2ac5-4bb9-8f8c-1204eecbd5fb", + "metadata": {}, + "outputs": [], + "source": [ + "fine_sample_time = 0.00001\n", + "segment_size = 128\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "5dde9f16-9fd6-4618-9ef4-8f5013e01f1a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "peak memory: 15315.70 MiB, increment: 9670.00 MiB\n" + ] + } + ], + "source": [ + "%memit lc = events.to_lc(dt=fine_sample_time)" + ] + }, + { + "cell_type": "markdown", + "id": "42b15eee-88b1-43bb-bdef-4ca7b7946887", + "metadata": {}, + "source": [ + "This very finely sampled light curve will take a _lot_ of memory: 10000 s, sampled at 10 $\\mu$s, will give ~1B float objects, or 8 GB, for the time array and the same for the counts array. Here, the value that comes out is slightly smaller because the operating system is using swap! Some of the swapped data will come back in the main memory when calculating the power spectrum." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "b009941b-a1a3-455b-9b64-4c5021affdd7", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "78it [00:28, 2.69it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "peak memory: 14324.80 MiB, increment: 2063.22 MiB\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/plain": [ + "array([1.02539377e-04, 1.03036920e-04, 9.54479649e-05, ...,\n", + " 1.10340774e-04, 1.07141777e-04, 9.10072280e-05])" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%memit ps = AveragedPowerspectrum.from_lightcurve(lc, segment_size=segment_size)\n", + "ps.power" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "f19ebe91-bdad-41c6-a9c6-ca1bc2cc02e7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "del events, lc, ps.power, ps\n", + "gc.collect()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "920771d2-0a75-41b4-8533-b9c7f572bdca", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Current memory use (29237): 1876.75 MB\n" + ] + } + ], + "source": [ + "memory_use = python_process.memory_info()[0]/2.**20\n", + "print(f\"Current memory use ({pid}): {memory_use:.2f} MB\")" + ] + }, + { + "cell_type": "markdown", + "id": "7c81b95f-3bec-4046-a3ba-af944cb3f34e", + "metadata": {}, + "source": [ + "So, if we want to take the maximum memory usage for the full procedure, we can profile the three steps done until now:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "8b7d8223-5f2a-435c-9510-dc4c4d13da09", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "78it [00:28, 2.70it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "peak memory: 16715.56 MiB, increment: 14837.95 MiB\n" + ] + }, + { + "data": { + "text/plain": [ + "array([1.02539377e-04, 1.03036920e-04, 9.54479649e-05, ...,\n", + " 1.10340774e-04, 1.07141777e-04, 9.10072280e-05])" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def legacy_pds_procedure(fname, sample_time, segment_size):\n", + " events = EventList.read(fname, fmt=\"ogip\")\n", + " lc = events.to_lc(dt=fine_sample_time)\n", + " return AveragedPowerspectrum.from_lightcurve(lc, segment_size=segment_size)\n", + "\n", + "\n", + "%memit ps = legacy_pds_procedure(fname, fine_sample_time, segment_size)\n", + "ps.power" + ] + }, + { + "cell_type": "markdown", + "id": "7838fc58-1231-4bcf-9257-05cb5293cc2c", + "metadata": {}, + "source": [ + "Let's clean up the memory a little bit." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "1db9bc50-c624-46e5-ab43-44cf40f387cd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Current memory use (29237): 1876.75 MB\n" + ] + } + ], + "source": [ + "del ps.power, ps\n", + "gc.collect()\n", + "python_process.memory_info()[0]/2.**20\n", + "print(f\"Current memory use ({pid}): {memory_use:.2f} MB\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "6d127938-71a9-4815-813f-e1cc5ba97503", + "metadata": {}, + "source": [ + "## Slightly better: PDS from events\n", + "What if we get the power spectrum directly from the events, without previous binning of the full light curve? In this case, the binning will happen only on a segment-by-segment basis, freeing memory." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "ef2ebf7f-7526-4ade-b543-7ddb1669e410", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "78it [00:28, 2.71it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "peak memory: 7543.30 MiB, increment: 5701.02 MiB\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/plain": [ + "array([1.02539377e-04, 1.03036920e-04, 9.54479649e-05, ...,\n", + " 1.10349785e-04, 1.07137039e-04, 9.09968704e-05])" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def pds_from_events(fname, sample_time, segment_size):\n", + " events = EventList.read(fname, fmt=\"ogip\")\n", + " return AveragedPowerspectrum.from_events(events, dt=sample_time, segment_size=segment_size)\n", + "\n", + "%memit ps = pds_from_events(fname, fine_sample_time, segment_size)\n", + "ps.power" + ] + }, + { + "cell_type": "markdown", + "id": "ce8f04da-9c8e-46d3-ba77-a04d8f60bb86", + "metadata": {}, + "source": [ + "Much better! The memory increment is now dominated by the loading of events, so just about 5.7 GB. Let's clean up the memory a little bit again" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "f2d79666-b03c-4a11-bd22-bfb5f691e218", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Current memory use (29237): 2245.31 MB\n" + ] + } + ], + "source": [ + "del ps.power, ps\n", + "gc.collect()\n", + "memory_use = python_process.memory_info()[0]/2.**20\n", + "print(f\"Current memory use ({pid}): {memory_use:.2f} MB\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "1eeef29a-56f3-4b41-9691-d276369c99ee", + "metadata": {}, + "source": [ + "## Let's be \"lazy\": lazy loading with FITSTimeseriesReader\n", + "\n", + "Now, let's try not to even pre-load the events. What will happen?\n", + "First of all, we use the new class `FITSTimeseriesReader` to lazy-load the data, meaning that the data remain in the FITS file until we try to access them. This occupies very little memory." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "74ef9faf-ac33-4590-8f1e-46c11a0882d0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "peak memory: 2245.34 MiB, increment: 0.00 MiB\n" + ] + } + ], + "source": [ + "from stingray.io import FITSTimeseriesReader\n", + "%memit fitsreader = FITSTimeseriesReader(fname, data_kind=\"times\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "0e7440ca-3c8d-4e6c-a2fd-cc486156bc11", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "peak memory: 2245.36 MiB, increment: 0.00 MiB\n" + ] + } + ], + "source": [ + "from stingray.gti import time_intervals_from_gtis\n", + "start, stop = time_intervals_from_gtis(fitsreader.gti, segment_size)\n", + "%memit interval_times = np.array(list(zip(start, stop)))\n" + ] + }, + { + "cell_type": "markdown", + "id": "89acff6e-1fe9-4705-8891-13ec6cd47fc6", + "metadata": {}, + "source": [ + "Let's create an iterable that uses the FITSTimeseriesReader to send AveragedPowerspectrum the pre-binned light curves for each segment. Events will be read in chunks from the FITS file, and streamed as light curve segments on the fly." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "92518923-fbc7-429a-b1d8-dc81591fc965", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "78it [00:32, 2.43it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "peak memory: 4531.69 MiB, increment: 2286.30 MiB\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/plain": [ + "array([1.02539377e-04, 1.03036920e-04, 9.54479649e-05, 1.13488540e-04,\n", + " 1.10641888e-04, 1.11452625e-04, 1.15657206e-04, 1.04863608e-04,\n", + " 9.25844488e-05, 9.50754514e-05], dtype=float64)" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from stingray.utils import histogram\n", + "def fits_times_iterable(fname, segment_size, sample_time):\n", + " \"\"\"Create light curve iterables to be analyzed by AveragedPowerspectrum.from_lc_iterable.\"\"\"\n", + " fitsreader = FITSTimeseriesReader(fname, data_kind=\"times\")\n", + " start, stop = time_intervals_from_gtis(fitsreader.gti, segment_size)\n", + " intvs = [[s, e] for s, e in zip(start,stop)]\n", + " times = fitsreader.filter_at_time_intervals(intvs, check_gtis=True)\n", + " for ts, (s, e) in zip(times, intvs):\n", + " lc = histogram(ts, bins=np.rint((e - s)/sample_time).astype(int), range=[s, e])\n", + " yield lc\n", + "\n", + "\n", + "%memit ps_it = AveragedPowerspectrum.from_lc_iterable(fits_times_iterable(fname, segment_size, fine_sample_time), segment_size=segment_size, dt=fine_sample_time)\n", + "ps_it.power[:10]" + ] + }, + { + "cell_type": "markdown", + "id": "2da603a0-6e82-4076-9e33-576145024735", + "metadata": {}, + "source": [ + "Hurray! We managed to keep the memory increment to ~2GB, comparable with the sole AveragedPowerspectrum operation!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3265e2f0-264d-48d8-8187-fb44b10d4f72", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From a7f295fd7844eda09ecffa93463146e847bfe835 Mon Sep 17 00:00:00 2001 From: Matteo Bachetti Date: Fri, 18 Oct 2024 22:53:42 +0200 Subject: [PATCH 2/2] Update Dealing with large data files.ipynb --- Performance/Dealing with large data files.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Performance/Dealing with large data files.ipynb b/Performance/Dealing with large data files.ipynb index 623b8a9..1010d93 100644 --- a/Performance/Dealing with large data files.ipynb +++ b/Performance/Dealing with large data files.ipynb @@ -54,7 +54,7 @@ "source": [ "## Data preparation\n", "\n", - "Now we simulate and load a full dataset. Let's simulate a large observation, about 2GB" + "Now we simulate and load a full dataset. Let's simulate a large observation, about 2GB. We use HENDRICS, you can install it with `pip install hendrics`" ] }, {