Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/clean up optimisation #73

Merged
merged 43 commits into from
Nov 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
8ea588e
Merge branch 'develop' of github.com:ecmwf/polytope into develop
mathleur Jul 19, 2023
7fd7682
Merge branch 'develop' of github.com:ecmwf/polytope into develop
mathleur Jul 27, 2023
0064b2e
Merge branch 'develop' of github.com:ecmwf/polytope into develop
mathleur Aug 30, 2023
8259033
Merge branch 'develop' of github.com:ecmwf/polytope into develop
mathleur Sep 7, 2023
ed8347f
Merge branch 'develop' of github.com:ecmwf/polytope into develop
mathleur Oct 9, 2023
4231257
Merge branch 'develop' of github.com:ecmwf/polytope into develop
mathleur Oct 10, 2023
7090e06
make first fdb extraction work
mathleur Oct 12, 2023
f4b61f6
fix unmerging of date and time
mathleur Oct 12, 2023
c6385ab
add regular grid mapping for fdb and test, which does not work yet
mathleur Oct 13, 2023
a6f938c
fix regular grid test
mathleur Oct 13, 2023
17776a8
small fixes and first performance test
mathleur Oct 13, 2023
bb76ea4
make octahedral grid mapping faster and first fdb performance
mathleur Oct 18, 2023
6f2dbd8
create dictionary of latitude lines in octahedral mapping
mathleur Oct 18, 2023
7b18dce
add some sort of caching for longitude values in unmap
mathleur Oct 19, 2023
a898711
optimise second idx search
mathleur Oct 19, 2023
9faf711
optimise small bits
mathleur Oct 19, 2023
fb07954
calculate leaves' ancestors while finding leaves optimisation
mathleur Oct 20, 2023
03f5b17
small optimisation
mathleur Oct 20, 2023
5175919
store slice_axis_idx in hullslicer
mathleur Oct 20, 2023
fb2b110
try to make faster
mathleur Oct 24, 2023
722d0f0
add a null transformation and see effect on performance
mathleur Oct 24, 2023
a225c41
small optimisation
mathleur Oct 25, 2023
dd5c249
small optimisations
mathleur Oct 26, 2023
b32ff76
make new recursive get
mathleur Oct 27, 2023
0deab2e
time recursive get function
mathleur Oct 30, 2023
9a3f3bc
add scalability_plot
mathleur Oct 30, 2023
f2aa886
retrieve ranges of longitudes from FDB
mathleur Nov 2, 2023
245d0b4
make latlon requests to the fdb in a single path
mathleur Nov 7, 2023
15de427
request only 1 request to pyfdb
mathleur Nov 8, 2023
56d978a
clean up
mathleur Nov 10, 2023
7a7a2ac
clean up
mathleur Nov 10, 2023
2090029
black
mathleur Nov 10, 2023
f8922df
remove data
mathleur Nov 10, 2023
5842a1b
remove unnecessary code
mathleur Nov 10, 2023
1065c72
remove unnecessary code
mathleur Nov 14, 2023
6fffbff
renaming and small fixes
mathleur Nov 15, 2023
eef415d
renaming
mathleur Nov 16, 2023
0dfabef
fix regular grid problem with too many points found
mathleur Nov 20, 2023
458531d
make healpix grid work with cyclic axes
mathleur Nov 20, 2023
ec07170
black
mathleur Nov 20, 2023
d4697b8
fdb axes indices are not always in sorted order
mathleur Nov 21, 2023
4f72f1f
small fixes
mathleur Nov 22, 2023
3312772
clean up branch
mathleur Nov 23, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,8 @@
polytope.egg-info
.pytest_cache
*.prof
*.idx
*.idx
*.grib
*.xml
site
.coverage
47 changes: 47 additions & 0 deletions performance/fdb_performance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import time

import pandas as pd

from polytope.datacube.backends.fdb import FDBDatacube
from polytope.engine.hullslicer import HullSlicer
from polytope.polytope import Polytope, Request
from polytope.shapes import Box, Select


class TestSlicingFDBDatacube:
def setup_method(self, method):
# Create a dataarray with 3 labelled axes using different index types
self.options = {
"values": {
"transformation": {
"mapper": {"type": "octahedral", "resolution": 1280, "axes": ["latitude", "longitude"]}
}
},
"date": {"transformation": {"merge": {"with": "time", "linkers": [" ", "00"]}}},
"step": {"transformation": {"type_change": "int"}},
}
self.config = {"class": "od", "expver": "0001", "levtype": "sfc", "step": 0}
self.fdbdatacube = FDBDatacube(self.config, axis_options=self.options)
self.slicer = HullSlicer()
self.API = Polytope(datacube=self.fdbdatacube, engine=self.slicer, axis_options=self.options)

# Testing different shapes
# @pytest.mark.skip(reason="can't install fdb branch on CI")
def test_fdb_datacube(self):
request = Request(
Select("step", [0]),
Select("levtype", ["sfc"]),
Select("date", [pd.Timestamp("20230625T120000")]),
Select("domain", ["g"]),
Select("expver", ["0001"]),
Select("param", ["167"]),
Select("class", ["od"]),
Select("stream", ["oper"]),
Select("type", ["an"]),
Box(["latitude", "longitude"], [0, 0], [10, 10]),
)
time1 = time.time()
result = self.API.retrieve(request)
print("ENTIRE TIME")
print(time.time() - time1)
print(len(result.leaves))
48 changes: 48 additions & 0 deletions performance/fdb_performance_3D.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import time

import pandas as pd

from polytope.datacube.backends.fdb import FDBDatacube
from polytope.engine.hullslicer import HullSlicer
from polytope.polytope import Polytope, Request
from polytope.shapes import Box, Select, Span


class TestSlicingFDBDatacube:
def setup_method(self, method):
# Create a dataarray with 3 labelled axes using different index types
self.options = {
"values": {
"transformation": {
"mapper": {"type": "octahedral", "resolution": 1280, "axes": ["latitude", "longitude"]}
}
},
"date": {"transformation": {"merge": {"with": "time", "linkers": [" ", "00"]}}},
"step": {"transformation": {"type_change": "int"}},
"levelist": {"transformation": {"type_change": "int"}},
}
self.config = {"class": "od", "expver": "0001", "levtype": "sfc"}
self.fdbdatacube = FDBDatacube(self.config, axis_options=self.options)
self.slicer = HullSlicer()
self.API = Polytope(datacube=self.fdbdatacube, engine=self.slicer, axis_options=self.options)

# Testing different shapes
# @pytest.mark.skip(reason="can't install fdb branch on CI")
def test_fdb_datacube(self):
request = Request(
Span("step", 1, 15),
Select("levtype", ["sfc"]),
Select("date", [pd.Timestamp("20231102T000000")]),
Select("domain", ["g"]),
Select("expver", ["0001"]),
Select("param", ["167"]),
Select("class", ["od"]),
Select("stream", ["oper"]),
Select("type", ["fc"]),
Box(["latitude", "longitude"], [0, 0], [3, 5]),
)
time1 = time.time()
result = self.API.retrieve(request)
print("ENTIRE TIME")
print(time.time() - time1)
print(len(result.leaves))
16 changes: 16 additions & 0 deletions performance/fdb_scalability_plot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import matplotlib.pyplot as plt

fdb_time = [
7.6377081871032715 - 7.558288812637329,
73.57192325592041 - 72.99611115455627,
733.2706120014191 - 727.7059993743896,
4808.3157522678375 - 4770.814565420151,
]
num_extracted_points = [1986, 19226, 191543, 1267134]

# for the 1.3M points, we used 100 latitudes too...., maybe that's why it's not as linear...

plt.plot(num_extracted_points, fdb_time, marker="o")
plt.xlabel("Number of extracted points")
plt.ylabel("Polytope extraction time (in s)")
plt.show()
81 changes: 0 additions & 81 deletions polytope/datacube/backends/FDB_datacube.py

This file was deleted.

1 change: 1 addition & 0 deletions polytope/datacube/backends/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from ..backends.datacube import *
3 changes: 3 additions & 0 deletions polytope/datacube/backends/datacube.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,11 @@ def _create_axes(self, name, values, transformation_type_key, transformation_opt
)
for blocked_axis in transformation.blocked_axes():
self.blocked_axes.append(blocked_axis)
for unwanted_axis in transformation.unwanted_axes():
self.unwanted_axes.append(unwanted_axis)
for axis_name in final_axis_names:
self.complete_axes.append(axis_name)
self.fake_axes.append(axis_name)
# if axis does not yet exist, create it

# first need to change the values so that we have right type
Expand Down
167 changes: 167 additions & 0 deletions polytope/datacube/backends/fdb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
from copy import deepcopy

import pyfdb

from .datacube import Datacube, IndexTree


class FDBDatacube(Datacube):
def __init__(self, config={}, axis_options={}):
self.axis_options = axis_options
self.axis_counter = 0
self._axes = None
treated_axes = []
self.non_complete_axes = []
self.complete_axes = []
self.blocked_axes = []
self.unwanted_axes = []
self.fake_axes = []
self.unwanted_path = {}

partial_request = config
# Find values in the level 3 FDB datacube
# Will be in the form of a dictionary? {axis_name:values_available, ...}
self.fdb = pyfdb.FDB()
self.fdb_coordinates = self.fdb.axes(partial_request).as_dict()
self.fdb_coordinates["values"] = []
for name, values in self.fdb_coordinates.items():
values.sort()
options = axis_options.get(name, {})
self._check_and_add_axes(options, name, values)
treated_axes.append(name)
self.complete_axes.append(name)

# add other options to axis which were just created above like "lat" for the mapper transformations for eg
for name in self._axes:
if name not in treated_axes:
options = axis_options.get(name, {})
val = self._axes[name].type
self._check_and_add_axes(options, name, val)

def remove_unwanted_axes(self, leaf_path):
for axis in self.unwanted_axes:
leaf_path.pop(axis)
return leaf_path

def get(self, requests: IndexTree, leaf_path={}):
# First when request node is root, go to its children
if requests.axis.name == "root":
for c in requests.children:
self.get(c)
# If request node has no children, we have a leaf so need to assign fdb values to it
else:
key_value_path = {requests.axis.name: requests.value}
ax = requests.axis
(key_value_path, leaf_path, self.unwanted_path) = ax.unmap_path_key(
key_value_path, leaf_path, self.unwanted_path
)
leaf_path |= key_value_path
if len(requests.children[0].children[0].children) == 0:
# remap this last key
self.get_2nd_last_values(requests, leaf_path)

# Otherwise remap the path for this key and iterate again over children
else:
for c in requests.children:
self.get(c, leaf_path)

def get_2nd_last_values(self, requests, leaf_path={}):
# In this function, we recursively loop over the last two layers of the tree and store the indices of the
# request ranges in those layers
lat_length = len(requests.children)
range_lengths = [False] * lat_length
current_start_idxs = [False] * lat_length
fdb_node_ranges = [False] * lat_length
for i in range(len(requests.children)):
lat_child = requests.children[i]
lon_length = len(lat_child.children)
range_lengths[i] = [1] * lon_length
current_start_idxs[i] = [None] * lon_length
fdb_node_ranges[i] = [[IndexTree.root] * lon_length] * lon_length
range_length = deepcopy(range_lengths[i])
current_start_idx = deepcopy(current_start_idxs[i])
fdb_range_nodes = deepcopy(fdb_node_ranges[i])
key_value_path = {lat_child.axis.name: lat_child.value}
ax = lat_child.axis
(key_value_path, leaf_path, self.unwanted_path) = ax.unmap_path_key(
key_value_path, leaf_path, self.unwanted_path
)
leaf_path |= key_value_path
(range_lengths[i], current_start_idxs[i], fdb_node_ranges[i]) = self.get_last_layer_before_leaf(
lat_child, leaf_path, range_length, current_start_idx, fdb_range_nodes
)
self.give_fdb_val_to_node(leaf_path, range_lengths, current_start_idxs, fdb_node_ranges, lat_length)

def get_last_layer_before_leaf(self, requests, leaf_path, range_l, current_idx, fdb_range_n):
i = 0
for c in requests.children:
# now c are the leaves of the initial tree
key_value_path = {c.axis.name: c.value}
ax = c.axis
(key_value_path, leaf_path, self.unwanted_path) = ax.unmap_path_key(
key_value_path, leaf_path, self.unwanted_path
)
leaf_path |= key_value_path
last_idx = key_value_path["values"]
if current_idx[i] is None:
current_idx[i] = last_idx
fdb_range_n[i][range_l[i] - 1] = c
else:
if last_idx == current_idx[i] + range_l[i]:
range_l[i] += 1
fdb_range_n[i][range_l[i] - 1] = c
else:
key_value_path = {c.axis.name: c.value}
ax = c.axis
(key_value_path, leaf_path, self.unwanted_path) = ax.unmap_path_key(
key_value_path, leaf_path, self.unwanted_path
)
leaf_path |= key_value_path
i += 1
current_start_idx = key_value_path["values"]
current_idx[i] = current_start_idx
return (range_l, current_idx, fdb_range_n)

def give_fdb_val_to_node(self, leaf_path, range_lengths, current_start_idx, fdb_range_nodes, lat_length):
(output_values, original_indices) = self.find_fdb_values(
leaf_path, range_lengths, current_start_idx, lat_length
)
new_fdb_range_nodes = []
new_range_lengths = []
for j in range(lat_length):
for i in range(len(range_lengths[j])):
if current_start_idx[j][i] is not None:
new_fdb_range_nodes.append(fdb_range_nodes[j][i])
new_range_lengths.append(range_lengths[j][i])
sorted_fdb_range_nodes = [new_fdb_range_nodes[i] for i in original_indices]
sorted_range_lengths = [new_range_lengths[i] for i in original_indices]
for i in range(len(sorted_fdb_range_nodes)):
for k in range(sorted_range_lengths[i]):
n = sorted_fdb_range_nodes[i][k]
n.result = output_values[0][0][0][i][k]

def find_fdb_values(self, path, range_lengths, current_start_idx, lat_length):
path.pop("values")
fdb_requests = []
interm_request_ranges = []
for i in range(lat_length):
for j in range(len(range_lengths[i])):
if current_start_idx[i][j] is not None:
current_request_ranges = (current_start_idx[i][j], current_start_idx[i][j] + range_lengths[i][j])
interm_request_ranges.append(current_request_ranges)
request_ranges_with_idx = list(enumerate(interm_request_ranges))
sorted_list = sorted(request_ranges_with_idx, key=lambda x: x[1][0])
original_indices, sorted_request_ranges = zip(*sorted_list)
fdb_requests.append(tuple((path, sorted_request_ranges)))
output_values = self.fdb.extract(fdb_requests)
return (output_values, original_indices)

def datacube_natural_indexes(self, axis, subarray):
indexes = subarray[axis.name]
return indexes

def select(self, path, unmapped_path):
return self.fdb_coordinates

def ax_vals(self, name):
return self.fdb_coordinates.get(name, None)
Loading