added one-dimensional ordering problem

thomasWeise · Nov 3, 2023 · 79c7bf0 · 79c7bf0
1 parent 1ba6a88
commit 79c7bf0
Show file tree

Hide file tree

Showing 9 changed files with 706 additions and 3 deletions.
diff --git a/examples/order1_from_dat.py b/examples/order1_from_dat.py
@@ -0,0 +1,174 @@
+"""
+Find a reasonable one-dimensional order for permutations.
+
+The input format of this program are `dat` files of the format
+```
+EVALS    GENOTYPE    FITNESS
+1    [22, 7, 6, 26, 27, 19, 3, 1, ... 5, 21, 8, 17, 2, 16, 9, 23]    87018
+13    [20, 7, 6, 26, 18, 19, 9, 1, ... 25, 13, 23, 16, 15, 24]    85456
+20    [20, 7, 18, 26, 6, 16, 9, 1,  ...  21, 13, 12, 19, 15, 17]    84152
+29    [20, 11, 14, 25, 5, 16, 15, 1,  ...  21, 13, 12, 9, 19, 17]    83180
+32    [20, 10, 14, 25, 5, 12, 15, 1,  ... 17, 13, 16, 9, 19, 21]    82846
+34    [20, 15, 14, 25, 5, 12, 10, 1,  ...  6, 17, 13, 16, 9, 19, 21]    78204
+```
+"""
+
+import argparse
+from os import listdir
+from os.path import basename, isdir, isfile, join
+from re import Pattern
+from re import compile as re_compile
+from typing import Any, Callable, Final
+
+import numpy as np
+from moptipy.algorithms.so.rls import RLS
+from moptipy.api.execution import Execution
+from moptipy.operators.permutations.op0_shuffle import Op0Shuffle
+from moptipy.operators.permutations.op1_swap2 import Op1Swap2
+from moptipy.utils.console import logger
+from moptipy.utils.help import argparser
+from moptipy.utils.path import Path
+from moptipy.utils.types import check_to_int_range
+
+from moptipyapps.order1d.distances import swap_distance
+from moptipyapps.order1d.instance import Instance
+from moptipyapps.order1d.objective import OneDimensionalDistribution
+from moptipyapps.order1d.space import OrderingSpace
+
+
+def parse_data(path: str, collector: Callable[[
+        tuple[str, int, int, np.ndarray]], Any],
+        fitness_limit: int, pattern: Pattern) -> None:
+    """
+    Parse a dat file.
+
+    :param path: the path
+    :param collector: the collector function to invoke when loading data
+    :param fitness_limit: the minimum acceptable fitness
+    :param pattern: the file name pattern
+    """
+    the_path: Final[Path] = Path.path(path)
+    if isdir(the_path):  # recursively parse directories
+        logger(f"recursing into directory '{the_path}'.")
+        for subpath in listdir(the_path):
+            parse_data(join(the_path, subpath), collector, fitness_limit,
+                       pattern)
+        return
+
+    if not isfile(the_path):
+        return  # if it is not a file, we quit
+    the_name: Final[str] = basename(the_path)
+    if not pattern.match(the_name):
+        return  # file does not match
+
+    # parse the file
+    for oline in the_path.open_for_read():
+        line = oline.strip()
+        if len(line) <= 0:
+            continue
+        bracket_open: int = line.find("[")
+        if bracket_open <= 0:
+            continue
+        bracket_close: int = line.find("]", bracket_open + 1)
+        if bracket_close <= bracket_open:
+            continue
+        f: int = check_to_int_range(line[bracket_close + 1:],
+                                    "fitness", 0, 1_000_000_000_000)
+        if f > fitness_limit:
+            continue
+        evals: int = check_to_int_range(line[:bracket_open].strip(),
+                                        "evals", 1, 1_000_000_000_000_000)
+        perm: list[int] = [
+            check_to_int_range(s, "perm", 1, 1_000_000_000) - 1
+            for s in line[bracket_open + 1:bracket_close].split(",")]
+        collector((the_name, evals, f, np.array(perm)))
+
+
+def get_tags(data: tuple[str, int, int, np.ndarray]) -> tuple[str, str, str]:
+    """
+    Get the tags to store along with the data.
+
+    :param data: the data
+    :return: the tags
+    """
+    return data[0], str(data[1]), str(data[2])
+
+
+def get_distance(a: tuple[str, int, int, np.ndarray],
+                 b: tuple[str, int, int, np.ndarray]) -> int:
+    """
+    Get the distance between two data elements.
+
+    The distance here is the swap distance.
+
+    :param a: the first element
+    :param b: the second element
+    :return: the swap distance
+    """
+    return swap_distance(a[3], b[3])
+
+
+def run(source: str, dest: str, max_fes: int = 1_000_000,
+        fitness_limit: int = 1_000_000_000,
+        file_name_regex: str = ".*") -> None:
+    """
+    Run the RLS algorithm to optimize a horizontal layout permutation.
+
+    :param source: the source file or directory
+    :param dest: the destination file
+    :param max_fes: the maximum FEs
+    :param fitness_limit: the minimum acceptable fitness
+    :param file_name_regex: the file name regular expression
+    """
+    logger(f"invoked program with source='{source}', dest='{dest}', "
+           f"max_fes={max_fes}, fitness_limit={fitness_limit}, and "
+           f"file_name_regex='{file_name_regex}'.")
+    # first, we load all the data to construct a distance rank matrix
+    pattern: Final[Pattern] = re_compile(file_name_regex)
+    logger(f"now loading data from '{source}' matching to '{pattern}'.")
+
+    data: list[tuple[str, int, int, np.ndarray]] = []
+    parse_data(source, data.append, fitness_limit, pattern)
+    logger(f"finished loading {len(data)} rows of data, "
+           "now constructing distance rank matrix.")
+    instance: Final[Instance] = Instance.from_sequence_and_distance(
+        data, get_tags, get_distance)
+    del data  # free the now useless data
+
+    # run the algorithm
+    logger(f"finished constructing matrix with {len(instance)} rows, "
+           "now doing optimization for "
+           f"{max_fes} FEs and writing result to '{dest}'.")
+    space: Final[OrderingSpace] = OrderingSpace(instance)
+    with (Execution().set_solution_space(space)
+          .set_objective(OneDimensionalDistribution(instance))
+          .set_algorithm(RLS(Op0Shuffle(space), Op1Swap2()))
+          .set_max_fes(max_fes)
+          .set_log_improvements(True)
+          .set_log_file(dest).execute()):
+        pass
+    logger("all done.")
+
+
+# Perform the optimization
+if __name__ == "__main__":
+    parser: Final[argparse.ArgumentParser] = argparser(
+        __file__, "One-Dimensional Ordering of Permutations",
+        "Run the one-dimensional order of permutations experiment.")
+    parser.add_argument(
+        "source", help="the directory or file with the input data",
+        type=Path.path, nargs="?", default="./")
+    parser.add_argument(
+        "dest", help="the file to write the output to",
+        type=Path.path, nargs="?", default="./result.txt")
+    parser.add_argument("fitnessLimit", help="the minimum acceptable fitness",
+                        type=int, nargs="?", default=1_000_000_000)
+    parser.add_argument("maxFEs", help="the maximum FEs to perform",
+                        type=int, nargs="?", default=1_000_000)
+    parser.add_argument(
+        "fileNameRegEx",
+        help="a regular expression that file names must match",
+        type=str, nargs="?", default=".*")
+    args: Final[argparse.Namespace] = parser.parse_args()
+    run(args.source, args.dest, args.maxFEs, args.fitnessLimit,
+        args.fileNameRegEx)
diff --git a/moptipyapps/order1d/__init__.py b/moptipyapps/order1d/__init__.py
@@ -0,0 +1,28 @@
+"""
+A set of tools for ordering objects in 1 dimension.
+
+Let's assume that we have `n` objects and a distance metric that can compute
+the distance between two objects. We do not know and also do not care about in
+how many dimension the objects exist - we just have objects and a distance
+metric.
+
+Now we want to find a one-dimensional order of the objects that reflects their
+original distance-based topology. For each object `a`, we want that its
+closest neighbor in the order is also its actual closest neighbor according to
+the distance metric. It's second-closest neighbor should be the actual
+second-closest neighbor according to the distance metric. And so on.
+
+Since we only care about the object order and do not want to metrically map
+the distances to one dimension, we can represent the solution as permutation
+of natural numbers.
+
+Of course, in a one-dimensional order, each object has exactly two closest
+neighbors (the one on its left and the one on its right) unless it is situated
+either at the beginning or end of the order, in which case it has exactly one
+closest neighbor. Based on the actual distance metric, an object may have any
+number of closest neighbors, maybe only one, or maybe three equally-far away
+objects. So it is not clear whether a perfect mapping to the one-dimensional
+permutations even exists.
+
+But we can try to find one that comes as close as possible to the real deal.
+"""
diff --git a/moptipyapps/order1d/distances.py b/moptipyapps/order1d/distances.py
@@ -0,0 +1,65 @@
+"""Some examples for distance metrics."""
+
+from typing import Final
+
+import numba  # type: ignore
+import numpy as np
+from moptipy.utils.nputils import DEFAULT_BOOL
+
+
+@numba.njit(cache=True, inline="always", fastmath=True, boundscheck=False)
+def swap_distance(p1: np.ndarray, p2: np.ndarray) -> int:
+    """
+    Compute the swap distance between two permutations `p1` and `p1`.
+
+    This is the minimum number of swaps required to translate `p1` to `p2` and
+    vice versa. This function is symmatric.
+
+    An upper bound for the number of maximum number of swaps that could be
+    required is the length of the permutation. This upper bound can be derived
+    from Selection Sort. Imagine that I want to translate the array `p1` to
+    `p2`. I go through `p1` from beginning to end. If, at index `i`, I find
+    the right element (`p1[i] == p2[i]`), then I do nothing. If not, then the
+    right element must come at some index `j>i` (because all elements before I
+    already have fixed). So I swap `p1[i]` with `p1[j]`. Now `p1[i] == p2[i]`
+    and I increment `i`. Once I arrive at the end of `p1`, it must hold that
+    `all(p1[i] == p2[i])`. At the same time, I have performed at most one swap
+    at each index during the iteration. Hence, I can never need more swaps
+    than the arrays are long.
+
+    :param p1: the first permutation
+    :param p2: the second permutation
+    :return: the swap distance, always between `0` and `len(p1)`
+
+    >>> swap_distance(np.array([0, 1, 2, 3]), np.array([3, 1, 2, 0]))
+    1
+    >>> swap_distance(np.array([0, 1, 2]), np.array([0, 1, 2]))
+    0
+    >>> swap_distance(np.array([1, 0, 2]), np.array([0, 1, 2]))
+    1
+    >>> swap_distance(np.array([0, 1, 2]), np.array([1, 0, 2]))
+    1
+    >>> swap_distance(np.array([0, 1, 2]), np.array([2, 0, 1]))
+    2
+    >>> swap_distance(np.array([2, 0, 1]), np.array([0, 1, 2]))
+    2
+    >>> swap_distance(np.arange(10), np.array([4, 8, 1, 5, 9, 3, 6, 0, 7, 2]))
+    7
+    >>> swap_distance(np.array([4, 8, 1, 5, 9, 3, 6, 0, 7, 2]), np.arange(10))
+    7
+    """
+    n: Final[int] = len(p1)
+    x: np.ndarray = p2[np.argsort(p1)]
+    unchecked: np.ndarray = np.ones(n, DEFAULT_BOOL)
+    result: int = 0
+
+    for i in range(n):
+        if unchecked[i]:
+            result += 1
+            unchecked[i] = False
+            j = x[i]
+            while j != i:
+                unchecked[j] = False
+                j = x[j]
+
+    return n - result