From 1836c53d5f0f17f112431acf9b6120d366316329 Mon Sep 17 00:00:00 2001
From: Rosie Wood <rwood@turing.ac.uk>
Date: Wed, 11 Sep 2024 12:47:09 +0100
Subject: [PATCH 1/5] create base class for recognition runners (RecRunner) and
 add search method for searching predictions

---
 mapreader/spot_text/deepsolo_runner.py    | 133 +--------
 mapreader/spot_text/dptext_detr_runner.py |   2 +-
 mapreader/spot_text/maptext_runner.py     | 134 +--------
 mapreader/spot_text/rec_runner_base.py    | 322 ++++++++++++++++++++++
 mapreader/spot_text/runner_base.py        |   8 +-
 5 files changed, 336 insertions(+), 263 deletions(-)
 create mode 100644 mapreader/spot_text/rec_runner_base.py

diff --git a/mapreader/spot_text/deepsolo_runner.py b/mapreader/spot_text/deepsolo_runner.py
index fefb1edd..298d11ea 100644
--- a/mapreader/spot_text/deepsolo_runner.py
+++ b/mapreader/spot_text/deepsolo_runner.py
@@ -11,7 +11,6 @@
     )
 
 import geopandas as gpd
-import numpy as np
 import pandas as pd
 import torch
 from adet.config import get_cfg
@@ -21,18 +20,16 @@
 except ImportError:
     raise ImportError("[ERROR] Please install Detectron2")
 
-from shapely import LineString, MultiPolygon, Polygon
-
 # first assert we are using the deep solo version of adet
 if adet.__version__ != "0.2.0-deepsolo":
     raise ImportError(
         "[ERROR] Please install DeepSolo from the following link: https://github.com/rwood-97/DeepSolo"
     )
 
-from .runner_base import Runner
+from .rec_runner_base import RecRunner
 
 
-class DeepSoloRunner(Runner):
+class DeepSoloRunner(RecRunner):
     def __init__(
         self,
         patch_df: pd.DataFrame | gpd.GeoDataFrame | str | pathlib.Path,
@@ -68,6 +65,8 @@ def __init__(
         self.patch_predictions = {}
         self.parent_predictions = {}
         self.geo_predictions = {}
+        self.search_results = {}
+        self.geo_search_results = {}
 
         # setup the config
         cfg = get_cfg()  # get a fresh new config
@@ -231,50 +230,6 @@ def __init__(
         # setup the predictor
         self.predictor = DefaultPredictor(cfg)
 
-    def get_patch_predictions(
-        self,
-        outputs: dict,
-        return_dataframe: bool = False,
-        min_ioa: float = 0.7,
-    ) -> dict | pd.DataFrame:
-        """Post process the model outputs to get patch predictions.
-
-        Parameters
-        ----------
-        outputs : dict
-            The outputs from the model.
-        return_dataframe : bool, optional
-            Whether to return the predictions as a pandas DataFrame, by default False
-        min_ioa : float, optional
-            The minimum intersection over area to consider two polygons the same, by default 0.7
-
-        Returns
-        -------
-        dict or pd.DataFrame
-            A dictionary containing the patch predictions or a DataFrame if `as_dataframe` is True.
-        """
-        # key for predictions
-        image_id = outputs["image_id"]
-        self.patch_predictions[image_id] = []
-
-        # get instances
-        instances = outputs["instances"].to("cpu")
-        ctrl_pnts = instances.ctrl_points.numpy()
-        scores = instances.scores.tolist()
-        recs = instances.recs
-        bd_pts = np.asarray(instances.bd)
-
-        self._post_process(image_id, ctrl_pnts, scores, recs, bd_pts)
-        self._deduplicate(image_id, min_ioa=min_ioa)
-
-        if return_dataframe:
-            return self._dict_to_dataframe(self.patch_predictions, geo=False)
-        return self.patch_predictions
-
-    def _process_ctrl_pnt(self, pnt):
-        points = pnt.reshape(-1, 2)
-        return points
-
     def _ctc_decode_recognition(self, rec):
         last_char = "###"
         s = ""
@@ -291,83 +246,3 @@ def _ctc_decode_recognition(self, rec):
             else:
                 last_char = "###"
         return s
-
-    def _post_process(self, image_id, ctrl_pnts, scores, recs, bd_pnts, alpha=0.4):
-        for ctrl_pnt, score, rec, bd in zip(ctrl_pnts, scores, recs, bd_pnts):
-            # draw polygons
-            if bd is not None:
-                bd = np.hsplit(bd, 2)
-                bd = np.vstack([bd[0], bd[1][::-1]])
-                polygon = Polygon(bd).buffer(0)
-
-                if isinstance(polygon, MultiPolygon):
-                    polygon = polygon.convex_hull
-
-            # draw center lines
-            line = self._process_ctrl_pnt(ctrl_pnt)
-            line = LineString(line)
-
-            # draw text
-            text = self._ctc_decode_recognition(rec)
-            if self.voc_size == 37:
-                text = text.upper()
-            # text = "{:.2f}: {}".format(score, text)
-            text = f"{text}"
-            score = f"{score:.2f}"
-
-            self.patch_predictions[image_id].append([polygon, text, score])
-
-    @staticmethod
-    def _dict_to_dataframe(
-        preds: dict,
-        geo: bool = False,
-        parent: bool = False,
-    ) -> pd.DataFrame:
-        """Convert the predictions dictionary to a pandas DataFrame.
-
-        Parameters
-        ----------
-        preds : dict
-            A dictionary of predictions.
-        geo : bool, optional
-            Whether the dictionary is georeferenced coords (or pixel bounds), by default True
-        parent : bool, optional
-            Whether the dictionary is at parent level, by default False
-
-        Returns
-        -------
-        pd.DataFrame
-            A pandas DataFrame containing the predictions.
-        """
-        if geo:
-            columns = ["geometry", "crs", "text", "score"]
-        else:
-            columns = ["geometry", "text", "score"]
-
-        if parent:
-            columns.append("patch_id")
-
-        preds_df = pd.concat(
-            pd.DataFrame(
-                preds[k],
-                index=np.full(len(preds[k]), k),
-                columns=columns,
-            )
-            for k in preds.keys()
-        )
-
-        if geo:
-            # get the crs (should be the same for all)
-            if not preds_df["crs"].nunique() == 1:
-                raise ValueError("[ERROR] Multiple crs found in the predictions.")
-            crs = preds_df["crs"].unique()[0]
-
-            preds_df = gpd.GeoDataFrame(
-                preds_df,
-                geometry="geometry",
-                crs=crs,
-            )
-
-        preds_df.index.name = "image_id"
-        preds_df.reset_index(inplace=True)  # reset index to get image_id as a column
-        return preds_df
diff --git a/mapreader/spot_text/dptext_detr_runner.py b/mapreader/spot_text/dptext_detr_runner.py
index e9c38113..c07873b2 100644
--- a/mapreader/spot_text/dptext_detr_runner.py
+++ b/mapreader/spot_text/dptext_detr_runner.py
@@ -98,7 +98,7 @@ def get_patch_predictions(
         Returns
         -------
         dict or pd.DataFrame
-            A dictionary containing the patch predictions or a DataFrame if `as_dataframe` is True.
+            A dictionary containing the patch predictions or a DataFrame if `return_dataframe` is True.
         """
         # key for predictions
         image_id = outputs["image_id"]
diff --git a/mapreader/spot_text/maptext_runner.py b/mapreader/spot_text/maptext_runner.py
index d876f48c..a2bcad79 100644
--- a/mapreader/spot_text/maptext_runner.py
+++ b/mapreader/spot_text/maptext_runner.py
@@ -11,7 +11,6 @@
     )
 
 import geopandas as gpd
-import numpy as np
 import pandas as pd
 import torch
 from adet.config import get_cfg
@@ -22,7 +21,6 @@
     raise ImportError("[ERROR] Please install Detectron2")
 
 from adet.utils.vitae_predictor import ViTAEPredictor
-from shapely import LineString, MultiPolygon, Polygon
 
 # first assert we are using the deep solo version of adet
 if adet.__version__ != "0.2.0-maptextpipeline":
@@ -30,10 +28,10 @@
         "[ERROR] Please install MapTextPipeline from the following link: https://github.com/rwood-97/MapTextPipeline"
     )
 
-from .runner_base import Runner
+from .rec_runner_base import RecRunner
 
 
-class MapTextRunner(Runner):
+class MapTextRunner(RecRunner):
     def __init__(
         self,
         patch_df: pd.DataFrame | gpd.GeoDataFrame | str | pathlib.Path,
@@ -44,7 +42,7 @@ def __init__(
         device: str = "default",
         delimiter: str = ",",
     ) -> None:
-        """_summary_
+        """Initialise the MapTextRunner.
 
         Parameters
         ----------
@@ -68,6 +66,8 @@ def __init__(
         self.patch_predictions = {}
         self.parent_predictions = {}
         self.geo_predictions = {}
+        self.search_results = {}
+        self.geo_search_results = {}
 
         # setup the config
         cfg = get_cfg()  # get a fresh new config
@@ -379,50 +379,6 @@ def __init__(
             self.predictor = ViTAEPredictor(cfg)
         self.predictor = DefaultPredictor(cfg)
 
-    def get_patch_predictions(
-        self,
-        outputs: dict,
-        return_dataframe: bool = False,
-        min_ioa: float = 0.7,
-    ) -> dict | pd.DataFrame:
-        """Post process the model outputs to get patch predictions.
-
-        Parameters
-        ----------
-        outputs : dict
-            The outputs from the model.
-        return_dataframe : bool, optional
-            Whether to return the predictions as a pandas DataFrame, by default False
-        min_ioa : float, optional
-            The minimum intersection over area to consider two polygons the same, by default 0.7
-
-        Returns
-        -------
-        dict or pd.DataFrame
-            A dictionary containing the patch predictions or a DataFrame if `as_dataframe` is True.
-        """
-        # key for predictions
-        image_id = outputs["image_id"]
-        self.patch_predictions[image_id] = []
-
-        # get instances
-        instances = outputs["instances"].to("cpu")
-        ctrl_pnts = instances.ctrl_points.numpy()
-        scores = instances.scores.tolist()
-        recs = instances.recs
-        bd_pts = np.asarray(instances.bd)
-
-        self._post_process(image_id, ctrl_pnts, scores, recs, bd_pts)
-        self._deduplicate(image_id, min_ioa=min_ioa)
-
-        if return_dataframe:
-            return self._dict_to_dataframe(self.patch_predictions, geo=False)
-        return self.patch_predictions
-
-    def _process_ctrl_pnt(self, pnt):
-        points = pnt.reshape(-1, 2)
-        return points
-
     def _ctc_decode_recognition(self, rec):
         last_char = "###"
         s = ""
@@ -443,83 +399,3 @@ def _ctc_decode_recognition(self, rec):
             else:
                 last_char = "###"
         return s
-
-    def _post_process(self, image_id, ctrl_pnts, scores, recs, bd_pnts, alpha=0.4):
-        for ctrl_pnt, score, rec, bd in zip(ctrl_pnts, scores, recs, bd_pnts):
-            # draw polygons
-            if bd is not None:
-                bd = np.hsplit(bd, 2)
-                bd = np.vstack([bd[0], bd[1][::-1]])
-                polygon = Polygon(bd).buffer(0)
-
-                if isinstance(polygon, MultiPolygon):
-                    polygon = polygon.convex_hull
-
-            # draw center lines
-            line = self._process_ctrl_pnt(ctrl_pnt)
-            line = LineString(line)
-
-            # draw text
-            text = self._ctc_decode_recognition(rec)
-            if self.voc_size == 37:
-                text = text.upper()
-            # text = "{:.2f}: {}".format(score, text)
-            text = f"{text}"
-            score = f"{score:.2f}"
-
-            self.patch_predictions[image_id].append([polygon, text, score])
-
-    @staticmethod
-    def _dict_to_dataframe(
-        preds: dict,
-        geo: bool = False,
-        parent: bool = False,
-    ) -> pd.DataFrame:
-        """Convert the predictions dictionary to a pandas DataFrame.
-
-        Parameters
-        ----------
-        preds : dict
-            A dictionary of predictions.
-        geo : bool, optional
-            Whether the dictionary is georeferenced coords (or pixel bounds), by default True
-        parent : bool, optional
-            Whether the dictionary is at parent level, by default False
-
-        Returns
-        -------
-        pd.DataFrame
-            A pandas DataFrame containing the predictions.
-        """
-        if geo:
-            columns = ["geometry", "crs", "text", "score"]
-        else:
-            columns = ["geometry", "text", "score"]
-
-        if parent:
-            columns.append("patch_id")
-
-        preds_df = pd.concat(
-            pd.DataFrame(
-                preds[k],
-                index=np.full(len(preds[k]), k),
-                columns=columns,
-            )
-            for k in preds.keys()
-        )
-
-        if geo:
-            # get the crs (should be the same for all)
-            if not preds_df["crs"].nunique() == 1:
-                raise ValueError("[ERROR] Multiple crs found in the predictions.")
-            crs = preds_df["crs"].unique()[0]
-
-            preds_df = gpd.GeoDataFrame(
-                preds_df,
-                geometry="geometry",
-                crs=crs,
-            )
-
-        preds_df.index.name = "image_id"
-        preds_df.reset_index(inplace=True)  # reset index to get image_id as a column
-        return preds_df
diff --git a/mapreader/spot_text/rec_runner_base.py b/mapreader/spot_text/rec_runner_base.py
new file mode 100644
index 00000000..c392d7ef
--- /dev/null
+++ b/mapreader/spot_text/rec_runner_base.py
@@ -0,0 +1,322 @@
+from __future__ import annotations
+
+import re
+
+import geopandas as gpd
+import matplotlib.patches as patches
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from PIL import Image
+from shapely import LineString, MultiPolygon, Polygon
+
+from .runner_base import Runner
+
+
+class RecRunner(Runner):
+    def get_patch_predictions(
+        self,
+        outputs: dict,
+        return_dataframe: bool = False,
+        min_ioa: float = 0.7,
+    ) -> dict | pd.DataFrame:
+        """Post process the model outputs to get patch predictions.
+
+        Parameters
+        ----------
+        outputs : dict
+            The outputs from the model.
+        return_dataframe : bool, optional
+            Whether to return the predictions as a pandas DataFrame, by default False
+        min_ioa : float, optional
+            The minimum intersection over area to consider two polygons the same, by default 0.7
+
+        Returns
+        -------
+        dict or pd.DataFrame
+            A dictionary containing the patch predictions or a DataFrame if `as_dataframe` is True.
+        """
+        # key for predictions
+        image_id = outputs["image_id"]
+        self.patch_predictions[image_id] = []
+
+        # get instances
+        instances = outputs["instances"].to("cpu")
+        ctrl_pnts = instances.ctrl_points.numpy()
+        scores = instances.scores.tolist()
+        recs = instances.recs
+        bd_pts = np.asarray(instances.bd)
+
+        self._post_process(image_id, ctrl_pnts, scores, recs, bd_pts)
+        self._deduplicate(image_id, min_ioa=min_ioa)
+
+        if return_dataframe:
+            return self._dict_to_dataframe(self.patch_predictions, geo=False)
+        return self.patch_predictions
+
+    def _process_ctrl_pnt(self, pnt):
+        points = pnt.reshape(-1, 2)
+        return points
+
+    def _post_process(self, image_id, ctrl_pnts, scores, recs, bd_pnts, alpha=0.4):
+        for ctrl_pnt, score, rec, bd in zip(ctrl_pnts, scores, recs, bd_pnts):
+            # draw polygons
+            if bd is not None:
+                bd = np.hsplit(bd, 2)
+                bd = np.vstack([bd[0], bd[1][::-1]])
+                polygon = Polygon(bd).buffer(0)
+
+                if isinstance(polygon, MultiPolygon):
+                    polygon = polygon.convex_hull
+
+            # draw center lines
+            line = self._process_ctrl_pnt(ctrl_pnt)
+            line = LineString(line)
+
+            # draw text
+            text = self._ctc_decode_recognition(rec)
+            if self.voc_size == 37:
+                text = text.upper()
+            # text = "{:.2f}: {}".format(score, text)
+            text = f"{text}"
+            score = f"{score:.2f}"
+
+            self.patch_predictions[image_id].append([polygon, text, score])
+
+    @staticmethod
+    def _dict_to_dataframe(
+        preds: dict,
+        geo: bool = False,
+        parent: bool = False,
+    ) -> pd.DataFrame:
+        """Convert the predictions dictionary to a pandas DataFrame.
+
+        Parameters
+        ----------
+        preds : dict
+            A dictionary of predictions.
+        geo : bool, optional
+            Whether the dictionary is georeferenced coords (or pixel bounds), by default True
+        parent : bool, optional
+            Whether the dictionary is at parent level, by default False
+
+        Returns
+        -------
+        pd.DataFrame
+            A pandas DataFrame containing the predictions.
+        """
+        if geo:
+            columns = ["geometry", "crs", "text", "score"]
+        else:
+            columns = ["geometry", "text", "score"]
+
+        if parent:
+            columns.append("patch_id")
+
+        if len(preds.keys()):
+            preds_df = pd.concat(
+                pd.DataFrame(
+                    preds[k],
+                    index=np.full(len(preds[k]), k),
+                    columns=columns,
+                )
+                for k in preds.keys()
+            )
+        else:
+            preds_df = pd.DataFrame(columns=columns)  # empty dataframe
+
+        if geo:
+            # get the crs (should be the same for all)
+            if not preds_df["crs"].nunique() == 1:
+                raise ValueError("[ERROR] Multiple crs found in the predictions.")
+            crs = preds_df["crs"].unique()[0]
+
+            preds_df = gpd.GeoDataFrame(
+                preds_df,
+                geometry="geometry",
+                crs=crs,
+            )
+
+        preds_df.index.name = "image_id"
+        preds_df.reset_index(inplace=True)  # reset index to get image_id as a column
+        return preds_df
+
+    def search_preds(
+        self, search_text: str, ignore_case: bool = True, return_dataframe: bool = False
+    ) -> dict | pd.DataFrame:
+        """Search the predictions for specific text. Accepts regex.
+
+        Parameters
+        ----------
+        search_text : str
+            The text to search for. Can be a regex pattern.
+        ignore_case : bool, optional
+            Whether to ignore case when searching, by default True.
+        return_dataframe : bool, optional
+            Whether to return the results as a pandas DataFrame, by default False.
+
+        Returns
+        -------
+        dict | pd.DataFrame
+            A dictionary containing the search results or a DataFrame if `return_dataframe` is True.
+
+        Raises
+        ------
+        ValueError
+            If no parent predictions are found.
+        """
+        # reset the search results
+        self.search_results = {}
+
+        # whether to ignore case
+        kwargs = {"flags": re.IGNORECASE} if ignore_case else {}
+
+        if self.parent_predictions == {}:
+            raise ValueError(
+                "[ERROR] No parent predictions found. You may need to run `convert_to_parent_pixel_bounds()`."
+            )
+
+        for image_id, preds in self.parent_predictions.items():
+            for instance in preds:
+                # ["geometry", "text", "score"]
+                if re.search(search_text, instance[1], **kwargs):
+                    if image_id in self.search_results:
+                        self.search_results[image_id].append(instance)
+                    else:
+                        self.search_results[image_id] = [instance]
+
+        if return_dataframe:
+            return self._dict_to_dataframe(self.search_results, parent=True)
+        return self.search_results
+
+    def show_search_results(
+        self,
+        parent_id: str,
+        figsize: tuple | None = (10, 10),
+        border_color: str | None = "r",
+        text_color: str | None = "b",
+        image_width_resolution: int | None = None,
+        return_fig: bool = False,
+    ) -> None:
+        """Show the search results on an image.
+
+        Parameters
+        ----------
+        parent_id : str
+            The image ID to show the predictions on (must be parent level).
+        figsize : tuple | None, optional
+            The size of the figure, by default (10, 10)
+        border_color : str | None, optional
+            The color of the border of the polygons, by default "r"
+        text_color : str | None, optional
+            The color of the text, by default "b"
+        image_width_resolution : int | None, optional
+            The maximum resolution of the image width, by default None
+        return_fig : bool, optional
+            Whether to return the figure, by default False
+
+        Returns
+        -------
+        fig
+            The matplotlib figure if `return_fig` is True.
+
+        Raises
+        ------
+        ValueError
+            If the image ID is not found in the patch or parent predictions.
+        """
+        if parent_id in self.parent_predictions.keys():
+            image_path = self.parent_df.loc[parent_id, "image_path"]
+        else:
+            raise ValueError(f"[ERROR] {parent_id} not found in parent predictions.")
+
+        img = Image.open(image_path)
+
+        # if image_width_resolution is specified, resize the image
+        if image_width_resolution:
+            new_width = int(image_width_resolution)
+            rescale_factor = new_width / img.width
+            new_height = int(img.height * rescale_factor)
+            img = img.resize((new_width, new_height), Image.LANCZOS)
+
+        fig = plt.figure(figsize=figsize)
+        ax = plt.gca()
+
+        # check if grayscale
+        if len(img.getbands()) == 1:
+            ax.imshow(img, cmap="gray", vmin=0, vmax=255, zorder=1)
+        else:
+            ax.imshow(img, zorder=1)
+
+        preds = self.search_results
+
+        for instance in preds[parent_id]:
+            polygon = np.array(instance[0].exterior.coords.xy)
+            center = instance[0].centroid.coords.xy
+            patch = patches.Polygon(polygon.T, edgecolor=border_color, facecolor="none")
+            ax.add_patch(patch)
+            ax.text(
+                center[0][0], center[1][0], instance[1], fontsize=8, color=text_color
+            )
+
+        plt.axis("off")
+        plt.title(parent_id)
+
+        if return_fig:
+            return fig
+
+    def convert_search_results_to_coords(
+        self, return_dataframe: bool = False
+    ) -> dict | gpd.GeoDataFrame:
+        """Convert the search results to georeferenced search results by converting the pixel bounds to coordinates.
+
+        Parameters
+        ----------
+        return_dataframe : bool, optional
+            Whether to return the results as a geopandas GeoDataFrame, by default False
+
+        Returns
+        -------
+        dict | gpd.GeoDataFrame
+            A dictionary of search results for each parent image or a DataFrame if `return_dataframe` is True.
+
+        Raises
+        ------
+        ValueError
+            If no search results are found.
+        """
+        if self.search_results == {}:
+            raise ValueError("[ERROR] No results to convert!")
+
+        # reset the geo search results
+        self.geo_search_results = {}
+
+        for parent_id, prediction in self.search_results.items():
+            if parent_id not in self.geo_search_results.keys():
+                self.geo_search_results[parent_id] = []
+
+                for instance in prediction:
+                    polygon = instance[0]
+
+                    xx, yy = (np.array(i) for i in polygon.exterior.xy)
+                    xx = (
+                        xx * self.parent_df.loc[parent_id, "dlon"]
+                        + self.parent_df.loc[parent_id, "coordinates"][0]
+                    )
+                    yy = (
+                        self.parent_df.loc[parent_id, "coordinates"][3]
+                        - yy * self.parent_df.loc[parent_id, "dlat"]
+                    )
+
+                    crs = self.parent_df.loc[parent_id, "crs"]
+
+                    parent_polygon_geo = Polygon(zip(xx, yy)).buffer(0)
+                    self.geo_search_results[parent_id].append(
+                        [parent_polygon_geo, crs, *instance[1:]]
+                    )
+
+        if return_dataframe:
+            return self._dict_to_dataframe(
+                self.geo_search_results, geo=True, parent=True
+            )
+        return self.geo_search_results
diff --git a/mapreader/spot_text/runner_base.py b/mapreader/spot_text/runner_base.py
index 9942a153..3ea13dc9 100644
--- a/mapreader/spot_text/runner_base.py
+++ b/mapreader/spot_text/runner_base.py
@@ -100,7 +100,7 @@ def run_all(
         Returns
         -------
         dict or pd.DataFrame or gpd.GeoDataFrame
-            A dictionary of predictions for each patch image or a DataFrame if `as_dataframe` is True.
+            A dictionary of predictions for each patch image or a DataFrame if `return_dataframe` is True.
         """
         img_paths = self.patch_df["image_path"].to_list()
 
@@ -129,7 +129,7 @@ def run_on_images(
         Returns
         -------
         dict or pd.DataFrame
-            A dictionary of predictions for each image or a DataFrame if `as_dataframe` is True.
+            A dictionary of predictions for each image or a DataFrame if `return_dataframe` is True.
         """
 
         if isinstance(img_paths, (str, pathlib.Path)):
@@ -243,7 +243,7 @@ def convert_to_parent_pixel_bounds(
         Returns
         -------
         dict or pd.DataFrame
-            A dictionary of predictions for each parent image or a DataFrame if `as_dataframe` is True.
+            A dictionary of predictions for each parent image or a DataFrame if `return_dataframe` is True.
         """
 
         for image_id, prediction in self.patch_predictions.items():
@@ -350,7 +350,7 @@ def convert_to_coords(
         Returns
         -------
         dict or gpd.GeoDataFrame
-            A dictionary of predictions for each parent image or a DataFrame if `as_dataframe` is True.
+            A dictionary of predictions for each parent image or a DataFrame if `return_dataframe` is True.
         """
         if self.parent_predictions == {}:
             print("[INFO] Converting patch pixel bounds to parent pixel bounds.")

From 0ab1f9e152a3caa99f414c9fb3e33109f56ef85e Mon Sep 17 00:00:00 2001
From: Rosie Wood <rwood@turing.ac.uk>
Date: Thu, 12 Sep 2024 14:21:43 +0100
Subject: [PATCH 2/5] add save to geojson

---
 mapreader/spot_text/deepsolo_runner.py |  1 -
 mapreader/spot_text/maptext_runner.py  |  1 -
 mapreader/spot_text/rec_runner_base.py | 35 +++++++++++---------------
 3 files changed, 14 insertions(+), 23 deletions(-)

diff --git a/mapreader/spot_text/deepsolo_runner.py b/mapreader/spot_text/deepsolo_runner.py
index 298d11ea..4b7d0832 100644
--- a/mapreader/spot_text/deepsolo_runner.py
+++ b/mapreader/spot_text/deepsolo_runner.py
@@ -66,7 +66,6 @@ def __init__(
         self.parent_predictions = {}
         self.geo_predictions = {}
         self.search_results = {}
-        self.geo_search_results = {}
 
         # setup the config
         cfg = get_cfg()  # get a fresh new config
diff --git a/mapreader/spot_text/maptext_runner.py b/mapreader/spot_text/maptext_runner.py
index a2bcad79..c56568e6 100644
--- a/mapreader/spot_text/maptext_runner.py
+++ b/mapreader/spot_text/maptext_runner.py
@@ -67,7 +67,6 @@ def __init__(
         self.parent_predictions = {}
         self.geo_predictions = {}
         self.search_results = {}
-        self.geo_search_results = {}
 
         # setup the config
         cfg = get_cfg()  # get a fresh new config
diff --git a/mapreader/spot_text/rec_runner_base.py b/mapreader/spot_text/rec_runner_base.py
index c392d7ef..3e838001 100644
--- a/mapreader/spot_text/rec_runner_base.py
+++ b/mapreader/spot_text/rec_runner_base.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import pathlib
 import re
 
 import geopandas as gpd
@@ -265,20 +266,16 @@ def show_search_results(
         if return_fig:
             return fig
 
-    def convert_search_results_to_coords(
-        self, return_dataframe: bool = False
-    ) -> dict | gpd.GeoDataFrame:
-        """Convert the search results to georeferenced search results by converting the pixel bounds to coordinates.
+    def save_search_results_to_geojson(
+        self,
+        save_path: str | pathlib.Path,
+    ) -> None:
+        """Convert the search results to georeferenced search results and save them to a GeoJSON file.
 
         Parameters
         ----------
-        return_dataframe : bool, optional
-            Whether to return the results as a geopandas GeoDataFrame, by default False
-
-        Returns
-        -------
-        dict | gpd.GeoDataFrame
-            A dictionary of search results for each parent image or a DataFrame if `return_dataframe` is True.
+        save_path : str | pathlib.Path
+            The path to save the GeoJSON file.
 
         Raises
         ------
@@ -288,12 +285,11 @@ def convert_search_results_to_coords(
         if self.search_results == {}:
             raise ValueError("[ERROR] No results to convert!")
 
-        # reset the geo search results
-        self.geo_search_results = {}
+        geo_search_results = {}
 
         for parent_id, prediction in self.search_results.items():
-            if parent_id not in self.geo_search_results.keys():
-                self.geo_search_results[parent_id] = []
+            if parent_id not in geo_search_results.keys():
+                geo_search_results[parent_id] = []
 
                 for instance in prediction:
                     polygon = instance[0]
@@ -311,12 +307,9 @@ def convert_search_results_to_coords(
                     crs = self.parent_df.loc[parent_id, "crs"]
 
                     parent_polygon_geo = Polygon(zip(xx, yy)).buffer(0)
-                    self.geo_search_results[parent_id].append(
+                    geo_search_results[parent_id].append(
                         [parent_polygon_geo, crs, *instance[1:]]
                     )
 
-        if return_dataframe:
-            return self._dict_to_dataframe(
-                self.geo_search_results, geo=True, parent=True
-            )
-        return self.geo_search_results
+        geo_df = self._dict_to_dataframe(geo_search_results, geo=True, parent=True)
+        geo_df.to_file(save_path, driver="GeoJSON", engine="pyogrio")

From 31f73d75bf3101e5cdb6abbc269981ffbad6e47b Mon Sep 17 00:00:00 2001
From: Rosie Wood <rwood@turing.ac.uk>
Date: Thu, 12 Sep 2024 15:44:56 +0100
Subject: [PATCH 3/5] add tests for search preds

---
 mapreader/spot_text/rec_runner_base.py     |  2 +-
 test_text_spotting/test_deepsolo_runner.py | 44 ++++++++++++++++++++++
 test_text_spotting/test_maptext_runner.py  | 44 ++++++++++++++++++++++
 3 files changed, 89 insertions(+), 1 deletion(-)

diff --git a/mapreader/spot_text/rec_runner_base.py b/mapreader/spot_text/rec_runner_base.py
index 3e838001..77c31386 100644
--- a/mapreader/spot_text/rec_runner_base.py
+++ b/mapreader/spot_text/rec_runner_base.py
@@ -283,7 +283,7 @@ def save_search_results_to_geojson(
             If no search results are found.
         """
         if self.search_results == {}:
-            raise ValueError("[ERROR] No results to convert!")
+            raise ValueError("[ERROR] No results to save!")
 
         geo_search_results = {}
 
diff --git a/test_text_spotting/test_deepsolo_runner.py b/test_text_spotting/test_deepsolo_runner.py
index c62a79f6..df9533d9 100644
--- a/test_text_spotting/test_deepsolo_runner.py
+++ b/test_text_spotting/test_deepsolo_runner.py
@@ -226,3 +226,47 @@ def test_deepsolo_save_to_geojson(runner_run_all, tmp_path, mock_response):
     assert set(gdf.columns) == set(
         ["image_id", "patch_id", "geometry", "crs", "text", "score"]
     )
+
+
+def test_deepsolo_search_preds(runner_run_all, mock_response):
+    runner = runner_run_all
+    _ = runner.convert_to_parent_pixel_bounds()
+    out = runner.search_preds("map", ignore_case=True)
+    assert isinstance(out, dict)
+    assert "mapreader_text.png" in out.keys()
+    # test dataframe
+    out = runner.search_preds("map", ignore_case=True, return_dataframe=True)
+    assert isinstance(out, pd.DataFrame)
+    assert set(out.columns) == set(
+        ["image_id", "patch_id", "geometry", "text", "score"]
+    )
+    assert "mapreader_text.png" in out["image_id"].values
+    out = runner.search_preds("somethingelse", ignore_case=True, return_dataframe=True)
+    assert len(out) == 0
+
+
+def test_deepsolo_search_preds_errors(runner_run_all, mock_response):
+    runner = runner_run_all
+    with pytest.raises(ValueError, match="No parent predictions found"):
+        runner.search_preds("maps", ignore_case=True)
+
+
+def test_deepsolo_save_search_results(runner_run_all, tmp_path, mock_response):
+    runner = runner_run_all
+    _ = runner.convert_to_parent_pixel_bounds()
+    out = runner.search_preds("map", ignore_case=True)
+    assert isinstance(out, dict)
+    runner.save_search_results_to_geojson(f"{tmp_path}/search_results.geojson")
+    assert os.path.exists(f"{tmp_path}/search_results.geojson")
+    gdf = gpd.read_file(f"{tmp_path}/search_results.geojson")
+    assert isinstance(gdf, gpd.GeoDataFrame)
+    assert set(gdf.columns) == set(
+        ["image_id", "patch_id", "geometry", "crs", "text", "score"]
+    )
+    assert "mapreader_text.png" in gdf["image_id"].values
+
+
+def test_deepsolo_save_search_results_errors(runner_run_all, tmp_path, mock_response):
+    runner = runner_run_all
+    with pytest.raises(ValueError, match="No results to save"):
+        runner.save_search_results_to_geojson(f"{tmp_path}/test.geojson")
diff --git a/test_text_spotting/test_maptext_runner.py b/test_text_spotting/test_maptext_runner.py
index 9690bcc9..d2214f85 100644
--- a/test_text_spotting/test_maptext_runner.py
+++ b/test_text_spotting/test_maptext_runner.py
@@ -226,3 +226,47 @@ def test_maptext_save_to_geojson(runner_run_all, tmp_path, mock_response):
     assert set(gdf.columns) == set(
         ["image_id", "patch_id", "geometry", "crs", "text", "score"]
     )
+
+
+def test_maptext_search_preds(runner_run_all, mock_response):
+    runner = runner_run_all
+    _ = runner.convert_to_parent_pixel_bounds()
+    out = runner.search_preds("map", ignore_case=True)
+    assert isinstance(out, dict)
+    assert "mapreader_text.png" in out.keys()
+    # test dataframe
+    out = runner.search_preds("map", ignore_case=True, return_dataframe=True)
+    assert isinstance(out, pd.DataFrame)
+    assert set(out.columns) == set(
+        ["image_id", "patch_id", "geometry", "text", "score"]
+    )
+    assert "mapreader_text.png" in out["image_id"].values
+    out = runner.search_preds("somethingelse", ignore_case=True, return_dataframe=True)
+    assert len(out) == 0
+
+
+def test_maptext_search_preds_errors(runner_run_all, mock_response):
+    runner = runner_run_all
+    with pytest.raises(ValueError, match="No parent predictions found"):
+        runner.search_preds("maps", ignore_case=True)
+
+
+def test_maptext_save_search_results(runner_run_all, tmp_path, mock_response):
+    runner = runner_run_all
+    _ = runner.convert_to_parent_pixel_bounds()
+    out = runner.search_preds("map", ignore_case=True)
+    assert isinstance(out, dict)
+    runner.save_search_results_to_geojson(f"{tmp_path}/search_results.geojson")
+    assert os.path.exists(f"{tmp_path}/search_results.geojson")
+    gdf = gpd.read_file(f"{tmp_path}/search_results.geojson")
+    assert isinstance(gdf, gpd.GeoDataFrame)
+    assert set(gdf.columns) == set(
+        ["image_id", "patch_id", "geometry", "crs", "text", "score"]
+    )
+    assert "mapreader_text.png" in gdf["image_id"].values
+
+
+def test_maptext_save_search_results_errors(runner_run_all, tmp_path, mock_response):
+    runner = runner_run_all
+    with pytest.raises(ValueError, match="No results to save"):
+        runner.save_search_results_to_geojson(f"{tmp_path}/test.geojson")

From 29621984d4ecd90d19ebbca34532c3b50854bfc0 Mon Sep 17 00:00:00 2001
From: Rosie Wood <rwood@turing.ac.uk>
Date: Thu, 12 Sep 2024 16:01:23 +0100
Subject: [PATCH 4/5] add docs for searching text

---
 .../step-by-step-guide/6-spot-text.rst        | 83 +++++++++++++++++--
 1 file changed, 75 insertions(+), 8 deletions(-)

diff --git a/docs/source/using-mapreader/step-by-step-guide/6-spot-text.rst b/docs/source/using-mapreader/step-by-step-guide/6-spot-text.rst
index d01a3046..47777f71 100644
--- a/docs/source/using-mapreader/step-by-step-guide/6-spot-text.rst
+++ b/docs/source/using-mapreader/step-by-step-guide/6-spot-text.rst
@@ -112,7 +112,7 @@ e.g. for the ``DPTextDETRRunner``, if you choose the "ArT/R_50_poly.yaml", you s
 
 e.g. for the ``DeepSoloRunner``, if you choose the "R_50/IC15/finetune_150k_tt_mlt_13_15_textocr.yaml", you should download the "ic15_res50_finetune_synth-tt-mlt-13-15-textocr.pth" model weights file from the DeepSolo repo.
 
-e.g. for the ``MapTextPipeline``, if you choose the "ViTAEv2_S/rumsey/final_rumsey.yaml", you should download the "rumsey-finetune.pth" model weights file from the MapTextPipeline repo.
+e.g. for the ``MapTextRunner``, if you choose the "ViTAEv2_S/rumsey/final_rumsey.yaml", you should download the "rumsey-finetune.pth" model weights file from the MapTextPipeline repo.
 
 .. note:: We recommend using the "ViTAEv2_S/rumsey/final_rumsey.yaml" configuration and "rumsey-finetune.pth" weights from the ``MapTextPipeline``. But you should choose based on your own use case.
 
@@ -120,7 +120,7 @@ For the DPTextDETRRunner, use:
 
 .. code-block:: python
 
-    from map_reader import DPTextDETRRunner
+    from mapreader import DPTextDETRRunner
 
     #EXAMPLE
     my_runner = DPTextDETR(
@@ -146,7 +146,7 @@ For the DeepSoloRunner, use:
 
 .. code-block:: python
 
-    from map_reader import DeepSoloRunner
+    from mapreader import DeepSoloRunner
 
     #EXAMPLE
     my_runner = DeepSoloRunner(
@@ -158,14 +158,14 @@ For the DeepSoloRunner, use:
 
 or, you can load your patch/parent dataframes from CSV/GeoJSON files as shown for the DPTextRunner (above).
 
-For the MapTextPipeline, use:
+For the MapTextRunner, use:
 
 .. code-block:: python
 
-    from map_reader import MapTextPipeline
+    from mapreader import MapTextRunner
 
     #EXAMPLE
-    my_runner = MapTextPipeline(
+    my_runner = MapTextRunner(
         patch_df,
         parent_df,
         cfg_file = "MapTextPipeline/configs/ViTAEv2_S/rumsey/final_rumsey.yaml",
@@ -182,7 +182,7 @@ You can explicitly set this using the ``device`` argument:
 .. code-block:: python
 
     #EXAMPLE
-    my_runner = MapTextPipeline(
+    my_runner = MapTextRunner(
         "./patch_df.csv",
         "./parent_df.csv",
         cfg_file = "MapTextPipeline/configs/ViTAEv2_S/rumsey/final_rumsey.yaml",
@@ -322,10 +322,77 @@ If you maps are georeferenced in your ``parent_df``, you can also convert the pi
 
     geo_preds_df = my_runner.convert_to_coords(return_dataframe=True)
 
-Again, you can save these to a csv file as above, or, you can save them to a geojson file for loading into GIS software:
+Again, you can save these to a csv file (as shown above), or, you can save them to a geojson file for loading into GIS software:
 
 .. code-block:: python
 
     my_runner.save_to_geojson("text_preds.geojson")
 
 This will save the predictions to a geojson file, with each text prediction as a separate feature.
+
+Search predictions
+------------------
+
+If you are using the DeepSoloRunner or the MapTextRunner, you will have recognized text outputs.
+You can search these predictions using the ``search_preds`` method:
+
+.. code-block:: python
+
+    search_results = my_runner.search_preds("search term")
+
+e.g To find all predictions containing the word "church" and ignoring the case:
+
+.. code-block:: python
+
+    # EXAMPLE
+    search_results = my_runner.search_preds("church")
+
+By default, this will return a dictionary containing the search results.
+If you'd like to return a dataframe instead, use the ``return_dataframe`` argument:
+
+.. code-block:: python
+
+    # EXAMPLE
+    search_results_df = my_runner.search_preds("church", return_dataframe=True)
+
+You can also ignore the case of the search term by setting the ``ignore_case`` argument:
+
+.. code-block:: python
+
+    # EXAMPLE
+    search_results_df = my_runner.search_preds("church", return_dataframe=True, ignore_case=True)
+
+
+The search accepts regex patterns so you can use these to search for more complex patterns.
+
+e.g. To search for all predictions containing the word "church" or "chapel", you could use the pattern "church|chapel":
+
+.. code-block:: python
+
+    # EXAMPLE
+    search_results_df = my_runner.search_preds("church|chapel", return_dataframe=True, ignore_case=True)
+
+Once you have your search results, you can view them on your map using the ``show_search_results`` method.
+
+.. code-block:: python
+
+    my_runner.show_search_results("map_74488689.png")
+
+This will show the map with the search results.
+
+As with the ``show`` method, you can use the ``border_color``, ``text_color`` and ``figsize`` arguments to customize the appearance of the image.
+
+Save search results
+~~~~~~~~~~~~~~~~~~~
+
+If your maps are georeferenced, you can also save your search results using the ``save_search_results_to_geojson`` method:
+
+.. code-block:: python
+
+    my_runner.save_search_results_to_geojson("search_results.geojson")
+
+This will save the search results to a geojson file, with each search result as a separate feature.
+
+These can then be loaded into GIS software for further analysis/exploration.
+
+If your maps are not georeferenced, you can save the search results to a csv file using the pandas ``to_csv`` method (as shown above).

From 47680bdf0ddf32702b444a69119cf5b57e811691 Mon Sep 17 00:00:00 2001
From: Rosie Wood <rwood@turing.ac.uk>
Date: Thu, 12 Sep 2024 16:36:16 +0100
Subject: [PATCH 5/5] update changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index c828444e..b185a2ea 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -25,6 +25,7 @@ _ADD NEW CHANGES HERE_
 - Loading of dataframes from GeoJSON files now supported in many file loading methods (e.g. `add_metadata`, `Annotator.__init__`, `AnnotationsLoader.load`, etc.) ([#495](https://github.com/maps-as-data/MapReader/pull/495))
 - `load_frames.py` added to `mapreader.utils`. This has functions for loading from various file formats (e.g. CSV, Excel, GeoJSON, etc.) and converting to GeoDataFrames ([#495](https://github.com/maps-as-data/MapReader/pull/495))
 - Added tests for text spotting code ([#500](https://github.com/maps-as-data/MapReader/pull/500))
+- Added `search_preds`, `show_search_results` and `save_search_results_to_geojson` methods to text spotting code ([#502](https://github.com/maps-as-data/MapReader/pull/502))
 
 ### Changed