- removed tqdm and prints from the repo because they break the docs (…

…consider removing the dependency) - minor fixes to text in notebooks
guybuk · Aug 20, 2024 · d1e8306 · d1e8306
1 parent 33b055e
commit d1e8306
Show file tree

Hide file tree

Showing 16 changed files with 137 additions and 123 deletions.
diff --git a/bridge/primitives/dataset/dataset.py b/bridge/primitives/dataset/dataset.py
@@ -5,7 +5,6 @@
 from typing import TYPE_CHECKING, Any, Callable, Dict, Hashable, Iterable, Iterator, List, Sequence
 
 import pandas as pd
-from tqdm.contrib import tmap
 from typing_extensions import Self
 
 from bridge.primitives.dataset.sample_api import SampleAPI
@@ -85,7 +84,7 @@ def get(self, sample_id: Hashable) -> Sample:
     def transform_samples(
         self,
         transform: SampleTransform,
-        map_fn=tmap,
+        map_fn=map,
         cache_mechanisms: Dict[str, CacheMechanism] | None = None,
         display_engine: DisplayEngine | None = None,
     ) -> Self:
@@ -98,7 +97,7 @@ def transform_samples(
         elements = [element for sample in samples for e_list in sample.elements.values() for element in e_list]
         return Dataset.from_elements(elements, display_engine=display_engine)
 
-    def map_samples(self, function: Callable[[Sample], Any], map_fn=tmap):
+    def map_samples(self, function: Callable[[Sample], Any], map_fn=map):
         outputs = map_fn(function, self)
         if isinstance(outputs, GeneratorType):
             return list(outputs)

diff --git a/bridge/primitives/dataset/sample_api.py b/bridge/primitives/dataset/sample_api.py
@@ -3,7 +3,6 @@
 import abc
 from typing import TYPE_CHECKING, Any, Callable, Dict, Hashable, Iterable, Iterator, Sequence
 
-from tqdm.contrib import tmap
 from typing_extensions import Self
 
 if TYPE_CHECKING:
@@ -27,14 +26,14 @@ def get(self, sample_id: Hashable) -> Sample:
     def transform_samples(
         self,
         transform: SampleTransform,
-        map_fn=tmap,
+        map_fn=map,
         cache_mechanisms: Dict[str, CacheMechanism] | None = None,
         display_engine: DisplayEngine | None = None,
     ) -> Self:
         pass
 
     @abc.abstractmethod
-    def map_samples(self, function: Callable[[Sample], Any], map_fn=tmap) -> Sequence[Sample]:
+    def map_samples(self, function: Callable[[Sample], Any], map_fn=map) -> Sequence[Sample]:
         pass
 
     @abc.abstractmethod

diff --git a/bridge/primitives/dataset/singular_dataset.py b/bridge/primitives/dataset/singular_dataset.py
@@ -3,7 +3,6 @@
 from typing import TYPE_CHECKING, Callable, Dict, Hashable, List, Sequence
 
 import pandas as pd
-from tqdm.contrib import tmap
 from typing_extensions import Self
 
 from bridge.primitives.dataset.dataset import Dataset
@@ -131,7 +130,7 @@ def sort_annotations(self, by: str, ascending: bool = True):
     def transform_samples(
         self,
         transform: SampleTransform,
-        map_fn=tmap,
+        map_fn=map,
         cache_mechanisms: Dict[str, CacheMechanism] | None = None,
         display_engine: DisplayEngine | None = None,
     ) -> Self:

diff --git a/bridge/providers/vision.py b/bridge/providers/vision.py
@@ -5,7 +5,6 @@
 from typing import TYPE_CHECKING, Dict
 
 import numpy as np
-from tqdm import tqdm
 
 from bridge.display.basic import SimplePrints
 from bridge.primitives.dataset import SingularDataset
@@ -97,7 +96,7 @@ def build_dataset(
         img_id_list = list(sorted(self._coco.imgs.keys()))
         images = []
         bboxes = []
-        for img_id in tqdm(img_id_list):
+        for img_id in img_id_list:
             coco_img = self._coco.loadImgs(img_id)[0]
             img_file = self._images_dir / coco_img["file_name"]
 

diff --git a/bridge/utils/download_and_extract_archive.py b/bridge/utils/download_and_extract_archive.py
@@ -19,12 +19,19 @@
 from tqdm import tqdm
 
 
-def _urlretrieve(url: str, filename: Union[str, pathlib.Path], chunk_size: int = 1024 * 32) -> None:
+def _urlretrieve(
+    url: str, filename: Union[str, pathlib.Path], chunk_size: int = 1024 * 32, progress_bar: bool = False
+) -> None:
     with urllib.request.urlopen(urllib.request.Request(url)) as response:
-        with open(filename, "wb") as fh, tqdm(total=response.length) as pbar:
-            while chunk := response.read(chunk_size):
-                fh.write(chunk)
-                pbar.update(len(chunk))
+        with open(filename, "wb") as fh:
+            if progress_bar:
+                with tqdm(total=response.length) as pbar:
+                    for chunk in iter(lambda: response.read(chunk_size), b""):
+                        fh.write(chunk)
+                        pbar.update(len(chunk))
+            else:
+                for chunk in iter(lambda: response.read(chunk_size), b""):
+                    fh.write(chunk)
 
 
 def calculate_md5(fpath: Union[str, pathlib.Path], chunk_size: int = 1024 * 1024) -> str:
@@ -219,10 +226,11 @@ def _extract_tar(
         for member in tar_members:
             member_path = os.path.join(to_path, member.name)
             if member_path not in existing_files:
-                print(f"Extracting {member.name} to {to_path}")
+                # print(f"Extracting {member.name} to {to_path}")
                 tar.extract(member, to_path)
             else:
-                print(f"{member.name} already exists in {to_path}, skipping.")
+                pass
+                # print(f"{member.name} already exists in {to_path}, skipping.")
 
 
 _ZIP_COMPRESSION_MAP: Dict[str, int] = {

diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst
@@ -1,8 +1,5 @@
-Getting started
+Getting Started
 ===============
-
-
-
 Installation
 ------------
 
@@ -27,14 +24,57 @@ You can install the latest version of Bridge's from PyPI. It comes in a few flav
     $ pip install bridge-ds[dev]
 
 
-Demos
------
-
-For high-level demos to show off Bridge's capabilities, consider
-browsing the following notebooks:
-
-#. :doc:`Quick and easy data exploration <notebooks/vision/fundamentals/coco_eda_demo>`
-#. :doc:`From sources, through augmentations, to Pytorch <notebooks/vision/processing_data/source2tensors_demo>`
+Key Concepts
+------------
 
-For a deeper understanding of Bridge, and to connect your custom datasets and data types,
- proceed to the :doc:`user_guide` section.
+In this section you will learn the basics of Bridge. Start by
+reading about the key concepts, and then proceed to
+the guides below.
+
+Element
+^^^^^^^
+
+An Element is the basic unit of data in a Dataset, from raw data
+objects such as images, text, audio, to various annotations such
+as class labels, bounding boxes, and segmentation maps. In
+essence,
+anything that constitutes a piece of information within the
+dataset can be an Element.
+
+Sample
+^^^^^^
+
+A Sample is a collection of Elements. It is our representation of
+a typical item-within-a-dataset.
+For example, an image with
+object detections constitutes a Sample, comprising a single image
+Element and multiple bounding box Elements.
+
+Dataset
+^^^^^^^
+
+A Dataset is a collection of Samples. It exposes the Table and
+Sample APIs.
+
+Table API
+^^^^^^^^^
+
+A general term for the set of functions and operators exposed by
+the
+Dataset which allows users to perform
+high-level operations with a user experience similar to Pandas -
+assign, query, sort, map, etc. In short, an API that
+allows users to treat any dataset as a DataFrame, where **every
+row is an element.**
+
+Sample API
+^^^^^^^^^^
+
+A general term for the set of functions and operators exposed by
+the Dataset which allows users to work on
+individual examples in the dataset in a meaningful manner.
+If the Table API is meant for high-level
+dataset management, then the Sample API is used for low-level
+operations
+like loading,
+caching, and transforming raw data (e.g. pixels, strings).
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -15,6 +15,7 @@ with a Pandas-like experience, and handle individual samples and raw data using
 class-based, easy-to-work-with interface.
 
 .. toctree::
+   :maxdepth: 2
 
    getting_started
    user_guide

diff --git a/docs/source/notebooks/vision/custom_data/display_engine.ipynb b/docs/source/notebooks/vision/custom_data/display_engine.ipynb
@@ -72,7 +72,7 @@
    "metadata": {},
    "source": [
     "## Class Structure\n",
-    "We can improve this viz by writing our own DisplayEngine. For starters, let's see which methods we need to implement:\n",
+    "We can improve this \"viz\" by writing our own DisplayEngine. For starters, let's see which methods we need to implement:\n",
     "\n",
     "```python\n",
     "class MyDisplayEngine(DisplayEngine):\n",
@@ -93,7 +93,7 @@
     "\n",
     "    def show_dataset(\n",
     "        self,\n",
-    "        dataset: D,\n",
+    "        dataset,\n",
     "        element_plot_kwargs: Dict[str, Any] | None = None,\n",
     "        sample_plot_kwargs: Dict[str, Any] | None = None,\n",
     "        dataset_plot_kwargs: Dict[str, Any] | None = None,\n",

diff --git a/docs/source/notebooks/vision/custom_data/load_mechanism.ipynb b/docs/source/notebooks/vision/custom_data/load_mechanism.ipynb
@@ -82,7 +82,7 @@
     "\n",
     "In this tutorial we will learn about the **LoadMechanism**, Bridge's way of loading raw data from different sources.\n",
     "\n",
-    "A quick reminder: to access the raw data within each element, we need to use the **SampleAPI**. The column `data` in the **TableAPI** usually (but not always) contains a reference to the data rather than the data itself:\n"
+    "A quick reminder: to access the raw data within each element, we need to use the **SampleAPI** with `sample.data / element.data`. The column `data` in the **TableAPI** usually (but not always) contains a reference to the data rather than the data itself:\n"
    ]
   },
   {
@@ -100,7 +100,7 @@
    "id": "8",
    "metadata": {},
    "source": [
-    "When we want to access data for a given element, we need to call the `element.data` property. In COCO, we have elements for _images_ and for _bboxes_. Because COCO is a **SingularDataset**, every sample has a special element, in this case the image, and annotation elements, in this case the bboxes."
+    "When we want to access data for a given element, we need to call the `element.data` property. In COCO, we have elements for _images_ and for _bboxes_. Because COCO is a **SingularDataset**, every sample has a special element, in this case the image, and we can access its data directly with `sample.data`."
    ]
   },
   {
@@ -122,7 +122,7 @@
    "id": "10",
    "metadata": {},
    "source": [
-    "Every element holds a **LoadMechanism**, an object responsible for loading data from different sources. In this case, for images, `element.data` will perform an HTTP request and load the image in the response. For bboxes, which already exist in-memory (i.e. we can see them directly in the `annotations` table), `element.data` will simply load the stored object.\n",
+    "Every element holds a **LoadMechanism**, an object responsible for loading data from different sources. In this case, for images, `element.data` will perform an HTTP request and load the image in the response. For bboxes, which already exist in-memory (note that we can see them directly in the `annotations` table), `element.data` will simply load the stored object.\n",
     "\n",
     "The **LoadMechanism** is defined by two variables:"
    ]
@@ -160,7 +160,7 @@
    "source": [
     "## In summary\n",
     "1. Bridge loads data lazily, only when `element.data` is called\n",
-    "2. The loading mechanism accepts **url_or_data** which defines where to load from (or whatto load), and **category** which defines _how_ to load it."
+    "2. The loading mechanism function accepts **url_or_data** which defines where to load from (or what to load), and **category** which defines _how_ to load it."
    ]
   }
  ],

diff --git a/docs/source/notebooks/vision/fundamentals/coco_eda_demo.ipynb b/docs/source/notebooks/vision/fundamentals/coco_eda_demo.ipynb
@@ -107,7 +107,7 @@
    "id": "8",
    "metadata": {},
    "source": [
-    "In the previous tutorials we've made a brief introduction into using the Sample and Table APIs. In this demo we'll perform a short step-by-step analysis on COCO, using different toolings available in BridgeDS."
+    "In this demo we'll perform a short step-by-step analysis on COCO, using different tools available in BridgeDS."
    ]
   },
   {
@@ -144,7 +144,7 @@
    "id": "12",
    "metadata": {},
    "source": [
-    "In the annotations table, class names are represented by numerical labels, which may impede readability during data analysis. To address this, we may choose to use a third-party file that maps these integer labels to their corresponding text labels."
+    "Observe the annotations table: the class names (within the BoundingBox objects in the `data` column) are represented by numerical labels, which may impede readability during data analysis. To address this, we may choose to use a third-party file that maps these integer labels to their corresponding text labels."
    ]
   },
   {
@@ -168,7 +168,7 @@
    "id": "14",
    "metadata": {},
    "source": [
-    "Like we've seen in the Table API tutorial, we can use `ds.assign_annotations` to replace our bounding box class labels with new ones:"
+    "We can use `ds.assign_annotations` to replace our bounding box class labels with new ones:"
    ]
   },
   {
@@ -223,7 +223,7 @@
     "\n",
     "```\n",
     "for sample in samples:\n",
-    "    for annotation in samples:\n",
+    "    for annotation in sample:\n",
     "        <do...>\n",
     "```\n",
     "\n",
@@ -346,9 +346,9 @@
    "id": "28",
    "metadata": {},
    "source": [
-    "To gain a deeper understanding of the image and the size of the dining table annotation in question, we introduce DisplayEngines, which you've seen briefly in the Sample API tutorial. These objects are injected into Datasets  Samples, and Elements, enabling us to manipulate the behavior of the `ds.show() / sample.show / element.show()` methods.\n",
+    "To gain a deeper understanding of the image and the size of the dining table annotation in question, we introduce DisplayEngines. These objects are injected into Datasets  Samples, and Elements, enabling us to manipulate the behavior of the `ds.show() / sample.show / element.show()` methods.\n",
     "\n",
-    "By default, the **SimplePrints** engine is utilized. Let's switch to the **Holoviews** engine for enhanced visualization:"
+    "By default, the **SimplePrints** engine is used. Let's switch to the **Holoviews** engine for enhanced visualization:"
    ]
   },
   {

diff --git a/docs/source/notebooks/vision/fundamentals/sample_api.ipynb b/docs/source/notebooks/vision/fundamentals/sample_api.ipynb
@@ -104,9 +104,10 @@
     "In BridgeDS, we use two complementing approaches to view datasets. We call them the **Sample API** and the **Table API**. This tutorial is about the former.\n",
     "\n",
     "Sample API can be loosely described as:\n",
-    "> A dataset can be viewed as a collection of samples, where each sample is a pythonic object (called Sample) that contains a collection of elements.\n",
     "\n",
-    "In case any of the terms 'dataset', 'sample' or 'element' is foreign to you, we recommend you to first go back to the Key Concepts section of our User Guide page.\n",
+    "    A dataset can be viewed as a collection of samples, where each sample is a pythonic object (called Sample) that contains a collection of elements.\n",
+    "\n",
+    "In case any of the terms 'dataset', 'sample' or 'element' is foreign to you, we recommend you to first go back to the Key Concepts section.\n",
     "\n",
     "Let's demonstrate how to use the **Sample API**:"
    ]
@@ -140,7 +141,7 @@
    "metadata": {},
    "source": [
     "## Properties\n",
-    "The sample object is fairly minimal, exposing only its _id_, _elements_, and _display_engine_, with limited methods available. This design reflects its role as a container for elements rather than a data object itself.\n",
+    "The sample object is fairly minimal, exposing only its _id_, _elements_, and _display_engine_ properties, with limited methods available. This design reflects its role as a container for elements rather than a data object itself.\n",
     "\n",
     "Now, let’s shift our focus to the elements:\n",
     "\n",
@@ -167,7 +168,7 @@
    "id": "12",
    "metadata": {},
    "source": [
-    "We observe one image element and two bbox elements. It's common to have a single element representing raw data (the image) alongside multiple elements representing annotations. To accommodate this frequent use case, we implement COCO using a subclass of **Sample** called **SingularSample**. This subclass provides a more convenient API, where the main element is accessible via `sample.element`, and the remaining elements are organized under `sample.annotations`:"
+    "We observe one image element and two bbox elements. It's common to have samples compose of a single element representing raw data (the image) alongside multiple elements representing annotations. To accommodate this frequent use case, we implement COCO samples using a subclass of **Sample** called **SingularSample**. This subclass provides a more convenient API, where the main element is accessible via `sample.element`, and the remaining elements are organized under `sample.annotations`:"
    ]
   },
   {

diff --git a/docs/source/notebooks/vision/fundamentals/table_api.ipynb b/docs/source/notebooks/vision/fundamentals/table_api.ipynb
@@ -108,7 +108,8 @@
     "In BridgeDS, we use two complementing approaches to view datasets. We call them the **Sample API** and the **Table API**. This tutorial is about the latter.\n",
     "\n",
     "The Table API can be described as:\n",
-    "> A dataset can be viewed as a table where every row represents a single element. Elements have a unique element_id but share their sample_id with other elements from the same Sample. The element_id and sample_id columns serve as the table's multi-index."
+    "\n",
+    "    A dataset can be viewed as a table where every row represents a single element. Elements have a unique element_id but share their sample_id with other elements from the same Sample. The element_id and sample_id columns serve as the table's multi-index."
    ]
   },
   {
@@ -145,9 +146,10 @@
    "id": "12",
    "metadata": {},
    "source": [
-    "The Table API is designed to expose callables that accept Pandas DataFrames as arguments, due to their simple and familiar design. The following sections showcase methods that allow users to perform different actions on Datasets, and these methods accept tuples of DataFrames (samples, annotations).\n",
+    "## Methods\n",
+    "The Table API is designed to expose callables that accept Pandas DataFrames as arguments, due to their simple and familiar API. The following sections showcase methods that allow users to perform different actions on Datasets, and these methods accept tuples of DataFrames (samples, annotations).\n",
     "\n",
-    "## Filter\n",
+    "### Filter\n",
     "Using tables allows us to easily filter out images or bboxes using familiar Pandas syntax. Note that when filtering samples, BridgeDS automatically filters out corresponding annotations:"
    ]
   },
@@ -177,7 +179,7 @@
    "id": "14",
    "metadata": {},
    "source": [
-    "## Assign\n",
+    "### Assign\n",
     "We can assign new columns to either `ds.samples` or `ds.annotations` using familiar syntax. Let's assign the value `n_bboxes` to every sample:"
    ]
   },
@@ -202,7 +204,7 @@
    "id": "16",
    "metadata": {},
    "source": [
-    "## Sorting\n",
+    "### Sorting\n",
     "We can sort the tables using familiar Pandas syntax:"
    ]
   },
@@ -222,7 +224,7 @@
    "id": "18",
    "metadata": {},
    "source": [
-    "Note that if we sort the samples table, we can change the positional index used by the Sample API (ds.iget). The next cell will show the dataset in order from most bboxes per image to least:"
+    "Note that if we sort the samples table, we can change the positional index used by the Sample API (ds.iget) which dictates the order of the samples below. The next cell will show the dataset in order from most bboxes per image to least:"
    ]
   },
   {