Skip to content

Commit

Permalink
Merge pull request #72 from nasaharvest/importable-datasets
Browse files Browse the repository at this point in the history
Importable datasets
  • Loading branch information
ivanzvonkov authored Jul 22, 2022
2 parents 9e6cd86 + a18c08a commit 8cb93f0
Show file tree
Hide file tree
Showing 19 changed files with 230 additions and 148 deletions.
2 changes: 1 addition & 1 deletion .flake8
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
[flake8]
exclude = venv,build,dist
exclude = venv,build,dist,openmapflow/datasets.py
max-line-length = 100
28 changes: 25 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,29 @@ gcloud auth login
gsutil mb -l <YOUR_OPENMAPFLOW_YAML_GCLOUD_LOCATION> gs://<YOUR_OPENMAPFLOW_YAML_BUCKET_LABELED_EO>
```

## Adding data [![cb]](https://colab.research.google.com/github/nasaharvest/openmapflow/blob/main/openmapflow/notebooks/new_data.ipynb)
## Adding data

#### Adding already existing data
**Prerequisites:**
- [ ] [Generated OpenMapFlow project](#generating-a-project-)

Add reference to already existing dataset in your datasets.py:
```python
from openmapflow.datasets import geowiki_landcover_2017, togo_crop_2019

datasets = [geowiki_landcover_2017, togo_crop_2019]
```
Download and push datasets
```bash
openmapflow create-dataset # Download datasets
dvc commit && dvc push # Push data to version control

git add .
git commit -m'Created new dataset'
git push
```

#### Adding custom data [![cb]](https://colab.research.google.com/github/nasaharvest/openmapflow/blob/main/openmapflow/notebooks/new_data.ipynb)

Data can be added by either following the below documentation OR running the above Colab notebook.

Expand All @@ -133,10 +155,10 @@ export RAW_LABEL_DIR=$(openmapflow datapath RAW_LABELS)
mkdir RAW_LABEL_DIR/<my dataset name>
cp -r <path to my raw data files> RAW_LABEL_DIR/<my dataset name>
```
Add reference to data using a `LabeledDataset` object in datasets.py, example:
Add reference to data using a `CustomLabeledDataset` object in datasets.py, example:
```python
datasets = [
LabeledDataset(
CustomLabeledDataset(
dataset="example_dataset",
country="Togo",
raw_labels=(
Expand Down
10 changes: 7 additions & 3 deletions buildings-example/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,15 @@
"""
from typing import List

from openmapflow.labeled_dataset import LabeledDataset, create_datasets
from openmapflow.labeled_dataset import (
CustomLabeledDataset,
LabeledDataset,
create_datasets,
)
from openmapflow.raw_labels import RawLabels

datasets: List[LabeledDataset] = [
LabeledDataset(
CustomLabeledDataset(
dataset="Uganda_buildings_2020",
country="Uganda",
raw_labels=(
Expand All @@ -22,7 +26,7 @@
),
),
),
LabeledDataset(
CustomLabeledDataset(
dataset="geowiki_landcover_2017",
country="global",
raw_labels=(
Expand Down
2 changes: 1 addition & 1 deletion buildings-example/openmapflow.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
version: 0.1.0
version: 0.1.1
project: buildings-example
description: OpenMapFlow buildings example
gcloud:
Expand Down
62 changes: 2 additions & 60 deletions crop-mask-example/datasets.py
Original file line number Diff line number Diff line change
@@ -1,67 +1,9 @@
from typing import List

from openmapflow.datasets import geowiki_landcover_2017, togo_crop_2019
from openmapflow.labeled_dataset import LabeledDataset, create_datasets
from openmapflow.raw_labels import RawLabels

datasets: List[LabeledDataset] = [
LabeledDataset(
dataset="geowiki_landcover_2017",
country="global",
raw_labels=(
RawLabels(
filename="loc_all_2.txt",
longitude_col="loc_cent_X",
latitude_col="loc_cent_Y",
class_prob=lambda df: df.sumcrop / 100,
start_year=2017,
x_y_from_centroid=False,
),
),
),
LabeledDataset(
dataset="Togo_2019",
country="Togo",
raw_labels=(
RawLabels(
filename="crop_merged_v2.zip",
class_prob=1.0,
train_val_test=(0.8, 0.2, 0.0),
start_year=2019,
),
RawLabels(
filename="noncrop_merged_v2.zip",
class_prob=0.0,
train_val_test=(0.8, 0.2, 0.0),
start_year=2019,
),
RawLabels(
filename="random_sample_hrk.zip",
class_prob=lambda df: df["hrk-label"],
transform_crs_from=32631,
train_val_test=(0.0, 0.0, 1.0),
start_year=2019,
),
RawLabels(
filename="random_sample_cn.zip",
class_prob=lambda df: df["cn_labels"],
train_val_test=(0.0, 0.0, 1.0),
start_year=2019,
),
RawLabels(
filename="BB_random_sample_1k.zip",
class_prob=lambda df: df["bb_label"],
train_val_test=(0.0, 0.0, 1.0),
start_year=2019,
),
RawLabels(
filename="random_sample_bm.zip",
class_prob=lambda df: df["bm_labels"],
train_val_test=(0.0, 0.0, 1.0),
start_year=2019,
),
),
),
]
datasets: List[LabeledDataset] = [geowiki_landcover_2017, togo_crop_2019]

if __name__ == "__main__":
create_datasets(datasets)
2 changes: 1 addition & 1 deletion crop-mask-example/openmapflow.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
version: 0.1.0
version: 0.1.1
project: crop-mask-example
description: OpenMapFlow crop mask example
gcloud:
Expand Down
10 changes: 7 additions & 3 deletions maize-example/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,15 @@
"""
from typing import List

from openmapflow.labeled_dataset import LabeledDataset, create_datasets
from openmapflow.labeled_dataset import (
CustomLabeledDataset,
LabeledDataset,
create_datasets,
)
from openmapflow.raw_labels import RawLabels

datasets: List[LabeledDataset] = [
LabeledDataset(
CustomLabeledDataset(
dataset="Kenya_non_crop_2019",
country="Kenya",
raw_labels=(
Expand Down Expand Up @@ -45,7 +49,7 @@
),
),
),
LabeledDataset(
CustomLabeledDataset(
dataset="ref_african_crops_kenya_01_labels",
country="Kenya",
raw_labels=tuple(
Expand Down
2 changes: 1 addition & 1 deletion maize-example/openmapflow.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
version: 0.1.0
version: 0.1.1
project: maize-example
description: OpenMapFlow maize example
gcloud:
Expand Down
2 changes: 1 addition & 1 deletion openmapflow/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
TEMPLATE_REQUIREMENTS = TEMPLATES_DIR / "requirements.txt"
TEMPLATE_DEPLOY_YML = TEMPLATES_DIR / "github-deploy.yaml"
TEMPLATE_TEST_YML = TEMPLATES_DIR / "github-test.yaml"
VERSION = "0.1.0"
VERSION = "0.1.1"

# -------------- Dataframe column names --------------------------------------
SOURCE = "source"
Expand Down
12 changes: 0 additions & 12 deletions openmapflow/data_instance.py

This file was deleted.

27 changes: 27 additions & 0 deletions openmapflow/datasets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from openmapflow.labeled_dataset import ExistingLabeledDataset

geowiki_landcover_2017 = ExistingLabeledDataset(
dataset="geowiki_landcover_2017",
source=(
"Linda See. A global reference database of crowdsourced cropland data collected using the "
+ "Geo-Wiki platform, 2017."
),
label_type="binary",
license="CC BY-3.0",
country="global",
download_url="https://storage.googleapis.com/harvest-public-assets/openmapflow/datasets/crop/geowiki_landcover_2017.csv",
)

togo_crop_2019 = ExistingLabeledDataset(
country="Togo",
dataset="Togo_2019",
download_url="https://storage.googleapis.com/harvest-public-assets/openmapflow/datasets/crop/Togo_2019.csv",
label_type="binary",
license="CC BY-4.0",
source=(
"Hannah Kerner, Gabriel Tseng, Inbal Becker-Reshef, Catherine Nakalembe, "
+ " Brian Barker, Blake Munshell, Madhava Paliyam, and Mehdi Hosseini. Rapid response "
+ "crop maps in data sparse regions. In ACM SIGKDD Conference on Data Mining and "
+ "Knowledge Discovery Workshops, 2020."
),
)
Loading

0 comments on commit 8cb93f0

Please sign in to comment.