fix second line addresses being included in parsed_address; fixes #9 (#…

…10) * add second line address failing test * remove old test files * change how parsed address is created from parsed address components * update version to 0.1.2
degauss-org · Oct 18, 2022 · 23e0cc5 · 23e0cc5
1 parent e8e0b47
commit 23e0cc5
Show file tree

Hide file tree

Showing 10 changed files with 281 additions and 273 deletions.
diff --git a/.github/workflows/build-deploy-pr.yaml b/.github/workflows/build-deploy-pr.yaml
@@ -29,6 +29,8 @@ jobs:
         run: |
           docker run --rm -v "${PWD}/test":/tmp ${{ env.container }} address.csv
           docker run --rm -v "${PWD}/test":/tmp ${{ env.container }} address.csv expand
+          docker run --rm -v "${PWD}/test":/tmp ${{ env.container }} address_stub.csv
+          docker run --rm -v "${PWD}/test":/tmp ${{ env.container }} address_stub.csv expand
       - name: login to ghcr
         uses: docker/login-action@v1
         with:

diff --git a/.github/workflows/build-deploy-release.yaml b/.github/workflows/build-deploy-release.yaml
@@ -30,6 +30,8 @@ jobs:
         run: |
           docker run --rm -v "${PWD}/test":/tmp ${{ env.container }} address.csv
           docker run --rm -v "${PWD}/test":/tmp ${{ env.container }} address.csv expand
+          docker run --rm -v "${PWD}/test":/tmp ${{ env.container }} address_stub.csv
+          docker run --rm -v "${PWD}/test":/tmp ${{ env.container }} address_stub.csv expand
       - name: login to ghcr
         uses: docker/login-action@v1
         with:

diff --git a/Dockerfile b/Dockerfile
@@ -2,7 +2,7 @@ FROM rocker/r-ver:4.1.3
 
 # DeGAUSS container metadata
 ENV degauss_name="postal"
-ENV degauss_version="0.1.1"
+ENV degauss_version="0.1.2"
 ENV degauss_description="normalized and parsed addresses"
 ENV degauss_argument="expand [default: '']"
 

diff --git a/README.md b/README.md
@@ -8,10 +8,10 @@
 If `my_address_file.csv` is a file in the current working directory with an address column named `address`, then the [DeGAUSS command](https://degauss.org/using_degauss.html#DeGAUSS_Commands):
 
 ```sh
-docker run --rm -v $PWD:/tmp ghcr.io/degauss-org/postal:0.1.1 my_address_file.csv
+docker run --rm -v $PWD:/tmp ghcr.io/degauss-org/postal:0.1.2 my_address_file.csv
 ```
 
-will produce `my_address_file_postal_0.1.1.csv` with added columns:
+will produce `my_address_file_postal_0.1.2.csv` with added columns:
 
 - **`cleaned_address`**: `address` with non-alphanumeric characterics and excess whitespace removed (with `dht::clean_address()`)
 - **`parsed.{address_component}`**: multiple columns, one for each [parsed address component](https://github.com/openvenues/libpostal#parser-labels) (e.g., `parsed.road`, `parsed.state`, `parsed.house_number`)
@@ -24,10 +24,10 @@ After parsing, the parsed addresses can be expanded into [several possible norma
 If any value is provided as an argument (e.g., "expand"), then the [DeGAUSS command](https://degauss.org/using_degauss.html#DeGAUSS_Commands):
 
 ```sh
-docker run --rm -v $PWD:/tmp ghcr.io/degauss-org/postal:0.1.1 my_address_file.csv expand
+docker run --rm -v $PWD:/tmp ghcr.io/degauss-org/postal:0.1.2 my_address_file.csv expand
 ```
 
-will produce `my_address_file_postal_0.1.1_expand.csv` with the above columns *plus*:
+will produce `my_address_file_postal_0.1.2_expand.csv` with the above columns *plus*:
 
 - **`expanded_addresses`**: the expanded addresses for `parsed_address`
 

diff --git a/entrypoint.R b/entrypoint.R
@@ -39,15 +39,17 @@ parsed_address_components <-
   purrr::modify(tibble::as_tibble) |>
   dplyr::bind_rows() |>
   dplyr::rename_with(~ paste("parsed", .x, sep = "."))
-# TODO make sure this doesn't remove rows with all NAs
 
 d <- dplyr::bind_cols(d, parsed_address_components)
 
-d <- tidyr::unite(d, col = "parsed_address", starts_with("parsed."), sep = " ", na.rm = TRUE, remove = FALSE)
+d <- tidyr::unite(d,
+                  col = "parsed_address",
+                  tidyselect::any_of(paste0("parsed.", c("house_number", "road", "city", "state", "postcode"))),
+                  sep = " ", na.rm = TRUE, remove = FALSE)
 
 ## expanding addresses
 if (!is.null(opt$expand)) {
-  cli::cli_alert_info("the {.field expand} argument is set to {.val {opt$expand}}; expanding addresses...")
+  cli::cli_alert_info("the {.field expand} argument is set to {.val {opt$expand}}; expanding the parsed addresses...")
   cli::cli_alert_warning("more than one address row will likely be returned for each input address row")
 
   d$expanded_addresses <-

diff --git a/test/address.csv b/test/address.csv
@@ -2,6 +2,7 @@ id,address
 55000100280,"2854 ROSEANN LN GREEN TOWNSHIP, OH 45239"
 55000235280,"2854 ROSEANN LANE GREEN TOWNSHIP, OH 45239"
 9800060045,"407 SOUTHVIEW AV CINCINNATI, OH 45219"
+9800060045,"407 SOUTHVIEW AV Apartment #2 CINCINNATI, OH 45219"
 59100090241,"909 GRETNA LN FOREST PARK, OH 45240"
 55071310120,"P.O. BOX 12345 GREEN TOWNSHIP, OH 45238"
 4100010061,"PO 12345 CINCINNATI, OH 45208"

diff --git a/test/address_postal_0.1.0.csv → test/address_postal_0.1.1.csv b/test/address_postal_0.1.0.csv → test/address_postal_0.1.1.csv
diff --git a/test/address_postal_0.1.0_expand.csv → test/address_postal_0.1.1_expand.csv b/test/address_postal_0.1.0_expand.csv → test/address_postal_0.1.1_expand.csv
diff --git a/test/address_stub_postal_0.1.0.csv → test/address_stub_postal_0.1.1.csv b/test/address_stub_postal_0.1.0.csv → test/address_stub_postal_0.1.1.csv
diff --git a/test/address_stub_postal_0.1.0_expand.csv → test/address_stub_postal_0.1.1_expand.csv b/test/address_stub_postal_0.1.0_expand.csv → test/address_stub_postal_0.1.1_expand.csv