From c8bb75d18d2d6233eb8f1321789809e285eec3c4 Mon Sep 17 00:00:00 2001 From: Cole Brokamp Date: Thu, 20 Oct 2022 14:58:26 -0400 Subject: [PATCH] Fix error caused by multiple components; closes #11 (#12) * add example address with duplicated components to cause test to fail * fix problem with duplicated address components * 0.1.3 release --- Dockerfile | 2 +- README.md | 8 ++++---- entrypoint.R | 6 ++++-- test/address.csv | 1 + ...{address_postal_0.1.1.csv => address_postal_0.1.2.csv} | 1 + ...l_0.1.1_expand.csv => address_postal_0.1.2_expand.csv} | 1 + ...tub_postal_0.1.1.csv => address_stub_postal_0.1.2.csv} | 0 ....1_expand.csv => address_stub_postal_0.1.2_expand.csv} | 0 8 files changed, 12 insertions(+), 7 deletions(-) rename test/{address_postal_0.1.1.csv => address_postal_0.1.2.csv} (98%) rename test/{address_postal_0.1.1_expand.csv => address_postal_0.1.2_expand.csv} (99%) rename test/{address_stub_postal_0.1.1.csv => address_stub_postal_0.1.2.csv} (100%) rename test/{address_stub_postal_0.1.1_expand.csv => address_stub_postal_0.1.2_expand.csv} (100%) diff --git a/Dockerfile b/Dockerfile index 66fe3db..955f880 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,7 +2,7 @@ FROM rocker/r-ver:4.1.3 # DeGAUSS container metadata ENV degauss_name="postal" -ENV degauss_version="0.1.2" +ENV degauss_version="0.1.3" ENV degauss_description="normalized and parsed addresses" ENV degauss_argument="expand [default: '']" diff --git a/README.md b/README.md index be7e52f..a62ef97 100644 --- a/README.md +++ b/README.md @@ -8,10 +8,10 @@ If `my_address_file.csv` is a file in the current working directory with an address column named `address`, then the [DeGAUSS command](https://degauss.org/using_degauss.html#DeGAUSS_Commands): ```sh -docker run --rm -v $PWD:/tmp ghcr.io/degauss-org/postal:0.1.2 my_address_file.csv +docker run --rm -v $PWD:/tmp ghcr.io/degauss-org/postal:0.1.3 my_address_file.csv ``` -will produce `my_address_file_postal_0.1.2.csv` with added columns: +will produce `my_address_file_postal_0.1.3.csv` with added columns: - **`cleaned_address`**: `address` with non-alphanumeric characterics and excess whitespace removed (with `dht::clean_address()`) - **`parsed.{address_component}`**: multiple columns, one for each [parsed address component](https://github.com/openvenues/libpostal#parser-labels) (e.g., `parsed.road`, `parsed.state`, `parsed.house_number`) @@ -24,10 +24,10 @@ After parsing, the parsed addresses can be expanded into [several possible norma If any value is provided as an argument (e.g., "expand"), then the [DeGAUSS command](https://degauss.org/using_degauss.html#DeGAUSS_Commands): ```sh -docker run --rm -v $PWD:/tmp ghcr.io/degauss-org/postal:0.1.2 my_address_file.csv expand +docker run --rm -v $PWD:/tmp ghcr.io/degauss-org/postal:0.1.3 my_address_file.csv expand ``` -will produce `my_address_file_postal_0.1.2_expand.csv` with the above columns *plus*: +will produce `my_address_file_postal_0.1.3_expand.csv` with the above columns *plus*: - **`expanded_addresses`**: the expanded addresses for `parsed_address` diff --git a/entrypoint.R b/entrypoint.R index c26c59f..3d956de 100755 --- a/entrypoint.R +++ b/entrypoint.R @@ -36,9 +36,11 @@ parsed_address_components <- purrr::transpose() |> purrr::modify(unlist) |> purrr::modify(jsonlite::fromJSON) |> - purrr::modify(tibble::as_tibble) |> + purrr::modify(tibble::as_tibble, .name_repair = "unique") |> dplyr::bind_rows() |> - dplyr::rename_with(~ paste("parsed", .x, sep = ".")) + dplyr::select(-contains("...")) |> + dplyr::rename_with(~ paste("parsed", .x, sep = ".")) |> + suppressMessages() d <- dplyr::bind_cols(d, parsed_address_components) diff --git a/test/address.csv b/test/address.csv index 2f7a294..c235e94 100644 --- a/test/address.csv +++ b/test/address.csv @@ -17,6 +17,7 @@ id,address 87190048084," " 97124042024," " 5100020177,"4506 CAMBERWELL RD CINCINNATI, OH 45209" +5100020177,"4506 CAMBERWELL RD CINCINNATI, OH CINCINNATI, OH 45209" 55000100212,"5585 FAIRWOOD RD GREEN TOWNSHIP, OH 45239" 51000810328,"6628 JULY CT COLERAIN TOWNSHIP, OH 45239" 61201400371,"5126 BRASHER AV BLUE ASH, OH 45242" diff --git a/test/address_postal_0.1.1.csv b/test/address_postal_0.1.2.csv similarity index 98% rename from test/address_postal_0.1.1.csv rename to test/address_postal_0.1.2.csv index b130ce2..3fd1469 100644 --- a/test/address_postal_0.1.1.csv +++ b/test/address_postal_0.1.2.csv @@ -17,6 +17,7 @@ id,address,cleaned_address,parsed_address,parsed.house_number,parsed.road,parsed 87190048084,NA,NA,na,NA,NA,NA,na,NA,NA,NA,NA 97124042024,NA,NA,na,NA,NA,NA,na,NA,NA,NA,NA 5100020177,"4506 CAMBERWELL RD CINCINNATI, OH 45209",4506 CAMBERWELL RD CINCINNATI OH 45209,4506 camberwell rd cincinnati oh 45209,4506,camberwell rd,cincinnati,oh,45209,NA,NA,NA +5100020177,"4506 CAMBERWELL RD CINCINNATI, OH CINCINNATI, OH 45209",4506 CAMBERWELL RD CINCINNATI OH CINCINNATI OH 45209,4506 camberwell rd 45209,4506,camberwell rd,NA,NA,45209,NA,NA,NA 55000100212,"5585 FAIRWOOD RD GREEN TOWNSHIP, OH 45239",5585 FAIRWOOD RD GREEN TOWNSHIP OH 45239,5585 fairwood rd green township oh 45239,5585,fairwood rd,green township,oh,45239,NA,NA,NA 51000810328,"6628 JULY CT COLERAIN TOWNSHIP, OH 45239",6628 JULY CT COLERAIN TOWNSHIP OH 45239,6628 july ct colerain township oh 45239,6628,july ct,colerain township,oh,45239,NA,NA,NA 61201400371,"5126 BRASHER AV BLUE ASH, OH 45242",5126 BRASHER AV BLUE ASH OH 45242,5126 brasher av blue ash oh 45242,5126,brasher av,blue ash,oh,45242,NA,NA,NA diff --git a/test/address_postal_0.1.1_expand.csv b/test/address_postal_0.1.2_expand.csv similarity index 99% rename from test/address_postal_0.1.1_expand.csv rename to test/address_postal_0.1.2_expand.csv index 8b4f312..417854c 100644 --- a/test/address_postal_0.1.1_expand.csv +++ b/test/address_postal_0.1.2_expand.csv @@ -42,6 +42,7 @@ id,address,cleaned_address,parsed_address,parsed.house_number,parsed.road,parsed 97124042024,NA,NA,na,NA,NA,NA,na,NA,NA,NA,NA,national association 5100020177,"4506 CAMBERWELL RD CINCINNATI, OH 45209",4506 CAMBERWELL RD CINCINNATI OH 45209,4506 camberwell rd cincinnati oh 45209,4506,camberwell rd,cincinnati,oh,45209,NA,NA,NA,4506 camberwell road cincinnati ohio 45209 5100020177,"4506 CAMBERWELL RD CINCINNATI, OH 45209",4506 CAMBERWELL RD CINCINNATI OH 45209,4506 camberwell rd cincinnati oh 45209,4506,camberwell rd,cincinnati,oh,45209,NA,NA,NA,4506 camberwell road cincinnati oh 45209 +5100020177,"4506 CAMBERWELL RD CINCINNATI, OH CINCINNATI, OH 45209",4506 CAMBERWELL RD CINCINNATI OH CINCINNATI OH 45209,4506 camberwell rd 45209,4506,camberwell rd,NA,NA,45209,NA,NA,NA,4506 camberwell road 45209 55000100212,"5585 FAIRWOOD RD GREEN TOWNSHIP, OH 45239",5585 FAIRWOOD RD GREEN TOWNSHIP OH 45239,5585 fairwood rd green township oh 45239,5585,fairwood rd,green township,oh,45239,NA,NA,NA,5585 fairwood road green township ohio 45239 55000100212,"5585 FAIRWOOD RD GREEN TOWNSHIP, OH 45239",5585 FAIRWOOD RD GREEN TOWNSHIP OH 45239,5585 fairwood rd green township oh 45239,5585,fairwood rd,green township,oh,45239,NA,NA,NA,5585 fairwood road green township oh 45239 51000810328,"6628 JULY CT COLERAIN TOWNSHIP, OH 45239",6628 JULY CT COLERAIN TOWNSHIP OH 45239,6628 july ct colerain township oh 45239,6628,july ct,colerain township,oh,45239,NA,NA,NA,6628 july ct colerain township ohio 45239 diff --git a/test/address_stub_postal_0.1.1.csv b/test/address_stub_postal_0.1.2.csv similarity index 100% rename from test/address_stub_postal_0.1.1.csv rename to test/address_stub_postal_0.1.2.csv diff --git a/test/address_stub_postal_0.1.1_expand.csv b/test/address_stub_postal_0.1.2_expand.csv similarity index 100% rename from test/address_stub_postal_0.1.1_expand.csv rename to test/address_stub_postal_0.1.2_expand.csv