Skip to content

Commit

Permalink
usaddress: add Similarity function
Browse files Browse the repository at this point in the history
  • Loading branch information
adamdecaf committed Sep 26, 2024
1 parent 7738805 commit d5ee4d4
Show file tree
Hide file tree
Showing 2 changed files with 261 additions and 0 deletions.
90 changes: 90 additions & 0 deletions pkg/usaddress/similarity.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
package usaddress

import (
"github.com/xrash/smetrics"
)

func stringSimilarity(s1, s2 string) float64 {
return smetrics.JaroWinkler(s1, s2, 0.1, 4)
}

type weights struct {
PrimaryNumber float64
StreetName float64
ZIPCode float64
City float64
State float64
StreetSuffix float64
StreetPredir float64
StreetPostdir float64
SecondaryUnit float64
Plus4 float64
POBox float64
RuralRoute float64
HighwayContract float64
}

// Default weights
//
// PrimaryNumber, StreetName, and ZIPCode are given the highest weights because they are critical for identifying a specific location.
// City and State are important but less specific than ZIPCode.
// StreetSuffix, StreetPredir, and StreetPostdir have lower weights as they often have less impact on mail delivery.
// SecondaryUnit and Plus4 are less critical and have the lowest weights.
// POBox, RuralRoute, and HighwayContract are alternative address types and replace the street address components when present.
var defaultWeights = weights{
PrimaryNumber: 0.25,
StreetName: 0.25,
ZIPCode: 0.20,
City: 0.10,
State: 0.05,
StreetSuffix: 0.05,
StreetPredir: 0.025,
StreetPostdir: 0.025,
SecondaryUnit: 0.025,
Plus4: 0.025,
POBox: 0.25,
RuralRoute: 0.25,
HighwayContract: 0.25,
}

func (a Address) Similarity(other Address) float64 {
var totalWeight float64
var similarityScore float64

// Function to add similarity and weight
addSimilarity := func(weight float64, s1, s2 string) {
sim := stringSimilarity(s1, s2)
similarityScore += sim * weight
totalWeight += weight
}

// Check for POBox, RuralRoute, or HighwayContract
if a.POBox != "" && other.POBox != "" {
addSimilarity(defaultWeights.POBox, a.POBox, other.POBox)
} else if a.RuralRoute != "" && other.RuralRoute != "" {
addSimilarity(defaultWeights.RuralRoute, a.RuralRoute, other.RuralRoute)
} else if a.HighwayContract != "" && other.HighwayContract != "" {
addSimilarity(defaultWeights.HighwayContract, a.HighwayContract, other.HighwayContract)
} else {
// Compare street address components
addSimilarity(defaultWeights.PrimaryNumber, a.PrimaryNumber, other.PrimaryNumber)
addSimilarity(defaultWeights.StreetPredir, a.StreetPredir, other.StreetPredir)
addSimilarity(defaultWeights.StreetName, a.StreetName, other.StreetName)
addSimilarity(defaultWeights.StreetSuffix, a.StreetSuffix, other.StreetSuffix)
addSimilarity(defaultWeights.StreetPostdir, a.StreetPostdir, other.StreetPostdir)
addSimilarity(defaultWeights.SecondaryUnit, a.SecondaryUnit, other.SecondaryUnit)
}

// Compare City, State, ZIPCode, Plus4
addSimilarity(defaultWeights.City, a.City, other.City)
addSimilarity(defaultWeights.State, a.State, other.State)
addSimilarity(defaultWeights.ZIPCode, a.ZIPCode, other.ZIPCode)
if a.Plus4 != "" || other.Plus4 != "" {
addSimilarity(defaultWeights.Plus4, a.Plus4, other.Plus4)
}

if totalWeight == 0 {
return 0.0
}
return similarityScore / totalWeight
}
171 changes: 171 additions & 0 deletions pkg/usaddress/similarity_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
package usaddress

import (
"testing"

"github.com/stretchr/testify/require"
)

func TestAddressSimilarity(t *testing.T) {
baseAddress := Address{
PrimaryNumber: "123",
StreetPredir: "N",
StreetName: "MAIN",
StreetSuffix: "ST",
StreetPostdir: "",
SecondaryUnit: "APT 4B",
City: "ANYTOWN",
State: "CA",
ZIPCode: "90210",
Plus4: "1234",
}

tests := []struct {
name string
addr1 Address
addr2 Address
expected float64
}{
{
name: "Identical Addresses",
addr1: baseAddress,
addr2: baseAddress,
expected: 1.0, // Expect full similarity
},
{
name: "Different Primary Number",
addr1: baseAddress,
addr2: func() Address {
a := baseAddress
a.PrimaryNumber = "124"
return a
}(),
expected: 0.956,
},
{
name: "Different Street Name",
addr1: baseAddress,
addr2: func() Address {
a := baseAddress
a.StreetName = "MAINN"
return a
}(),
expected: 0.99,
},
{
name: "Different ZIPCode",
addr1: baseAddress,
addr2: func() Address {
a := baseAddress
a.ZIPCode = "90211"
return a
}(),
expected: 0.984,
},
{
name: "Different City",
addr1: baseAddress,
addr2: func() Address {
a := baseAddress
a.City = "OTHERTOWN"
return a
}(),
expected: 0.967,
},
{
name: "Different State",
addr1: baseAddress,
addr2: func() Address {
a := baseAddress
a.State = "NY"
return a
}(),
expected: 0.95,
},
{
name: "Different Street Suffix",
addr1: baseAddress,
addr2: func() Address {
a := baseAddress
a.StreetSuffix = "AVE"
return a
}(),
expected: 0.95,
},
{
name: "Different StreetPredir",
addr1: baseAddress,
addr2: func() Address {
a := baseAddress
a.StreetPredir = "S"
return a
}(),
expected: 0.975,
},
{
name: "Different StreetPostdir",
addr1: baseAddress,
addr2: func() Address {
a := baseAddress
a.StreetPostdir = "NW"
return a
}(),
expected: 0.975,
},
{
name: "Different Secondary Unit",
addr1: baseAddress,
addr2: func() Address {
a := baseAddress
a.SecondaryUnit = "APT 5C"
return a
}(),
expected: 0.997,
},
{
name: "Different Plus4",
addr1: baseAddress,
addr2: func() Address {
a := baseAddress
a.Plus4 = "5678"
return a
}(),
expected: 0.975,
},
{
name: "POBox Instead of Street Address",
addr1: func() Address {
a := baseAddress
a.POBox = "PO BOX 789"
// Clear street address components
a.PrimaryNumber = ""
a.StreetName = ""
a.StreetSuffix = ""
a.StreetPredir = ""
a.StreetPostdir = ""
a.SecondaryUnit = ""
return a
}(),
addr2: func() Address {
a := baseAddress
a.POBox = "PO BOX 789"
// Clear street address components
a.PrimaryNumber = ""
a.StreetName = ""
a.StreetSuffix = ""
a.StreetPredir = ""
a.StreetPostdir = ""
a.SecondaryUnit = ""
return a
}(),
expected: 1.0,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
similarity := tt.addr1.Similarity(tt.addr2)
require.InDelta(t, tt.expected, similarity, 0.01)
})
}
}

0 comments on commit d5ee4d4

Please sign in to comment.