Skip to content

Commit

Permalink
fix: fetch UK sanctions .ods from HTML
Browse files Browse the repository at this point in the history
  • Loading branch information
adamdecaf committed Dec 20, 2024
1 parent c7c8346 commit 7e6b0a0
Show file tree
Hide file tree
Showing 5 changed files with 87 additions and 7 deletions.
2 changes: 1 addition & 1 deletion cmd/server/download.go
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ func ukCSLRecords(logger log.Logger, initialDir string) ([]*csl.UKCSLRecord, err

func ukSanctionsListRecords(logger log.Logger, initialDir string) ([]*csl.UKSanctionsListRecord, error) {
file, err := csl.DownloadUKSanctionsList(logger, initialDir)
if err != nil {
if file == nil || err != nil {
logger.Warn().Logf("skipping UK Sanctions List download: %v", err)
// no error to return because we skip the download
return nil, nil
Expand Down
4 changes: 4 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ go 1.20

require (
github.com/abadojack/whatlanggo v1.0.1
github.com/antchfx/htmlquery v1.3.3
github.com/antihax/optional v1.0.0
github.com/bbalet/stopwords v1.0.0
github.com/go-kit/kit v0.13.0
Expand All @@ -23,18 +24,21 @@ require (
)

require (
github.com/antchfx/xpath v1.3.2 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/cespare/xxhash/v2 v2.2.0 // indirect
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
github.com/go-kit/log v0.2.1 // indirect
github.com/go-logfmt/logfmt v0.6.0 // indirect
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
github.com/golang/protobuf v1.5.3 // indirect
github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/prometheus/client_model v0.5.0 // indirect
github.com/prometheus/common v0.45.0 // indirect
github.com/prometheus/procfs v0.12.0 // indirect
github.com/rickar/cal/v2 v2.1.13 // indirect
golang.org/x/net v0.18.0 // indirect
golang.org/x/sys v0.15.0 // indirect
google.golang.org/appengine v1.6.8 // indirect
google.golang.org/protobuf v1.33.0 // indirect
Expand Down
8 changes: 8 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym
github.com/VividCortex/gohistogram v1.0.0 h1:6+hBz+qvs0JOrrNhhmR7lFxo5sINxBCGXrdtl/UvroE=
github.com/abadojack/whatlanggo v1.0.1 h1:19N6YogDnf71CTHm3Mp2qhYfkRdyvbgwWdd2EPxJRG4=
github.com/abadojack/whatlanggo v1.0.1/go.mod h1:66WiQbSbJBIlOZMsvbKe5m6pzQovxCH9B/K8tQB2uoc=
github.com/antchfx/htmlquery v1.3.3 h1:x6tVzrRhVNfECDaVxnZi1mEGrQg3mjE/rxbH2Pe6dNE=
github.com/antchfx/htmlquery v1.3.3/go.mod h1:WeU3N7/rL6mb6dCwtE30dURBnBieKDC/fR8t6X+cKjU=
github.com/antchfx/xpath v1.3.2 h1:LNjzlsSjinu3bQpw9hWMY9ocB80oLOWuQqFvO6xt51U=
github.com/antchfx/xpath v1.3.2/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs=
github.com/antihax/optional v1.0.0 h1:xK2lYat7ZLaVVcIuj82J8kIro4V6kDe0AUDFboUCwcg=
github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY=
github.com/bbalet/stopwords v1.0.0 h1:0TnGycCtY0zZi4ltKoOGRFIlZHv0WqpoIGUsObjztfo=
Expand Down Expand Up @@ -50,6 +54,8 @@ github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfU
github.com/golang/groupcache v0.0.0-20190702054246-869f871628b6/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
github.com/golang/groupcache v0.0.0-20191227052852-215e87163ea7/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE=
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A=
github.com/golang/mock v1.2.0/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A=
github.com/golang/mock v1.3.1/go.mod h1:sBzyDLLjw3U8JLTeZvSv8jJB+tU5PVekmnlKIyFUx0Y=
Expand Down Expand Up @@ -180,6 +186,8 @@ golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLL
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
golang.org/x/net v0.18.0 h1:mIYleuAkSbHh0tCv7RvjL3F6ZVbLjq4+R7zbOn3Kokg=
golang.org/x/net v0.18.0/go.mod h1:/czyP5RqHAH4odGYxBJ1qz0+CE5WZ+2j1YgoEo8F2jQ=
golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
Expand Down
64 changes: 59 additions & 5 deletions pkg/csl/download_uk.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,22 +5,22 @@
package csl

import (
"fmt"
"io"
"os"
"strings"

"github.com/moov-io/base/log"
"github.com/moov-io/base/strx"
"github.com/moov-io/watchman/pkg/download"

"github.com/antchfx/htmlquery"
)

var (
// taken from https://www.gov.uk/government/publications/financial-sanctions-consolidated-list-of-targets/consolidated-list-of-targets#contents
publicUKCSLDownloadURL = "https://ofsistorage.blob.core.windows.net/publishlive/2022format/ConList.csv"
ukCSLDownloadURL = strx.Or(os.Getenv("UK_CSL_DOWNLOAD_URL"), publicUKCSLDownloadURL)

// https://www.gov.uk/government/publications/the-uk-sanctions-list
publicUKSanctionsListURL = "https://assets.publishing.service.gov.uk/media/6756dae7f96f5424a4b87791/UK_Sanctions_List.ods"
ukSanctionsListURL = strx.Or(os.Getenv("UK_SANCTIONS_LIST_URL"), publicUKSanctionsListURL)
)

func DownloadUKCSL(logger log.Logger, initialDir string) (map[string]io.ReadCloser, error) {
Expand All @@ -36,7 +36,61 @@ func DownloadUKSanctionsList(logger log.Logger, initialDir string) (map[string]i
dl := download.New(logger, download.HTTPClient)

ukSanctionsNameAndSource := make(map[string]string)
ukSanctionsNameAndSource["UK_Sanctions_List.ods"] = ukSanctionsListURL

latestURL, err := fetchLatestUKSanctionsListURL(logger, initialDir)
if err != nil {
return nil, err
}
logger.Info().Logf("downloading UK sanctions from %s", latestURL)

ukSanctionsNameAndSource["UK_Sanctions_List.ods"] = latestURL

return dl.GetFiles(initialDir, ukSanctionsNameAndSource)
}

var (
defaultUKSanctionsListHTML = strx.Or(os.Getenv("UK_CSL_HTML_INDEX_URL"), "https://www.gov.uk/government/publications/the-uk-sanctions-list")
)

func fetchLatestUKSanctionsListURL(logger log.Logger, initialDir string) (string, error) {
fromEnv := strings.TrimSpace(os.Getenv("UK_SANCTIONS_LIST_URL"))
if fromEnv != "" {
return fromEnv, nil
}

// Fetch the HTML page and look for the latest link
ukSanctionsNameAndSource := make(map[string]string)
ukSanctionsNameAndSource["UK_Sanctions_List.ods"] = defaultUKSanctionsListHTML

dl := download.New(logger, download.HTTPClient)

pages, err := dl.GetFiles(initialDir, ukSanctionsNameAndSource)
if err != nil {
return "", fmt.Errorf("getting UK Sanctions html index: %w", err)
}

indexContents, exists := pages["UK_Sanctions_List.ods"]
if !exists {
return "", fmt.Errorf("UK sanctions index page %s not found", defaultUKSanctionsListHTML)
}

index, err := htmlquery.Parse(indexContents)
if err != nil {
return "", fmt.Errorf("parsing UK sanctions index page: %w", err)
}

links, err := htmlquery.QueryAll(index, `//a[contains(@class, 'govuk-link') and contains(@href, '.ods')]`)
if err != nil {
return "", fmt.Errorf("html xpath failed: %w", err)
}

for _, link := range links {
for _, attr := range link.Attr {
if attr.Key == "href" && strings.HasSuffix(attr.Val, ".ods") {
return attr.Val, nil
}
}
}

return "", nil
}
16 changes: 15 additions & 1 deletion pkg/csl/download_uk_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
"testing"

"github.com/moov-io/base/log"
"github.com/stretchr/testify/require"
)

func TestUKCSLDownload(t *testing.T) {
Expand Down Expand Up @@ -76,12 +77,25 @@ func TestUKCSLDownload_initialDir(t *testing.T) {
}
}

func TestUKSanctionsListIndex(t *testing.T) {
if testing.Short() {
return
}

logger := log.NewTestLogger()
foundURL, err := fetchLatestUKSanctionsListURL(logger, "")
require.NoError(t, err)

require.Contains(t, foundURL, "UK_Sanctions_List.ods")
}

func TestUKSanctionsListDownload(t *testing.T) {
if testing.Short() {
return
}

file, err := DownloadUKSanctionsList(log.NewNopLogger(), "")
logger := log.NewTestLogger()
file, err := DownloadUKSanctionsList(logger, "")
if err != nil {
t.Fatal(err)
}
Expand Down

0 comments on commit 7e6b0a0

Please sign in to comment.