From 7e6b0a0963107bd70641d8a829620576a1943ff0 Mon Sep 17 00:00:00 2001 From: Adam Shannon Date: Fri, 20 Dec 2024 12:42:01 -0600 Subject: [PATCH] fix: fetch UK sanctions .ods from HTML --- cmd/server/download.go | 2 +- go.mod | 4 +++ go.sum | 8 +++++ pkg/csl/download_uk.go | 64 ++++++++++++++++++++++++++++++++++--- pkg/csl/download_uk_test.go | 16 +++++++++- 5 files changed, 87 insertions(+), 7 deletions(-) diff --git a/cmd/server/download.go b/cmd/server/download.go index 24e70b59..923f2dcc 100644 --- a/cmd/server/download.go +++ b/cmd/server/download.go @@ -235,7 +235,7 @@ func ukCSLRecords(logger log.Logger, initialDir string) ([]*csl.UKCSLRecord, err func ukSanctionsListRecords(logger log.Logger, initialDir string) ([]*csl.UKSanctionsListRecord, error) { file, err := csl.DownloadUKSanctionsList(logger, initialDir) - if err != nil { + if file == nil || err != nil { logger.Warn().Logf("skipping UK Sanctions List download: %v", err) // no error to return because we skip the download return nil, nil diff --git a/go.mod b/go.mod index 07489416..45611c15 100644 --- a/go.mod +++ b/go.mod @@ -4,6 +4,7 @@ go 1.20 require ( github.com/abadojack/whatlanggo v1.0.1 + github.com/antchfx/htmlquery v1.3.3 github.com/antihax/optional v1.0.0 github.com/bbalet/stopwords v1.0.0 github.com/go-kit/kit v0.13.0 @@ -23,11 +24,13 @@ require ( ) require ( + github.com/antchfx/xpath v1.3.2 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/cespare/xxhash/v2 v2.2.0 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/go-kit/log v0.2.1 // indirect github.com/go-logfmt/logfmt v0.6.0 // indirect + github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect github.com/golang/protobuf v1.5.3 // indirect github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect @@ -35,6 +38,7 @@ require ( github.com/prometheus/common v0.45.0 // indirect github.com/prometheus/procfs v0.12.0 // indirect github.com/rickar/cal/v2 v2.1.13 // indirect + golang.org/x/net v0.18.0 // indirect golang.org/x/sys v0.15.0 // indirect google.golang.org/appengine v1.6.8 // indirect google.golang.org/protobuf v1.33.0 // indirect diff --git a/go.sum b/go.sum index c2eb583e..4f9d06d1 100644 --- a/go.sum +++ b/go.sum @@ -20,6 +20,10 @@ github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym github.com/VividCortex/gohistogram v1.0.0 h1:6+hBz+qvs0JOrrNhhmR7lFxo5sINxBCGXrdtl/UvroE= github.com/abadojack/whatlanggo v1.0.1 h1:19N6YogDnf71CTHm3Mp2qhYfkRdyvbgwWdd2EPxJRG4= github.com/abadojack/whatlanggo v1.0.1/go.mod h1:66WiQbSbJBIlOZMsvbKe5m6pzQovxCH9B/K8tQB2uoc= +github.com/antchfx/htmlquery v1.3.3 h1:x6tVzrRhVNfECDaVxnZi1mEGrQg3mjE/rxbH2Pe6dNE= +github.com/antchfx/htmlquery v1.3.3/go.mod h1:WeU3N7/rL6mb6dCwtE30dURBnBieKDC/fR8t6X+cKjU= +github.com/antchfx/xpath v1.3.2 h1:LNjzlsSjinu3bQpw9hWMY9ocB80oLOWuQqFvO6xt51U= +github.com/antchfx/xpath v1.3.2/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= github.com/antihax/optional v1.0.0 h1:xK2lYat7ZLaVVcIuj82J8kIro4V6kDe0AUDFboUCwcg= github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY= github.com/bbalet/stopwords v1.0.0 h1:0TnGycCtY0zZi4ltKoOGRFIlZHv0WqpoIGUsObjztfo= @@ -50,6 +54,8 @@ github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfU github.com/golang/groupcache v0.0.0-20190702054246-869f871628b6/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/groupcache v0.0.0-20191227052852-215e87163ea7/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE= +github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= github.com/golang/mock v1.2.0/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= github.com/golang/mock v1.3.1/go.mod h1:sBzyDLLjw3U8JLTeZvSv8jJB+tU5PVekmnlKIyFUx0Y= @@ -180,6 +186,8 @@ golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLL golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.18.0 h1:mIYleuAkSbHh0tCv7RvjL3F6ZVbLjq4+R7zbOn3Kokg= +golang.org/x/net v0.18.0/go.mod h1:/czyP5RqHAH4odGYxBJ1qz0+CE5WZ+2j1YgoEo8F2jQ= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= diff --git a/pkg/csl/download_uk.go b/pkg/csl/download_uk.go index 0eaee4df..a4ecc0b3 100644 --- a/pkg/csl/download_uk.go +++ b/pkg/csl/download_uk.go @@ -5,22 +5,22 @@ package csl import ( + "fmt" "io" "os" + "strings" "github.com/moov-io/base/log" "github.com/moov-io/base/strx" "github.com/moov-io/watchman/pkg/download" + + "github.com/antchfx/htmlquery" ) var ( // taken from https://www.gov.uk/government/publications/financial-sanctions-consolidated-list-of-targets/consolidated-list-of-targets#contents publicUKCSLDownloadURL = "https://ofsistorage.blob.core.windows.net/publishlive/2022format/ConList.csv" ukCSLDownloadURL = strx.Or(os.Getenv("UK_CSL_DOWNLOAD_URL"), publicUKCSLDownloadURL) - - // https://www.gov.uk/government/publications/the-uk-sanctions-list - publicUKSanctionsListURL = "https://assets.publishing.service.gov.uk/media/6756dae7f96f5424a4b87791/UK_Sanctions_List.ods" - ukSanctionsListURL = strx.Or(os.Getenv("UK_SANCTIONS_LIST_URL"), publicUKSanctionsListURL) ) func DownloadUKCSL(logger log.Logger, initialDir string) (map[string]io.ReadCloser, error) { @@ -36,7 +36,61 @@ func DownloadUKSanctionsList(logger log.Logger, initialDir string) (map[string]i dl := download.New(logger, download.HTTPClient) ukSanctionsNameAndSource := make(map[string]string) - ukSanctionsNameAndSource["UK_Sanctions_List.ods"] = ukSanctionsListURL + + latestURL, err := fetchLatestUKSanctionsListURL(logger, initialDir) + if err != nil { + return nil, err + } + logger.Info().Logf("downloading UK sanctions from %s", latestURL) + + ukSanctionsNameAndSource["UK_Sanctions_List.ods"] = latestURL return dl.GetFiles(initialDir, ukSanctionsNameAndSource) } + +var ( + defaultUKSanctionsListHTML = strx.Or(os.Getenv("UK_CSL_HTML_INDEX_URL"), "https://www.gov.uk/government/publications/the-uk-sanctions-list") +) + +func fetchLatestUKSanctionsListURL(logger log.Logger, initialDir string) (string, error) { + fromEnv := strings.TrimSpace(os.Getenv("UK_SANCTIONS_LIST_URL")) + if fromEnv != "" { + return fromEnv, nil + } + + // Fetch the HTML page and look for the latest link + ukSanctionsNameAndSource := make(map[string]string) + ukSanctionsNameAndSource["UK_Sanctions_List.ods"] = defaultUKSanctionsListHTML + + dl := download.New(logger, download.HTTPClient) + + pages, err := dl.GetFiles(initialDir, ukSanctionsNameAndSource) + if err != nil { + return "", fmt.Errorf("getting UK Sanctions html index: %w", err) + } + + indexContents, exists := pages["UK_Sanctions_List.ods"] + if !exists { + return "", fmt.Errorf("UK sanctions index page %s not found", defaultUKSanctionsListHTML) + } + + index, err := htmlquery.Parse(indexContents) + if err != nil { + return "", fmt.Errorf("parsing UK sanctions index page: %w", err) + } + + links, err := htmlquery.QueryAll(index, `//a[contains(@class, 'govuk-link') and contains(@href, '.ods')]`) + if err != nil { + return "", fmt.Errorf("html xpath failed: %w", err) + } + + for _, link := range links { + for _, attr := range link.Attr { + if attr.Key == "href" && strings.HasSuffix(attr.Val, ".ods") { + return attr.Val, nil + } + } + } + + return "", nil +} diff --git a/pkg/csl/download_uk_test.go b/pkg/csl/download_uk_test.go index fd4432c4..156e2118 100644 --- a/pkg/csl/download_uk_test.go +++ b/pkg/csl/download_uk_test.go @@ -13,6 +13,7 @@ import ( "testing" "github.com/moov-io/base/log" + "github.com/stretchr/testify/require" ) func TestUKCSLDownload(t *testing.T) { @@ -76,12 +77,25 @@ func TestUKCSLDownload_initialDir(t *testing.T) { } } +func TestUKSanctionsListIndex(t *testing.T) { + if testing.Short() { + return + } + + logger := log.NewTestLogger() + foundURL, err := fetchLatestUKSanctionsListURL(logger, "") + require.NoError(t, err) + + require.Contains(t, foundURL, "UK_Sanctions_List.ods") +} + func TestUKSanctionsListDownload(t *testing.T) { if testing.Short() { return } - file, err := DownloadUKSanctionsList(log.NewNopLogger(), "") + logger := log.NewTestLogger() + file, err := DownloadUKSanctionsList(logger, "") if err != nil { t.Fatal(err) }