diff --git a/apps/transport/lib/jobs/new_datagouv_datasets_job.ex b/apps/transport/lib/jobs/new_datagouv_datasets_job.ex index a47a8756ca..6f593f6ebe 100644 --- a/apps/transport/lib/jobs/new_datagouv_datasets_job.ex +++ b/apps/transport/lib/jobs/new_datagouv_datasets_job.ex @@ -88,19 +88,35 @@ defmodule Transport.Jobs.NewDatagouvDatasetsJob do DateTime.compare(datetime, dt_limit) == :gt end + @doc """ + Useful to ignore specific datasets/organizations. + + iex> ignore_dataset?(%{"organization" => %{"id" => "5a83f81fc751df6f8573eb8a"}, "title" => "BDTOPO© - Chefs-Lieux pour le département de l'Eure-et-Loir"}) + true + """ + def ignore_dataset?(%{"organization" => %{"id" => "5a83f81fc751df6f8573eb8a"}, "title" => title}) do + String.contains?(title, "BDTOPO") + end + + def ignore_dataset?(%{}), do: false + def dataset_is_relevant?(%{} = dataset) do - match_on_dataset = - [&tags_is_relevant?/1, &description_is_relevant?/1, &title_is_relevant?/1] - |> Enum.map(& &1.(dataset)) - |> Enum.any?() - - match_on_resources = - dataset - |> Map.fetch!("resources") - |> Enum.map(&resource_is_relevant?/1) - |> Enum.any?() - - match_on_dataset or match_on_resources + if ignore_dataset?(dataset) do + false + else + match_on_dataset = + [&tags_is_relevant?/1, &description_is_relevant?/1, &title_is_relevant?/1] + |> Enum.map(& &1.(dataset)) + |> Enum.any?() + + match_on_resources = + dataset + |> Map.fetch!("resources") + |> Enum.map(&resource_is_relevant?/1) + |> Enum.any?() + + match_on_dataset or match_on_resources + end end defp title_is_relevant?(%{"title" => title}), do: string_matches?(title) @@ -133,7 +149,7 @@ defmodule Transport.Jobs.NewDatagouvDatasetsJob do MapSet.member?(@relevant_formats, String.downcase(format)) end - defp resource_schema_is_relevant?(%{"schema" => %{"name" => "etalab/schema-irve"}}), do: false + defp resource_schema_is_relevant?(%{"schema" => %{"name" => "etalab/schema-irve-statique"}}), do: false defp resource_schema_is_relevant?(%{"schema" => %{"name" => schema_name}}) do schema_name in Map.keys(Schemas.transport_schemas()) diff --git a/apps/transport/test/transport/jobs/new_datagouv_datasets_job_test.exs b/apps/transport/test/transport/jobs/new_datagouv_datasets_job_test.exs index 0ee8558f46..44cea262c3 100644 --- a/apps/transport/test/transport/jobs/new_datagouv_datasets_job_test.exs +++ b/apps/transport/test/transport/jobs/new_datagouv_datasets_job_test.exs @@ -40,8 +40,23 @@ defmodule Transport.Test.Transport.Jobs.NewDatagouvDatasetsJobTest do refute NewDatagouvDatasetsJob.dataset_is_relevant?(%{ base - | "resources" => [%{"format" => "csv", "schema" => %{"name" => "etalab/schema-irve"}, "description" => ""}] + | "resources" => [ + %{"format" => "csv", "schema" => %{"name" => "etalab/schema-irve-statique"}, "description" => ""} + ] }) + + # Uses `ignore_dataset?/1` to ignore specific datasets + bdtopo_args = + Map.merge(base, %{ + "title" => "BDTOPO© - Chefs-Lieux pour le département de l'Eure-et-Loir", + "tags" => ["transport"] + }) + + assert NewDatagouvDatasetsJob.dataset_is_relevant?(bdtopo_args) + + refute NewDatagouvDatasetsJob.dataset_is_relevant?( + Map.merge(bdtopo_args, %{"organization" => %{"id" => "5a83f81fc751df6f8573eb8a"}}) + ) end test "filtered_datasets" do