Skip to content

Commit

Permalink
Analyse structure fichiers NeTEx existants pour le GT7 (#4259)
Browse files Browse the repository at this point in the history
* Trailing whitespace in readme

* Modest README for the scripts directory

* Refactoring script netex

* Analyseur de structure de fichiers NeTEx

Produit un CSV.

* Niveau hiérarchique de chaque fichier

* Please the linter
  • Loading branch information
ptitfred authored Oct 21, 2024
1 parent 1a5b798 commit fd5f974
Show file tree
Hide file tree
Showing 4 changed files with 137 additions and 47 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ mix cmd --app transport mix test --color
mix cmd --app unlock mix test --color
# or, for a single file, or single test
mix cmd --app transport mix test --color test/transport_web/integrations/backoffice_test.exs
mix cmd --app transport mix test --color test/transport_web/integrations/backoffice_test.exs
mix cmd --app transport mix test --color test/transport_web/integrations/backoffice_test.exs:8
```

Expand Down
12 changes: 12 additions & 0 deletions scripts/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Scripts

A unstructured collection of scripts to explore or patch data. Can be used to
fix production data, to do some cold analysis, or to test some piece of code.

## Usage

Assuming you've sourced the required environment variables (mostly the database connector), you can launch a given script with `mix run`:

```
mix run scripts/my-script.exs
```
88 changes: 42 additions & 46 deletions scripts/netex_analyzer.exs
Original file line number Diff line number Diff line change
Expand Up @@ -20,57 +20,53 @@ df =
end)
|> Enum.filter(&(&1.unverified_format == "NeTEx"))

netex =
df
|> Task.async_stream(
fn r ->
url = r.url
file = Path.join("cache-dir", "resource-#{r.id}.dat")
status_file = file <> ".status.json"
download_resource = fn r ->
url = r.url
file = Path.join("cache-dir", "resource-#{r.id}.dat")
status_file = file <> ".status.json"

unless File.exists?(status_file) do
IO.puts("Saving #{url}")
url = if String.contains?(url, "|"), do: URI.encode(url), else: url
unless File.exists?(status_file) do
IO.puts("Saving #{url}")
url = if String.contains?(url, "|"), do: URI.encode(url), else: url

%{status: status} =
Transport.HTTPClient.get!(url,
decode_body: false,
compressed: false,
into: File.stream!(file)
)
%{status: status} =
Transport.HTTPClient.get!(url,
decode_body: false,
compressed: false,
into: File.stream!(file)
)

File.write!(status_file, %{status: status} |> Jason.encode!())
end
File.write!(status_file, %{status: status} |> Jason.encode!())
end

%{"status" => status} = File.read!(status_file) |> Jason.decode!()
%{"status" => status} = File.read!(status_file) |> Jason.decode!()

r
|> Map.put(:http_status, status)
|> Map.put(:local_path, file)
end,
max_concurrency: 10,
timeout: 120_000
)
|> Stream.map(fn {:ok, result} -> result end)
|> Stream.reject(&is_nil(&1))
|> Task.async_stream(
fn r ->
IO.puts("Processing file #{r.id}")
r
|> Map.put(:http_status, status)
|> Map.put(:local_path, file)
end

count_relevant_stop_places_per_resource = fn r ->
IO.puts("Processing file #{r.id}")

try do
count =
Transport.NeTEx.read_all_stop_places(r.local_path)
|> Enum.flat_map(fn {_file, stops} -> stops end)
# some stop places have no latitude in NeTEx
|> Enum.reject(fn p -> is_nil(p[:latitude]) end)
|> Enum.count()
try do
count =
Transport.NeTEx.read_all_stop_places(r.local_path)
|> Enum.flat_map(fn {_file, stops} -> stops end)
# some stop places have no latitude in NeTEx
|> Enum.reject(fn p -> is_nil(p[:latitude]) end)
|> Enum.count()

IO.puts("#{count} StopPlaces detected")
rescue
e -> IO.puts("Som'thing bad happened")
end
end,
max_concurrency: 5,
timeout: 60_000 * 5
)
IO.puts("#{count} StopPlaces detected")
rescue
_ -> IO.puts("Som'thing bad happened")
end
end

netex =
df
|> Task.async_stream(download_resource, max_concurrency: 10, timeout: 120_000)
|> Stream.map(fn {:ok, result} -> result end)
|> Stream.reject(&is_nil(&1))
|> Task.async_stream(count_relevant_stop_places_per_resource, max_concurrency: 5, timeout: 60_000 * 5)
|> Stream.run()
82 changes: 82 additions & 0 deletions scripts/netex_layout_analyzer.exs
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
resources =
DB.Resource
|> DB.Repo.all()

# count
resources
|> Enum.count()
|> IO.inspect()

df =
resources
|> Enum.map(fn r ->
%{
id: r.id,
url: r.url,
title: r.title,
unverified_format: r.format,
description: r.description
}
end)
|> Enum.filter(&(&1.unverified_format == "NeTEx"))

download_resource = fn r ->
url = r.url
file = Path.join("cache-dir", "resource-#{r.id}.dat")
status_file = file <> ".status.json"

unless File.exists?(status_file) do
IO.puts("Saving #{url}")
url = if String.contains?(url, "|"), do: URI.encode(url), else: url

%{status: status} =
Transport.HTTPClient.get!(url,
decode_body: false,
compressed: false,
into: File.stream!(file)
)

File.write!(status_file, %{status: status} |> Jason.encode!())
end

%{"status" => status} = File.read!(status_file) |> Jason.decode!()

r
|> Map.put(:http_status, status)
|> Map.put(:local_path, file)
end

hierarchy_level = fn file -> file |> String.split("/") |> Enum.count() end

dump_netex_files = fn r ->
IO.puts("Processing file #{r.id}")

url = "https://transport.data.gouv.fr/resources/#{r.id}"

result =
try do
Transport.NeTEx.read_all_stop_places(r.local_path)
|> Enum.map(fn {file, _stops} -> file end)
|> Enum.reject(fn file -> String.ends_with?(file, "/") end)
|> Enum.map(fn file -> [url, r.title, r.url, file, hierarchy_level.(file)] end)
rescue
_ ->
IO.puts("Som'thing bad happened")
[]
end

NimbleCSV.RFC4180.dump_to_iodata(result)
end

output_file = "netex_layout_analysis.csv"

File.write(output_file, NimbleCSV.RFC4180.dump_to_iodata([~w(resource title url file hierarchy)]))

df
|> Task.async_stream(download_resource, max_concurrency: 10, timeout: 120_000)
|> Stream.map(fn {:ok, result} -> result end)
|> Stream.reject(&is_nil(&1))
|> Task.async_stream(dump_netex_files, max_concurrency: 5, timeout: 60_000 * 5)
|> Stream.map(fn {:ok, result} -> result end)
|> Stream.into(File.stream!(output_file, [:append, :utf8]))
|> Stream.run()

0 comments on commit fd5f974

Please sign in to comment.