From b0cc0c36ecbd0bb015284f3e1898831e89a306f5 Mon Sep 17 00:00:00 2001 From: Benjamin Piouffle Date: Wed, 17 Jul 2024 17:31:52 +0200 Subject: [PATCH] fix(Captions): use proper XML parsing --- .../cf/lib/videos/captions_fetcher_youtube.ex | 39 +++++++++---------- apps/cf/mix.exs | 1 + .../lib/schema/types/video_caption.ex | 2 +- 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/apps/cf/lib/videos/captions_fetcher_youtube.ex b/apps/cf/lib/videos/captions_fetcher_youtube.ex index 699e6ca4..346e37f4 100644 --- a/apps/cf/lib/videos/captions_fetcher_youtube.ex +++ b/apps/cf/lib/videos/captions_fetcher_youtube.ex @@ -7,6 +7,7 @@ defmodule CF.Videos.CaptionsFetcherYoutube do @behaviour CF.Videos.CaptionsFetcher require Logger + import SweetXml @impl true def fetch(%{youtube_id: youtube_id, language: language}) do @@ -69,31 +70,29 @@ defmodule CF.Videos.CaptionsFetcherYoutube do defp process_transcript(transcript) do transcript - |> String.replace(~r/^<\?xml version="1.0" encoding="utf-8"\?>/, "") - |> String.replace("", "") - |> String.split("") - |> Enum.filter(&(String.trim(&1) != "")) - |> Enum.map(&process_line/1) + |> SweetXml.xpath( + ~x"//transcript/text"l, + text: ~x"./text()"s |> transform_by(&clean_text/1), + start: ~x"./@start"s |> transform_by(&parse_float/1), + duration: ~x"./@dur"os |> transform_by(&parse_float/1) + ) + |> Enum.filter(fn %{text: text, start: start} -> + start != nil and text != nil and text != "" + end) end - defp process_line(line) do - %{"start" => start} = Regex.named_captures(~r/start="(?[\d.]+)"/, line) - %{"dur" => dur} = Regex.named_captures(~r/dur="(?[\d.]+)"/, line) - - text = - line - |> String.replace("&", "&") - |> String.replace(~r//, "") - |> String.replace(~r"]+(>|$)", "") - |> HtmlEntities.decode() - |> String.trim() - - %{start: parse_float(start), duration: parse_float(dur), text: text} + defp clean_text(text) do + text + |> String.replace("&", "&") + |> HtmlEntities.decode() + |> String.trim() end defp parse_float(val) do - {num, _} = Float.parse(val) - num + case Float.parse(val) do + {num, _} -> num + _ -> nil + end end # Below is an implementation using the official YouTube API, but it requires OAuth2 authentication. diff --git a/apps/cf/mix.exs b/apps/cf/mix.exs index e539642e..bba8d07c 100644 --- a/apps/cf/mix.exs +++ b/apps/cf/mix.exs @@ -60,6 +60,7 @@ defmodule CF.Mixfile do {:yaml_elixir, "~> 2.9.0"}, {:jason, "~> 1.4"}, {:openai, "~> 0.6.1"}, + {:sweet_xml, "~> 0.7.4"}, # ---- Internal ---- {:db, in_umbrella: true}, diff --git a/apps/cf_graphql/lib/schema/types/video_caption.ex b/apps/cf_graphql/lib/schema/types/video_caption.ex index 29b5eba6..6a964a4f 100644 --- a/apps/cf_graphql/lib/schema/types/video_caption.ex +++ b/apps/cf_graphql/lib/schema/types/video_caption.ex @@ -13,6 +13,6 @@ defmodule CF.Graphql.Schema.Types.VideoCaption do @desc "Caption start time (in seconds)" field(:start, non_null(:float)) @desc "Caption duration (in seconds)" - field(:duration, non_null(:float)) + field(:duration, :float) end end