Skip to content

Commit

Permalink
feat: Use LLM to extract statements (#465)
Browse files Browse the repository at this point in the history
* feat: Use LLM to extract statements

* Use OpenAI API
  • Loading branch information
Betree authored Jul 17, 2024
1 parent f8d23cb commit 47f5840
Show file tree
Hide file tree
Showing 11 changed files with 300 additions and 2 deletions.
7 changes: 7 additions & 0 deletions apps/cf/config/config.exs
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,10 @@ config :algoliax,

# Import environment specific config
import_config "#{Mix.env()}.exs"

config :cf,
openai_model: "gpt-4o"

config :openai,
beta: "assistants=v2",
http_options: [recv_timeout: 30_000, timeout: 30_000]
168 changes: 168 additions & 0 deletions apps/cf/lib/llms/statements_creator.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
defmodule CF.LLMs.StatementsCreator do
@moduledoc """
Functions to create statements from a video that has captions using LLMs
"""

import Ecto.Query
require EEx
require Logger

@captions_chunk_size 300

# Load prompt messages templates
EEx.function_from_file(
:defp,
:generate_system_prompt,
Path.join(__DIR__, "templates/statements_extractor_system_prompt.eex")
)

EEx.function_from_file(
:defp,
:generate_user_prompt,
Path.join(__DIR__, "templates/statements_extractor_user_prompt.eex"),
[
:video,
:captions
]
)

@doc """
Create statements from a video that has captions using LLMs
"""
def process_video!(video_id) do
DB.Schema.Video
|> join(:inner, [v], vc in DB.Schema.VideoCaption, on: v.id == vc.video_id)
|> where([v, vc], v.id == ^video_id)
|> order_by([v, vc], desc: vc.inserted_at)
|> limit(1)
|> select([v, vc], {v, vc})
|> DB.Repo.one()
|> case do
nil ->
raise "Video or captions not found"

{video, video_caption} ->
video_caption.parsed
|> chunk_captions()
|> Enum.map(fn captions ->
video
|> get_llm_suggested_statements(captions)
|> filter_known_statements(video)
|> create_statements_from_inputs(video)
|> broadcast_statements(video)

Process.sleep(500)
end)
end
end

@doc """
Chunk captions everytime we reach the max caption length
"""
defp chunk_captions(captions) do
# TODO: Add last captions from previous batch to preserve context
Enum.chunk_every(captions, @captions_chunk_size)
end

defp get_llm_suggested_statements(video, captions, retries \\ 0) do
OpenAI.chat_completion(
model: Application.get_env(:cf, :openai_model),
response_format: %{type: "json_object"},
stream: false,
messages: [
%{
role: "system",
content: generate_system_prompt()
},
%{
role: "user",
content: generate_user_prompt(video, captions)
}
]
)
|> case do
{:ok, %{choices: choices}} ->
choices
|> List.first()
|> get_in(["message", "content"])
|> get_json_str_from_content!()
|> Jason.decode!()
|> Map.get("statements")
|> check_statements_input_format!()

{:error, error} ->
if retries > 0 do
Logger.warn("Failed to get LLM suggested statements: #{inspect(error)}. Retrying...")
Process.sleep(1000)
get_llm_suggested_statements(video, captions, retries - 1)
else
Logger.error(inspect(error))
raise error
end
end
end

defp check_statements_input_format!(statements_inputs) do
for %{"text" => text, "time" => time} <- statements_inputs do
unless is_binary(text) and is_integer(time) do
raise "Invalid statement input format"
end
end

statements_inputs
end

# Remove statements when we already have a similar one at time/text
defp filter_known_statements(statements, video) do
existing_statements =
DB.Schema.Statement
|> where([s], s.video_id == ^video.id)
|> DB.Repo.all()

Enum.reject(statements, fn %{"text" => text, "time" => time} ->
Enum.any?(existing_statements, fn s ->
s.time >= time - 5 and s.time <= time + 5 and String.jaro_distance(s.text, text) > 0.80
end)
end)
end

defp create_statements_from_inputs(statements_inputs, video) do
inserted_at = NaiveDateTime.utc_now() |> NaiveDateTime.truncate(:second)

{nb_statements, statements} =
DB.Repo.insert_all(
DB.Schema.Statement,
Enum.map(statements_inputs, fn %{"text" => text, "time" => time} ->
%{
video_id: video.id,
text: text,
time: time,
inserted_at: inserted_at,
updated_at: inserted_at
}
end),
returning: true
)

statements
end

defp broadcast_statements(statements, video) do
statements
|> Enum.map(fn statement ->
CF.RestApi.Endpoint.broadcast(
"statements:video:#{DB.Type.VideoHashId.encode(video.id)}",
"statement_added",
CF.RestApi.StatementView.render("show.json", statement: statement)
)
end)
end

# JSON content can optionally be wrapped in a ```json ... ``` block
defp get_json_str_from_content!(content) do
case Regex.scan(~r/```json\n(.+)\n```/mis, content) do
[[_, json_str]] -> json_str
_ -> content
end
end
end
56 changes: 56 additions & 0 deletions apps/cf/lib/llms/templates/statements_extractor_system_prompt.eex
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# Mission

Ta tâche est d'extraire des citations intéressantes à vérifier depuis les sous-titres d'une vidéo, ainsi que le timecode du 1er mot de la citation. Le texte peut contenir des fautes ou des mots mal reconnus, tu les corrigeras. Tu peux aussi résumer ou remplacer certaines parties non-essentielles par "[...]" pour raccourcir la citation.

Renvoie uniquement le résultat en JSON, **sans aucun commentaire ni conclusion**.

# Comment choisir les extraits à vérifier

Pour être pertinente, une citation doit :
- être vérifiable grâce à l'exposition de faits
- faire mention d'une source ou d'un contenu que l'on peut vérifier
- présenter une information unique (découpe les citations qui présentent plusieurs éléments)
Et remplir au moins un des critères suivants :
- présenter des éléments incomplets ou approximatifs
- présenter un argument fallacieux, trompeur ou mensonger
- présenter des informations intéressantes à vérifier

Ne méritent pas travail de vérification :
- les évidences comme "Le ciel est bleu !"
- les annecdotes personelles (ex: "ça a changé ma vie")
- les figures de style et l'humour (comme les hyperboles, les métaphores, etc)
- les erreurs mineures
- les opinions personnelles ("j'aime ça")

# Exemple

## Input

```json
{
"video": {
"title": "Thinkerview - La diagonale du vide en France"
},
"captions": [
{ "start": 10, "text": "Cette mesure sociale a été un désastre de la pensée ça ne m'évoque que du dégoût elle n'a fait que créer une augmentation du chômage et a provoqué de nombreuses critiques de l'UE c'était pour moi une pure folie" },
{ "start": 85, "text": "mais parlons de la diagonnale du vite il y a d'autres zones en France qui sont très peuplées elle s'affiche ici et juste et oui je sais effectivement je pense que je peux tenter une" },
{ "start": 89, "text": "reconversion à devenir présentateur météo" },
{ "start": 94, "text": "dans les zones que vous voyez ici on compte seulement 6,5% de la population française métropolitaine pourtant et bien ces espaces" },
{ "start": 102, "text": "représentent 42% du territoire national mais alors pourquoi la diagonale du vide comme" },
{ "start": 110, "text": "nom? Ça a changé ma vie quand je l'ai découvert" }
]
}
```

## Output

```json
{
"statements": [
{ "time": 10, "text": "Cette mesure sociale [...] n'a fait que créer une augmentation du chômage" },
{ "time": 10, "text": "Cette mesure sociale [...] a provoqué de nombreuses critiques de l'UE" },
{ "time": 94, "text": "ici on compte seulement 6,5% de la population française métropolitaine" },
{ "time": 94, "text": "ces espaces représentent 42% du territoire national" }
],
}
```
11 changes: 11 additions & 0 deletions apps/cf/lib/llms/templates/statements_extractor_user_prompt.eex
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
```json
{
"video": {
"title": "<%= video.id %>"
},
"captions": <%= captions |> Enum.map(fn caption -> %{
"start": floor(caption["start"]),
"text": String.trim(caption["text"])
} end) |> Jason.encode! %>
}
```
2 changes: 2 additions & 0 deletions apps/cf/mix.exs
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ defmodule CF.Mixfile do
{:sweet_xml, "~> 0.6"},
{:burnex, "~> 3.1"},
{:yaml_elixir, "~> 2.9.0"},
{:jason, "~> 1.4"},
{:openai, "~> 0.6.1"},

# ---- Internal ----
{:db, in_umbrella: true},
Expand Down
6 changes: 6 additions & 0 deletions apps/cf_graphql/lib/resolvers/videos.ex
Original file line number Diff line number Diff line change
Expand Up @@ -99,4 +99,10 @@ defmodule CF.Graphql.Resolvers.Videos do
|> Repo.all()
|> Enum.group_by(& &1.video_id)
end

def start_automatic_statements_extraction(_root, %{video_id: video_id}, _info) do
video = DB.Repo.get!(DB.Schema.Video, video_id)
CF.LLMs.StatementsCreator.process_video!(video.id)
{:ok, video}
end
end
32 changes: 32 additions & 0 deletions apps/cf_graphql/lib/schema/middleware/require_reputation.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
defmodule CF.Graphql.Schema.Middleware.RequireReputation do
@moduledoc """
A middleware to ensure the user has a certain reputation.
"""

@behaviour Absinthe.Middleware

@doc false
def call(resolution, reputation) do
cond do
is_nil(resolution.context[:user]) ->
Absinthe.Resolution.put_result(resolution, {:error, "unauthorized"})

resolution.context[:user].reputation && resolution.context[:user].reputation < reputation ->
Absinthe.Resolution.put_result(
resolution,
{:error,
%{
code: "unauthorized",
message: "You do not have the required reputation to perform this action.",
details: %{
user_reputation: resolution.context[:user].reputation,
required_reputation: reputation
}
}}
)

true ->
resolution
end
end
end
10 changes: 10 additions & 0 deletions apps/cf_graphql/lib/schema/schema.ex
Original file line number Diff line number Diff line change
Expand Up @@ -89,5 +89,15 @@ defmodule CF.Graphql.Schema do

resolve(&Resolvers.Notifications.update_subscription/3)
end

@desc "Use this to start the automatic statements extraction job. Requires elevated permissions."
field :start_automatic_statements_extraction, :video do
middleware(Middleware.RequireAuthentication)
middleware(Middleware.RequireReputation, 450)

arg(:video_id, non_null(:id))

resolve(&Resolvers.Videos.start_automatic_statements_extraction/3)
end
end
end
4 changes: 2 additions & 2 deletions apps/cf_jobs/config/config.exs
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ config :cf_jobs, CF.Jobs.Scheduler,
],
# Captions
download_captions: [
# every 10 minutes
schedule: "*/10 * * * *",
# every minute
schedule: "*/1 * * * *",
task: {CF.Jobs.DownloadCaptions, :update, []},
overlap: false
]
Expand Down
5 changes: 5 additions & 0 deletions config/releases.exs
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ config :cf,
hard_limitations_period: load_int.({"hard_limitations_period", 3 * 60 * 60}),
invitation_system: load_bool.({"invitation_system", "false"}),
youtube_api_key: load_secret.({"youtube_api_key", nil}),
openai_model: load_secret.("openai_model"),
oauth: [
facebook: [
client_id: load_secret.("facebook_app_id"),
Expand All @@ -95,6 +96,10 @@ config :cf,
]
]

config :openai,
api_key: load_secret.("openai_api_key"),
organization_key: load_secret("openai_organization_key")

config :cf, CF.Authenticator.GuardianImpl, secret_key: load_secret.("secret_key_base")

config :cf, CF.Mailer,
Expand Down
1 change: 1 addition & 0 deletions mix.lock
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@
"nimble_pool": {:hex, :nimble_pool, "1.1.0", "bf9c29fbdcba3564a8b800d1eeb5a3c58f36e1e11d7b7fb2e084a643f645f06b", [:mix], [], "hexpm", "af2e4e6b34197db81f7aad230c1118eac993acc0dae6bc83bac0126d4ae0813a"},
"not_qwerty123": {:hex, :not_qwerty123, "2.2.1", "656e940159517f2d2f07ea0bb14e4ad376d176b5f4de07115e7a64902b5e13e3", [:mix], [{:gettext, "~> 0.13", [hex: :gettext, repo: "hexpm", optional: false]}], "hexpm", "7637173b09eb7b26b29925039d5b92f7107c94a27cbe4d2ba8efb8b84d060c4b"},
"oauth2": {:hex, :oauth2, "0.9.4", "632e8e8826a45e33ac2ea5ac66dcc019ba6bb5a0d2ba77e342d33e3b7b252c6e", [:mix], [{:hackney, "~> 1.7", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm", "407c6b9f60aa0d01b915e2347dc6be78adca706a37f0c530808942da3b62e7af"},
"openai": {:hex, :openai, "0.6.1", "ad86b5b253969fe6d59896d295b1a573cbe44d586fd00bfa8cf3f440d800b4d6", [:mix], [{:httpoison, "~> 2.0", [hex: :httpoison, repo: "hexpm", optional: false]}, {:jason, "~> 1.4", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm", "aea82953ea82fcbf91d0474125943becf5d8318af53081ed722a0f26d4346353"},
"parallel_stream": {:hex, :parallel_stream, "1.0.6", "b967be2b23f0f6787fab7ed681b4c45a215a81481fb62b01a5b750fa8f30f76c", [:mix], [], "hexpm", "639b2e8749e11b87b9eb42f2ad325d161c170b39b288ac8d04c4f31f8f0823eb"},
"parse_trans": {:hex, :parse_trans, "3.4.1", "6e6aa8167cb44cc8f39441d05193be6e6f4e7c2946cb2759f015f8c56b76e5ff", [:rebar3], [], "hexpm", "620a406ce75dada827b82e453c19cf06776be266f5a67cff34e1ef2cbb60e49a"},
"phoenix": {:hex, :phoenix, "1.5.14", "2d5db884be496eefa5157505ec0134e66187cb416c072272420c5509d67bf808", [:mix], [{:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: true]}, {:phoenix_html, "~> 2.13 or ~> 3.0", [hex: :phoenix_html, repo: "hexpm", optional: true]}, {:phoenix_pubsub, "~> 2.0", [hex: :phoenix_pubsub, repo: "hexpm", optional: false]}, {:plug, "~> 1.10", [hex: :plug, repo: "hexpm", optional: false]}, {:plug_cowboy, "~> 1.0 or ~> 2.2", [hex: :plug_cowboy, repo: "hexpm", optional: true]}, {:plug_crypto, "~> 1.1.2 or ~> 1.2", [hex: :plug_crypto, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "207f1aa5520320cbb7940d7ff2dde2342162cf513875848f88249ea0ba02fef7"},
Expand Down

0 comments on commit 47f5840

Please sign in to comment.