-
Notifications
You must be signed in to change notification settings - Fork 24
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Use LLM to extract statements (#465)
* feat: Use LLM to extract statements * Use OpenAI API
- Loading branch information
Showing
11 changed files
with
300 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,168 @@ | ||
defmodule CF.LLMs.StatementsCreator do | ||
@moduledoc """ | ||
Functions to create statements from a video that has captions using LLMs | ||
""" | ||
|
||
import Ecto.Query | ||
require EEx | ||
require Logger | ||
|
||
@captions_chunk_size 300 | ||
|
||
# Load prompt messages templates | ||
EEx.function_from_file( | ||
:defp, | ||
:generate_system_prompt, | ||
Path.join(__DIR__, "templates/statements_extractor_system_prompt.eex") | ||
) | ||
|
||
EEx.function_from_file( | ||
:defp, | ||
:generate_user_prompt, | ||
Path.join(__DIR__, "templates/statements_extractor_user_prompt.eex"), | ||
[ | ||
:video, | ||
:captions | ||
] | ||
) | ||
|
||
@doc """ | ||
Create statements from a video that has captions using LLMs | ||
""" | ||
def process_video!(video_id) do | ||
DB.Schema.Video | ||
|> join(:inner, [v], vc in DB.Schema.VideoCaption, on: v.id == vc.video_id) | ||
|> where([v, vc], v.id == ^video_id) | ||
|> order_by([v, vc], desc: vc.inserted_at) | ||
|> limit(1) | ||
|> select([v, vc], {v, vc}) | ||
|> DB.Repo.one() | ||
|> case do | ||
nil -> | ||
raise "Video or captions not found" | ||
|
||
{video, video_caption} -> | ||
video_caption.parsed | ||
|> chunk_captions() | ||
|> Enum.map(fn captions -> | ||
video | ||
|> get_llm_suggested_statements(captions) | ||
|> filter_known_statements(video) | ||
|> create_statements_from_inputs(video) | ||
|> broadcast_statements(video) | ||
|
||
Process.sleep(500) | ||
end) | ||
end | ||
end | ||
|
||
@doc """ | ||
Chunk captions everytime we reach the max caption length | ||
""" | ||
defp chunk_captions(captions) do | ||
# TODO: Add last captions from previous batch to preserve context | ||
Enum.chunk_every(captions, @captions_chunk_size) | ||
end | ||
|
||
defp get_llm_suggested_statements(video, captions, retries \\ 0) do | ||
OpenAI.chat_completion( | ||
model: Application.get_env(:cf, :openai_model), | ||
response_format: %{type: "json_object"}, | ||
stream: false, | ||
messages: [ | ||
%{ | ||
role: "system", | ||
content: generate_system_prompt() | ||
}, | ||
%{ | ||
role: "user", | ||
content: generate_user_prompt(video, captions) | ||
} | ||
] | ||
) | ||
|> case do | ||
{:ok, %{choices: choices}} -> | ||
choices | ||
|> List.first() | ||
|> get_in(["message", "content"]) | ||
|> get_json_str_from_content!() | ||
|> Jason.decode!() | ||
|> Map.get("statements") | ||
|> check_statements_input_format!() | ||
|
||
{:error, error} -> | ||
if retries > 0 do | ||
Logger.warn("Failed to get LLM suggested statements: #{inspect(error)}. Retrying...") | ||
Process.sleep(1000) | ||
get_llm_suggested_statements(video, captions, retries - 1) | ||
else | ||
Logger.error(inspect(error)) | ||
raise error | ||
end | ||
end | ||
end | ||
|
||
defp check_statements_input_format!(statements_inputs) do | ||
for %{"text" => text, "time" => time} <- statements_inputs do | ||
unless is_binary(text) and is_integer(time) do | ||
raise "Invalid statement input format" | ||
end | ||
end | ||
|
||
statements_inputs | ||
end | ||
|
||
# Remove statements when we already have a similar one at time/text | ||
defp filter_known_statements(statements, video) do | ||
existing_statements = | ||
DB.Schema.Statement | ||
|> where([s], s.video_id == ^video.id) | ||
|> DB.Repo.all() | ||
|
||
Enum.reject(statements, fn %{"text" => text, "time" => time} -> | ||
Enum.any?(existing_statements, fn s -> | ||
s.time >= time - 5 and s.time <= time + 5 and String.jaro_distance(s.text, text) > 0.80 | ||
end) | ||
end) | ||
end | ||
|
||
defp create_statements_from_inputs(statements_inputs, video) do | ||
inserted_at = NaiveDateTime.utc_now() |> NaiveDateTime.truncate(:second) | ||
|
||
{nb_statements, statements} = | ||
DB.Repo.insert_all( | ||
DB.Schema.Statement, | ||
Enum.map(statements_inputs, fn %{"text" => text, "time" => time} -> | ||
%{ | ||
video_id: video.id, | ||
text: text, | ||
time: time, | ||
inserted_at: inserted_at, | ||
updated_at: inserted_at | ||
} | ||
end), | ||
returning: true | ||
) | ||
|
||
statements | ||
end | ||
|
||
defp broadcast_statements(statements, video) do | ||
statements | ||
|> Enum.map(fn statement -> | ||
CF.RestApi.Endpoint.broadcast( | ||
"statements:video:#{DB.Type.VideoHashId.encode(video.id)}", | ||
"statement_added", | ||
CF.RestApi.StatementView.render("show.json", statement: statement) | ||
) | ||
end) | ||
end | ||
|
||
# JSON content can optionally be wrapped in a ```json ... ``` block | ||
defp get_json_str_from_content!(content) do | ||
case Regex.scan(~r/```json\n(.+)\n```/mis, content) do | ||
[[_, json_str]] -> json_str | ||
_ -> content | ||
end | ||
end | ||
end |
56 changes: 56 additions & 0 deletions
56
apps/cf/lib/llms/templates/statements_extractor_system_prompt.eex
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
# Mission | ||
|
||
Ta tâche est d'extraire des citations intéressantes à vérifier depuis les sous-titres d'une vidéo, ainsi que le timecode du 1er mot de la citation. Le texte peut contenir des fautes ou des mots mal reconnus, tu les corrigeras. Tu peux aussi résumer ou remplacer certaines parties non-essentielles par "[...]" pour raccourcir la citation. | ||
|
||
Renvoie uniquement le résultat en JSON, **sans aucun commentaire ni conclusion**. | ||
|
||
# Comment choisir les extraits à vérifier | ||
|
||
Pour être pertinente, une citation doit : | ||
- être vérifiable grâce à l'exposition de faits | ||
- faire mention d'une source ou d'un contenu que l'on peut vérifier | ||
- présenter une information unique (découpe les citations qui présentent plusieurs éléments) | ||
Et remplir au moins un des critères suivants : | ||
- présenter des éléments incomplets ou approximatifs | ||
- présenter un argument fallacieux, trompeur ou mensonger | ||
- présenter des informations intéressantes à vérifier | ||
|
||
Ne méritent pas travail de vérification : | ||
- les évidences comme "Le ciel est bleu !" | ||
- les annecdotes personelles (ex: "ça a changé ma vie") | ||
- les figures de style et l'humour (comme les hyperboles, les métaphores, etc) | ||
- les erreurs mineures | ||
- les opinions personnelles ("j'aime ça") | ||
|
||
# Exemple | ||
|
||
## Input | ||
|
||
```json | ||
{ | ||
"video": { | ||
"title": "Thinkerview - La diagonale du vide en France" | ||
}, | ||
"captions": [ | ||
{ "start": 10, "text": "Cette mesure sociale a été un désastre de la pensée ça ne m'évoque que du dégoût elle n'a fait que créer une augmentation du chômage et a provoqué de nombreuses critiques de l'UE c'était pour moi une pure folie" }, | ||
{ "start": 85, "text": "mais parlons de la diagonnale du vite il y a d'autres zones en France qui sont très peuplées elle s'affiche ici et juste là et oui je sais effectivement je pense que je peux tenter une" }, | ||
{ "start": 89, "text": "reconversion à devenir présentateur météo" }, | ||
{ "start": 94, "text": "dans les zones que vous voyez ici on compte seulement 6,5% de la population française métropolitaine pourtant et bien ces espaces" }, | ||
{ "start": 102, "text": "représentent 42% du territoire national mais alors pourquoi la diagonale du vide comme" }, | ||
{ "start": 110, "text": "nom? Ça a changé ma vie quand je l'ai découvert" } | ||
] | ||
} | ||
``` | ||
|
||
## Output | ||
|
||
```json | ||
{ | ||
"statements": [ | ||
{ "time": 10, "text": "Cette mesure sociale [...] n'a fait que créer une augmentation du chômage" }, | ||
{ "time": 10, "text": "Cette mesure sociale [...] a provoqué de nombreuses critiques de l'UE" }, | ||
{ "time": 94, "text": "ici on compte seulement 6,5% de la population française métropolitaine" }, | ||
{ "time": 94, "text": "ces espaces représentent 42% du territoire national" } | ||
], | ||
} | ||
``` |
11 changes: 11 additions & 0 deletions
11
apps/cf/lib/llms/templates/statements_extractor_user_prompt.eex
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
```json | ||
{ | ||
"video": { | ||
"title": "<%= video.id %>" | ||
}, | ||
"captions": <%= captions |> Enum.map(fn caption -> %{ | ||
"start": floor(caption["start"]), | ||
"text": String.trim(caption["text"]) | ||
} end) |> Jason.encode! %> | ||
} | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
32 changes: 32 additions & 0 deletions
32
apps/cf_graphql/lib/schema/middleware/require_reputation.ex
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
defmodule CF.Graphql.Schema.Middleware.RequireReputation do | ||
@moduledoc """ | ||
A middleware to ensure the user has a certain reputation. | ||
""" | ||
|
||
@behaviour Absinthe.Middleware | ||
|
||
@doc false | ||
def call(resolution, reputation) do | ||
cond do | ||
is_nil(resolution.context[:user]) -> | ||
Absinthe.Resolution.put_result(resolution, {:error, "unauthorized"}) | ||
|
||
resolution.context[:user].reputation && resolution.context[:user].reputation < reputation -> | ||
Absinthe.Resolution.put_result( | ||
resolution, | ||
{:error, | ||
%{ | ||
code: "unauthorized", | ||
message: "You do not have the required reputation to perform this action.", | ||
details: %{ | ||
user_reputation: resolution.context[:user].reputation, | ||
required_reputation: reputation | ||
} | ||
}} | ||
) | ||
|
||
true -> | ||
resolution | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters