Skip to content

Commit

Permalink
ai search works again
Browse files Browse the repository at this point in the history
  • Loading branch information
yujonglee committed Sep 14, 2024
1 parent 06379c9 commit c3f018c
Show file tree
Hide file tree
Showing 17 changed files with 143 additions and 105 deletions.
13 changes: 1 addition & 12 deletions core/config/runtime.exs
Original file line number Diff line number Diff line change
Expand Up @@ -122,18 +122,7 @@ config :canary, :openai_api_base, System.get_env("OPENAI_API_BASE")
config :canary, :openai_api_key, System.get_env("OPENAI_API_KEY")

config :canary, :text_embedding_model, System.get_env("TEXT_EMBEDDING_MODEL")

config :canary,
:chat_completion_model_understanding,
System.get_env("CHAT_COMPLETION_MODEL_UNDERSTANDING")

config :canary,
:chat_completion_model_response,
System.get_env("CHAT_COMPLETION_MODEL_RESPONSE")

config :canary,
:chat_completion_model_background,
System.get_env("CHAT_COMPLETION_MODEL_BACKGROUND")
config :canary, :chat_completion_model, System.get_env("CHAT_COMPLETION_MODEL")

if config_env() != :test do
if System.get_env("GITHUB_CLIENT_ID") && System.get_env("GITHUB_CLIENT_SECRET") do
Expand Down
7 changes: 7 additions & 0 deletions core/lib/canary/crawler.ex
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
# TODO: to support multi-start-urls, we need to provide store PID from the outside
# Not sure how to handle "Sitemap" and "Fallback" in the same time

# one clever way of doing it is to pass Store to Sitemap based thing? do not do dupliates.
# To support forum use-case, we need lot more config. (split sitepap <> start-urls..?)
# Do not over-complicate it for now

defmodule Canary.Crawler do
@callback run(String.t(), opts :: keyword()) :: {:ok, map()} | {:error, any()}
@modules [Canary.Crawler.Sitemap, Canary.Crawler.Fallback]
Expand Down
8 changes: 5 additions & 3 deletions core/lib/canary/index/index.ex
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
defmodule Canary.Index do
alias Canary.Sources.Source
alias Canary.Sources.Webpage
alias Canary.Sources.GithubIssue
alias Canary.Sources.GithubDiscussion
Expand Down Expand Up @@ -66,14 +67,14 @@ defmodule Canary.Index do
Client.delete_document(source_type, id)
end

def search(sources, query, opts \\ []) do
def search(sources, queries, opts \\ []) when is_list(queries) do
tags = opts[:tags]
embedding = opts[:embedding]
embedding_alpha = opts[:embedding_alpha] || 0.3

args =
sources
|> Enum.map(fn %Canary.Sources.Source{id: source_id, config: %Ash.Union{type: type}} ->
for(source <- sources, query <- queries, do: {source, query})
|> Enum.map(fn {%Source{id: source_id, config: %Ash.Union{type: type}}, query} ->
filter_by =
[
"source_id:=[#{source_id}]",
Expand Down Expand Up @@ -131,6 +132,7 @@ defmodule Canary.Index do
excerpt: hit["highlight"]["content"]["snippet"] || hit["document"]["content"]
}
end)
|> Enum.uniq_by(& &1.id)

%{
source_id: hits |> Enum.at(0) |> Map.get(:source_id),
Expand Down
2 changes: 1 addition & 1 deletion core/lib/canary/interactions/responder.ex
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ defmodule Canary.Interactions.Responder.Default do
require Ash.Query

def run(query, pattern, %{account: account, sources: sources}, handle_delta) do
model = Application.fetch_env!(:canary, :chat_completion_model_response)
model = Application.fetch_env!(:canary, :CHAT_COMPLETION_MODEL)
source = sources |> Enum.at(0)
{:ok, %{search: docs}} = Canary.Searcher.run(source, query)

Expand Down
31 changes: 0 additions & 31 deletions core/lib/canary/overview.ex

This file was deleted.

2 changes: 1 addition & 1 deletion core/lib/canary/prompts/prompt.ex
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ defmodule Canary.Prompt do
|> String.trim()
end

def format("understander_user", %{query: _, keywords: _} = inputs) do
def format("understander_user", %{query: _, sources: _} = inputs) do
inputs
|> then(&EEx.eval_string(@understander_user_prompt, assigns: &1))
|> String.trim()
Expand Down
1 change: 1 addition & 0 deletions core/lib/canary/prompts/understander_assistant.eex
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<query><%= Enum.join(@queries, ",") %></query>
10 changes: 3 additions & 7 deletions core/lib/canary/prompts/understander_system.eex
Original file line number Diff line number Diff line change
@@ -1,13 +1,9 @@
You are a world class techincal support engineer.
Your job is to analyze the user's query and return a structured response like below:

<analysis>
<keywords>KEYWORD_1,KEYWORD_2,KEYWORD_3</keywords>
</analysis>
<query>KEYWORD_1,KEYWORD_2,KEYWORD_3</query>

IMPORTANT NOTES:
- <keywords></keywords> should contain comma separated list of keywords. MAX 3 keywords are allowed.
- Each "keyword" must be a single word. It will be used to run keyword based search. User '#{@keywords_section}' section for inspiration.
- <query></query> should contain comma separated list of keywords. MAX 3 keywords are allowed.
- Each "keyword" must be a single word. It will be used to run keyword based search.

Do not include any other text, just respond with the XML-like format that I provided.
If user's query is totally nonsense, just return <analysis></analysis>.
14 changes: 10 additions & 4 deletions core/lib/canary/prompts/understander_user.eex
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
## Keywords extracted from documents
Here, I provide titles and some messy keywords extracted from documents.

<%= @keywords %>
<%= for source <- @sources do %>
### <%= source.name %>
#### Titles
<%= source.titles |> Enum.join("\n") %>
#### Keywords
<%= source.keywords |> Enum.join(", ") %>
<% end %>

## User query
<%= @query %>
Based on above information, come up with plausible keywords to fullfil below user's query.
Query: "<%= @query %>"
50 changes: 24 additions & 26 deletions core/lib/canary/query/understander.ex
Original file line number Diff line number Diff line change
@@ -1,21 +1,24 @@
defmodule Canary.Query.UnderstanderResult do
defstruct [:query, :keywords]
@type t :: %__MODULE__{query: String.t(), keywords: list(String.t())}
end

defmodule Canary.Query.Understander do
@callback run(String.t(), String.t()) ::
{:ok, Canary.Query.UnderstanderResult.t()} | {:error, any()}
@callback run(list(any()), String.t()) :: {:ok, list(String.t())} | {:error, any()}

def run(query, keywords), do: impl().run(query, keywords)
def run(sources, query), do: impl().run(sources, query)
defp impl(), do: Canary.Query.Understander.LLM
end

defmodule Canary.Query.Understander.LLM do
@behaviour Canary.Query.Understander

def run(query, keywords) do
chat_model = Application.fetch_env!(:canary, :chat_completion_model_understanding)
alias Canary.Sources.Source
alias Canary.Sources.SourceOverview

def run(sources, query) do
chat_model = Application.fetch_env!(:canary, :chat_completion_model)

overviews =
sources
|> Enum.map(fn %Source{name: name, overview: %SourceOverview{} = overview} ->
%{name: name, titles: overview.titles, keywords: overview.keywords}
end)

messages = [
%{
Expand All @@ -24,30 +27,25 @@ defmodule Canary.Query.Understander.LLM do
},
%{
role: "user",
content: Canary.Prompt.format("understander_user", %{query: query, keywords: keywords})
content: Canary.Prompt.format("understander_user", %{query: query, sources: overviews})
}
]

case Canary.AI.chat(%{model: chat_model, messages: messages}, timeout: 2_000) do
{:ok, analysis} -> {:ok, parse(query, analysis)}
{:ok, completion} -> {:ok, parse(completion)}
error -> error
end
end

defp parse(original_query, completion) do
keywords =
~r/<keywords>(.*?)<\/keywords>/s
|> Regex.scan(completion, capture: :all_but_first)
|> Enum.flat_map(fn [keywords] ->
keywords |> String.split(",") |> Enum.map(&String.trim/1)
end)

query =
~r/<query>(.*?)<\/query>/s
|> Regex.scan(completion, capture: :all_but_first)
|> Enum.map(fn [query] -> String.trim(query) end)
|> Enum.at(0, nil)
defp parse(completion) do
case Regex.run(~r/<query>(.*?)<\/query>/s, completion) do
[_, match] ->
match
|> String.split(",")
|> Enum.map(&String.trim/1)

%Canary.Query.UnderstanderResult{keywords: keywords, query: query || original_query}
nil ->
[]
end
end
end
12 changes: 10 additions & 2 deletions core/lib/canary/scraper.ex
Original file line number Diff line number Diff line change
Expand Up @@ -153,8 +153,16 @@ defmodule Canary.Scraper do
|> Enum.map(&String.trim/1)
end

defp to_text(node) when is_binary(node), do: String.trim(node)
defp to_text(node), do: Floki.text(node)
defp to_text(node) when is_binary(node), do: trim(node)
defp to_text(node), do: Floki.text(node) |> trim()

defp trim(s) do
s
|> String.to_charlist()
|> Enum.filter(&(&1 in 0..127))
|> List.to_string()
|> String.trim()
end

defp update_first(list, fun) when length(list) == 0, do: [fun.(%Item{title: "", content: ""})]
defp update_first(list, fun), do: List.update_at(list, 0, fun)
Expand Down
35 changes: 21 additions & 14 deletions core/lib/canary/searcher.ex
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ defmodule Canary.Searcher.Default do

def run(sources, query) do
if ai?(query) do
Appsignal.instrument("normal_search", fn ->
normal_search(sources, query)
Appsignal.instrument("ai_search", fn ->
ai_search(sources, query)
end)
else
Appsignal.instrument("normal_search", fn ->
Expand All @@ -54,20 +54,27 @@ defmodule Canary.Searcher.Default do
|> Enum.count() > 2
end

defp normal_search(sources, query) do
{:ok, results} = Canary.Index.search(sources, query)
defp ai_search(sources, query) do
with {:ok, queries} = Canary.Query.Understander.run(sources, query),
{:ok, results} <- Canary.Index.search(sources, queries) do
{:ok, transform(sources, results)}
end
end

ret =
results
|> Enum.map(fn %{source_id: source_id, hits: hits} ->
%Canary.Sources.Source{
name: name,
config: %Ash.Union{type: type}
} = sources |> Enum.find(&(&1.id == source_id))
defp normal_search(sources, query) do
{:ok, results} = Canary.Index.search(sources, [query])
{:ok, transform(sources, results)}
end

%{name: name, type: type, hits: hits}
end)
defp transform(sources, search_results) do
search_results
|> Enum.map(fn %{source_id: source_id, hits: hits} ->
%Canary.Sources.Source{
name: name,
config: %Ash.Union{type: type}
} = sources |> Enum.find(&(&1.id == source_id))

{:ok, ret}
%{name: name, type: type, hits: hits}
end)
end
end
34 changes: 34 additions & 0 deletions core/lib/canary/sources/source.ex
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ defmodule Canary.Sources.Source do
domain: Canary.Sources,
data_layer: AshPostgres.DataLayer

require Ash.Query

attributes do
uuid_primary_key :id
create_timestamp :created_at
Expand Down Expand Up @@ -46,6 +48,38 @@ defmodule Canary.Sources.Source do
accept [:name, :config, :overview]
end

update :update_overview do
require_atomic? false

change fn changeset, _ ->
id = Ash.Changeset.get_data(changeset, :id)

documents =
Canary.Sources.Document
|> Ash.Query.filter(source_id == ^id)
|> Ash.read!()

chunks =
documents
|> Enum.flat_map(fn %Canary.Sources.Document{chunks: chunks} ->
Enum.map(chunks, fn %Ash.Union{value: value} -> value end)
end)

titles = Enum.map(chunks, fn %{title: title} -> title end)

keywords =
chunks
|> Enum.map(fn %{content: content} -> content end)
|> Enum.join("\n")
|> then(&Canary.Native.extract_keywords(&1, length(documents) * 20))

overview = %Canary.Sources.SourceOverview{titles: titles, keywords: keywords}

changeset
|> Ash.Changeset.change_attribute(:overview, overview)
end
end

destroy :destroy do
primary? true

Expand Down
4 changes: 4 additions & 0 deletions core/lib/canary/workers/github_discussion_processor.ex
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ defmodule Canary.Workers.GithubDiscussionProcessor do
message: "github discussion fetcher ended"
})

source
|> Ash.Changeset.for_update(:update_overview, %{})
|> Ash.update()

:ok
end
end
5 changes: 5 additions & 0 deletions core/lib/canary/workers/github_issue_processor.ex
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@ defmodule Canary.Workers.GithubIssueProcessor do
with {:ok, incomings} <- GithubIssue.Fetcher.run(source),
:ok <- GithubIssue.Syncer.run(source_id, incomings) do
notify_event_end(source_id)

source
|> Ash.Changeset.for_update(:update_overview, %{})
|> Ash.update()

:ok
end
end
Expand Down
4 changes: 4 additions & 0 deletions core/lib/canary/workers/webpage_processor.ex
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ defmodule Canary.Workers.WebpageProcessor do
message: "webpage fetcher ended"
})

source
|> Ash.Changeset.for_update(:update_overview, %{})
|> Ash.update()

:ok
end
end
Expand Down
Loading

0 comments on commit c3f018c

Please sign in to comment.