From 544768525114f481af2e3f18b6b9e00d821e818b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Urban?= Date: Sat, 27 Jul 2024 12:12:03 +0200 Subject: [PATCH 1/6] feat: add modules for paperless tags, document types, and correspondents Introduced new modules for handling paperless tags, document types, and correspondents, including basic structures and functions for creating and retrieving data. Also added utility functions for response parsing and error handling. --- src/error.rs | 33 +++++++++++++++++++++++ src/main.rs | 5 ++++ src/paperless_correspondents.rs | 25 ++++++++++++++++++ src/paperless_documenttypes.rs | 46 +++++++++++++++++++++++++++++++++ src/paperless_tags.rs | 26 +++++++++++++++++++ src/util.rs | 39 ++++++++++++++++++++++++++++ 6 files changed, 174 insertions(+) create mode 100644 src/error.rs create mode 100644 src/paperless_correspondents.rs create mode 100644 src/paperless_documenttypes.rs create mode 100644 src/paperless_tags.rs create mode 100644 src/util.rs diff --git a/src/error.rs b/src/error.rs new file mode 100644 index 0000000..b900cfa --- /dev/null +++ b/src/error.rs @@ -0,0 +1,33 @@ +use std::fmt; + +#[derive(Debug)] +pub enum ResponseError { + Io(std::io::Error), + ParseBody(std::num::ParseIntError), + RequestError(std::io::Error), + Other(String), +} + +// Step 2: Implement std::fmt::Display +impl fmt::Display for ResponseError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + ResponseError::Io(ref err) => write!(f, "IO error: {}", err), + ResponseError::ParseBody(ref err) => write!(f, "Parse error: {}", err), + ResponseError::RequestError(ref err) => write!(f, "Parse error: {}", err), + ResponseError::Other(ref err) => write!(f, "Other error: {}", err), + } + } +} + +impl std::error::Error for ResponseError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match *self { + ResponseError::Io(ref err) => Some(err), + ResponseError::ParseBody(ref err) => Some(err), + ResponseError::RequestError(ref err) => Some(err), + ResponseError::Other(_) => None, + } + } +} + diff --git a/src/main.rs b/src/main.rs index c45a7b9..529f363 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,6 +1,11 @@ mod llm_api; mod paperless; mod logger; +mod paperless_tags; +mod paperless_documenttypes; +mod paperless_correspondents; +mod util; +mod error; use ollama_rs::{ Ollama, diff --git a/src/paperless_correspondents.rs b/src/paperless_correspondents.rs new file mode 100644 index 0000000..1ccf415 --- /dev/null +++ b/src/paperless_correspondents.rs @@ -0,0 +1,25 @@ +use reqwest::Client; +use crate::paperless::CreateField; +struct DocumentType { + id: u32, + slug: String, + name: String, + matching_algorithm: u8 +} +pub fn create_correspondent( + correspondent_name: &str, + client: &Client, + base_url: &str, +) { + +} + + +pub fn get_correspondents( + client: &Client, + base_url: &str, +) { + + let url = format!("{}/api/correspondents/", base_url); + let res= client.get(url).send(); +} diff --git a/src/paperless_documenttypes.rs b/src/paperless_documenttypes.rs new file mode 100644 index 0000000..3d2bb26 --- /dev/null +++ b/src/paperless_documenttypes.rs @@ -0,0 +1,46 @@ +use reqwest::Client; + + + +struct DocumentType { + id: u32, + slug: String, + name: String, + matching_algorithm: u8 +} +pub fn create_document_type( + document_types: &str, + client: &Client, + base_url: &str, +) { + + + +} + + +pub fn get_document_types( + client: &Client, + base_url: &str, +) { + + let url = format!("{}/api/document_types/", base_url); + let res= client.get(url).send(); + let body = match { + Ok(data) => { + + }, + Err(e) => { + slog_scope::error!("Error getting document types: {}", {}) + } + }; +} + + + +pub fn determine_if_type_exists( + client: &Client, + base_url: &str, +) { + +} \ No newline at end of file diff --git a/src/paperless_tags.rs b/src/paperless_tags.rs new file mode 100644 index 0000000..e46ab35 --- /dev/null +++ b/src/paperless_tags.rs @@ -0,0 +1,26 @@ +use reqwest::Client; + + +struct Tags { + id: u32, + slug: String, + name: String, + matching_algorithm: u8 +} +pub fn create_tag( + correspondent_name: &str, + client: &Client, + base_url: &str, +) { + +} + + +pub fn get_tags( + client: &Client, + base_url: &str, +) { + + let url = format!("{}/api/tags/", base_url); + let res= client.get(url).send(); +} diff --git a/src/util.rs b/src/util.rs new file mode 100644 index 0000000..cbe732c --- /dev/null +++ b/src/util.rs @@ -0,0 +1,39 @@ +use reqwest::Response; +use serde::Deserialize; +use crate::error::ResponseError; +use crate::{Response as PaperlessResponse}; + + +pub async fn parse_response(response: Response) -> Result where T: Deserialize<'static> { + + let response_result = response.error_for_status(); + match response_result { + Ok(data) => { + let body = data.text().await?; + slog_scope::trace!("Response from server while fetching documents: {}", body); + + let json = body.trim_start_matches("Document content: "); + + let data: Result, _> = serde_json::from_str(json); + match data { + Ok(data) => { + slog_scope::info!("Successfully retrieved {} Documents", data.results.len()); + Ok(data.results) + } + Err(e) => { + let column = e.column(); + let start = (column as isize - 30).max(0) as usize; + let end = (column + 30).min(json.len()); + slog_scope::error!("Error while creating json of document response from paperless {}", e); + slog_scope::error!("Error at column {}: {}", column, &json[start..end]); + slog_scope::trace!("Error occured in json {}", &json); + Err(e.into()) // Remove the semicolon here + } + } + } + Err(e) => { + slog_scope::error!("Error while fetching documents from paperless: {}",e); + Err(e.into()) + } + } +} \ No newline at end of file From 6396551917d9fee7c644dc81b71157898740d78d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Urban?= Date: Thu, 19 Sep 2024 21:46:45 +0200 Subject: [PATCH 2/6] feat: refactor file structure and improve LLM integration Refactor the structure of Paperless-related modules by merging functionalities and removing redundant files. Improved LLM response handling by separating the prompt construction and response data extraction into distinct functions, leading to more modular and maintainable code. --- src/llm_api.rs | 4 +- src/main.rs | 56 +++++----- src/paperless.rs | 184 +++++++++++++++++++++++++++++++- src/paperless_correspondents.rs | 25 ----- src/paperless_defaultfields.rs | 66 ++++++++++++ src/paperless_documenttypes.rs | 46 -------- src/paperless_tags.rs | 26 ----- 7 files changed, 278 insertions(+), 129 deletions(-) delete mode 100644 src/paperless_correspondents.rs create mode 100644 src/paperless_defaultfields.rs delete mode 100644 src/paperless_documenttypes.rs delete mode 100644 src/paperless_tags.rs diff --git a/src/llm_api.rs b/src/llm_api.rs index d76d9c4..09dfa99 100644 --- a/src/llm_api.rs +++ b/src/llm_api.rs @@ -6,10 +6,8 @@ use crate::Document; pub async fn generate_response( ollama: &Ollama, model: &String, - prompt_base: &String, - document: &Document, + prompt: String, ) -> std::result::Result> { - let prompt = format!("{} {}", document.content, prompt_base); let res = ollama .generate(GenerationRequest::new(model.clone(), prompt)) .await; diff --git a/src/main.rs b/src/main.rs index 529f363..6494eba 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,9 +1,7 @@ mod llm_api; mod paperless; mod logger; -mod paperless_tags; -mod paperless_documenttypes; -mod paperless_correspondents; +mod paperless_defaultfields; mod util; mod error; @@ -129,7 +127,7 @@ async fn process_documents(client: &Client, ollama: &Ollama, model: &str, base_u Analyze the document to find the values for these fields and format the response as a \ JSON object. Use the most likely answer for each field. \ The response should contain only JSON data where the key and values are all in simple string \ - format(no nested object) for direct parsing by another program. So now additional text or \ + format(no nested object) for direct parsing by another program. So no additional text or \ explanation, no introtext, the answer should start and end with curly brackets \ delimiting the json object ".to_string() }; @@ -166,38 +164,44 @@ async fn process_documents(client: &Client, ollama: &Ollama, model: &str, base_u } async fn process_documents_batch(documents: &Vec, ollama: &Ollama, model: &str, prompt_base: &String, client: &Client, fields: &Vec, base_url: &str, mode: Mode) -> Result<(), Box> { + Ok(for document in documents { slog_scope::trace!("Document Content: {}", document.content); slog_scope::info!("Generate Response with LLM {}", "model"); slog_scope::debug!("with Prompt: {}", prompt_base); - match generate_response(ollama, &model.to_string(), &prompt_base.to_string(), &document).await { - Ok(res) => { - // Log the response from the generate_response call - slog_scope::debug!("LLM Response: {}", res.response); - - match extract_json_object(&res.response) { - Ok(json_str) => { - // Log successful JSON extraction - slog_scope::debug!("Extracted JSON Object: {}", json_str); - - match serde_json::from_str(&json_str) { - Ok(json) => update_document_fields(client, document.id, &fields, &json, base_url, mode).await?, - Err(e) => { - slog_scope::error!("Error parsing llm response json {}", e.to_string()); - slog_scope::debug!("JSON String was: {}", &json_str); - } + generate_response_and_extract_data(ollama, &model, &prompt_base, client, &fields, base_url, mode, &document).await; + }) +} + +async fn generate_response_and_extract_data(ollama: &Ollama, model: &str, prompt_base: &String, client: &Client, fields: &Vec, base_url: &str, mode: Mode, document: &Document) { + let prompt = format!("{} {}", prompt_base, document.content); + + match generate_response(ollama, &model.to_string(), prompt).await { + Ok(res) => { + // Log the response from the generate_response call + slog_scope::debug!("LLM Response: {}", res.response); + + match extract_json_object(&res.response) { + Ok(json_str) => { + // Log successful JSON extraction + slog_scope::debug!("Extracted JSON Object: {}", json_str); + + match serde_json::from_str(&json_str) { + Ok(json) => update_document_fields(client, document.id, &fields, &json, base_url, mode).await?, + Err(e) => { + slog_scope::error!("Error parsing llm response json {}", e.to_string()); + slog_scope::debug!("JSON String was: {}", &json_str); } } - Err(e) => slog_scope::error!("{}", e), } - } - Err(e) => { - slog_scope::error!("Error generating llm response: {}", e); - continue; + Err(e) => slog_scope::error!("{}", e), } } - }) + Err(e) => { + slog_scope::error!("Error generating llm response: {}", e); + } + } } #[tokio::main] diff --git a/src/paperless.rs b/src/paperless.rs index 4fc919a..161523e 100644 --- a/src/paperless.rs +++ b/src/paperless.rs @@ -1,11 +1,38 @@ use std::collections::HashMap; use std::fmt; +use std::fmt::Debug; use reqwest::Client; -use serde::de::StdError; +use serde::de::{DeserializeOwned, StdError}; use serde_json::{Map, Value}; use crate::{CustomField, Document, Field, Mode, Response}; use serde::{Deserialize, Serialize}; +#[derive(Clone, Copy)] +pub enum PaperlessDefaultFieldType { + Tag, + DocumentType, + Correspondent, +} + +impl PaperlessDefaultFieldType { + fn to_string(self) -> &'static str { + match self { + PaperlessDefaultFieldType::Tag => "tags", + PaperlessDefaultFieldType::DocumentType => "document_types", + PaperlessDefaultFieldType::Correspondent => "correspondents", + } + } +} + +#[derive(Serialize, Deserialize, Debug, Clone)] +struct DefaultField { + #[serde(skip_serializing_if = "Option::is_none")] // Skip `id` if it's None + id: Option, + slug: String, + name: String, + matching_algorithm: u8, +} + pub async fn get_data_from_paperless( client: &Client, url: &str, @@ -34,7 +61,7 @@ pub async fn get_data_from_paperless( //let error_part = value.pointer("/results/0").unwrap(); //println!("Error part: {}", error_part); // Parse the JSON string into the Response struct - return parse_document_response(json); + parse_document_response(json) } Err(e) => { slog_scope::error!("Error while fetching documents from paperless: {}",e); @@ -68,7 +95,7 @@ pub async fn get_next_data_from_paperless(client: &Client, //let error_part = value.pointer("/results/0").unwrap(); //println!("Error part: {}", error_part); // Parse the JSON string into the Response struct - return parse_document_response(json); + parse_document_response(json) } Err(e) => { slog_scope::error!("Error while fetching documents from paperless: {}",e); @@ -139,6 +166,51 @@ pub async fn query_custom_fields( } } +pub async fn get_default_fields( + client: &Client, + base_url: &str, + endpoint: PaperlessDefaultFieldType, +) -> Result, Box> +where + T: DeserializeOwned + Debug, +{ + slog_scope::info!("Fetching custom fields from paperless at {}", base_url); + let res = client + .get(format!("{}/api/{}/", base_url, endpoint.to_string())) + .send() + .await?; + + let response_result = res.error_for_status(); + match response_result { + Ok(data) => { + let body = data.text().await?; + slog_scope::debug!("Response from server while fetching documents: {}", body); + + // Remove the "Field: " prefix if necessary + let json = body.trim_start_matches("Field: "); + let data: Result, _> = serde_json::from_str(json); + match data { + Ok(data) => { + slog_scope::info!("{}: {:?}", endpoint.to_string(), data.results); + Ok(data.results) + } + Err(e) => { + let column = e.column(); + let start = (column as isize - 30).max(0) as usize; + let end = (column + 30).min(json.len()); + slog_scope::error!("Error occurred parsing custom fields: {}", e); + slog_scope::error!("Error at column {}: {}", column, &json[start..end]); + slog_scope::debug!("Error occurred in json {}", &json); + Err(e.into()) + } + } + } + Err(e) => { + slog_scope::error!("Error retrieving custom fields: {}", e); + Err(e.into()) + } + } +} pub async fn update_document_fields( client: &Client, document_id: u32, @@ -230,6 +302,77 @@ pub async fn update_document_fields( } } +/// This function update the default fields like tags, correspondents and document_types in paperless +/// it is checked if a field exists on the server and if not, it is created +/// +pub async fn update_document_default_fields( + client: &Client, + document_id: u32, + fields: &Vec, + data: Vec, + base_url: &str, + endpoint: PaperlessDefaultFieldType, + mode: Mode, +) -> Result<(), Box> { + let mut default_field_ids = Vec::new(); + + for value in data { + + if let Some(field) = fields.iter().find(|&f| f.name == *value) { + let default_field_id = field.id; + default_field_ids.push(default_field_id); + } else { + if matches!(mode, Mode::Create) { + slog_scope::info!("Creating {}: {}", endpoint.to_string(), value); + let create_field = DefaultField { + id: None, + name: value.clone(), + slug: value.clone(), + matching_algorithm: 6, + }; + match create_default_field(client, &create_field, base_url, endpoint).await + { + Ok(new_field) => { + default_field_ids.push(new_field.id) + } + Err(e) => { + slog_scope::error!("Error: {} creating custom field: {}, skipping...",e, value) + } + } + } + } + } + + let mut payload = serde_json::Map::new(); + payload.insert(endpoint.to_string().to_string(), serde_json::json!(default_field_ids)); + + if payload.is_empty() { + slog_scope::warn!("{}", "payload is empty, not updating fields"); + return Err(Box::new(fmt::Error::default())); // Use a standard library error type like fmt::Error. + } + let url = format!("{}/api/documents/{}/", base_url, document_id); + slog_scope::info!("Updating document with ID: {}", document_id); + slog_scope::debug!("Request Payload: {}", map_to_string(&payload)); + + for (key, value) in &payload { + slog_scope::debug!("{}: {}", key, value); + } + let res = client.patch(&url).json(&payload).send().await?; + let response_result = res.error_for_status(); + match response_result { + Ok(data) => { + let body = data.text().await?; + slog_scope::trace!("{}", body); + slog_scope::info!("Document with ID: {} successfully updated", document_id); + Ok(()) + } + Err(e) => { + slog_scope::error!("Error while updating document fields: {}", e); + Err(e.into()) + } + } +} + fn convert_field_to_custom_field(value: &Option, field: &Field) -> CustomField { let custom_field = CustomField { field: field.id.clone(), @@ -279,6 +422,41 @@ pub async fn create_custom_field( } } } +pub async fn create_default_field( + client: &Client, + field: &DefaultField, + base_url: &str, + endpoint: PaperlessDefaultFieldType, +) -> Result> { + // Define the URL for creating a custom field + let url = format!("{}/api/{}/", base_url, endpoint.to_string()); + + + // Send the request to create the custom field + let res = client.post(&url).json(&field).send().await?; + let response_result = res.error_for_status(); + match response_result { + Ok(data) => { + let body = data.text().await?; + slog_scope::trace!("{}", body); + let field: Result, _> = serde_json::from_str(&body); + match field { + Ok(field) => { + Ok(field.results[0].clone()) // TODO: improve + } + Err(e) => { + slog_scope::debug!("Creating field response: {}", body); + slog_scope::error!("Error parsing response from new field: {}", e); + Err(e.into()) + } + } + } + Err(e) => { + slog_scope::error!("Error creating custom field: {}", e); + Err(e.into()) + } + } +} fn map_to_string(map: &Map) -> String { map.iter() .map(|(key, value)| format!("{}: {}", key, value)) diff --git a/src/paperless_correspondents.rs b/src/paperless_correspondents.rs deleted file mode 100644 index 1ccf415..0000000 --- a/src/paperless_correspondents.rs +++ /dev/null @@ -1,25 +0,0 @@ -use reqwest::Client; -use crate::paperless::CreateField; -struct DocumentType { - id: u32, - slug: String, - name: String, - matching_algorithm: u8 -} -pub fn create_correspondent( - correspondent_name: &str, - client: &Client, - base_url: &str, -) { - -} - - -pub fn get_correspondents( - client: &Client, - base_url: &str, -) { - - let url = format!("{}/api/correspondents/", base_url); - let res= client.get(url).send(); -} diff --git a/src/paperless_defaultfields.rs b/src/paperless_defaultfields.rs new file mode 100644 index 0000000..c9fd148 --- /dev/null +++ b/src/paperless_defaultfields.rs @@ -0,0 +1,66 @@ +use ollama_rs::Ollama; +use reqwest::Client; +use serde::de::StdError; +use crate::{extract_json_object, Document, Field, Mode}; +use crate::llm_api::generate_response; +use crate::paperless::{get_default_fields, update_document_fields, PaperlessDefaultFieldType}; + +const ANSWER_INSTRUCTION: String = "The result should be a only a json array of string and nothing else. The answer should start and end with the square bracket. The document is:".to_string(); +async fn construct_document_type_prompt(client: &Client, base_url: &str) -> Result> { + let document_types = get_default_fields(client, base_url, PaperlessDefaultFieldType::DocumentType).await?; + let base_prompt = format!("Determine the type of this document from the following available document types: {:?}, if none of these fit the document, create a new one: ", document_types); + Ok(base_prompt) +} + + +async fn construct_tag_prompt(client: &Client, base_url: &str) -> Result> { + let document_types = get_default_fields(client, base_url, PaperlessDefaultFieldType::Tag).await?; + let base_prompt = format!("Determine the type of this document from the following available document types: {:?}, if none of these fit the document, create a new one: ", document_types); + Ok(base_prompt) +} +async fn construct_correspondent_prompt(client: &Client, base_url: &str) -> Result> { + let document_types = get_default_fields(client, base_url, PaperlessDefaultFieldType::Correspondent).await?; + let base_prompt = format!("Determine possible correspondents from this document from the following available correspondents: {:?}, if none of these fit the document, create a new one. The result should be a only a json array of string and nothing else. The answer should start and end with the square bracket. The document is: ", document_types); + Ok(base_prompt) +} + + + + + + +async fn generate_response_and_extract_data(ollama: &Ollama, model: &str, prompt_base: &String, client: &Client, fields: &Vec, base_url: &str, mode: Mode, document: &Document) { + let prompt = format!("{} {}", prompt_base, document.content); + + match generate_response(ollama, &model.to_string(), prompt).await { + Ok(res) => { + // Log the response from the generate_response call + slog_scope::debug!("LLM Response: {}", res.response); + + match extract_json_object(&res.response) { + Ok(json_str) => { + // Log successful JSON extraction + slog_scope::debug!("Extracted JSON Object: {}", json_str); + + match serde_json::from_str(&json_str) { + Ok(json) => update_document_fields(client, document.id, &fields, &json, base_url, mode).await?, + Err(e) => { + slog_scope::error!("Error parsing llm response json {}", e.to_string()); + slog_scope::debug!("JSON String was: {}", &json_str); + } + } + } + Err(e) => slog_scope::error!("{}", e), + } + } + Err(e) => { + slog_scope::error!("Error generating llm response: {}", e); + } + } +} +pub fn determine_if_type_exists( + client: &Client, + base_url: &str, +) { + //TODO: +} \ No newline at end of file diff --git a/src/paperless_documenttypes.rs b/src/paperless_documenttypes.rs deleted file mode 100644 index 3d2bb26..0000000 --- a/src/paperless_documenttypes.rs +++ /dev/null @@ -1,46 +0,0 @@ -use reqwest::Client; - - - -struct DocumentType { - id: u32, - slug: String, - name: String, - matching_algorithm: u8 -} -pub fn create_document_type( - document_types: &str, - client: &Client, - base_url: &str, -) { - - - -} - - -pub fn get_document_types( - client: &Client, - base_url: &str, -) { - - let url = format!("{}/api/document_types/", base_url); - let res= client.get(url).send(); - let body = match { - Ok(data) => { - - }, - Err(e) => { - slog_scope::error!("Error getting document types: {}", {}) - } - }; -} - - - -pub fn determine_if_type_exists( - client: &Client, - base_url: &str, -) { - -} \ No newline at end of file diff --git a/src/paperless_tags.rs b/src/paperless_tags.rs deleted file mode 100644 index e46ab35..0000000 --- a/src/paperless_tags.rs +++ /dev/null @@ -1,26 +0,0 @@ -use reqwest::Client; - - -struct Tags { - id: u32, - slug: String, - name: String, - matching_algorithm: u8 -} -pub fn create_tag( - correspondent_name: &str, - client: &Client, - base_url: &str, -) { - -} - - -pub fn get_tags( - client: &Client, - base_url: &str, -) { - - let url = format!("{}/api/tags/", base_url); - let res= client.get(url).send(); -} From e1dd59f39bb510e1bc56b430bf041c7f063a891d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Urban?= Date: Fri, 4 Oct 2024 22:30:27 +0200 Subject: [PATCH 3/6] fix: error handling and flow of filling default fields --- src/main.rs | 2 +- src/paperless.rs | 25 ++++++-------- src/paperless_defaultfields.rs | 61 ++++++++++++++++++++-------------- 3 files changed, 48 insertions(+), 40 deletions(-) diff --git a/src/main.rs b/src/main.rs index 6494eba..82f9aa5 100644 --- a/src/main.rs +++ b/src/main.rs @@ -188,7 +188,7 @@ async fn generate_response_and_extract_data(ollama: &Ollama, model: &str, prompt slog_scope::debug!("Extracted JSON Object: {}", json_str); match serde_json::from_str(&json_str) { - Ok(json) => update_document_fields(client, document.id, &fields, &json, base_url, mode).await?, + Ok(json) => update_document_fields(client, document.id, &fields, &json, base_url, mode).await.unwrap_or_default(), //TODO: Fix unwrap Err(e) => { slog_scope::error!("Error parsing llm response json {}", e.to_string()); slog_scope::debug!("JSON String was: {}", &json_str); diff --git a/src/paperless.rs b/src/paperless.rs index 161523e..77fee1b 100644 --- a/src/paperless.rs +++ b/src/paperless.rs @@ -25,7 +25,7 @@ impl PaperlessDefaultFieldType { } #[derive(Serialize, Deserialize, Debug, Clone)] -struct DefaultField { +pub struct DefaultField { #[serde(skip_serializing_if = "Option::is_none")] // Skip `id` if it's None id: Option, slug: String, @@ -166,13 +166,11 @@ pub async fn query_custom_fields( } } -pub async fn get_default_fields( +pub async fn get_default_fields( client: &Client, base_url: &str, endpoint: PaperlessDefaultFieldType, -) -> Result, Box> -where - T: DeserializeOwned + Debug, +) -> Result, Box> { slog_scope::info!("Fetching custom fields from paperless at {}", base_url); let res = client @@ -188,7 +186,7 @@ where // Remove the "Field: " prefix if necessary let json = body.trim_start_matches("Field: "); - let data: Result, _> = serde_json::from_str(json); + let data: Result, _> = serde_json::from_str(json); match data { Ok(data) => { slog_scope::info!("{}: {:?}", endpoint.to_string(), data.results); @@ -313,7 +311,7 @@ pub async fn update_document_default_fields( base_url: &str, endpoint: PaperlessDefaultFieldType, mode: Mode, -) -> Result<(), Box> { +) -> Option> { let mut default_field_ids = Vec::new(); for value in data { @@ -348,7 +346,7 @@ pub async fn update_document_default_fields( if payload.is_empty() { slog_scope::warn!("{}", "payload is empty, not updating fields"); - return Err(Box::new(fmt::Error::default())); // Use a standard library error type like fmt::Error. + return None } let url = format!("{}/api/documents/{}/", base_url, document_id); slog_scope::info!("Updating document with ID: {}", document_id); @@ -357,18 +355,17 @@ pub async fn update_document_default_fields( for (key, value) in &payload { slog_scope::debug!("{}: {}", key, value); } - let res = client.patch(&url).json(&payload).send().await?; - let response_result = res.error_for_status(); + let res = client.patch(&url).json(&payload).send().await; + let response_result = res; match response_result { Ok(data) => { - let body = data.text().await?; - slog_scope::trace!("{}", body); + let body = data.text().await; slog_scope::info!("Document with ID: {} successfully updated", document_id); - Ok(()) + None } Err(e) => { slog_scope::error!("Error while updating document fields: {}", e); - Err(e.into()) + Some(Box::new(e)) } } } diff --git a/src/paperless_defaultfields.rs b/src/paperless_defaultfields.rs index c9fd148..cf97e16 100644 --- a/src/paperless_defaultfields.rs +++ b/src/paperless_defaultfields.rs @@ -3,58 +3,69 @@ use reqwest::Client; use serde::de::StdError; use crate::{extract_json_object, Document, Field, Mode}; use crate::llm_api::generate_response; -use crate::paperless::{get_default_fields, update_document_fields, PaperlessDefaultFieldType}; +use crate::paperless::{get_default_fields, update_document_default_fields, update_document_fields, DefaultField, PaperlessDefaultFieldType}; const ANSWER_INSTRUCTION: String = "The result should be a only a json array of string and nothing else. The answer should start and end with the square bracket. The document is:".to_string(); async fn construct_document_type_prompt(client: &Client, base_url: &str) -> Result> { - let document_types = get_default_fields(client, base_url, PaperlessDefaultFieldType::DocumentType).await?; + let document_types = get_default_fields(client, base_url, PaperlessDefaultFieldType::DocumentType).await; let base_prompt = format!("Determine the type of this document from the following available document types: {:?}, if none of these fit the document, create a new one: ", document_types); Ok(base_prompt) } async fn construct_tag_prompt(client: &Client, base_url: &str) -> Result> { - let document_types = get_default_fields(client, base_url, PaperlessDefaultFieldType::Tag).await?; + let document_types = get_default_fields(client, base_url, PaperlessDefaultFieldType::Tag).await; let base_prompt = format!("Determine the type of this document from the following available document types: {:?}, if none of these fit the document, create a new one: ", document_types); Ok(base_prompt) } async fn construct_correspondent_prompt(client: &Client, base_url: &str) -> Result> { - let document_types = get_default_fields(client, base_url, PaperlessDefaultFieldType::Correspondent).await?; + let document_types = get_default_fields(client, base_url, PaperlessDefaultFieldType::Correspondent).await; let base_prompt = format!("Determine possible correspondents from this document from the following available correspondents: {:?}, if none of these fit the document, create a new one. The result should be a only a json array of string and nothing else. The answer should start and end with the square bracket. The document is: ", document_types); Ok(base_prompt) } +async fn generate_response_and_extract_data(ollama: &Ollama, model: &str, prompt_base: &String, client: &Client, fields: Vec, base_url: &str, document: &Document, mode: Mode, field_type: PaperlessDefaultFieldType) -> Option> { + let prompt = match field_type { + PaperlessDefaultFieldType::Correspondent => construct_correspondent_prompt(client, base_url).await, + PaperlessDefaultFieldType::Tag => construct_tag_prompt(client, base_url).await, + PaperlessDefaultFieldType::DocumentType => construct_document_type_prompt(client, base_url).await, + }; + match prompt { + Ok(prompt) => { + match generate_response(ollama, &model.to_string(), prompt).await { + Ok(res) => { + // Log the response from the generate_response call + slog_scope::debug!("LLM Response: {}", res.response); + match extract_json_object(&res.response) { + Ok(json_str) => { + // Log successful JSON extraction + slog_scope::debug!("Extracted JSON Object: {}", json_str); - - -async fn generate_response_and_extract_data(ollama: &Ollama, model: &str, prompt_base: &String, client: &Client, fields: &Vec, base_url: &str, mode: Mode, document: &Document) { - let prompt = format!("{} {}", prompt_base, document.content); - - match generate_response(ollama, &model.to_string(), prompt).await { - Ok(res) => { - // Log the response from the generate_response call - slog_scope::debug!("LLM Response: {}", res.response); - - match extract_json_object(&res.response) { - Ok(json_str) => { - // Log successful JSON extraction - slog_scope::debug!("Extracted JSON Object: {}", json_str); - - match serde_json::from_str(&json_str) { - Ok(json) => update_document_fields(client, document.id, &fields, &json, base_url, mode).await?, + match serde_json::from_str(&json_str) { + Ok(json) => update_document_default_fields(client, document.id, &fields, json, base_url, field_type, mode).await, + Err(e) => { + slog_scope::error!("Error parsing llm response json {}", e.to_string()); + slog_scope::debug!("JSON String was: {}", & json_str); + Some(Box::new(e)) + } + } + } Err(e) => { - slog_scope::error!("Error parsing llm response json {}", e.to_string()); - slog_scope::debug!("JSON String was: {}", &json_str); + slog_scope::error ! ("{}", e); + None } } } - Err(e) => slog_scope::error!("{}", e), + Err(e) => { + slog_scope::error ! ("Error generating llm response: {}", e); + None + } } } Err(e) => { - slog_scope::error!("Error generating llm response: {}", e); + Some(e) } } } From 42f43ca8564475f1c0074a0234255183e0683c72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Urban?= Date: Sat, 5 Oct 2024 10:08:33 +0200 Subject: [PATCH 4/6] fix: change prompt to generate more accurate results --- src/main.rs | 10 ++++++++- src/paperless.rs | 4 ++-- src/paperless_defaultfields.rs | 13 ++++++------ src/util.rs | 39 ---------------------------------- 4 files changed, 18 insertions(+), 48 deletions(-) diff --git a/src/main.rs b/src/main.rs index 82f9aa5..3c0d3a8 100644 --- a/src/main.rs +++ b/src/main.rs @@ -19,8 +19,9 @@ use serde::{Deserialize, Serialize}; use serde_json::{Value}; use std::env; use crate::llm_api::generate_response; -use crate::paperless::{get_data_from_paperless, get_next_data_from_paperless, query_custom_fields, update_document_fields}; +use crate::paperless::{get_data_from_paperless, get_default_fields, get_next_data_from_paperless, query_custom_fields, update_document_fields, PaperlessDefaultFieldType}; use substring::Substring; +use crate::paperless_defaultfields::extract_default_fields; #[derive(Serialize, Deserialize, Debug, Clone)] struct Document { @@ -171,6 +172,13 @@ async fn process_documents_batch(documents: &Vec, ollama: &Ollama, mod slog_scope::debug!("with Prompt: {}", prompt_base); generate_response_and_extract_data(ollama, &model, &prompt_base, client, &fields, base_url, mode, &document).await; + let default_fields = get_default_fields(client, base_url, PaperlessDefaultFieldType::Tag).await; + match default_fields { + Ok(default_fields) => { + extract_default_fields(ollama, &model, &prompt_base, client, default_fields, base_url, &document, mode, PaperlessDefaultFieldType::Tag).await; + } + Err(e) => slog_scope::error!("Error while interacting with paperless: {}", e), + } }) } diff --git a/src/paperless.rs b/src/paperless.rs index 77fee1b..afe14d6 100644 --- a/src/paperless.rs +++ b/src/paperless.rs @@ -436,10 +436,10 @@ pub async fn create_default_field( Ok(data) => { let body = data.text().await?; slog_scope::trace!("{}", body); - let field: Result, _> = serde_json::from_str(&body); + let field: Result = serde_json::from_str(&body); match field { Ok(field) => { - Ok(field.results[0].clone()) // TODO: improve + Ok(field) // TODO: improve } Err(e) => { slog_scope::debug!("Creating field response: {}", body); diff --git a/src/paperless_defaultfields.rs b/src/paperless_defaultfields.rs index cf97e16..be3f659 100644 --- a/src/paperless_defaultfields.rs +++ b/src/paperless_defaultfields.rs @@ -5,27 +5,27 @@ use crate::{extract_json_object, Document, Field, Mode}; use crate::llm_api::generate_response; use crate::paperless::{get_default_fields, update_document_default_fields, update_document_fields, DefaultField, PaperlessDefaultFieldType}; -const ANSWER_INSTRUCTION: String = "The result should be a only a json array of string and nothing else. The answer should start and end with the square bracket. The document is:".to_string(); + const ANSWER_INSTRUCTION: &'static str = "The result should be a only a json array of string and nothing else. The answer should start and end with the square bracket. The document is: "; async fn construct_document_type_prompt(client: &Client, base_url: &str) -> Result> { let document_types = get_default_fields(client, base_url, PaperlessDefaultFieldType::DocumentType).await; - let base_prompt = format!("Determine the type of this document from the following available document types: {:?}, if none of these fit the document, create a new one: ", document_types); + let base_prompt = format!("Determine the type of this document from the following available document types: {:?}, if none of these fit the document, create a new one. ", document_types); Ok(base_prompt) } async fn construct_tag_prompt(client: &Client, base_url: &str) -> Result> { let document_types = get_default_fields(client, base_url, PaperlessDefaultFieldType::Tag).await; - let base_prompt = format!("Determine the type of this document from the following available document types: {:?}, if none of these fit the document, create a new one: ", document_types); + let base_prompt = format!("Determine the type of this document from the following available document types: {:?}, if none of these fit the document, create a new one. ", document_types); Ok(base_prompt) } async fn construct_correspondent_prompt(client: &Client, base_url: &str) -> Result> { let document_types = get_default_fields(client, base_url, PaperlessDefaultFieldType::Correspondent).await; - let base_prompt = format!("Determine possible correspondents from this document from the following available correspondents: {:?}, if none of these fit the document, create a new one. The result should be a only a json array of string and nothing else. The answer should start and end with the square bracket. The document is: ", document_types); + let base_prompt = format!("Determine possible correspondents from this document from the following available correspondents: {:?}, if none of these fit the document, create a new one. The result should be a only a json array of string and nothing else. The answer should start and end with the square bracket. ", document_types); Ok(base_prompt) } -async fn generate_response_and_extract_data(ollama: &Ollama, model: &str, prompt_base: &String, client: &Client, fields: Vec, base_url: &str, document: &Document, mode: Mode, field_type: PaperlessDefaultFieldType) -> Option> { +pub async fn extract_default_fields(ollama: &Ollama, model: &str, prompt_base: &String, client: &Client, fields: Vec, base_url: &str, document: &Document, mode: Mode, field_type: PaperlessDefaultFieldType) -> Option> { let prompt = match field_type { PaperlessDefaultFieldType::Correspondent => construct_correspondent_prompt(client, base_url).await, PaperlessDefaultFieldType::Tag => construct_tag_prompt(client, base_url).await, @@ -33,7 +33,8 @@ async fn generate_response_and_extract_data(ollama: &Ollama, model: &str, prompt }; match prompt { Ok(prompt) => { - match generate_response(ollama, &model.to_string(), prompt).await { + let prompt_with_document = prompt + &*ANSWER_INSTRUCTION + &*document.content; + match generate_response(ollama, &model.to_string(), prompt_with_document).await { Ok(res) => { // Log the response from the generate_response call slog_scope::debug!("LLM Response: {}", res.response); diff --git a/src/util.rs b/src/util.rs index cbe732c..e69de29 100644 --- a/src/util.rs +++ b/src/util.rs @@ -1,39 +0,0 @@ -use reqwest::Response; -use serde::Deserialize; -use crate::error::ResponseError; -use crate::{Response as PaperlessResponse}; - - -pub async fn parse_response(response: Response) -> Result where T: Deserialize<'static> { - - let response_result = response.error_for_status(); - match response_result { - Ok(data) => { - let body = data.text().await?; - slog_scope::trace!("Response from server while fetching documents: {}", body); - - let json = body.trim_start_matches("Document content: "); - - let data: Result, _> = serde_json::from_str(json); - match data { - Ok(data) => { - slog_scope::info!("Successfully retrieved {} Documents", data.results.len()); - Ok(data.results) - } - Err(e) => { - let column = e.column(); - let start = (column as isize - 30).max(0) as usize; - let end = (column + 30).min(json.len()); - slog_scope::error!("Error while creating json of document response from paperless {}", e); - slog_scope::error!("Error at column {}: {}", column, &json[start..end]); - slog_scope::trace!("Error occured in json {}", &json); - Err(e.into()) // Remove the semicolon here - } - } - } - Err(e) => { - slog_scope::error!("Error while fetching documents from paperless: {}",e); - Err(e.into()) - } - } -} \ No newline at end of file From 2563c0c569812202e0e685ecb4e83cab16c207f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Urban?= Date: Tue, 22 Oct 2024 12:45:31 +0200 Subject: [PATCH 5/6] feat!: add support for creating default fields controlled by modes BREAKING-CHANGE: mode values have changed, value 0 no means NoAnalyze prompting doclytics to do nothing for this field type, use value 1 for the previous behaviour instead. --- README.md | 29 +++++++++++++----------- src/llm_api.rs | 1 - src/main.rs | 41 ++++++++++++++++++++++++++++------ src/paperless.rs | 3 ++- src/paperless_defaultfields.rs | 2 +- src/util.rs | 3 +++ 6 files changed, 56 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index 2fd1369..ebda439 100644 --- a/README.md +++ b/README.md @@ -46,19 +46,22 @@ With these prerequisites met, you are now ready to proceed with the installation The application requires setting environment variables for its configuration. Below is a table describing each environment variable, indicating whether it is required or optional, its default value (if any), and a brief description: -| Environment Variable | Required | Default Value | Description | -|--------------------------|---------|----------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `PAPERLESS_TOKEN` | Yes | None | The authentication token for accessing the Paperless API. | -| `PAPERLESS_BASE_URL` | Yes | None | The base URL for the Paperless API. | -| `PAPERLESS_FILTER` | NO | "NOT tagged=true" | Filter string that filters the documents to be fetched from paperless | -| `LANGUAGE` | No | "EN" | Allow to use translated base prompts (Support: EN, DE) | -| `OLLAMA_HOST` | No | "localhost" | The hostname where the Ollama service is running. | -| `OLLAMA_PORT` | No | "11434" | The port on which the Ollama service is accessible. | -| `OLLAMA_SECURE_ENDPOINT` | No | "false" | Whether to use HTTPS (`true`) or HTTP (`false`) for Ollama. | -| `OLLAMA_MODEL` | No | "llama2:13b" | The specific Ollama model to be used for processing. | -| `BASE_PROMPT` | No | see [Example Prompt](example/example.prompt) | Prompt given to the model, for requesting metadata.
Should contain the custom fields in paperless that you want doclytics. | -| `LOG_LEVEL` | No | INFO | Log level | -| `MODE` | No | 0 | :warning: **Experimental**: Mode of operation.
0 = NoCreate (Doclytics does not create custom fields automatically in Paperless), 1 = Create (Doclytics automatically creates custom fields that do not exist in Paperless). All fields will be created as type "Text" at the moment. In stable support, the type will be inferred. | +| Environment Variable | Required | Default Value | Description | +|---------------------------|---------|----------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `PAPERLESS_TOKEN` | Yes | None | The authentication token for accessing the Paperless API. | +| `PAPERLESS_BASE_URL` | Yes | None | The base URL for the Paperless API. | +| `PAPERLESS_FILTER` | NO | "NOT tagged=true" | Filter string that filters the documents to be fetched from paperless | +| `LANGUAGE` | No | "EN" | Allow to use translated base prompts (Support: EN, DE) | +| `OLLAMA_HOST` | No | "localhost" | The hostname where the Ollama service is running. | +| `OLLAMA_PORT` | No | "11434" | The port on which the Ollama service is accessible. | +| `OLLAMA_SECURE_ENDPOINT` | No | "false" | Whether to use HTTPS (`true`) or HTTP (`false`) for Ollama. | +| `OLLAMA_MODEL` | No | "llama2:13b" | The specific Ollama model to be used for processing. | +| `BASE_PROMPT` | No | see [Example Prompt](example/example.prompt) | Prompt given to the model, for requesting metadata.
Should contain the custom fields in paperless that you want doclytics. | +| `LOG_LEVEL` | No | INFO | Log level | +| `MODE` | No | 0 | :warning: **Experimental**: Mode of operation.
0 = NoAnalyze(Doclytics does nothing for this field type), 1 = NoCreate (Doclytics does not create custom fields automatically in Paperless), 2 = Create (Doclytics automatically creates custom fields that do not exist in Paperless). All fields will be created as type "Text" at the moment. In stable support, the type will be inferred. | +| `DOCLYTICS_TAGS` | No | 0 | :warning: **Experimental**: Mode of operation.
0 = NoAnalyze(Doclytics does nothing for this field type), 1 = NoCreate (Doclytics does not create custom fields automatically in Paperless), 2 = Create (Doclytics automatically creates custom fields that do not exist in Paperless). All fields will be created as type "Text" at the moment. In stable support, the type will be inferred. | +| `DOCLYTICS_DOCTYPE` | No | 0 | :warning: **Experimental**: Mode of operation.
0 = NoAnalyze(Doclytics does nothing for this field type), 1 = NoCreate (Doclytics does not create custom fields automatically in Paperless), 2 = Create (Doclytics automatically creates custom fields that do not exist in Paperless). All fields will be created as type "Text" at the moment. In stable support, the type will be inferred. | +| `DOCLYTICS_CORRESPONDENT` | No | 0 | :warning: **Experimental**: Mode of operation.
0 = NoAnalyze(Doclytics does nothing for this field type), 1 = NoCreate (Doclytics does not create custom fields automatically in Paperless), 2 = Create (Doclytics automatically creates custom fields that do not exist in Paperless). All fields will be created as type "Text" at the moment. In stable support, the type will be inferred. | diff --git a/src/llm_api.rs b/src/llm_api.rs index 09dfa99..4539177 100644 --- a/src/llm_api.rs +++ b/src/llm_api.rs @@ -1,7 +1,6 @@ use ollama_rs::generation::completion::GenerationResponse; use ollama_rs::generation::completion::request::GenerationRequest; use ollama_rs::Ollama; -use crate::Document; pub async fn generate_response( ollama: &Ollama, diff --git a/src/main.rs b/src/main.rs index 3c0d3a8..955488b 100644 --- a/src/main.rs +++ b/src/main.rs @@ -69,14 +69,16 @@ struct Field { #[derive(Clone, Copy)] enum Mode { + NoAnalyze, Create, NoCreate, } impl Mode { fn from_int(value: i32) -> Self { match value { - 1 => Mode::Create, - 0 => Mode::NoCreate, + 2 => Mode::Create, + 1 => Mode::NoCreate, + 0 => Mode::NoAnalyze, _ => Mode::NoCreate, } } @@ -165,7 +167,10 @@ async fn process_documents(client: &Client, ollama: &Ollama, model: &str, base_u } async fn process_documents_batch(documents: &Vec, ollama: &Ollama, model: &str, prompt_base: &String, client: &Client, fields: &Vec, base_url: &str, mode: Mode) -> Result<(), Box> { - + let tag_mode = create_mode_from_env("DOCLYTICS_TAGS"); + let doctype_mode = create_mode_from_env("DOCLYTICS_DOCTYPE"); + let correspondent_mode = create_mode_from_env("DOCLYTICS_CORRESPONDENT"); + Ok(for document in documents { slog_scope::trace!("Document Content: {}", document.content); slog_scope::info!("Generate Response with LLM {}", "model"); @@ -175,7 +180,26 @@ async fn process_documents_batch(documents: &Vec, ollama: &Ollama, mod let default_fields = get_default_fields(client, base_url, PaperlessDefaultFieldType::Tag).await; match default_fields { Ok(default_fields) => { - extract_default_fields(ollama, &model, &prompt_base, client, default_fields, base_url, &document, mode, PaperlessDefaultFieldType::Tag).await; + match tag_mode { + Mode::NoAnalyze => (), + _ => + if let Some(err) = extract_default_fields(ollama, &model, &prompt_base, client, &default_fields, base_url, &document, tag_mode, PaperlessDefaultFieldType::Tag).await { + return Err(err); + } + } + match doctype_mode { + Mode::NoAnalyze => (), + _ => + if let Some(err) = extract_default_fields(ollama, &model, &prompt_base, client, &default_fields, base_url, &document, doctype_mode, PaperlessDefaultFieldType::DocumentType).await { + return Err(err); + } + } + match correspondent_mode { + Mode::NoAnalyze => (), + _ => if let Some(err) = extract_default_fields(ollama, &model, &prompt_base, client, &default_fields, base_url, &document, correspondent_mode, PaperlessDefaultFieldType::Correspondent).await { + return Err(err); + } + } } Err(e) => slog_scope::error!("Error while interacting with paperless: {}", e), } @@ -211,7 +235,6 @@ async fn generate_response_and_extract_data(ollama: &Ollama, model: &str, prompt } } } - #[tokio::main] async fn main() -> Result<(), Box> { logger::init(); // Initializes the global logger @@ -271,7 +294,11 @@ fn extract_json_object(input: &str) -> Result { } } - +fn create_mode_from_env(env_key: &str) -> Mode { + let mode_env = env::var(env_key).unwrap_or_else(|_| "1".to_string()); + let mode_int = mode_env.parse::().unwrap_or(1); + Mode::from_int(mode_int) +} #[cfg(test)] mod tests { use super::*; @@ -293,4 +320,4 @@ mod tests { let empty_json_str = "No JSON object or array here"; assert!(extract_json_object(empty_json_str).is_err()); } -} \ No newline at end of file +} diff --git a/src/paperless.rs b/src/paperless.rs index afe14d6..bcf94ca 100644 --- a/src/paperless.rs +++ b/src/paperless.rs @@ -6,6 +6,7 @@ use serde::de::{DeserializeOwned, StdError}; use serde_json::{Map, Value}; use crate::{CustomField, Document, Field, Mode, Response}; use serde::{Deserialize, Serialize}; +use crate::util::normalize_string; #[derive(Clone, Copy)] pub enum PaperlessDefaultFieldType { @@ -316,7 +317,7 @@ pub async fn update_document_default_fields( for value in data { - if let Some(field) = fields.iter().find(|&f| f.name == *value) { + if let Some(field) = fields.iter().find(|&f| normalize_string(&*f.name) == normalize_string(&*value)) { let default_field_id = field.id; default_field_ids.push(default_field_id); } else { diff --git a/src/paperless_defaultfields.rs b/src/paperless_defaultfields.rs index be3f659..5abc6c3 100644 --- a/src/paperless_defaultfields.rs +++ b/src/paperless_defaultfields.rs @@ -25,7 +25,7 @@ async fn construct_correspondent_prompt(client: &Client, base_url: &str) -> Resu } -pub async fn extract_default_fields(ollama: &Ollama, model: &str, prompt_base: &String, client: &Client, fields: Vec, base_url: &str, document: &Document, mode: Mode, field_type: PaperlessDefaultFieldType) -> Option> { +pub async fn extract_default_fields(ollama: &Ollama, model: &str, prompt_base: &String, client: &Client, fields: &Vec, base_url: &str, document: &Document, mode: Mode, field_type: PaperlessDefaultFieldType) -> Option> { let prompt = match field_type { PaperlessDefaultFieldType::Correspondent => construct_correspondent_prompt(client, base_url).await, PaperlessDefaultFieldType::Tag => construct_tag_prompt(client, base_url).await, diff --git a/src/util.rs b/src/util.rs index e69de29..c676f43 100644 --- a/src/util.rs +++ b/src/util.rs @@ -0,0 +1,3 @@ +pub fn normalize_string(s: &str) -> String { + s.replace("-", "").replace("_", "").to_lowercase() +} \ No newline at end of file From 4f92be2845516c275c45bfe3a2aa66ea95c74a01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Urban?= Date: Tue, 22 Oct 2024 20:55:24 +0200 Subject: [PATCH 6/6] fix: error handling and logging --- src/main.rs | 6 +++--- src/paperless_defaultfields.rs | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/main.rs b/src/main.rs index 955488b..08ffc2a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -184,20 +184,20 @@ async fn process_documents_batch(documents: &Vec, ollama: &Ollama, mod Mode::NoAnalyze => (), _ => if let Some(err) = extract_default_fields(ollama, &model, &prompt_base, client, &default_fields, base_url, &document, tag_mode, PaperlessDefaultFieldType::Tag).await { - return Err(err); + slog_scope::error!("Error while getting tags: {:?}", err); } } match doctype_mode { Mode::NoAnalyze => (), _ => if let Some(err) = extract_default_fields(ollama, &model, &prompt_base, client, &default_fields, base_url, &document, doctype_mode, PaperlessDefaultFieldType::DocumentType).await { - return Err(err); + slog_scope::error!("Error while getting doctype: {:?}", err); } } match correspondent_mode { Mode::NoAnalyze => (), _ => if let Some(err) = extract_default_fields(ollama, &model, &prompt_base, client, &default_fields, base_url, &document, correspondent_mode, PaperlessDefaultFieldType::Correspondent).await { - return Err(err); + slog_scope::error!("Error while getting correspondents: {:?}", err); } } } diff --git a/src/paperless_defaultfields.rs b/src/paperless_defaultfields.rs index 5abc6c3..a010d58 100644 --- a/src/paperless_defaultfields.rs +++ b/src/paperless_defaultfields.rs @@ -5,7 +5,7 @@ use crate::{extract_json_object, Document, Field, Mode}; use crate::llm_api::generate_response; use crate::paperless::{get_default_fields, update_document_default_fields, update_document_fields, DefaultField, PaperlessDefaultFieldType}; - const ANSWER_INSTRUCTION: &'static str = "The result should be a only a json array of string and nothing else. The answer should start and end with the square bracket. The document is: "; + const ANSWER_INSTRUCTION: &'static str = "The result should be a only a non-nested one dimensional json array of correctly quoted strings and nothing else. The answer should start and end with the square bracket. The document is: "; async fn construct_document_type_prompt(client: &Client, base_url: &str) -> Result> { let document_types = get_default_fields(client, base_url, PaperlessDefaultFieldType::DocumentType).await; let base_prompt = format!("Determine the type of this document from the following available document types: {:?}, if none of these fit the document, create a new one. ", document_types);