From fddaf31633f93f93d7ebadb42f622ad2aaaf0437 Mon Sep 17 00:00:00 2001 From: Quarto GHA Workflow Runner Date: Fri, 18 Oct 2024 16:00:57 +0000 Subject: [PATCH] Built site for gh-pages --- .nojekyll | 2 +- r/NEWS.html | 733 ++++++++++++++++++++++++++++++++++++++++++++++++++++ search.json | 103 ++++---- sitemap.xml | 50 ++-- 4 files changed, 816 insertions(+), 72 deletions(-) create mode 100644 r/NEWS.html diff --git a/.nojekyll b/.nojekyll index 95791cb..e4dfc06 100644 --- a/.nojekyll +++ b/.nojekyll @@ -1 +1 @@ -9c1ae2f9 \ No newline at end of file +92ff7ebd \ No newline at end of file diff --git a/r/NEWS.html b/r/NEWS.html new file mode 100644 index 0000000..8d03681 --- /dev/null +++ b/r/NEWS.html @@ -0,0 +1,733 @@ + + + + + + + + + +news – mall + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ +
+ +
+ + + + +
+ + + + +
+

mall 0.1.0

+
    +
  • Initial CRAN submission.
  • +
+ + +
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/search.json b/search.json index db920da..8c61f84 100644 --- a/search.json +++ b/search.json @@ -329,51 +329,37 @@ "text": "Examples\n\n \nlibrary(mall) \n \ndata(\"reviews\") \n \nllm_use(\"ollama\", \"llama3.2\", seed = 100, .silent = TRUE) \n \n# Use 'labels' to let the function know what to extract \nllm_extract(reviews, review, labels = \"product\") \n#> # A tibble: 3 × 2\n#> review .extract \n#> <chr> <chr> \n#> 1 This has been the best TV I've ever used. Gr… tv \n#> 2 I regret buying this laptop. It is too slow … laptop \n#> 3 Not sure how to feel about my new washing ma… washing machine\n \n# Use 'pred_name' to customize the new column's name \nllm_extract(reviews, review, \"product\", pred_name = \"prod\") \n#> # A tibble: 3 × 2\n#> review prod \n#> <chr> <chr> \n#> 1 This has been the best TV I've ever used. Gr… tv \n#> 2 I regret buying this laptop. It is too slow … laptop \n#> 3 Not sure how to feel about my new washing ma… washing machine\n \n# Pass a vector to request multiple things, the results will be pipe delimeted \n# in a single column \nllm_extract(reviews, review, c(\"product\", \"feelings\")) \n#> # A tibble: 3 × 2\n#> review .extract \n#> <chr> <chr> \n#> 1 This has been the best TV I've ever used. Gr… tv | great \n#> 2 I regret buying this laptop. It is too slow … laptop|frustration \n#> 3 Not sure how to feel about my new washing ma… washing machine | confusion\n \n# To get multiple columns, use 'expand_cols' \nllm_extract(reviews, review, c(\"product\", \"feelings\"), expand_cols = TRUE) \n#> # A tibble: 3 × 3\n#> review product feelings \n#> <chr> <chr> <chr> \n#> 1 This has been the best TV I've ever used. Gr… \"tv \" \" great\" \n#> 2 I regret buying this laptop. It is too slow … \"laptop\" \"frustration\"\n#> 3 Not sure how to feel about my new washing ma… \"washing machine \" \" confusion\"\n \n# Pass a named vector to set the resulting column names \nllm_extract( \n .data = reviews, \n col = review, \n labels = c(prod = \"product\", feels = \"feelings\"), \n expand_cols = TRUE \n) \n#> # A tibble: 3 × 3\n#> review prod feels \n#> <chr> <chr> <chr> \n#> 1 This has been the best TV I've ever used. Gr… \"tv \" \" great\" \n#> 2 I regret buying this laptop. It is too slow … \"laptop\" \"frustration\"\n#> 3 Not sure how to feel about my new washing ma… \"washing machine \" \" confusion\"\n \n# For character vectors, instead of a data frame, use this function \nllm_vec_extract(\"bob smith, 123 3rd street\", c(\"name\", \"address\")) \n#> [1] \"bob smith | 123 3rd street\"\n \n# To preview the first call that will be made to the downstream R function \nllm_vec_extract( \n \"bob smith, 123 3rd street\", \n c(\"name\", \"address\"), \n preview = TRUE \n) \n#> ollamar::chat(messages = list(list(role = \"user\", content = \"You are a helpful text extraction engine. Extract the name, address being referred to on the text. I expect 2 items exactly. No capitalization. No explanations. Return the response exclusively in a pipe separated list, and no headers. The answer is based on the following text:\\nbob smith, 123 3rd street\")), \n#> output = \"text\", model = \"llama3.2\", seed = 100)" }, { - "objectID": "r/cran-comments.html", - "href": "r/cran-comments.html", - "title": "mall", + "objectID": "r/NEWS.html", + "href": "r/NEWS.html", + "title": "mall 0.1.0", "section": "", - "text": "This is a new package submission. Run multiple ‘Large Language Model’ predictions against a table. The predictions run row-wise over a specified column. It works using a one-shot prompt, along with the current row’s content. The prompt that is used will depend of the type of analysis needed.\nThe README file is very short because all the information about how to use it is this website: https://mlverse.github.io/mall/." + "text": "mall 0.1.0\n\nInitial CRAN submission." }, { - "objectID": "r/cran-comments.html#new-submission", - "href": "r/cran-comments.html#new-submission", - "title": "mall", + "objectID": "r/LICENSE.html", + "href": "r/LICENSE.html", + "title": "MIT License", "section": "", - "text": "This is a new package submission. Run multiple ‘Large Language Model’ predictions against a table. The predictions run row-wise over a specified column. It works using a one-shot prompt, along with the current row’s content. The prompt that is used will depend of the type of analysis needed.\nThe README file is very short because all the information about how to use it is this website: https://mlverse.github.io/mall/." - }, - { - "objectID": "r/cran-comments.html#r-cmd-check-environments", - "href": "r/cran-comments.html#r-cmd-check-environments", - "title": "mall", - "section": "R CMD check environments", - "text": "R CMD check environments\n\nMac OS M3 (aarch64-apple-darwin23), R 4.4.1 (Local)\nMac OS x86_64-apple-darwin20.0 (64-bit), R 4.4.1 (GH Actions)\nWindows x86_64-w64-mingw32 (64-bit), R 4.4.1 (GH Actions)\nLinux x86_64-pc-linux-gnu (64-bit), R 4.4.1 (GH Actions)\nLinux x86_64-pc-linux-gnu (64-bit), R 4.5.0 (dev) (GH Actions)\nLinux x86_64-pc-linux-gnu (64-bit), R 4.3.3 (old release) (GH Actions)" - }, - { - "objectID": "r/cran-comments.html#r-cmd-check-results", - "href": "r/cran-comments.html#r-cmd-check-results", - "title": "mall", - "section": "R CMD check results", - "text": "R CMD check results\n0 errors ✔ | 0 warnings ✔ | 0 notes ✔" + "text": "MIT License\nCopyright (c) 2024 mall authors\nPermission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:\nThe above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.\nTHE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE." }, { - "objectID": "articles/databricks.html", - "href": "articles/databricks.html", - "title": "Databricks", + "objectID": "articles/performance.html", + "href": "articles/performance.html", + "title": "Performance", "section": "", - "text": "This brief example shows how seamless it is to use the same functions, but against a remote database connection. Today, it works with the following functions:", + "text": "We will briefly cover this methods performance from two perspectives:\n\nHow long the analysis takes to run locally\nHow well it predicts\n\nTo do so, we will use the data_bookReviews data set, provided by the classmap package. For this exercise, only the first 100, of the total 1,000, are going to be part of this analysis.\n\nlibrary(mall)\nlibrary(classmap)\nlibrary(dplyr)\n\ndata(data_bookReviews)\n\ndata_bookReviews |>\n glimpse()\n#> Rows: 1,000\n#> Columns: 2\n#> $ review <chr> \"i got this as both a book and an audio file. i had waited t…\n#> $ sentiment <fct> 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 1, …\n\nAs per the docs, sentiment is a factor indicating the sentiment of the review: negative (1) or positive (2)\n\nlength(strsplit(paste(head(data_bookReviews$review, 100), collapse = \" \"), \" \")[[1]])\n#> [1] 20470\n\nJust to get an idea of how much data we’re processing, I’m using a very, very simple word count. So we’re analyzing a bit over 20 thousand words.\n\nreviews_llm <- data_bookReviews |>\n head(100) |> \n llm_sentiment(\n col = review,\n options = c(\"positive\" ~ 2, \"negative\" ~ 1),\n pred_name = \"predicted\"\n )\n#> ! There were 2 predictions with invalid output, they were coerced to NA\n\nAs far as time, on my Apple M3 machine, it took about 1.5 minutes to process, 100 rows, containing 20 thousand words. Setting temp to 0 in llm_use(), made the model run faster.\nThe package uses purrr to send each prompt individually to the LLM. But, I did try a few different ways to speed up the process, unsuccessfully:\n\nUsed furrr to send multiple requests at a time. This did not work because either the LLM or Ollama processed all my requests serially. So there was no improvement.\nI also tried sending more than one row’s text at a time. This cause instability in the number of results. For example sending 5 at a time, sometimes returned 7 or 8. Even sending 2 was not stable.\n\nThis is what the new table looks like:\n\nreviews_llm\n#> # A tibble: 100 × 3\n#> review sentiment predicted\n#> <chr> <fct> <dbl>\n#> 1 \"i got this as both a book and an audio file… 1 1\n#> 2 \"this book places too much emphasis on spend… 1 1\n#> 3 \"remember the hollywood blacklist? the holly… 2 2\n#> 4 \"while i appreciate what tipler was attempti… 1 1\n#> 5 \"the others in the series were great, and i … 1 1\n#> 6 \"a few good things, but she's lost her edge … 1 1\n#> 7 \"words cannot describe how ripped off and di… 1 1\n#> 8 \"1. the persective of most writers is shaped… 1 NA\n#> 9 \"i have been a huge fan of michael crichton … 1 1\n#> 10 \"i saw dr. polk on c-span a month or two ago… 2 2\n#> # ℹ 90 more rows\n\nI used yardstick to see how well the model performed. Of course, the accuracy will not be of the “truth”, but rather the package’s results recorded in sentiment.\n\nlibrary(forcats)\n\nreviews_llm |>\n mutate(predicted = as.factor(predicted)) |>\n yardstick::accuracy(sentiment, predicted)\n#> # A tibble: 1 × 3\n#> .metric .estimator .estimate\n#> <chr> <chr> <dbl>\n#> 1 accuracy binary 0.980", "crumbs": [ - "Databricks (R only)" + "Performance" ] }, { - "objectID": "articles/databricks.html#examples", - "href": "articles/databricks.html#examples", - "title": "Databricks", - "section": "Examples", - "text": "Examples\nWe will start by connecting to the Databricks Warehouse\n\nlibrary(mall)\nlibrary(DBI)\n\ncon <- dbConnect(\n odbc::databricks(),\n HTTPPath = Sys.getenv(\"DATABRICKS_PATH\")\n)\n\nNext, we will create a small reviews table\n\nlibrary(dplyr)\n\nreviews <- tribble(\n ~review,\n \"This has been the best TV I've ever used. Great screen, and sound.\",\n \"I regret buying this laptop. It is too slow and the keyboard is too noisy\",\n \"Not sure how to feel about my new washing machine. Great color, but hard to figure\"\n)\n\ntbl_reviews <- copy_to(con, reviews, overwrite = TRUE)\n\nUsing llm_sentiment() in Databricks will call that vendor’s SQL AI function directly:\n\ntbl_reviews |>\n llm_sentiment(review)\n#> # Source: SQL [3 x 2]\n#> # Database: Spark SQL 3.1.1[token@Spark SQL/hive_metastore]\n#> review .sentiment\n#> <chr> <chr> \n#> 1 This has been the best TV Ive ever used. Great screen, and sound. positive \n#> 2 I regret buying this laptop. It is too slow and the keyboard is to… negative \n#> 3 Not sure how to feel about my new washing machine. Great color, bu… mixed\n\nThere are some differences in the arguments, and output of the LLM’s. Notice that instead of “neutral”, the prediction is “mixed”. The AI Sentiment function does not allow to change the possible options.\nNext, we will try llm_summarize(). The max_words argument maps to the same argument in the AI Summarize function:\n\ntbl_reviews |>\n llm_summarize(review, max_words = 5) |> \n show_query()\n#> <SQL>\n#> SELECT `reviews`.*, ai_summarize(`review`, CAST(5.0 AS INT)) AS `.summary`\n#> FROM `reviews`\n\nllm_classify() for this back-end, will only accept unnamed options.\n\ntbl_reviews |> \n llm_classify(review, c(\"appliance\", \"computer\"))\n#> # Source: SQL [3 x 2]\n#> # Database: Spark SQL 3.1.1[token@Spark SQL/hive_metastore]\n#> review .classify\n#> <chr> <chr> \n#> 1 This has been the best TV Ive ever used. Great screen, and sound. appliance\n#> 2 I regret buying this laptop. It is too slow and the keyboard is too… computer \n#> 3 Not sure how to feel about my new washing machine. Great color, but… appliance", + "objectID": "articles/performance.html#performance", + "href": "articles/performance.html#performance", + "title": "Performance", + "section": "", + "text": "We will briefly cover this methods performance from two perspectives:\n\nHow long the analysis takes to run locally\nHow well it predicts\n\nTo do so, we will use the data_bookReviews data set, provided by the classmap package. For this exercise, only the first 100, of the total 1,000, are going to be part of this analysis.\n\nlibrary(mall)\nlibrary(classmap)\nlibrary(dplyr)\n\ndata(data_bookReviews)\n\ndata_bookReviews |>\n glimpse()\n#> Rows: 1,000\n#> Columns: 2\n#> $ review <chr> \"i got this as both a book and an audio file. i had waited t…\n#> $ sentiment <fct> 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 1, …\n\nAs per the docs, sentiment is a factor indicating the sentiment of the review: negative (1) or positive (2)\n\nlength(strsplit(paste(head(data_bookReviews$review, 100), collapse = \" \"), \" \")[[1]])\n#> [1] 20470\n\nJust to get an idea of how much data we’re processing, I’m using a very, very simple word count. So we’re analyzing a bit over 20 thousand words.\n\nreviews_llm <- data_bookReviews |>\n head(100) |> \n llm_sentiment(\n col = review,\n options = c(\"positive\" ~ 2, \"negative\" ~ 1),\n pred_name = \"predicted\"\n )\n#> ! There were 2 predictions with invalid output, they were coerced to NA\n\nAs far as time, on my Apple M3 machine, it took about 1.5 minutes to process, 100 rows, containing 20 thousand words. Setting temp to 0 in llm_use(), made the model run faster.\nThe package uses purrr to send each prompt individually to the LLM. But, I did try a few different ways to speed up the process, unsuccessfully:\n\nUsed furrr to send multiple requests at a time. This did not work because either the LLM or Ollama processed all my requests serially. So there was no improvement.\nI also tried sending more than one row’s text at a time. This cause instability in the number of results. For example sending 5 at a time, sometimes returned 7 or 8. Even sending 2 was not stable.\n\nThis is what the new table looks like:\n\nreviews_llm\n#> # A tibble: 100 × 3\n#> review sentiment predicted\n#> <chr> <fct> <dbl>\n#> 1 \"i got this as both a book and an audio file… 1 1\n#> 2 \"this book places too much emphasis on spend… 1 1\n#> 3 \"remember the hollywood blacklist? the holly… 2 2\n#> 4 \"while i appreciate what tipler was attempti… 1 1\n#> 5 \"the others in the series were great, and i … 1 1\n#> 6 \"a few good things, but she's lost her edge … 1 1\n#> 7 \"words cannot describe how ripped off and di… 1 1\n#> 8 \"1. the persective of most writers is shaped… 1 NA\n#> 9 \"i have been a huge fan of michael crichton … 1 1\n#> 10 \"i saw dr. polk on c-span a month or two ago… 2 2\n#> # ℹ 90 more rows\n\nI used yardstick to see how well the model performed. Of course, the accuracy will not be of the “truth”, but rather the package’s results recorded in sentiment.\n\nlibrary(forcats)\n\nreviews_llm |>\n mutate(predicted = as.factor(predicted)) |>\n yardstick::accuracy(sentiment, predicted)\n#> # A tibble: 1 × 3\n#> .metric .estimator .estimate\n#> <chr> <chr> <dbl>\n#> 1 accuracy binary 0.980", "crumbs": [ - "Databricks (R only)" + "Performance" ] }, { @@ -427,31 +413,52 @@ ] }, { - "objectID": "articles/performance.html", - "href": "articles/performance.html", - "title": "Performance", + "objectID": "articles/databricks.html", + "href": "articles/databricks.html", + "title": "Databricks", "section": "", - "text": "We will briefly cover this methods performance from two perspectives:\n\nHow long the analysis takes to run locally\nHow well it predicts\n\nTo do so, we will use the data_bookReviews data set, provided by the classmap package. For this exercise, only the first 100, of the total 1,000, are going to be part of this analysis.\n\nlibrary(mall)\nlibrary(classmap)\nlibrary(dplyr)\n\ndata(data_bookReviews)\n\ndata_bookReviews |>\n glimpse()\n#> Rows: 1,000\n#> Columns: 2\n#> $ review <chr> \"i got this as both a book and an audio file. i had waited t…\n#> $ sentiment <fct> 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 1, …\n\nAs per the docs, sentiment is a factor indicating the sentiment of the review: negative (1) or positive (2)\n\nlength(strsplit(paste(head(data_bookReviews$review, 100), collapse = \" \"), \" \")[[1]])\n#> [1] 20470\n\nJust to get an idea of how much data we’re processing, I’m using a very, very simple word count. So we’re analyzing a bit over 20 thousand words.\n\nreviews_llm <- data_bookReviews |>\n head(100) |> \n llm_sentiment(\n col = review,\n options = c(\"positive\" ~ 2, \"negative\" ~ 1),\n pred_name = \"predicted\"\n )\n#> ! There were 2 predictions with invalid output, they were coerced to NA\n\nAs far as time, on my Apple M3 machine, it took about 1.5 minutes to process, 100 rows, containing 20 thousand words. Setting temp to 0 in llm_use(), made the model run faster.\nThe package uses purrr to send each prompt individually to the LLM. But, I did try a few different ways to speed up the process, unsuccessfully:\n\nUsed furrr to send multiple requests at a time. This did not work because either the LLM or Ollama processed all my requests serially. So there was no improvement.\nI also tried sending more than one row’s text at a time. This cause instability in the number of results. For example sending 5 at a time, sometimes returned 7 or 8. Even sending 2 was not stable.\n\nThis is what the new table looks like:\n\nreviews_llm\n#> # A tibble: 100 × 3\n#> review sentiment predicted\n#> <chr> <fct> <dbl>\n#> 1 \"i got this as both a book and an audio file… 1 1\n#> 2 \"this book places too much emphasis on spend… 1 1\n#> 3 \"remember the hollywood blacklist? the holly… 2 2\n#> 4 \"while i appreciate what tipler was attempti… 1 1\n#> 5 \"the others in the series were great, and i … 1 1\n#> 6 \"a few good things, but she's lost her edge … 1 1\n#> 7 \"words cannot describe how ripped off and di… 1 1\n#> 8 \"1. the persective of most writers is shaped… 1 NA\n#> 9 \"i have been a huge fan of michael crichton … 1 1\n#> 10 \"i saw dr. polk on c-span a month or two ago… 2 2\n#> # ℹ 90 more rows\n\nI used yardstick to see how well the model performed. Of course, the accuracy will not be of the “truth”, but rather the package’s results recorded in sentiment.\n\nlibrary(forcats)\n\nreviews_llm |>\n mutate(predicted = as.factor(predicted)) |>\n yardstick::accuracy(sentiment, predicted)\n#> # A tibble: 1 × 3\n#> .metric .estimator .estimate\n#> <chr> <chr> <dbl>\n#> 1 accuracy binary 0.980", + "text": "This brief example shows how seamless it is to use the same functions, but against a remote database connection. Today, it works with the following functions:", "crumbs": [ - "Performance" + "Databricks (R only)" ] }, { - "objectID": "articles/performance.html#performance", - "href": "articles/performance.html#performance", - "title": "Performance", - "section": "", - "text": "We will briefly cover this methods performance from two perspectives:\n\nHow long the analysis takes to run locally\nHow well it predicts\n\nTo do so, we will use the data_bookReviews data set, provided by the classmap package. For this exercise, only the first 100, of the total 1,000, are going to be part of this analysis.\n\nlibrary(mall)\nlibrary(classmap)\nlibrary(dplyr)\n\ndata(data_bookReviews)\n\ndata_bookReviews |>\n glimpse()\n#> Rows: 1,000\n#> Columns: 2\n#> $ review <chr> \"i got this as both a book and an audio file. i had waited t…\n#> $ sentiment <fct> 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 1, …\n\nAs per the docs, sentiment is a factor indicating the sentiment of the review: negative (1) or positive (2)\n\nlength(strsplit(paste(head(data_bookReviews$review, 100), collapse = \" \"), \" \")[[1]])\n#> [1] 20470\n\nJust to get an idea of how much data we’re processing, I’m using a very, very simple word count. So we’re analyzing a bit over 20 thousand words.\n\nreviews_llm <- data_bookReviews |>\n head(100) |> \n llm_sentiment(\n col = review,\n options = c(\"positive\" ~ 2, \"negative\" ~ 1),\n pred_name = \"predicted\"\n )\n#> ! There were 2 predictions with invalid output, they were coerced to NA\n\nAs far as time, on my Apple M3 machine, it took about 1.5 minutes to process, 100 rows, containing 20 thousand words. Setting temp to 0 in llm_use(), made the model run faster.\nThe package uses purrr to send each prompt individually to the LLM. But, I did try a few different ways to speed up the process, unsuccessfully:\n\nUsed furrr to send multiple requests at a time. This did not work because either the LLM or Ollama processed all my requests serially. So there was no improvement.\nI also tried sending more than one row’s text at a time. This cause instability in the number of results. For example sending 5 at a time, sometimes returned 7 or 8. Even sending 2 was not stable.\n\nThis is what the new table looks like:\n\nreviews_llm\n#> # A tibble: 100 × 3\n#> review sentiment predicted\n#> <chr> <fct> <dbl>\n#> 1 \"i got this as both a book and an audio file… 1 1\n#> 2 \"this book places too much emphasis on spend… 1 1\n#> 3 \"remember the hollywood blacklist? the holly… 2 2\n#> 4 \"while i appreciate what tipler was attempti… 1 1\n#> 5 \"the others in the series were great, and i … 1 1\n#> 6 \"a few good things, but she's lost her edge … 1 1\n#> 7 \"words cannot describe how ripped off and di… 1 1\n#> 8 \"1. the persective of most writers is shaped… 1 NA\n#> 9 \"i have been a huge fan of michael crichton … 1 1\n#> 10 \"i saw dr. polk on c-span a month or two ago… 2 2\n#> # ℹ 90 more rows\n\nI used yardstick to see how well the model performed. Of course, the accuracy will not be of the “truth”, but rather the package’s results recorded in sentiment.\n\nlibrary(forcats)\n\nreviews_llm |>\n mutate(predicted = as.factor(predicted)) |>\n yardstick::accuracy(sentiment, predicted)\n#> # A tibble: 1 × 3\n#> .metric .estimator .estimate\n#> <chr> <chr> <dbl>\n#> 1 accuracy binary 0.980", + "objectID": "articles/databricks.html#examples", + "href": "articles/databricks.html#examples", + "title": "Databricks", + "section": "Examples", + "text": "Examples\nWe will start by connecting to the Databricks Warehouse\n\nlibrary(mall)\nlibrary(DBI)\n\ncon <- dbConnect(\n odbc::databricks(),\n HTTPPath = Sys.getenv(\"DATABRICKS_PATH\")\n)\n\nNext, we will create a small reviews table\n\nlibrary(dplyr)\n\nreviews <- tribble(\n ~review,\n \"This has been the best TV I've ever used. Great screen, and sound.\",\n \"I regret buying this laptop. It is too slow and the keyboard is too noisy\",\n \"Not sure how to feel about my new washing machine. Great color, but hard to figure\"\n)\n\ntbl_reviews <- copy_to(con, reviews, overwrite = TRUE)\n\nUsing llm_sentiment() in Databricks will call that vendor’s SQL AI function directly:\n\ntbl_reviews |>\n llm_sentiment(review)\n#> # Source: SQL [3 x 2]\n#> # Database: Spark SQL 3.1.1[token@Spark SQL/hive_metastore]\n#> review .sentiment\n#> <chr> <chr> \n#> 1 This has been the best TV Ive ever used. Great screen, and sound. positive \n#> 2 I regret buying this laptop. It is too slow and the keyboard is to… negative \n#> 3 Not sure how to feel about my new washing machine. Great color, bu… mixed\n\nThere are some differences in the arguments, and output of the LLM’s. Notice that instead of “neutral”, the prediction is “mixed”. The AI Sentiment function does not allow to change the possible options.\nNext, we will try llm_summarize(). The max_words argument maps to the same argument in the AI Summarize function:\n\ntbl_reviews |>\n llm_summarize(review, max_words = 5) |> \n show_query()\n#> <SQL>\n#> SELECT `reviews`.*, ai_summarize(`review`, CAST(5.0 AS INT)) AS `.summary`\n#> FROM `reviews`\n\nllm_classify() for this back-end, will only accept unnamed options.\n\ntbl_reviews |> \n llm_classify(review, c(\"appliance\", \"computer\"))\n#> # Source: SQL [3 x 2]\n#> # Database: Spark SQL 3.1.1[token@Spark SQL/hive_metastore]\n#> review .classify\n#> <chr> <chr> \n#> 1 This has been the best TV Ive ever used. Great screen, and sound. appliance\n#> 2 I regret buying this laptop. It is too slow and the keyboard is too… computer \n#> 3 Not sure how to feel about my new washing machine. Great color, but… appliance", "crumbs": [ - "Performance" + "Databricks (R only)" ] }, { - "objectID": "r/LICENSE.html", - "href": "r/LICENSE.html", - "title": "MIT License", + "objectID": "r/cran-comments.html", + "href": "r/cran-comments.html", + "title": "mall", "section": "", - "text": "MIT License\nCopyright (c) 2024 mall authors\nPermission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:\nThe above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.\nTHE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE." + "text": "This is a new package submission. Run multiple ‘Large Language Model’ predictions against a table. The predictions run row-wise over a specified column. It works using a one-shot prompt, along with the current row’s content. The prompt that is used will depend of the type of analysis needed.\nThe README file is very short because all the information about how to use it is this website: https://mlverse.github.io/mall/." + }, + { + "objectID": "r/cran-comments.html#new-submission", + "href": "r/cran-comments.html#new-submission", + "title": "mall", + "section": "", + "text": "This is a new package submission. Run multiple ‘Large Language Model’ predictions against a table. The predictions run row-wise over a specified column. It works using a one-shot prompt, along with the current row’s content. The prompt that is used will depend of the type of analysis needed.\nThe README file is very short because all the information about how to use it is this website: https://mlverse.github.io/mall/." + }, + { + "objectID": "r/cran-comments.html#r-cmd-check-environments", + "href": "r/cran-comments.html#r-cmd-check-environments", + "title": "mall", + "section": "R CMD check environments", + "text": "R CMD check environments\n\nMac OS M3 (aarch64-apple-darwin23), R 4.4.1 (Local)\nMac OS x86_64-apple-darwin20.0 (64-bit), R 4.4.1 (GH Actions)\nWindows x86_64-w64-mingw32 (64-bit), R 4.4.1 (GH Actions)\nLinux x86_64-pc-linux-gnu (64-bit), R 4.4.1 (GH Actions)\nLinux x86_64-pc-linux-gnu (64-bit), R 4.5.0 (dev) (GH Actions)\nLinux x86_64-pc-linux-gnu (64-bit), R 4.3.3 (old release) (GH Actions)" + }, + { + "objectID": "r/cran-comments.html#r-cmd-check-results", + "href": "r/cran-comments.html#r-cmd-check-results", + "title": "mall", + "section": "R CMD check results", + "text": "R CMD check results\n0 errors ✔ | 0 warnings ✔ | 0 notes ✔" }, { "objectID": "index.html", diff --git a/sitemap.xml b/sitemap.xml index a3bc5e3..a1368d9 100644 --- a/sitemap.xml +++ b/sitemap.xml @@ -2,78 +2,82 @@ https://mlverse.github.io/mall/reference/llm_summarize.html - 2024-10-18T15:55:27.843Z + 2024-10-18T16:00:28.431Z https://mlverse.github.io/mall/reference/m_backend_submit.html - 2024-10-18T15:55:27.843Z + 2024-10-18T16:00:28.431Z https://mlverse.github.io/mall/reference/llm_use.html - 2024-10-18T15:55:27.843Z + 2024-10-18T16:00:28.431Z https://mlverse.github.io/mall/reference/reviews.html - 2024-10-18T15:55:27.843Z + 2024-10-18T16:00:28.431Z https://mlverse.github.io/mall/reference/llm_classify.html - 2024-10-18T15:55:27.843Z + 2024-10-18T16:00:28.431Z https://mlverse.github.io/mall/reference/llm_sentiment.html - 2024-10-18T15:55:27.843Z + 2024-10-18T16:00:28.431Z https://mlverse.github.io/mall/reference/llm_extract.html - 2024-10-18T15:55:27.843Z + 2024-10-18T16:00:28.431Z - https://mlverse.github.io/mall/r/cran-comments.html - 2024-10-18T15:55:27.839Z + https://mlverse.github.io/mall/r/NEWS.html + 2024-10-18T16:00:28.431Z - https://mlverse.github.io/mall/articles/databricks.html - 2024-10-18T15:55:27.835Z + https://mlverse.github.io/mall/r/LICENSE.html + 2024-10-18T16:00:28.431Z + + + https://mlverse.github.io/mall/articles/performance.html + 2024-10-18T16:00:28.427Z https://mlverse.github.io/mall/articles/caching.html - 2024-10-18T15:55:27.835Z + 2024-10-18T16:00:28.427Z - https://mlverse.github.io/mall/articles/performance.html - 2024-10-18T15:55:27.835Z + https://mlverse.github.io/mall/articles/databricks.html + 2024-10-18T16:00:28.427Z - https://mlverse.github.io/mall/r/LICENSE.html - 2024-10-18T15:55:27.839Z + https://mlverse.github.io/mall/r/cran-comments.html + 2024-10-18T16:00:28.431Z https://mlverse.github.io/mall/index.html - 2024-10-18T15:55:27.835Z + 2024-10-18T16:00:28.427Z https://mlverse.github.io/mall/reference/MallFrame.html - 2024-10-18T15:55:27.843Z + 2024-10-18T16:00:28.431Z https://mlverse.github.io/mall/reference/r_index.html - 2024-10-18T15:55:27.843Z + 2024-10-18T16:00:28.431Z https://mlverse.github.io/mall/reference/llm_custom.html - 2024-10-18T15:55:27.843Z + 2024-10-18T16:00:28.431Z https://mlverse.github.io/mall/reference/llm_verify.html - 2024-10-18T15:55:27.843Z + 2024-10-18T16:00:28.431Z https://mlverse.github.io/mall/reference/llm_translate.html - 2024-10-18T15:55:27.843Z + 2024-10-18T16:00:28.431Z https://mlverse.github.io/mall/reference/index.html - 2024-10-18T15:55:27.843Z + 2024-10-18T16:00:28.431Z