From 905b7af9a3b99e665251c300fd3e5d60d2981795 Mon Sep 17 00:00:00 2001
From: Samraj Moorjani <samraj.moorjani@databricks.com>
Date: Wed, 9 Oct 2024 11:15:30 -0700
Subject: [PATCH] Update online monitoring dashboard for human feedback (#36)

* Update dashboard for human feedback

* update dashboard with feedback.
---
 ..._online_monitoring_dashboard_template.json | 283 +++++++++++++++---
 1 file changed, 247 insertions(+), 36 deletions(-)

diff --git a/rag_app_sample_code/resources/agent_quality_online_monitoring_dashboard_template.json b/rag_app_sample_code/resources/agent_quality_online_monitoring_dashboard_template.json
index 0c1f3ee..05dc07e 100644
--- a/rag_app_sample_code/resources/agent_quality_online_monitoring_dashboard_template.json
+++ b/rag_app_sample_code/resources/agent_quality_online_monitoring_dashboard_template.json
@@ -2,8 +2,8 @@
     "datasets": [
         {
             "name": "b3e86a40",
-            "displayName": "{{eval_requests_log_table_name}}",
-            "query": "select \n    *,\n    execution_time_ms / 1000 as execution_time_s,\n    concat(\n        '/ml/experiments/',\n        experiment_id,\n        '/runs/',\n        run_id,\n        '/evaluations?searchQuery=',\n        databricks_request_id\n     ) as eval_details\nfrom {{eval_requests_log_table_name}}\nwhere \n    run_id is not NULL \n    and run_id != \"skipped\""
+            "displayName": "request_logs_eval",
+            "query": "with ranked_logs as (\n    select \n        `timestamp`,\n        request_id,\n        source.id as source_id,\n        text_assessment.ratings[\"answer_correct\"][\"value\"] as text_rating,\n        retrieval_assessment.ratings[\"answer_correct\"][\"value\"] as retrieval_rating,\n        retrieval_assessment.position as retrieval_position,\n        row_number() over (\n            partition by request_id, source.id, retrieval_assessment.position order by `timestamp` desc\n        ) as rank\n    from {{assessment_log_table_name}}\n), human_assessments as (\n    select \n        `timestamp`,\n        request_id as databricks_request_id,\n        source_id,\n        text_rating,\n        case\n            when retrieval_rating = \"false\" then \"negative\"\n            when retrieval_rating = \"true\" then \"positive\"\n            when retrieval_rating = \"i_dont_know\" then \"i_dont_know\"\n            else null\n        end as retrieval_rating,\n        retrieval_position\n    from ranked_logs\n    where rank = 1\n    order by `timestamp` desc\n), aggregated_human_assessments as (\n    select \n        databricks_request_id,\n        sum(case when text_rating = 'positive' then 1 else 0 end) as positive_text_ratings,\n        sum(case when text_rating = 'negative' or text_rating = 'i_dont_know' then 1 else 0 end) as negative_text_ratings,\n        sum(case when retrieval_rating = 'positive' then 1 else 0 end) as positive_retrieval_ratings,\n        sum(case when retrieval_rating = 'negative' or retrieval_rating = 'i_dont_know' then 1 else 0 end) as negative_retrieval_ratings\n    from human_assessments\n    group by databricks_request_id\n), eval_log as (\n    select \n        *,\n        execution_time_ms / 1000 as execution_time_s,\n        concat(\n            '/ml/experiments/',\n            experiment_id,\n            '/runs/',\n            run_id,\n            '/evaluations?searchQuery=',\n            databricks_request_id\n        ) as eval_details\n    from {{eval_requests_log_table_name}}\n    where \n        run_id is not null \n        and run_id != \"skipped\"\n)\nselect\n    eval_log.*,\n    case \n        when negative_text_ratings = 0 and positive_text_ratings = 0 then null\n        when negative_text_ratings >= positive_text_ratings then \"negative\" \n        when negative_text_ratings < positive_text_ratings then \"positive\" \n        else null\n    end as user_text_rating,\n    case \n        when negative_retrieval_ratings = 0 and positive_retrieval_ratings = 0 then null\n        when negative_retrieval_ratings >= positive_retrieval_ratings then \"negative\" \n        when negative_retrieval_ratings < positive_retrieval_ratings then \"positive\" \n        else null\n    end as user_retrieval_rating\nfrom eval_log\nleft join aggregated_human_assessments \non eval_log.databricks_request_id = aggregated_human_assessments.databricks_request_id"
         }
     ],
     "pages": [
@@ -76,7 +76,10 @@
                                             "fieldName": "percentile(execution_time_s, percentage=0_9)",
                                             "displayName": "90th Percentile of execution_time_s"
                                         }
-                                    ]
+                                    ],
+                                    "axis": {
+                                        "title": "Seconds"
+                                    }
                                 }
                             },
                             "frame": {
@@ -101,7 +104,7 @@
                     },
                     "position": {
                         "x": 0,
-                        "y": 11,
+                        "y": 15,
                         "width": 3,
                         "height": 5
                     }
@@ -113,9 +116,9 @@
                     },
                     "position": {
                         "x": 0,
-                        "y": 30,
+                        "y": 34,
                         "width": 6,
-                        "height": 2
+                        "height": 1
                     }
                 },
                 {
@@ -163,7 +166,10 @@
                                     "scale": {
                                         "type": "quantitative"
                                     },
-                                    "displayName": "Count of Records"
+                                    "axis": {
+                                        "title": "# of Requests"
+                                    },
+                                    "displayName": "# of Requests"
                                 },
                                 "color": {
                                     "fieldName": "response/llm_judged/safety/rating",
@@ -203,7 +209,7 @@
                     },
                     "position": {
                         "x": 0,
-                        "y": 18,
+                        "y": 22,
                         "width": 3,
                         "height": 6
                     }
@@ -253,7 +259,10 @@
                                     "scale": {
                                         "type": "quantitative"
                                     },
-                                    "displayName": "Count of Records"
+                                    "axis": {
+                                        "title": "# of Requests"
+                                    },
+                                    "displayName": "# of Requests"
                                 },
                                 "color": {
                                     "fieldName": "response/llm_judged/groundedness/rating",
@@ -293,7 +302,7 @@
                     },
                     "position": {
                         "x": 0,
-                        "y": 24,
+                        "y": 28,
                         "width": 3,
                         "height": 6
                     }
@@ -343,7 +352,10 @@
                                     "scale": {
                                         "type": "quantitative"
                                     },
-                                    "displayName": "Count of Records"
+                                    "axis": {
+                                        "title": "# of Requests"
+                                    },
+                                    "displayName": "# of Requests"
                                 },
                                 "color": {
                                     "fieldName": "response/llm_judged/relevance_to_query/rating",
@@ -387,7 +399,7 @@
                     },
                     "position": {
                         "x": 3,
-                        "y": 18,
+                        "y": 22,
                         "width": 3,
                         "height": 6
                     }
@@ -437,7 +449,10 @@
                                     "scale": {
                                         "type": "quantitative"
                                     },
-                                    "displayName": "Count of Records"
+                                    "axis": {
+                                        "title": "# of Requests"
+                                    },
+                                    "displayName": "# of Requests"
                                 },
                                 "color": {
                                     "fieldName": "status_code",
@@ -450,7 +465,10 @@
                                             }
                                         ]
                                     },
-                                    "displayName": "status_code"
+                                    "legend": {
+                                        "title": "Status Code"
+                                    },
+                                    "displayName": "Status Code"
                                 }
                             },
                             "frame": {
@@ -461,7 +479,7 @@
                     },
                     "position": {
                         "x": 0,
-                        "y": 6,
+                        "y": 5,
                         "width": 3,
                         "height": 5
                     }
@@ -507,7 +525,10 @@
                                     "scale": {
                                         "type": "quantitative"
                                     },
-                                    "displayName": "Average retrieval/llm_judged/chunk_relevance/precision"
+                                    "axis": {
+                                        "title": "Precision"
+                                    },
+                                    "displayName": "Precision"
                                 }
                             },
                             "frame": {
@@ -534,7 +555,7 @@
                     },
                     "position": {
                         "x": 3,
-                        "y": 24,
+                        "y": 28,
                         "width": 3,
                         "height": 6
                     }
@@ -542,7 +563,7 @@
                 {
                     "widget": {
                         "name": "f3cd0aff",
-                        "textbox_spec": "\n## Top-level metrics\nThe following charts give a quick overview of incoming traffic, the pass/fail rate of output quality, the average execution time for the agent, and topic distribution. **Clicking on any data point inside a chart will filter all other charts to the requests covered by the selected data point.**"
+                        "textbox_spec": "\n## Top-level metrics\nThe following charts give a quick overview of incoming traffic, the pass/fail rate of output quality, user feedback 👍 / 👎, the average execution time for the agent, and topic distribution. **Clicking on any data point inside a chart will filter all other charts to the requests covered by the selected data point.**"
                     },
                     "position": {
                         "x": 0,
@@ -1717,7 +1738,7 @@
                     },
                     "position": {
                         "x": 0,
-                        "y": 32,
+                        "y": 35,
                         "width": 6,
                         "height": 6
                     }
@@ -1729,7 +1750,7 @@
                     },
                     "position": {
                         "x": 0,
-                        "y": 16,
+                        "y": 20,
                         "width": 6,
                         "height": 2
                     }
@@ -1776,7 +1797,10 @@
                                     "scale": {
                                         "type": "quantitative"
                                     },
-                                    "displayName": "Count of Records"
+                                    "axis": {
+                                        "title": "% of Requests"
+                                    },
+                                    "displayName": "% of Requests"
                                 },
                                 "color": {
                                     "fieldName": "response/overall_assessment/rating",
@@ -1814,7 +1838,7 @@
                     },
                     "position": {
                         "x": 3,
-                        "y": 6,
+                        "y": 5,
                         "width": 3,
                         "height": 5
                     }
@@ -1856,12 +1880,12 @@
                             "selection": {
                                 "defaultSelection": {
                                     "range": {
-                                        "dataType": "DATE",
+                                        "dataType": "DATETIME",
                                         "min": {
-                                            "value": "2024-09-17T00:00:00.000"
+                                            "value": "now-7d/d"
                                         },
                                         "max": {
-                                            "value": "2024-09-25T23:59:59.999"
+                                            "value": "now-1d/d"
                                         }
                                     }
                                 }
@@ -1876,8 +1900,8 @@
                     "position": {
                         "x": 0,
                         "y": 4,
-                        "width": 2,
-                        "height": 2
+                        "width": 3,
+                        "height": 1
                     }
                 },
                 {
@@ -1930,7 +1954,10 @@
                                             "places": 2
                                         }
                                     },
-                                    "displayName": "Count of Records"
+                                    "axis": {
+                                        "title": "# of Requests"
+                                    },
+                                    "displayName": "# of Requests"
                                 },
                                 "color": {
                                     "fieldName": "topic",
@@ -1968,7 +1995,7 @@
                     },
                     "position": {
                         "x": 3,
-                        "y": 11,
+                        "y": 15,
                         "width": 3,
                         "height": 5
                     }
@@ -2014,22 +2041,206 @@
                         }
                     },
                     "position": {
-                        "x": 2,
+                        "x": 3,
                         "y": 4,
-                        "width": 1,
-                        "height": 2
+                        "width": 3,
+                        "height": 1
                     }
                 },
                 {
                     "widget": {
-                        "name": "49eeddb4",
-                        "textbox_spec": "##### Mosaic Agent Framework & Evaluation can also help you track actual user feedback over your agent (e.g., thumbs up/down). Reach out to your Databricks representative if you are interested in enabling this feature.\n"
+                        "name": "44027211",
+                        "queries": [
+                            {
+                                "name": "main_query",
+                                "query": {
+                                    "datasetName": "b3e86a40",
+                                    "fields": [
+                                        {
+                                            "name": "user_text_rating",
+                                            "expression": "`user_text_rating`"
+                                        },
+                                        {
+                                            "name": "daily(timestamp)",
+                                            "expression": "DATE_TRUNC(\"DAY\", `timestamp`)"
+                                        },
+                                        {
+                                            "name": "count(user_text_rating)",
+                                            "expression": "COUNT(`user_text_rating`)"
+                                        }
+                                    ],
+                                    "disaggregated": false
+                                }
+                            }
+                        ],
+                        "spec": {
+                            "version": 3,
+                            "widgetType": "bar",
+                            "encodings": {
+                                "x": {
+                                    "fieldName": "daily(timestamp)",
+                                    "scale": {
+                                        "type": "temporal"
+                                    },
+                                    "displayName": "Timestamp"
+                                },
+                                "y": {
+                                    "fieldName": "count(user_text_rating)",
+                                    "scale": {
+                                        "type": "quantitative"
+                                    },
+                                    "axis": {
+                                        "title": "# of Requests"
+                                    },
+                                    "displayName": "# of Requests"
+                                },
+                                "color": {
+                                    "fieldName": "user_text_rating",
+                                    "scale": {
+                                        "type": "categorical",
+                                        "mappings": [
+                                            {
+                                                "value": null,
+                                                "color": "#077A9D"
+                                            },
+                                            {
+                                                "value": "no",
+                                                "color": "#FFAB00"
+                                            },
+                                            {
+                                                "value": "yes",
+                                                "color": "#00A972"
+                                            },
+                                            {
+                                                "value": "negative",
+                                                "color": "#FFAB00"
+                                            },
+                                            {
+                                                "value": "positive",
+                                                "color": "#00A972"
+                                            }
+                                        ]
+                                    },
+                                    "legend": {
+                                        "hideTitle": true
+                                    },
+                                    "displayName": "user_text_rating"
+                                }
+                            },
+                            "frame": {
+                                "title": "Human feedback on response quality",
+                                "showTitle": true,
+                                "showDescription": true,
+                                "description": "(i.e., more thumbs down than thumbs up and vice versa)"
+                            },
+                            "mark": {
+                                "layout": "stack"
+                            }
+                        }
+                    },
+                    "position": {
+                        "x": 0,
+                        "y": 10,
+                        "width": 3,
+                        "height": 5
+                    }
+                },
+                {
+                    "widget": {
+                        "name": "13a7e1aa",
+                        "queries": [
+                            {
+                                "name": "main_query",
+                                "query": {
+                                    "datasetName": "b3e86a40",
+                                    "fields": [
+                                        {
+                                            "name": "user_retrieval_rating",
+                                            "expression": "`user_retrieval_rating`"
+                                        },
+                                        {
+                                            "name": "daily(timestamp)",
+                                            "expression": "DATE_TRUNC(\"DAY\", `timestamp`)"
+                                        },
+                                        {
+                                            "name": "count(user_retrieval_rating)",
+                                            "expression": "COUNT(`user_retrieval_rating`)"
+                                        }
+                                    ],
+                                    "disaggregated": false
+                                }
+                            }
+                        ],
+                        "spec": {
+                            "version": 3,
+                            "widgetType": "bar",
+                            "encodings": {
+                                "x": {
+                                    "fieldName": "daily(timestamp)",
+                                    "scale": {
+                                        "type": "temporal"
+                                    },
+                                    "displayName": "Timestamp"
+                                },
+                                "y": {
+                                    "fieldName": "count(user_retrieval_rating)",
+                                    "scale": {
+                                        "type": "quantitative"
+                                    },
+                                    "axis": {
+                                        "title": "# of Requests"
+                                    },
+                                    "displayName": "# of Requests"
+                                },
+                                "color": {
+                                    "fieldName": "user_retrieval_rating",
+                                    "scale": {
+                                        "type": "categorical",
+                                        "mappings": [
+                                            {
+                                                "value": null,
+                                                "color": "#077A9D"
+                                            },
+                                            {
+                                                "value": "no",
+                                                "color": "#FFAB00"
+                                            },
+                                            {
+                                                "value": "yes",
+                                                "color": "#00A972"
+                                            },
+                                            {
+                                                "value": "negative",
+                                                "color": "#FFAB00"
+                                            },
+                                            {
+                                                "value": "positive",
+                                                "color": "#00A972"
+                                            }
+                                        ]
+                                    },
+                                    "legend": {
+                                        "hideTitle": true
+                                    },
+                                    "displayName": "user_retrieval_rating"
+                                }
+                            },
+                            "frame": {
+                                "title": "Human feedback on retrieval quality",
+                                "showTitle": true,
+                                "showDescription": true,
+                                "description": "(i.e., more thumbs down than thumbs up and vice versa)"
+                            },
+                            "mark": {
+                                "layout": "stack"
+                            }
+                        }
                     },
                     "position": {
                         "x": 3,
-                        "y": 4,
+                        "y": 10,
                         "width": 3,
-                        "height": 2
+                        "height": 5
                     }
                 }
             ]