diff --git a/300_Aggregations.asciidoc b/300_Aggregations.asciidoc deleted file mode 100644 index df309af6b..000000000 --- a/300_Aggregations.asciidoc +++ /dev/null @@ -1,21 +0,0 @@ -[[aggregations]] - -== Aggregations - -include::300_Aggregations/05_overview.asciidoc[] - -include::300_Aggregations/15_concepts_buckets.asciidoc[] - -include::300_Aggregations/20_basic_example.asciidoc[] - -include::300_Aggregations/25_basic_example_expanded.asciidoc[] - -include::300_Aggregations/28_bucket_metric_list.asciidoc[] - -include::300_Aggregations/30_histogram.asciidoc[] - -include::300_Aggregations/35_date_histogram.asciidoc[] - -include::300_Aggregations/40_scope.asciidoc[] - -include::300_Aggregations/45_filtering.asciidoc[] \ No newline at end of file diff --git a/300_Aggregations/21_add_metric.asciidoc b/300_Aggregations/21_add_metric.asciidoc new file mode 100644 index 000000000..4be3a670d --- /dev/null +++ b/300_Aggregations/21_add_metric.asciidoc @@ -0,0 +1,93 @@ + +=== Adding a metric to the mix + +The previous example told us how many documents were in each bucket, which is +useful. But often, our applications require more sophisticated _metrics_ about +the documents. For example, what is the average price of cars in each bucket? + +// "nesting"-> need to tell Elasticsearch which metrics to calculate, and on which fields. +To get this information, we need to start nesting metrics inside of the buckets. +Metrics will calculate some kind of mathematical statistic based on the values +in the documents residing within a particular bucket. + +Let's go ahead and add an `average` metric to our car example: + +[source,js] +-------------------------------------------------- +GET /cars/transactions/_search?search_type=count +{ + "aggs": { + "colors": { + "terms": { + "field": "color" + }, + "aggs": { <1> + "avg_price": { <2> + "avg": { + "field": "price" <3> + } + } + } + } + } +} +-------------------------------------------------- +// SENSE: 300_Aggregations/20_basic_example.json +<1> We add a new `aggs` level to hold the metric +<2> We then give the metric a name: "avg_price" +<3> And finally define it as an `avg` metric over the "price" field + +As you can see, we took the previous example and tacked on a new `agga` level. +This new aggregation level allows us to nest the `avg` metric inside the +`terms` bucket. Effectively, this means we will generate an average for each +color. + +Just like the "colors" example, we need to name our metric ("avg_price") so we +can retrieve the values later. Finally, we specify the metric itself (`avg`) +and what field we want the average to be calculated on (`price`). + +// Delete this para +The response is, not surprisingly, nearly identical to the previous response...except +there is now a new "avg_price" element added to each color bucket: + +[source,js] +-------------------------------------------------- +{ +... + "aggregations": { + "colors": { + "buckets": [ + { + "key": "red", + "doc_count": 4, + "avg_price": { <1> + "value": 32500 + } + }, + { + "key": "blue", + "doc_count": 2, + "avg_price": { + "value": 20000 + } + }, + { + "key": "green", + "doc_count": 2, + "avg_price": { + "value": 21000 + } + } + ] + } + } +... +} +-------------------------------------------------- +<1> New "avg_price" element in response + +// Would love to have a graph under each example showing how the data can be displayed (later, i know) +Although the response has changed minimally, the data we get out of it has grown +substantially. Before, we knew there were four red cars. Now we know that the +average price of red cars is $32,500. This is something that you can plug directly +into reports or graphs. \ No newline at end of file diff --git a/300_Aggregations/22_nested_bucket.asciidoc b/300_Aggregations/22_nested_bucket.asciidoc new file mode 100644 index 000000000..fe276974d --- /dev/null +++ b/300_Aggregations/22_nested_bucket.asciidoc @@ -0,0 +1,101 @@ + +=== Buckets inside of buckets + +The true power of aggregations becomes apparent once you start playing with +different nesting schemes. In the previous examples, we saw how you could nest +a metric inside a bucket, which is already quite powerful. + +But the real exciting analytics come from nesting buckets inside _other buckets_. +This time, we want to find out the distribution of car manufacturers for each +color: + + +[source,js] +-------------------------------------------------- +GET /cars/transactions/_search?search_type=count +{ + "aggs": { + "colors": { + "terms": { + "field": "color" + }, + "aggs": { + "avg_price": { <1> + "avg": { + "field": "price" + } + }, + "make": { <2> + "terms": { + "field": "make" <3> + } + } + } + } + } +} +-------------------------------------------------- +// SENSE: 300_Aggregations/20_basic_example.json +<1> Notice that we can leave the previous "avg_price" metric in place +<2> Another aggregation named "make" is added to the "color" bucket +<3> This aggregation is a `terms` bucket and will generate unique buckets for +each car make + +A few interesting things happened here. First, you'll notice that the previous +"avg_price" metric is left entirely intact. Each "level" of an aggregation can +have many metrics or buckets. The "avg_price" metric tells us the average price +for each car color. This is independent of other buckets and metrics which +are also being built. + +This is very important for your application, since there are often many related, +but entirely distinct, metrics which you need to collect. Aggregations allow +you to collect all of them in a single pass over the data. + +The other important thing to note is that the aggregation we added, "make", is +a `terms` bucket (nested inside the "colors" `terms` bucket). This means we will +generate a (color, make) tuple for every unique combination in your dataset. + +Let's take a look at the response (truncated for brevity, since it is now +growing quite long): + + +[source,js] +-------------------------------------------------- +{ +... + "aggregations": { + "colors": { + "buckets": [ + { + "key": "red", + "doc_count": 4, + "make": { <1> + "buckets": [ + { + "key": "honda", <2> + "doc_count": 3 + }, + { + "key": "bmw", + "doc_count": 1 + } + ] + }, + "avg_price": { + "value": 32500 <3> + } + }, + +... +} +-------------------------------------------------- +<1> Our new aggregation is nested under each color bucket, as expected +<2> We now see a breakdown of car makes for each color +<3> Finally, you can see that our previous "avg_price" metric is still intact + +The response tells us: + +- There are four red cars +- The average price of a red car is $32,500 +- Three of the red cars are made by Honda, and one is a BMW +- Similar analytics are generated for other colors and makes \ No newline at end of file diff --git a/300_Aggregations/23_extra_metrics.asciidoc b/300_Aggregations/23_extra_metrics.asciidoc new file mode 100644 index 000000000..e1edb14fd --- /dev/null +++ b/300_Aggregations/23_extra_metrics.asciidoc @@ -0,0 +1,97 @@ + + +==== One final modification + +Just to drive the point home, let's make one final modification to our example +before moving on to new topics. Let's add two metrics to calculate the min and +max price for each make: + + +[source,js] +-------------------------------------------------- +GET /cars/transactions/_search?search_type=count +{ + "aggs": { + "colors": { + "terms": { + "field": "color" + }, + "aggs": { + "avg_price": { "avg": { "field": "price" } + }, + "make" : { + "terms" : { + "field" : "make" + }, + "aggs" : { <1> + "min_price" : { "min": { "field": "price"} }, <2> + "max_price" : { "max": { "field": "price"} } <3> + } + } + } + } + } +} +-------------------------------------------------- +// SENSE: 300_Aggregations/20_basic_example.json + +// Careful with the "no surprise", it makes it sound like you're bored :) + +<1> No surprise...we need to add another "aggs" level for nesting +<2> Then we include a `min` metric +<3> And a `max` metric + +Which gives us the following output (again, truncated): + +[source,js] +-------------------------------------------------- +{ +... + "aggregations": { + "colors": { + "buckets": [ + { + "key": "red", + "doc_count": 4, + "make": { + "buckets": [ + { + "key": "honda", + "doc_count": 3, + "min_price": { + "value": 10000 <1> + }, + "max_price": { + "value": 20000 <1> + } + }, + { + "key": "bmw", + "doc_count": 1, + "min_price": { + "value": 80000 + }, + "max_price": { + "value": 80000 + } + } + ] + }, + "avg_price": { + "value": 32500 + } + }, +... +-------------------------------------------------- +<1> The `min` and `max` metrics that we added now appear under each "make" + +With those two buckets, we've expanded the information derived from this query +to include: + +// Nice, but "Similar analytics.." -> "etc."? +- There are four red cars +- The average price of a red car is $32,500 +- Three of the red cars are made by Honda, and one is a BMW +- The cheapest Honda is $10,000 +- The most expensive Honda is $20,000 +- Similar analytics are generated for all other colors and makes \ No newline at end of file diff --git a/300_Aggregations/25_basic_example_expanded.asciidoc b/300_Aggregations/25_basic_example_expanded.asciidoc index 68603bf6f..139597f9c 100644 --- a/300_Aggregations/25_basic_example_expanded.asciidoc +++ b/300_Aggregations/25_basic_example_expanded.asciidoc @@ -1,290 +1,2 @@ -=== Adding a metric to the mix -The previous example told us how many documents were in each bucket, which is -useful. But often, our applications require more sophisticated _metrics_ about -the documents. For example, what is the average price of cars in each bucket? - -// "nesting"-> need to tell Elasticsearch which metrics to calculate, and on which fields. -To get this information, we need to start nesting metrics inside of the buckets. -Metrics will calculate some kind of mathematical statistic based on the values -in the documents residing within a particular bucket. - -Let's go ahead and add an `average` metric to our car example: - -[source,js] --------------------------------------------------- -GET /cars/transactions/_search?search_type=count -{ - "aggs": { - "colors": { - "terms": { - "field": "color" - }, - "aggs": { <1> - "avg_price": { <2> - "avg": { - "field": "price" <3> - } - } - } - } - } -} --------------------------------------------------- -// SENSE: 300_Aggregations/20_basic_example.json -<1> We add a new `aggs` level to hold the metric -<2> We then give the metric a name: "avg_price" -<3> And finally define it as an `avg` metric over the "price" field - -As you can see, we took the previous example and tacked on a new `agga` level. -This new aggregation level allows us to nest the `avg` metric inside the -`terms` bucket. Effectively, this means we will generate an average for each -color. - -Just like the "colors" example, we need to name our metric ("avg_price") so we -can retrieve the values later. Finally, we specify the metric itself (`avg`) -and what field we want the average to be calculated on (`price`). - -// Delete this para -The response is, not surprisingly, nearly identical to the previous response...except -there is now a new "avg_price" element added to each color bucket: - -[source,js] --------------------------------------------------- -{ -... - "aggregations": { - "colors": { - "buckets": [ - { - "key": "red", - "doc_count": 4, - "avg_price": { <1> - "value": 32500 - } - }, - { - "key": "blue", - "doc_count": 2, - "avg_price": { - "value": 20000 - } - }, - { - "key": "green", - "doc_count": 2, - "avg_price": { - "value": 21000 - } - } - ] - } - } -... -} --------------------------------------------------- -<1> New "avg_price" element in response - -// Would love to have a graph under each example showing how the data can be displayed (later, i know) -Although the response has changed minimally, the data we get out of it has grown -substantially. Before, we knew there were four red cars. Now we know that the -average price of red cars is $32,500. This is something that you can plug directly -into reports or graphs. - -=== Buckets inside of buckets - -The true power of aggregations becomes apparent once you start playing with -different nesting schemes. In the previous examples, we saw how you could nest -a metric inside a bucket, which is already quite powerful. - -But the real exciting analytics come from nesting buckets inside _other buckets_. -This time, we want to find out the distribution of car manufacturers for each -color: - - -[source,js] --------------------------------------------------- -GET /cars/transactions/_search?search_type=count -{ - "aggs": { - "colors": { - "terms": { - "field": "color" - }, - "aggs": { - "avg_price": { <1> - "avg": { - "field": "price" - } - }, - "make": { <2> - "terms": { - "field": "make" <3> - } - } - } - } - } -} --------------------------------------------------- -// SENSE: 300_Aggregations/20_basic_example.json -<1> Notice that we can leave the previous "avg_price" metric in place -<2> Another aggregation named "make" is added to the "color" bucket -<3> This aggregation is a `terms` bucket and will generate unique buckets for -each car make - -A few interesting things happened here. First, you'll notice that the previous -"avg_price" metric is left entirely intact. Each "level" of an aggregation can -have many metrics or buckets. The "avg_price" metric tells us the average price -for each car color. This is independent of other buckets and metrics which -are also being built. - -This is very important for your application, since there are often many related, -but entirely distinct, metrics which you need to collect. Aggregations allow -you to collect all of them in a single pass over the data. - -The other important thing to note is that the aggregation we added, "make", is -a `terms` bucket (nested inside the "colors" `terms` bucket). This means we will -generate a (color, make) tuple for every unique combination in your dataset. - -Let's take a look at the response (truncated for brevity, since it is now -growing quite long): - - -[source,js] --------------------------------------------------- -{ -... - "aggregations": { - "colors": { - "buckets": [ - { - "key": "red", - "doc_count": 4, - "make": { <1> - "buckets": [ - { - "key": "honda", <2> - "doc_count": 3 - }, - { - "key": "bmw", - "doc_count": 1 - } - ] - }, - "avg_price": { - "value": 32500 <3> - } - }, - -... -} --------------------------------------------------- -<1> Our new aggregation is nested under each color bucket, as expected -<2> We now see a breakdown of car makes for each color -<3> Finally, you can see that our previous "avg_price" metric is still intact - -The response tells us: - -- There are four red cars -- The average price of a red car is $32,500 -- Three of the red cars are made by Honda, and one is a BMW -- Similar analytics are generated for other colors and makes - -==== One final modification - -Just to drive the point home, let's make one final modification to our example -before moving on to new topics. Let's add two metrics to calculate the min and -max price for each make: - - -[source,js] --------------------------------------------------- -GET /cars/transactions/_search?search_type=count -{ - "aggs": { - "colors": { - "terms": { - "field": "color" - }, - "aggs": { - "avg_price": { "avg": { "field": "price" } - }, - "make" : { - "terms" : { - "field" : "make" - }, - "aggs" : { <1> - "min_price" : { "min": { "field": "price"} }, <2> - "max_price" : { "max": { "field": "price"} } <3> - } - } - } - } - } -} --------------------------------------------------- -// SENSE: 300_Aggregations/20_basic_example.json - -// Careful with the "no surprise", it makes it sound like you're bored :) - -<1> No surprise...we need to add another "aggs" level for nesting -<2> Then we include a `min` metric -<3> And a `max` metric - -Which gives us the following output (again, truncated): - -[source,js] --------------------------------------------------- -{ -... - "aggregations": { - "colors": { - "buckets": [ - { - "key": "red", - "doc_count": 4, - "make": { - "buckets": [ - { - "key": "honda", - "doc_count": 3, - "min_price": { - "value": 10000 <1> - }, - "max_price": { - "value": 20000 <1> - } - }, - { - "key": "bmw", - "doc_count": 1, - "min_price": { - "value": 80000 - }, - "max_price": { - "value": 80000 - } - } - ] - }, - "avg_price": { - "value": 32500 - } - }, -... --------------------------------------------------- -<1> The `min` and `max` metrics that we added now appear under each "make" - -With those two buckets, we've expanded the information derived from this query -to include: - -// Nice, but "Similar analytics.." -> "etc."? -- There are four red cars -- The average price of a red car is $32,500 -- Three of the red cars are made by Honda, and one is a BMW -- The cheapest Honda is $10,000 -- The most expensive Honda is $20,000 -- Similar analytics are generated for all other colors and makes diff --git a/301_Aggregation_Overview.asciidoc b/301_Aggregation_Overview.asciidoc new file mode 100644 index 000000000..dbe91c89c --- /dev/null +++ b/301_Aggregation_Overview.asciidoc @@ -0,0 +1,7 @@ +[[aggregations]] + += Aggregations + +include::300_Aggregations/05_overview.asciidoc[] + +include::300_Aggregations/15_concepts_buckets.asciidoc[] \ No newline at end of file diff --git a/302_Example_Walkthrough.asciidoc b/302_Example_Walkthrough.asciidoc new file mode 100644 index 000000000..563cb5f96 --- /dev/null +++ b/302_Example_Walkthrough.asciidoc @@ -0,0 +1,10 @@ + +include::300_Aggregations/20_basic_example.asciidoc[] + +include::300_Aggregations/21_add_metric.asciidoc[] + +include::300_Aggregations/22_nested_bucket.asciidoc[] + +include::300_Aggregations/23_extra_metrics.asciidoc[] + +include::300_Aggregations/28_bucket_metric_list.asciidoc[] \ No newline at end of file diff --git a/303_Making_Graphs.asciidoc b/303_Making_Graphs.asciidoc new file mode 100644 index 000000000..8f355e77b --- /dev/null +++ b/303_Making_Graphs.asciidoc @@ -0,0 +1,9 @@ + + +include::300_Aggregations/30_histogram.asciidoc[] + +include::300_Aggregations/35_date_histogram.asciidoc[] + +include::300_Aggregations/40_scope.asciidoc[] + +include::300_Aggregations/45_filtering.asciidoc[] \ No newline at end of file diff --git a/304_Approximate_Aggregations.asciidoc b/304_Approximate_Aggregations.asciidoc new file mode 100644 index 000000000..30404ce4c --- /dev/null +++ b/304_Approximate_Aggregations.asciidoc @@ -0,0 +1 @@ +TODO \ No newline at end of file diff --git a/305_Significant_Terms.asciidoc b/305_Significant_Terms.asciidoc new file mode 100644 index 000000000..30404ce4c --- /dev/null +++ b/305_Significant_Terms.asciidoc @@ -0,0 +1 @@ +TODO \ No newline at end of file diff --git a/306_Practical_Considerations.asciidoc b/306_Practical_Considerations.asciidoc new file mode 100644 index 000000000..30404ce4c --- /dev/null +++ b/306_Practical_Considerations.asciidoc @@ -0,0 +1 @@ +TODO \ No newline at end of file diff --git a/book.asciidoc b/book.asciidoc index efc557e69..f4ba54d90 100644 --- a/book.asciidoc +++ b/book.asciidoc @@ -70,18 +70,29 @@ include::280_Suggesters.asciidoc[] // Part 4 +include::301_Aggregation_Overview.asciidoc[] + +include::302_Example_Walkthrough.asciidoc[] + +include::303_Making_Graphs.asciidoc[] + +include::304_Approximate_Aggregations.asciidoc[] + +include::305_Significant_Terms.asciidoc[] + +include::306_Practical_Considerations.asciidoc[] + +// Part 5 [[more-than-search]] = More than search (TODO) -include::300_Aggregations.asciidoc[] - include::310_Geolocation.asciidoc[] include::320_Percolation.asciidoc[] include::330_Scripting.asciidoc[] -// Part 5 +// Part 6 [[modelling-your-data]] @@ -95,7 +106,7 @@ include::420_Time_Based.asciidoc[] include::430_Index_Per_User.asciidoc[] -// Part 6 +// Part 7 [[administration]]