forked from dotnet/spark
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathStructuredNetworkWordCountWindowed.fs
66 lines (57 loc) · 2.58 KB
/
StructuredNetworkWordCountWindowed.fs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
namespace Microsoft.Spark.Examples.Sql.Streaming
open Microsoft.Spark.Examples
open Microsoft.Spark.Sql
/// <summary>
/// The example is taken/modified from
/// spark/examples/src/main/python/sql/streaming/structured_network_wordcount_windowed.py
///
/// You can set up the data source as follow in a separated terminal:
/// `$ nc -lk 9999`
/// to start writing standard input to port 9999.
/// </summary>
type StructuredNetworkWordCountWindowed() =
member this.Run(args : string[]) =
match args with
| ([| hostname; portStr; windowSizeStr |] | [| hostname; portStr; windowSizeStr; _ |]) ->
let port = portStr |> int64
let windowSize = windowSizeStr |> int64
let slideSize = if (args.Length = 3) then windowSize else (args.[3] |> int64)
if (slideSize > windowSize) then
printfn "<slide duration> must be less than or equal to <window duration>"
let windowDuration = sprintf "%d seconds" windowSize
let slideDuration = sprintf "%d seconds" slideSize
let spark =
SparkSession.Builder().AppName("StructuredNetworkWordCountWindowed").GetOrCreate()
let lines =
spark.ReadStream()
.Format("socket")
.Option("host", hostname)
.Option("port", port)
.Option("includeTimestamp", true)
.Load()
let words =
lines.Select(Functions.Explode(Functions.Split(lines.["value"], " "))
.Alias("word"), lines.["timestamp"])
let windowedCounts =
words.GroupBy(Functions.Window(words.["timestamp"], windowDuration, slideDuration),
words.["word"])
.Count()
.OrderBy("window")
let query =
windowedCounts.WriteStream()
.OutputMode("complete")
.Format("console")
.Option("truncate", false)
.Start()
query.AwaitTermination()
0
| _ ->
printfn "Usage: StructuredNetworkWordCountWindowed \
<hostname> <port> <window duration in seconds> \
[<slide duration in seconds>]"
1
interface IExample with
member this.Run (args) = this.Run (args)