forked from infochimps-away/wukong
-
Notifications
You must be signed in to change notification settings - Fork 0
/
word_count.rb
executable file
·75 lines (69 loc) · 1.96 KB
/
word_count.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/usr/bin/env ruby
require 'rubygems'
require 'wukong/script'
module WordCount
class Mapper < Wukong::Streamer::LineStreamer
#
# Split a string into its constituent words.
#
# This is pretty simpleminded:
# * downcase the word
# * Split at any non-alphanumeric boundary, including '_'
# * However, preserve the special cases of 's, 'd or 't at the end of a
# word.
#
# tokenize("Ability is a poor man's wealth #johnwoodenquote")
# # => ["ability", "is", "a", "poor", "man's", "wealth", "johnwoodenquote"]
#
def tokenize str
return [] if str.blank?
str = str.downcase;
# kill off all punctuation except [stuff]'s or [stuff]'t
# this includes hyphens (words are split)
str = str.
gsub(/[^a-zA-Z0-9\']+/, ' ').
gsub(/(\w)\'([std])\b/, '\1!\2').gsub(/\'/, ' ').gsub(/!/, "'")
# Busticate at whitespace
words = str.split(/\s+/)
words.reject!{|w| w.blank? }
words
end
#
# Emit each word in each line.
#
def process line
tokenize(line).each{|word| yield [word, 1] }
end
end
#
# You can stack up all the values in a list then sum them at once.
#
# This isn't good style, as it means the whole list is held in memory
#
class Reducer1 < Wukong::Streamer::ListReducer
def finalize
yield [ key, values.map(&:last).map(&:to_i).inject(0){|x,tot| x+tot } ]
end
end
#
# A bit kinder to your memory manager: accumulate the sum record-by-record:
#
class Reducer2 < Wukong::Streamer::AccumulatingReducer
def start!(*args) @key_count = 0 end
def accumulate(*args) @key_count += 1 end
def finalize
yield [ key, @key_count ]
end
end
#
# ... easiest of all, though: this is common enough that it's already included
#
require 'wukong/streamer/count_keys'
class Reducer3 < Wukong::Streamer::CountKeys
end
end
# Execute the script
Wukong.run(
WordCount::Mapper,
WordCount::Reducer
)