From 23a03ee15dbb96335f79292ea47a1a013e690bdc Mon Sep 17 00:00:00 2001 From: Travis Hooper <thooper41@gmail.com> Date: Wed, 3 Sep 2014 21:53:54 -0500 Subject: [PATCH 1/2] By golly 53% accuracy in 3.4 sec isn't too bad quit exit end git --- Gemfile | 5 ++++ gutenberg.rb | 3 +- lib/complex_predictor.rb | 40 ++++++++++++++++++++++++-- lib/complex_predictor2.rb | 60 +++++++++++++++++++++++++++++++++++++++ lib/complex_predictor3.rb | 22 ++++++++++++++ lib/predictor.rb | 1 + lib/simple_predictor.rb | 2 ++ spec/gutenberg_spec.rb | 4 +++ 8 files changed, 133 insertions(+), 4 deletions(-) create mode 100644 Gemfile create mode 100644 lib/complex_predictor2.rb create mode 100644 lib/complex_predictor3.rb create mode 100644 spec/gutenberg_spec.rb diff --git a/Gemfile b/Gemfile new file mode 100644 index 0000000..1f47892 --- /dev/null +++ b/Gemfile @@ -0,0 +1,5 @@ +source 'https://rubygems.org' +ruby '2.0.0' + +gem 'rspec', '~> 2.14.1' +gem 'pry-byebug' diff --git a/gutenberg.rb b/gutenberg.rb index 84d20f6..de4730d 100644 --- a/gutenberg.rb +++ b/gutenberg.rb @@ -24,5 +24,4 @@ def run!(predictor_klass, opts={}) end run!(SimplePredictor) -run!(ComplexPredictor, debug: true) - +run!(ComplexPredictor, debug: true) \ No newline at end of file diff --git a/lib/complex_predictor.rb b/lib/complex_predictor.rb index b8921f3..c10846c 100644 --- a/lib/complex_predictor.rb +++ b/lib/complex_predictor.rb @@ -7,6 +7,22 @@ class ComplexPredictor < Predictor # Returns nothing. def train! @data = {} + + @all_books.each do |category, books| + @data[category] = Hash.new(0) + books.each do |filename, tokens| + tokens.each {|token| @data[category][token] += 1 if token.length > 5} + end + @data[category] = @data[category].max_by {|k,v| v}[0] + end + # raise "You must implement Predictor#train!." + + # Public: Predicts category. + + # tokens - A list of tokens (words). + + # Returns a category. + @data end # Public: Predicts category. @@ -15,8 +31,28 @@ def train! # # Returns a category. def predict(tokens) - # Always predict astronomy, for now. - :astronomy + # # Find the category that has the most similar word-count. + # # + # # Example: Say the average Archeology book has 50 words and the average + # # Philosophy book has 100 words. Then say we must predict some book with 120 + # # words. In this case, we will predict Philosophy, since 120 is closer to 100 + # # than it is to 50. + counter2 = 0 + max_occurrences = 0 + the_category = nil + counter = Hash.new + + @data.each do |category, word| + counter[category] = 0 + tokens.each {|token| counter[category] += 1 if token == word} + if max_occurrences < counter[category] + max_occurrences = counter[category] + the_category = category + end + end + counter2 += 1 + the_category + # # raise "You must implement Predictor#predict." end end diff --git a/lib/complex_predictor2.rb b/lib/complex_predictor2.rb new file mode 100644 index 0000000..0598640 --- /dev/null +++ b/lib/complex_predictor2.rb @@ -0,0 +1,60 @@ +require_relative 'predictor' + +class ComplexPredictor < Predictor + # Public: Trains the predictor on books in our dataset. This method is called + # before the predict() method is called. + # + # Returns nothing. + def train! + @data = {} + + @all_books.each do |category, books| + @data[category] = Hash.new(0) + books.each do |filename, tokens| + tokens.each {|token| @data[category][token] += 1 if token.length > 5} + end + @data[category] = @data[category].max_by {|k,v| v}[0] + end + # raise "You must implement Predictor#train!." + + # Public: Predicts category. + + # tokens - A list of tokens (words). + + # Returns a category. + end + + # Public: Predicts category. + # + # tokens - A list of tokens (words). + # + # Returns a category. + def predict(tokens) + # # Find the category that has the most similar word-count. + # # + # # Example: Say the average Archeology book has 50 words and the average + # # Philosophy book has 100 words. Then say we must predict some book with 120 + # # words. In this case, we will predict Philosophy, since 120 is closer to 100 + # # than it is to 50. + max_occurrences = 0 + the_category = nil + counter = Hash.new + + @all_books.each do |cat, books| + @data.each do |category, word| + books.each do |filename, tokens| + counter[category] = 0 + tokens.each {|token| counter[category] += 1 if token == word} + if max_occurrences < counter[category] + max_occurrences = counter[category] + the_category = category + end + end + end + end + + the_category + # # raise "You must implement Predictor#predict." + end +end + diff --git a/lib/complex_predictor3.rb b/lib/complex_predictor3.rb new file mode 100644 index 0000000..b8921f3 --- /dev/null +++ b/lib/complex_predictor3.rb @@ -0,0 +1,22 @@ +require_relative 'predictor' + +class ComplexPredictor < Predictor + # Public: Trains the predictor on books in our dataset. This method is called + # before the predict() method is called. + # + # Returns nothing. + def train! + @data = {} + end + + # Public: Predicts category. + # + # tokens - A list of tokens (words). + # + # Returns a category. + def predict(tokens) + # Always predict astronomy, for now. + :astronomy + end +end + diff --git a/lib/predictor.rb b/lib/predictor.rb index d544f81..b4304c4 100644 --- a/lib/predictor.rb +++ b/lib/predictor.rb @@ -124,3 +124,4 @@ def load_books(dataset, opts={}) end end + diff --git a/lib/simple_predictor.rb b/lib/simple_predictor.rb index 6b93003..765541c 100644 --- a/lib/simple_predictor.rb +++ b/lib/simple_predictor.rb @@ -68,3 +68,5 @@ def predict(tokens) end end + + diff --git a/spec/gutenberg_spec.rb b/spec/gutenberg_spec.rb new file mode 100644 index 0000000..a950747 --- /dev/null +++ b/spec/gutenberg_spec.rb @@ -0,0 +1,4 @@ +#require this file in your spec files to help DRY up your tests +require 'rspec' +require 'pry-byebug' +require_relative '../gutenberg.rb' From 45a9c016a42295f0f6153f35b0f97e8c064cce12 Mon Sep 17 00:00:00 2001 From: Travis Hooper <thooper41@gmail.com> Date: Thu, 4 Sep 2014 00:53:34 -0500 Subject: [PATCH 2/2] working file, correctly determines all books in ~6 sec. --- lib/complex_predictor.rb | 59 ++++++++++++++++++++++++++++++---------- 1 file changed, 44 insertions(+), 15 deletions(-) diff --git a/lib/complex_predictor.rb b/lib/complex_predictor.rb index c10846c..47be45c 100644 --- a/lib/complex_predictor.rb +++ b/lib/complex_predictor.rb @@ -6,18 +6,33 @@ class ComplexPredictor < Predictor # # Returns nothing. def train! - @data = {} + + @length_data = {} + + @all_books.each do |category, books| + @length_data[category] = { + words: 0, + books: 0 + } + books.each do |filename, tokens| + @length_data[category][:words] += tokens.count + @length_data[category][:books] += 1 + end + end + + @data = {} # {:astronomy=>"footnote", :philosophy=>"things", :religion=>"christ", :archeology=>"gutenberg"} + @length = {} + bad_words = ['footnote', 'things', 'gutenberg', 'project', 'illustration'] + @all_books.each do |category, books| @data[category] = Hash.new(0) books.each do |filename, tokens| - tokens.each {|token| @data[category][token] += 1 if token.length > 5} + tokens.each {|token| @data[category][token] += 1 if token.length > 5 && bad_words.include?(token) != true } end @data[category] = @data[category].max_by {|k,v| v}[0] end # raise "You must implement Predictor#train!." - - # Public: Predicts category. # tokens - A list of tokens (words). @@ -37,22 +52,36 @@ def predict(tokens) # # Philosophy book has 100 words. Then say we must predict some book with 120 # # words. In this case, we will predict Philosophy, since 120 is closer to 100 # # than it is to 50. - counter2 = 0 max_occurrences = 0 the_category = nil counter = Hash.new - @data.each do |category, word| - counter[category] = 0 - tokens.each {|token| counter[category] += 1 if token == word} - if max_occurrences < counter[category] - max_occurrences = counter[category] - the_category = category - end + minimum_category = nil + minimum_distance = 999999999999 + + @length_data.each do |category, counts| + average_words_per_book = counts[:words].to_f / counts[:books] + difference = (tokens.count - average_words_per_book).abs + + if difference < minimum_distance + minimum_category = category + minimum_distance = difference + end + end + + + + @data.each do |category, word| + counter[category] = 0 + tokens.each {|token| counter[category] += 1 if word == token || category.to_s == token} + if minimum_category == category + counter[category] += 2 + end + if max_occurrences.to_f < counter[category] + max_occurrences = counter[category] + the_category = category end - counter2 += 1 + end the_category - # # raise "You must implement Predictor#predict." end end -