diff --git a/Gemfile b/Gemfile new file mode 100644 index 0000000..1f47892 --- /dev/null +++ b/Gemfile @@ -0,0 +1,5 @@ +source 'https://rubygems.org' +ruby '2.0.0' + +gem 'rspec', '~> 2.14.1' +gem 'pry-byebug' diff --git a/gutenberg.rb b/gutenberg.rb index 84d20f6..de4730d 100644 --- a/gutenberg.rb +++ b/gutenberg.rb @@ -24,5 +24,4 @@ def run!(predictor_klass, opts={}) end run!(SimplePredictor) -run!(ComplexPredictor, debug: true) - +run!(ComplexPredictor, debug: true) \ No newline at end of file diff --git a/lib/complex_predictor.rb b/lib/complex_predictor.rb index b8921f3..47be45c 100644 --- a/lib/complex_predictor.rb +++ b/lib/complex_predictor.rb @@ -6,7 +6,38 @@ class ComplexPredictor < Predictor # # Returns nothing. def train! - @data = {} + + @length_data = {} + + @all_books.each do |category, books| + @length_data[category] = { + words: 0, + books: 0 + } + books.each do |filename, tokens| + @length_data[category][:words] += tokens.count + @length_data[category][:books] += 1 + end + end + + @data = {} # {:astronomy=>"footnote", :philosophy=>"things", :religion=>"christ", :archeology=>"gutenberg"} + @length = {} + bad_words = ['footnote', 'things', 'gutenberg', 'project', 'illustration'] + + + @all_books.each do |category, books| + @data[category] = Hash.new(0) + books.each do |filename, tokens| + tokens.each {|token| @data[category][token] += 1 if token.length > 5 && bad_words.include?(token) != true } + end + @data[category] = @data[category].max_by {|k,v| v}[0] + end + # raise "You must implement Predictor#train!." + + # tokens - A list of tokens (words). + + # Returns a category. + @data end # Public: Predicts category. @@ -15,8 +46,42 @@ def train! # # Returns a category. def predict(tokens) - # Always predict astronomy, for now. - :astronomy + # # Find the category that has the most similar word-count. + # # + # # Example: Say the average Archeology book has 50 words and the average + # # Philosophy book has 100 words. Then say we must predict some book with 120 + # # words. In this case, we will predict Philosophy, since 120 is closer to 100 + # # than it is to 50. + max_occurrences = 0 + the_category = nil + counter = Hash.new + + minimum_category = nil + minimum_distance = 999999999999 + + @length_data.each do |category, counts| + average_words_per_book = counts[:words].to_f / counts[:books] + difference = (tokens.count - average_words_per_book).abs + + if difference < minimum_distance + minimum_category = category + minimum_distance = difference + end + end + + + + @data.each do |category, word| + counter[category] = 0 + tokens.each {|token| counter[category] += 1 if word == token || category.to_s == token} + if minimum_category == category + counter[category] += 2 + end + if max_occurrences.to_f < counter[category] + max_occurrences = counter[category] + the_category = category + end + end + the_category end end - diff --git a/lib/complex_predictor2.rb b/lib/complex_predictor2.rb new file mode 100644 index 0000000..0598640 --- /dev/null +++ b/lib/complex_predictor2.rb @@ -0,0 +1,60 @@ +require_relative 'predictor' + +class ComplexPredictor < Predictor + # Public: Trains the predictor on books in our dataset. This method is called + # before the predict() method is called. + # + # Returns nothing. + def train! + @data = {} + + @all_books.each do |category, books| + @data[category] = Hash.new(0) + books.each do |filename, tokens| + tokens.each {|token| @data[category][token] += 1 if token.length > 5} + end + @data[category] = @data[category].max_by {|k,v| v}[0] + end + # raise "You must implement Predictor#train!." + + # Public: Predicts category. + + # tokens - A list of tokens (words). + + # Returns a category. + end + + # Public: Predicts category. + # + # tokens - A list of tokens (words). + # + # Returns a category. + def predict(tokens) + # # Find the category that has the most similar word-count. + # # + # # Example: Say the average Archeology book has 50 words and the average + # # Philosophy book has 100 words. Then say we must predict some book with 120 + # # words. In this case, we will predict Philosophy, since 120 is closer to 100 + # # than it is to 50. + max_occurrences = 0 + the_category = nil + counter = Hash.new + + @all_books.each do |cat, books| + @data.each do |category, word| + books.each do |filename, tokens| + counter[category] = 0 + tokens.each {|token| counter[category] += 1 if token == word} + if max_occurrences < counter[category] + max_occurrences = counter[category] + the_category = category + end + end + end + end + + the_category + # # raise "You must implement Predictor#predict." + end +end + diff --git a/lib/complex_predictor3.rb b/lib/complex_predictor3.rb new file mode 100644 index 0000000..b8921f3 --- /dev/null +++ b/lib/complex_predictor3.rb @@ -0,0 +1,22 @@ +require_relative 'predictor' + +class ComplexPredictor < Predictor + # Public: Trains the predictor on books in our dataset. This method is called + # before the predict() method is called. + # + # Returns nothing. + def train! + @data = {} + end + + # Public: Predicts category. + # + # tokens - A list of tokens (words). + # + # Returns a category. + def predict(tokens) + # Always predict astronomy, for now. + :astronomy + end +end + diff --git a/lib/predictor.rb b/lib/predictor.rb index d544f81..b4304c4 100644 --- a/lib/predictor.rb +++ b/lib/predictor.rb @@ -124,3 +124,4 @@ def load_books(dataset, opts={}) end end + diff --git a/lib/simple_predictor.rb b/lib/simple_predictor.rb index 6b93003..765541c 100644 --- a/lib/simple_predictor.rb +++ b/lib/simple_predictor.rb @@ -68,3 +68,5 @@ def predict(tokens) end end + + diff --git a/spec/gutenberg_spec.rb b/spec/gutenberg_spec.rb new file mode 100644 index 0000000..a950747 --- /dev/null +++ b/spec/gutenberg_spec.rb @@ -0,0 +1,4 @@ +#require this file in your spec files to help DRY up your tests +require 'rspec' +require 'pry-byebug' +require_relative '../gutenberg.rb'