diff --git a/gutenberg.rb b/gutenberg.rb index 84d20f6..e955c85 100644 --- a/gutenberg.rb +++ b/gutenberg.rb @@ -23,6 +23,6 @@ def run!(predictor_klass, opts={}) puts "Accuracy: #{accuracy}" end -run!(SimplePredictor) +#run!(SimplePredictor) run!(ComplexPredictor, debug: true) diff --git a/lib/complex_predictor.rb b/lib/complex_predictor.rb index b8921f3..2b55a08 100644 --- a/lib/complex_predictor.rb +++ b/lib/complex_predictor.rb @@ -1,4 +1,5 @@ require_relative 'predictor' +require 'pry-byebug' class ComplexPredictor < Predictor # Public: Trains the predictor on books in our dataset. This method is called @@ -6,7 +7,41 @@ class ComplexPredictor < Predictor # # Returns nothing. def train! + @data = {} + + @all_books.each do |category, books| + @data[category] = { + uniq_words: Hash.new(0), + authors: [] + } + + books.each do |filename, token| + token.each do |word| + if good_token?(word) && word.length > 7 + @data[category][:uniq_words][word] += 1 + end + end + @author_start = token.find_index("by") + @data[category][:authors] << token[(@author_start + 1)..(@author_start+2)] + end + @new_hash = Hash.new(0) + @most_pop = @data[category][:uniq_words].sort_by {|key, value| value}.reverse[0..100] + @most_pop.map! {|word, count| word} + # @most_pop.each do |x| + # @new_hash[x.first] = x.last + # end + @data[category][:uniq_words] = @most_pop + + end + + + #@date = { + #unq_word: [] + #book: + # + # + #} end # Public: Predicts category. @@ -15,8 +50,12 @@ def train! # # Returns a category. def predict(tokens) - # Always predict astronomy, for now. - :astronomy - end + @match = Hash.new(0) + @data.each do |category, uniq| + @match[category] = (@data[category][:uniq_words] & tokens).length + end + @match.max_by {|category, count| count }.first + end + end