From 08bfbc8ee6a14133f202cdb4e4ae837bbe9087ef Mon Sep 17 00:00:00 2001 From: Matteo Wiley Date: Wed, 3 Sep 2014 21:08:48 -0500 Subject: [PATCH 1/3] This organizes the data into unique words and author --- gutenberg.rb | 2 +- lib/complex_predictor.rb | 28 ++++++++++++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/gutenberg.rb b/gutenberg.rb index 84d20f6..e955c85 100644 --- a/gutenberg.rb +++ b/gutenberg.rb @@ -23,6 +23,6 @@ def run!(predictor_klass, opts={}) puts "Accuracy: #{accuracy}" end -run!(SimplePredictor) +#run!(SimplePredictor) run!(ComplexPredictor, debug: true) diff --git a/lib/complex_predictor.rb b/lib/complex_predictor.rb index b8921f3..431c63f 100644 --- a/lib/complex_predictor.rb +++ b/lib/complex_predictor.rb @@ -1,4 +1,5 @@ require_relative 'predictor' +require 'pry-byebug' class ComplexPredictor < Predictor # Public: Trains the predictor on books in our dataset. This method is called @@ -6,7 +7,34 @@ class ComplexPredictor < Predictor # # Returns nothing. def train! + @data = {} + + @all_books.each do |category, books| + @data[category] = { + uniq_words: Hash.new(0), + authors: [] + } + + books.each do |filename, token| + token.each do |word| + if good_token?(word) && word.length > 5 + @data[category][:uniq_words][word] += 1 + end + @author_start = token.find_index("by") + @data[category][:authors] << token[(@author_start + 1)..(@author_start+2)] + end + end + @data[category][:uniq_words] = @data[category][:uniq_words].sort_by {|key, value| value}.reverse[0..14] + binding.pry + end + + #@date = { + #unq_word: [] + #book: + # + # + #} end # Public: Predicts category. From 0c8225ef0bdf199c241691d6c448ec2c309c20d3 Mon Sep 17 00:00:00 2001 From: Matteo Wiley Date: Thu, 4 Sep 2014 09:56:33 -0500 Subject: [PATCH 2/3] This runs with an 96% acc but at 83 seconds --- lib/complex_predictor.rb | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/lib/complex_predictor.rb b/lib/complex_predictor.rb index 431c63f..00ff986 100644 --- a/lib/complex_predictor.rb +++ b/lib/complex_predictor.rb @@ -21,13 +21,16 @@ def train! if good_token?(word) && word.length > 5 @data[category][:uniq_words][word] += 1 end + end @author_start = token.find_index("by") @data[category][:authors] << token[(@author_start + 1)..(@author_start+2)] end - end - @data[category][:uniq_words] = @data[category][:uniq_words].sort_by {|key, value| value}.reverse[0..14] - binding.pry + @data[category][:uniq_words] = @data[category][:uniq_words].sort_by {|key, value| value}.reverse[0..14] + + # binding.pry + end + #@date = { #unq_word: [] @@ -43,8 +46,18 @@ def train! # # Returns a category. def predict(tokens) - # Always predict astronomy, for now. - :astronomy + @match = Hash.new(0) + + @data.each do |category, uniq| + tokens.each do |word| + uniq[:uniq_words].each do |uni_w| + if uni_w.include?(word) + @match[category] += 1 + end + end + end + end + @match.max_by {|category, count| count }.first end end From 98dfbcc49deee6eca8deb1359a9d3bbe17ebd017 Mon Sep 17 00:00:00 2001 From: Matteo Wiley Date: Thu, 4 Sep 2014 10:43:46 -0500 Subject: [PATCH 3/3] This takes it up too 100% and down to 5 secs --- lib/complex_predictor.rb | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/lib/complex_predictor.rb b/lib/complex_predictor.rb index 00ff986..2b55a08 100644 --- a/lib/complex_predictor.rb +++ b/lib/complex_predictor.rb @@ -18,17 +18,21 @@ def train! books.each do |filename, token| token.each do |word| - if good_token?(word) && word.length > 5 + if good_token?(word) && word.length > 7 @data[category][:uniq_words][word] += 1 end end @author_start = token.find_index("by") @data[category][:authors] << token[(@author_start + 1)..(@author_start+2)] end - @data[category][:uniq_words] = @data[category][:uniq_words].sort_by {|key, value| value}.reverse[0..14] + @new_hash = Hash.new(0) + @most_pop = @data[category][:uniq_words].sort_by {|key, value| value}.reverse[0..100] + @most_pop.map! {|word, count| word} + # @most_pop.each do |x| + # @new_hash[x.first] = x.last + # end + @data[category][:uniq_words] = @most_pop - # binding.pry - end @@ -47,17 +51,11 @@ def train! # Returns a category. def predict(tokens) @match = Hash.new(0) - @data.each do |category, uniq| - tokens.each do |word| - uniq[:uniq_words].each do |uni_w| - if uni_w.include?(word) - @match[category] += 1 - end - end - end + @match[category] = (@data[category][:uniq_words] & tokens).length end @match.max_by {|category, count| count }.first - end + end + end