From 652a28c4c3e42a33fbe9e97e30dfb8ed4d26fab5 Mon Sep 17 00:00:00 2001 From: Nick Damiano Date: Wed, 3 Sep 2014 23:58:57 -0500 Subject: [PATCH 1/4] added code to complex predictor to sort each book by top 100 frequently used words and compare that to the unknown books to see category. Right now I have two arrays of words and I'm going to subtract them and then see which count is the smallest. the smallest count is the one with the most matching words and the right category. --- lib/complex_predictor.rb | 49 +++++++++++++++++++++++++++++++++++++++- lib/simple_predictor.rb | 4 +--- lib/test.rb | 19 ++++++++++++++++ 3 files changed, 68 insertions(+), 4 deletions(-) create mode 100644 lib/test.rb diff --git a/lib/complex_predictor.rb b/lib/complex_predictor.rb index b8921f3..56dd7fe 100644 --- a/lib/complex_predictor.rb +++ b/lib/complex_predictor.rb @@ -1,12 +1,34 @@ require_relative 'predictor' +require 'pry-byebug' class ComplexPredictor < Predictor # Public: Trains the predictor on books in our dataset. This method is called # before the predict() method is called. # # Returns nothing. + #This algorithm gets the top 100 most frequent words above 7 letters that occur across all books + #for the given subject. Next, it compares the words def train! - @data = {} + @data= {} + @words = Hash.new(0) + @all_books.each do |category, books| + books.each do |filename, words| + words.each do |word| + if word.length > 6 + @words[word] += 1 + end + end + words_freq = @words.sort_by{|key, value| value}[-100..-1] + words_only = words_freq.map{|pair| pair[0]} + @data[category] = words_only + end + end + binding.pry + puts "stop" +#create a hash with the word frequency for each one +#check to see how many top 20 words match. just like the min one, the closest match or the one with the most +#similar terms gets selected + end # Public: Predicts category. @@ -15,6 +37,31 @@ def train! # # Returns a category. def predict(tokens) + minimum_category = nil + minimum_difference = 999999999 + @token_words = Hash.new(0) + tokens.each do |token| + if token.length > 6 + @token_words[token] +=1 + end + end + sorted_token_words = @token_words.sort_by{|key, value| value}[-100..-1] + only_words = sorted_token_words.map{|pair| pair[0]} + @difference = nil + @data.each do |category, word_list| + + end + minimum_category = category + minimum_distance = difference + end + + binding.pry + #compare sorted_token_words to each @data[:category] hash to see how many words match + #calculate matching words or missing words number and store that as minimum difference + #store the subject as minimum subject + #do the next category and if the minimum difference is larger swap out values + #return the minimum subject + # # Always predict astronomy, for now. :astronomy end diff --git a/lib/simple_predictor.rb b/lib/simple_predictor.rb index 6b93003..fab8c0f 100644 --- a/lib/simple_predictor.rb +++ b/lib/simple_predictor.rb @@ -25,14 +25,13 @@ def train! # } # } @data = {} - @all_books.each do |category, books| @data[category] = { words: 0, books: 0 } books.each do |filename, tokens| - @data[category][:words] += tokens.count + @data[category][:words] += tokens.length @data[category][:books] += 1 end end @@ -53,7 +52,6 @@ def predict(tokens) minimum_category = nil minimum_distance = 999999999999 - @data.each do |category, counts| average_words_per_book = counts[:words].to_f / counts[:books] difference = (tokens.count - average_words_per_book).abs diff --git a/lib/test.rb b/lib/test.rb new file mode 100644 index 0000000..ce8fe1f --- /dev/null +++ b/lib/test.rb @@ -0,0 +1,19 @@ +# win_loss = %W{win loss loss win win win FART FART FART FART FART FART BUTTZ} +# p win_loss +# hashbrowns = Hash.new(0) +# win_loss.each do |item| +# hashbrowns[item] +=1 +# end + +# new = hashbrowns.sort_by{|k,v| v } + +# p words + +a =[1,2,3,4,5,6,7,8] +b =[5,6,7,8,9,10,11] + +x = a-b +p x +#so when I subtract, the new array is unmatched items from the arrays. +#so the new array.length - smaller number the better. if it's smaller than the variable stow it because it +#means that there are more words in common! OH GOD IT'S HAPPENING!!! RUBY IN MY BRAIN. From 02ab14a6876cfecfb6009e56fb9f29d3be7eb223 Mon Sep 17 00:00:00 2001 From: Nick Damiano Date: Thu, 4 Sep 2014 00:32:45 -0500 Subject: [PATCH 2/4] finished my code to test comparisons between top 100 occurring words over 6 characters with a 73% accuracy, prediction time of 5.08 seconds, and a training time of 4.75 seconds --- lib/complex_predictor.rb | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/lib/complex_predictor.rb b/lib/complex_predictor.rb index 56dd7fe..7c1cf83 100644 --- a/lib/complex_predictor.rb +++ b/lib/complex_predictor.rb @@ -23,8 +23,6 @@ def train! @data[category] = words_only end end - binding.pry - puts "stop" #create a hash with the word frequency for each one #check to see how many top 20 words match. just like the min one, the closest match or the one with the most #similar terms gets selected @@ -37,8 +35,8 @@ def train! # # Returns a category. def predict(tokens) - minimum_category = nil - minimum_difference = 999999999 + @minimum_category = nil + @minimum_distance = 999999999 @token_words = Hash.new(0) tokens.each do |token| if token.length > 6 @@ -49,13 +47,18 @@ def predict(tokens) only_words = sorted_token_words.map{|pair| pair[0]} @difference = nil @data.each do |category, word_list| - - end - minimum_category = category - minimum_distance = difference + @difference = @data[category] - only_words + difference_count = @difference.count + if difference_count < @minimum_distance + @minimum_category = category + @minimum_distance = difference_count + #binding.pry end + end + @minimum_category + end +end - binding.pry #compare sorted_token_words to each @data[:category] hash to see how many words match #calculate matching words or missing words number and store that as minimum difference #store the subject as minimum subject @@ -63,7 +66,3 @@ def predict(tokens) #return the minimum subject # # Always predict astronomy, for now. - :astronomy - end -end - From d2703825a10133296c66226d035a3ea026fa3ce3 Mon Sep 17 00:00:00 2001 From: Nick Damiano Date: Thu, 4 Sep 2014 10:22:56 -0500 Subject: [PATCH 3/4] added code to check for subject name in first 5000 words and still at 73% --- gutenberg.rb | 2 +- lib/complex_predictor.rb | 25 ++++++++++++++++++++++--- lib/test.rb | 11 +++++++---- 3 files changed, 30 insertions(+), 8 deletions(-) diff --git a/gutenberg.rb b/gutenberg.rb index 84d20f6..e955c85 100644 --- a/gutenberg.rb +++ b/gutenberg.rb @@ -23,6 +23,6 @@ def run!(predictor_klass, opts={}) puts "Accuracy: #{accuracy}" end -run!(SimplePredictor) +#run!(SimplePredictor) run!(ComplexPredictor, debug: true) diff --git a/lib/complex_predictor.rb b/lib/complex_predictor.rb index 7c1cf83..2d41f0a 100644 --- a/lib/complex_predictor.rb +++ b/lib/complex_predictor.rb @@ -9,6 +9,8 @@ class ComplexPredictor < Predictor #This algorithm gets the top 100 most frequent words above 7 letters that occur across all books #for the given subject. Next, it compares the words def train! + @subjects = [] + @points = Hash.new(0) @data= {} @words = Hash.new(0) @all_books.each do |category, books| @@ -16,8 +18,12 @@ def train! words.each do |word| if word.length > 6 @words[word] += 1 + #binding.pry end + end + @subjects.push([filename.split("/")[2]]) + @points[category]=0 words_freq = @words.sort_by{|key, value| value}[-100..-1] words_only = words_freq.map{|pair| pair[0]} @data[category] = words_only @@ -34,10 +40,18 @@ def train! # tokens - A list of tokens (words). # # Returns a category. + + #create a new hash with the subject and a corresponding points value field + #run the first test and assign a point to the hash for the proper one + #search the first 500 words for a word match to the subject, which is found inside the file name + #if there is a match, assign a point to the new hash + #additionally you can run the simple version to try to add another point def predict(tokens) @minimum_category = nil @minimum_distance = 999999999 @token_words = Hash.new(0) + @points = Hash.new(0) + #gets the top 100 words and compares the difference tokens.each do |token| if token.length > 6 @token_words[token] +=1 @@ -50,12 +64,17 @@ def predict(tokens) @difference = @data[category] - only_words difference_count = @difference.count if difference_count < @minimum_distance - @minimum_category = category + @points[category] += 1 @minimum_distance = difference_count - #binding.pry end + @subjects.each{|subject| if tokens[0..5000].include?(subject) then @points[category]+=1 end} + end - @minimum_category + # binding.pry + p "stop" + #iterate through points to get the highest value and pass the key ********** + cat = @points.sort_by{|k,v| v}[-1][0] + cat end end diff --git a/lib/test.rb b/lib/test.rb index ce8fe1f..82d51a5 100644 --- a/lib/test.rb +++ b/lib/test.rb @@ -9,11 +9,14 @@ # p words -a =[1,2,3,4,5,6,7,8] -b =[5,6,7,8,9,10,11] +# a =[1,2,3,4,5,6,7,8] +# b =[5,6,7,8,9,10,11] -x = a-b -p x +# x = a-b +# p x.count #so when I subtract, the new array is unmatched items from the arrays. #so the new array.length - smaller number the better. if it's smaller than the variable stow it because it #means that there are more words in common! OH GOD IT'S HAPPENING!!! RUBY IN MY BRAIN. + +stuff = {nick: 3, robert: 1, mike: 7} +p stuff.sort_by{|k,v| v}[-1][0] From d0b3d97f25d8749646df25ec2e24be29121ce157 Mon Sep 17 00:00:00 2001 From: Nick Damiano Date: Thu, 4 Sep 2014 11:03:32 -0500 Subject: [PATCH 4/4] removed the section that searches through the words for subject word. Changed the comparison arrays so that the known subject array top words is 2500 instead of 100. 96% --- lib/complex_predictor.rb | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/lib/complex_predictor.rb b/lib/complex_predictor.rb index 2d41f0a..8738537 100644 --- a/lib/complex_predictor.rb +++ b/lib/complex_predictor.rb @@ -24,7 +24,7 @@ def train! end @subjects.push([filename.split("/")[2]]) @points[category]=0 - words_freq = @words.sort_by{|key, value| value}[-100..-1] + words_freq = @words.sort_by{|key, value| value}[-2000..-1] words_only = words_freq.map{|pair| pair[0]} @data[category] = words_only end @@ -67,11 +67,15 @@ def predict(tokens) @points[category] += 1 @minimum_distance = difference_count end - @subjects.each{|subject| if tokens[0..5000].include?(subject) then @points[category]+=1 end} + #binding.pry + #this seemed like a neat idea but does not impact anything + # @subjects.each do |subject| + # if only_words.include?(subject) + # @points[category]+=1 end + # end end # binding.pry - p "stop" #iterate through points to get the highest value and pass the key ********** cat = @points.sort_by{|k,v| v}[-1][0] cat