makersquare · NickDamiano · Sep 4, 2014 · Sep 4, 2014 · Sep 4, 2014 · Sep 4, 2014
diff --git a/gutenberg.rb b/gutenberg.rb
@@ -23,6 +23,6 @@ def run!(predictor_klass, opts={})
   puts "Accuracy: #{accuracy}"
 end
 
-run!(SimplePredictor)
+#run!(SimplePredictor)
 run!(ComplexPredictor, debug: true)
 
diff --git a/lib/complex_predictor.rb b/lib/complex_predictor.rb
@@ -1,22 +1,91 @@
 require_relative 'predictor'
+require 'pry-byebug'
 
 class ComplexPredictor < Predictor
   # Public: Trains the predictor on books in our dataset. This method is called
   # before the predict() method is called.
   #
   # Returns nothing.
+  #This algorithm gets the top 100 most frequent words above 7 letters that occur across all books
+  #for the given subject. Next, it compares the words
   def train!
-    @data = {}
+    @subjects = []
+    @points = Hash.new(0)
+    @data= {}
+    @words = Hash.new(0)
+    @all_books.each do |category, books|
+      books.each do |filename, words|
+        words.each do |word|
+          if word.length > 6
+            @words[word] += 1
+            #binding.pry
+          end
+
+        end
+        @subjects.push([filename.split("/")[2]])
+        @points[category]=0
+        words_freq = @words.sort_by{|key, value| value}[-2000..-1]
+        words_only = words_freq.map{|pair| pair[0]}
+        @data[category] = words_only
+      end
+    end
+#create a hash with the word frequency for each one
+#check to see how many top 20 words match. just like the min one, the closest match or the one with the most
+#similar terms gets selected
+
   end
 
   # Public: Predicts category.
   #
   # tokens - A list of tokens (words).
   #
   # Returns a category.
+
+  #create a new hash with the subject and a corresponding points value field
+  #run the first test and assign a point to the hash for the proper one
+  #search the first 500 words for a word match to the subject, which is found inside the file name
+  #if there is a match, assign a point to the new hash
+  #additionally you can run the simple version to try to add another point
   def predict(tokens)
-    # Always predict astronomy, for now.
-    :astronomy
+    @minimum_category = nil
+    @minimum_distance = 999999999
+    @token_words = Hash.new(0)
+    @points = Hash.new(0)
+    #gets the top 100 words and compares the difference
+    tokens.each do |token|
+      if token.length > 6
+        @token_words[token] +=1
+      end
+    end
+    sorted_token_words = @token_words.sort_by{|key, value| value}[-100..-1]
+    only_words = sorted_token_words.map{|pair| pair[0]}
+    @difference = nil
+    @data.each do |category, word_list| 
+      @difference = @data[category] - only_words
+      difference_count = @difference.count 
+      if difference_count < @minimum_distance
+        @points[category] += 1
+        @minimum_distance = difference_count
+      end
+      #binding.pry 
+      #this seemed like a neat idea but does not impact anything
+      # @subjects.each do |subject| 
+      #    if only_words.include?(subject) 
+      #      @points[category]+=1 end
+
+      #    end
+    end
+    # binding.pry
+    #iterate through points to get the highest value and pass the key **********
+    cat = @points.sort_by{|k,v| v}[-1][0]
+    cat
   end
 end
 
+    #compare sorted_token_words to each @data[:category] hash to see how many words match
+    #calculate matching words or missing words number and store that as minimum difference
+    #store the subject as minimum subject
+    #do the next category and if the minimum difference is larger swap out values
+    #return the minimum subject
+    #
+    # Always predict astronomy, for now.
diff --git a/lib/simple_predictor.rb b/lib/simple_predictor.rb
@@ -25,14 +25,13 @@ def train!
     #   }
     # }
     @data = {}
-
     @all_books.each do |category, books|
       @data[category] = {
         words: 0,
         books: 0
       }
       books.each do |filename, tokens|
-        @data[category][:words] += tokens.count
+        @data[category][:words] += tokens.length
         @data[category][:books] += 1
       end
     end
@@ -53,7 +52,6 @@ def predict(tokens)
 
     minimum_category = nil
     minimum_distance = 999999999999
-
     @data.each do |category, counts|
       average_words_per_book = counts[:words].to_f / counts[:books]
       difference = (tokens.count - average_words_per_book).abs

diff --git a/lib/test.rb b/lib/test.rb
@@ -0,0 +1,22 @@
+# win_loss = %W{win loss loss win win win FART FART FART FART FART FART BUTTZ}
+# p win_loss
+# hashbrowns = Hash.new(0)
+# win_loss.each do |item|
+#   hashbrowns[item] +=1
+# end
+
+# new = hashbrowns.sort_by{|k,v| v }
+
+# p words
+
+# a =[1,2,3,4,5,6,7,8]
+# b =[5,6,7,8,9,10,11]
+
+# x = a-b
+# p x.count
+#so when I subtract, the new array is unmatched items from the arrays. 
+#so the new array.length - smaller number the better. if it's smaller than the variable stow it because it
+#means that there are more words in common! OH GOD IT'S HAPPENING!!! RUBY IN MY BRAIN. 
+
+stuff = {nick: 3, robert: 1, mike: 7}
+p stuff.sort_by{|k,v| v}[-1][0]