From 652a28c4c3e42a33fbe9e97e30dfb8ed4d26fab5 Mon Sep 17 00:00:00 2001
From: Nick Damiano <damiano.nick@gmail.com>
Date: Wed, 3 Sep 2014 23:58:57 -0500
Subject: [PATCH 1/4] added code to complex predictor to sort each book by top
 100 frequently used words and compare that to the unknown books to see
 category. Right now I have two arrays of words and I'm going to subtract them
 and then see which count is the smallest. the smallest count is the one with
 the most matching words and the right category.

---
 lib/complex_predictor.rb | 49 +++++++++++++++++++++++++++++++++++++++-
 lib/simple_predictor.rb  |  4 +---
 lib/test.rb              | 19 ++++++++++++++++
 3 files changed, 68 insertions(+), 4 deletions(-)
 create mode 100644 lib/test.rb

diff --git a/lib/complex_predictor.rb b/lib/complex_predictor.rb
index b8921f3..56dd7fe 100644
--- a/lib/complex_predictor.rb
+++ b/lib/complex_predictor.rb
@@ -1,12 +1,34 @@
 require_relative 'predictor'
+require 'pry-byebug'
 
 class ComplexPredictor < Predictor
   # Public: Trains the predictor on books in our dataset. This method is called
   # before the predict() method is called.
   #
   # Returns nothing.
+  #This algorithm gets the top 100 most frequent words above 7 letters that occur across all books
+  #for the given subject. Next, it compares the words
   def train!
-    @data = {}
+    @data= {}
+    @words = Hash.new(0)
+    @all_books.each do |category, books|
+      books.each do |filename, words|
+        words.each do |word|
+          if word.length > 6
+            @words[word] += 1
+          end
+        end
+        words_freq = @words.sort_by{|key, value| value}[-100..-1]
+        words_only = words_freq.map{|pair| pair[0]}
+        @data[category] = words_only
+      end
+    end
+    binding.pry 
+    puts "stop"
+#create a hash with the word frequency for each one
+#check to see how many top 20 words match. just like the min one, the closest match or the one with the most
+#similar terms gets selected
+
   end
 
   # Public: Predicts category.
@@ -15,6 +37,31 @@ def train!
   #
   # Returns a category.
   def predict(tokens)
+    minimum_category = nil
+    minimum_difference = 999999999
+    @token_words = Hash.new(0)
+    tokens.each do |token|
+      if token.length > 6
+        @token_words[token] +=1
+      end
+    end
+    sorted_token_words = @token_words.sort_by{|key, value| value}[-100..-1]
+    only_words = sorted_token_words.map{|pair| pair[0]}
+    @difference = nil
+    @data.each do |category, word_list| 
+      
+    end
+        minimum_category = category
+        minimum_distance = difference
+      end
+
+    binding.pry
+    #compare sorted_token_words to each @data[:category] hash to see how many words match
+    #calculate matching words or missing words number and store that as minimum difference
+    #store the subject as minimum subject
+    #do the next category and if the minimum difference is larger swap out values
+    #return the minimum subject
+    #
     # Always predict astronomy, for now.
     :astronomy
   end
diff --git a/lib/simple_predictor.rb b/lib/simple_predictor.rb
index 6b93003..fab8c0f 100644
--- a/lib/simple_predictor.rb
+++ b/lib/simple_predictor.rb
@@ -25,14 +25,13 @@ def train!
     #   }
     # }
     @data = {}
-
     @all_books.each do |category, books|
       @data[category] = {
         words: 0,
         books: 0
       }
       books.each do |filename, tokens|
-        @data[category][:words] += tokens.count
+        @data[category][:words] += tokens.length
         @data[category][:books] += 1
       end
     end
@@ -53,7 +52,6 @@ def predict(tokens)
 
     minimum_category = nil
     minimum_distance = 999999999999
-
     @data.each do |category, counts|
       average_words_per_book = counts[:words].to_f / counts[:books]
       difference = (tokens.count - average_words_per_book).abs
diff --git a/lib/test.rb b/lib/test.rb
new file mode 100644
index 0000000..ce8fe1f
--- /dev/null
+++ b/lib/test.rb
@@ -0,0 +1,19 @@
+# win_loss = %W{win loss loss win win win FART FART FART FART FART FART BUTTZ}
+# p win_loss
+# hashbrowns = Hash.new(0)
+# win_loss.each do |item|
+#   hashbrowns[item] +=1
+# end
+
+# new = hashbrowns.sort_by{|k,v| v }
+
+# p words
+
+a =[1,2,3,4,5,6,7,8]
+b =[5,6,7,8,9,10,11]
+
+x = a-b
+p x
+#so when I subtract, the new array is unmatched items from the arrays. 
+#so the new array.length - smaller number the better. if it's smaller than the variable stow it because it
+#means that there are more words in common! OH GOD IT'S HAPPENING!!! RUBY IN MY BRAIN. 

From 02ab14a6876cfecfb6009e56fb9f29d3be7eb223 Mon Sep 17 00:00:00 2001
From: Nick Damiano <damiano.nick@gmail.com>
Date: Thu, 4 Sep 2014 00:32:45 -0500
Subject: [PATCH 2/4] finished my code to test comparisons between top 100
 occurring words over 6 characters with a 73% accuracy, prediction time of
 5.08 seconds, and a training time of 4.75 seconds

---
 lib/complex_predictor.rb | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/lib/complex_predictor.rb b/lib/complex_predictor.rb
index 56dd7fe..7c1cf83 100644
--- a/lib/complex_predictor.rb
+++ b/lib/complex_predictor.rb
@@ -23,8 +23,6 @@ def train!
         @data[category] = words_only
       end
     end
-    binding.pry 
-    puts "stop"
 #create a hash with the word frequency for each one
 #check to see how many top 20 words match. just like the min one, the closest match or the one with the most
 #similar terms gets selected
@@ -37,8 +35,8 @@ def train!
   #
   # Returns a category.
   def predict(tokens)
-    minimum_category = nil
-    minimum_difference = 999999999
+    @minimum_category = nil
+    @minimum_distance = 999999999
     @token_words = Hash.new(0)
     tokens.each do |token|
       if token.length > 6
@@ -49,13 +47,18 @@ def predict(tokens)
     only_words = sorted_token_words.map{|pair| pair[0]}
     @difference = nil
     @data.each do |category, word_list| 
-      
-    end
-        minimum_category = category
-        minimum_distance = difference
+      @difference = @data[category] - only_words
+      difference_count = @difference.count 
+      if difference_count < @minimum_distance
+        @minimum_category = category
+        @minimum_distance = difference_count
+        #binding.pry
       end
+    end
+    @minimum_category
+  end
+end
 
-    binding.pry
     #compare sorted_token_words to each @data[:category] hash to see how many words match
     #calculate matching words or missing words number and store that as minimum difference
     #store the subject as minimum subject
@@ -63,7 +66,3 @@ def predict(tokens)
     #return the minimum subject
     #
     # Always predict astronomy, for now.
-    :astronomy
-  end
-end
-

From d2703825a10133296c66226d035a3ea026fa3ce3 Mon Sep 17 00:00:00 2001
From: Nick Damiano <damiano.nick@gmail.com>
Date: Thu, 4 Sep 2014 10:22:56 -0500
Subject: [PATCH 3/4] added code to check for subject name in first 5000 words
 and still at 73%

---
 gutenberg.rb             |  2 +-
 lib/complex_predictor.rb | 25 ++++++++++++++++++++++---
 lib/test.rb              | 11 +++++++----
 3 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/gutenberg.rb b/gutenberg.rb
index 84d20f6..e955c85 100644
--- a/gutenberg.rb
+++ b/gutenberg.rb
@@ -23,6 +23,6 @@ def run!(predictor_klass, opts={})
   puts "Accuracy: #{accuracy}"
 end
 
-run!(SimplePredictor)
+#run!(SimplePredictor)
 run!(ComplexPredictor, debug: true)
 
diff --git a/lib/complex_predictor.rb b/lib/complex_predictor.rb
index 7c1cf83..2d41f0a 100644
--- a/lib/complex_predictor.rb
+++ b/lib/complex_predictor.rb
@@ -9,6 +9,8 @@ class ComplexPredictor < Predictor
   #This algorithm gets the top 100 most frequent words above 7 letters that occur across all books
   #for the given subject. Next, it compares the words
   def train!
+    @subjects = []
+    @points = Hash.new(0)
     @data= {}
     @words = Hash.new(0)
     @all_books.each do |category, books|
@@ -16,8 +18,12 @@ def train!
         words.each do |word|
           if word.length > 6
             @words[word] += 1
+            #binding.pry
           end
+          
         end
+        @subjects.push([filename.split("/")[2]])
+        @points[category]=0
         words_freq = @words.sort_by{|key, value| value}[-100..-1]
         words_only = words_freq.map{|pair| pair[0]}
         @data[category] = words_only
@@ -34,10 +40,18 @@ def train!
   # tokens - A list of tokens (words).
   #
   # Returns a category.
+
+  #create a new hash with the subject and a corresponding points value field
+  #run the first test and assign a point to the hash for the proper one
+  #search the first 500 words for a word match to the subject, which is found inside the file name
+  #if there is a match, assign a point to the new hash
+  #additionally you can run the simple version to try to add another point
   def predict(tokens)
     @minimum_category = nil
     @minimum_distance = 999999999
     @token_words = Hash.new(0)
+    @points = Hash.new(0)
+    #gets the top 100 words and compares the difference
     tokens.each do |token|
       if token.length > 6
         @token_words[token] +=1
@@ -50,12 +64,17 @@ def predict(tokens)
       @difference = @data[category] - only_words
       difference_count = @difference.count 
       if difference_count < @minimum_distance
-        @minimum_category = category
+        @points[category] += 1
         @minimum_distance = difference_count
-        #binding.pry
       end
+      @subjects.each{|subject| if tokens[0..5000].include?(subject) then @points[category]+=1 end}
+
     end
-    @minimum_category
+    # binding.pry
+    p "stop"
+    #iterate through points to get the highest value and pass the key **********
+    cat = @points.sort_by{|k,v| v}[-1][0]
+    cat
   end
 end
 
diff --git a/lib/test.rb b/lib/test.rb
index ce8fe1f..82d51a5 100644
--- a/lib/test.rb
+++ b/lib/test.rb
@@ -9,11 +9,14 @@
 
 # p words
 
-a =[1,2,3,4,5,6,7,8]
-b =[5,6,7,8,9,10,11]
+# a =[1,2,3,4,5,6,7,8]
+# b =[5,6,7,8,9,10,11]
 
-x = a-b
-p x
+# x = a-b
+# p x.count
 #so when I subtract, the new array is unmatched items from the arrays. 
 #so the new array.length - smaller number the better. if it's smaller than the variable stow it because it
 #means that there are more words in common! OH GOD IT'S HAPPENING!!! RUBY IN MY BRAIN. 
+
+stuff = {nick: 3, robert: 1, mike: 7}
+p stuff.sort_by{|k,v| v}[-1][0]

From d0b3d97f25d8749646df25ec2e24be29121ce157 Mon Sep 17 00:00:00 2001
From: Nick Damiano <damiano.nick@gmail.com>
Date: Thu, 4 Sep 2014 11:03:32 -0500
Subject: [PATCH 4/4] removed the section that searches through the words for
 subject word. Changed the comparison arrays so that the known subject array
 top words is 2500 instead of 100. 96%

---
 lib/complex_predictor.rb | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/lib/complex_predictor.rb b/lib/complex_predictor.rb
index 2d41f0a..8738537 100644
--- a/lib/complex_predictor.rb
+++ b/lib/complex_predictor.rb
@@ -24,7 +24,7 @@ def train!
         end
         @subjects.push([filename.split("/")[2]])
         @points[category]=0
-        words_freq = @words.sort_by{|key, value| value}[-100..-1]
+        words_freq = @words.sort_by{|key, value| value}[-2000..-1]
         words_only = words_freq.map{|pair| pair[0]}
         @data[category] = words_only
       end
@@ -67,11 +67,15 @@ def predict(tokens)
         @points[category] += 1
         @minimum_distance = difference_count
       end
-      @subjects.each{|subject| if tokens[0..5000].include?(subject) then @points[category]+=1 end}
+      #binding.pry 
+      #this seemed like a neat idea but does not impact anything
+      # @subjects.each do |subject| 
+      #    if only_words.include?(subject) 
+      #      @points[category]+=1 end
 
+      #    end
     end
     # binding.pry
-    p "stop"
     #iterate through points to get the highest value and pass the key **********
     cat = @points.sort_by{|k,v| v}[-1][0]
     cat