From 23a03ee15dbb96335f79292ea47a1a013e690bdc Mon Sep 17 00:00:00 2001
From: Travis Hooper <thooper41@gmail.com>
Date: Wed, 3 Sep 2014 21:53:54 -0500
Subject: [PATCH 1/2] By golly 53% accuracy in 3.4 sec isn't too bad

quit
exit
end
git
---
 Gemfile                   |  5 ++++
 gutenberg.rb              |  3 +-
 lib/complex_predictor.rb  | 40 ++++++++++++++++++++++++--
 lib/complex_predictor2.rb | 60 +++++++++++++++++++++++++++++++++++++++
 lib/complex_predictor3.rb | 22 ++++++++++++++
 lib/predictor.rb          |  1 +
 lib/simple_predictor.rb   |  2 ++
 spec/gutenberg_spec.rb    |  4 +++
 8 files changed, 133 insertions(+), 4 deletions(-)
 create mode 100644 Gemfile
 create mode 100644 lib/complex_predictor2.rb
 create mode 100644 lib/complex_predictor3.rb
 create mode 100644 spec/gutenberg_spec.rb

diff --git a/Gemfile b/Gemfile
new file mode 100644
index 0000000..1f47892
--- /dev/null
+++ b/Gemfile
@@ -0,0 +1,5 @@
+source 'https://rubygems.org'
+ruby '2.0.0'
+
+gem 'rspec', '~> 2.14.1'
+gem 'pry-byebug'
diff --git a/gutenberg.rb b/gutenberg.rb
index 84d20f6..de4730d 100644
--- a/gutenberg.rb
+++ b/gutenberg.rb
@@ -24,5 +24,4 @@ def run!(predictor_klass, opts={})
 end
 
 run!(SimplePredictor)
-run!(ComplexPredictor, debug: true)
-
+run!(ComplexPredictor, debug: true)
\ No newline at end of file
diff --git a/lib/complex_predictor.rb b/lib/complex_predictor.rb
index b8921f3..c10846c 100644
--- a/lib/complex_predictor.rb
+++ b/lib/complex_predictor.rb
@@ -7,6 +7,22 @@ class ComplexPredictor < Predictor
   # Returns nothing.
   def train!
     @data = {}
+
+    @all_books.each do |category, books|
+      @data[category] = Hash.new(0)
+      books.each do |filename, tokens|
+        tokens.each {|token| @data[category][token] += 1 if token.length > 5}
+      end
+      @data[category] = @data[category].max_by {|k,v| v}[0]
+    end
+    # raise "You must implement Predictor#train!."
+
+  # Public: Predicts category.
+  
+  # tokens - A list of tokens (words).
+  
+  # Returns a category.
+  @data
   end
 
   # Public: Predicts category.
@@ -15,8 +31,28 @@ def train!
   #
   # Returns a category.
   def predict(tokens)
-    # Always predict astronomy, for now.
-    :astronomy
+    # # Find the category that has the most similar word-count.
+    # #
+    # # Example: Say the average Archeology book has 50 words and the average
+    # # Philosophy book has 100 words. Then say we must predict some book with 120
+    # # words. In this case, we will predict Philosophy, since 120 is closer to 100
+    # # than it is to 50.
+    counter2 = 0
+    max_occurrences = 0
+    the_category = nil
+    counter = Hash.new
+
+      @data.each do |category, word|
+          counter[category] = 0
+          tokens.each {|token| counter[category] += 1 if token == word}
+          if max_occurrences < counter[category]
+            max_occurrences = counter[category]
+            the_category = category
+          end
+      end
+    counter2 += 1
+    the_category
+    # # raise "You must implement Predictor#predict."
   end
 end
 
diff --git a/lib/complex_predictor2.rb b/lib/complex_predictor2.rb
new file mode 100644
index 0000000..0598640
--- /dev/null
+++ b/lib/complex_predictor2.rb
@@ -0,0 +1,60 @@
+require_relative 'predictor'
+
+class ComplexPredictor < Predictor
+  # Public: Trains the predictor on books in our dataset. This method is called
+  # before the predict() method is called.
+  #
+  # Returns nothing.
+  def train!
+    @data = {}
+
+    @all_books.each do |category, books|
+      @data[category] = Hash.new(0)
+      books.each do |filename, tokens|
+        tokens.each {|token| @data[category][token] += 1 if token.length > 5}
+      end
+      @data[category] = @data[category].max_by {|k,v| v}[0]
+    end
+    # raise "You must implement Predictor#train!."
+
+  # Public: Predicts category.
+  
+  # tokens - A list of tokens (words).
+  
+  # Returns a category.
+  end
+
+  # Public: Predicts category.
+  #
+  # tokens - A list of tokens (words).
+  #
+  # Returns a category.
+  def predict(tokens)
+    # # Find the category that has the most similar word-count.
+    # #
+    # # Example: Say the average Archeology book has 50 words and the average
+    # # Philosophy book has 100 words. Then say we must predict some book with 120
+    # # words. In this case, we will predict Philosophy, since 120 is closer to 100
+    # # than it is to 50.
+    max_occurrences = 0
+    the_category = nil
+    counter = Hash.new
+
+    @all_books.each do |cat, books|
+      @data.each do |category, word|
+        books.each do |filename, tokens|
+          counter[category] = 0
+          tokens.each {|token| counter[category] += 1 if token == word}
+          if max_occurrences < counter[category]
+            max_occurrences = counter[category]
+            the_category = category
+          end
+        end
+      end
+    end
+
+    the_category
+    # # raise "You must implement Predictor#predict."
+  end
+end
+
diff --git a/lib/complex_predictor3.rb b/lib/complex_predictor3.rb
new file mode 100644
index 0000000..b8921f3
--- /dev/null
+++ b/lib/complex_predictor3.rb
@@ -0,0 +1,22 @@
+require_relative 'predictor'
+
+class ComplexPredictor < Predictor
+  # Public: Trains the predictor on books in our dataset. This method is called
+  # before the predict() method is called.
+  #
+  # Returns nothing.
+  def train!
+    @data = {}
+  end
+
+  # Public: Predicts category.
+  #
+  # tokens - A list of tokens (words).
+  #
+  # Returns a category.
+  def predict(tokens)
+    # Always predict astronomy, for now.
+    :astronomy
+  end
+end
+
diff --git a/lib/predictor.rb b/lib/predictor.rb
index d544f81..b4304c4 100644
--- a/lib/predictor.rb
+++ b/lib/predictor.rb
@@ -124,3 +124,4 @@ def load_books(dataset, opts={})
   end
 end
 
+
diff --git a/lib/simple_predictor.rb b/lib/simple_predictor.rb
index 6b93003..765541c 100644
--- a/lib/simple_predictor.rb
+++ b/lib/simple_predictor.rb
@@ -68,3 +68,5 @@ def predict(tokens)
   end
 end
 
+
+
diff --git a/spec/gutenberg_spec.rb b/spec/gutenberg_spec.rb
new file mode 100644
index 0000000..a950747
--- /dev/null
+++ b/spec/gutenberg_spec.rb
@@ -0,0 +1,4 @@
+#require this file in your spec files to help DRY up your tests
+require 'rspec'
+require 'pry-byebug'
+require_relative '../gutenberg.rb'

From 45a9c016a42295f0f6153f35b0f97e8c064cce12 Mon Sep 17 00:00:00 2001
From: Travis Hooper <thooper41@gmail.com>
Date: Thu, 4 Sep 2014 00:53:34 -0500
Subject: [PATCH 2/2] working file, correctly determines all books in ~6 sec.

---
 lib/complex_predictor.rb | 59 ++++++++++++++++++++++++++++++----------
 1 file changed, 44 insertions(+), 15 deletions(-)

diff --git a/lib/complex_predictor.rb b/lib/complex_predictor.rb
index c10846c..47be45c 100644
--- a/lib/complex_predictor.rb
+++ b/lib/complex_predictor.rb
@@ -6,18 +6,33 @@ class ComplexPredictor < Predictor
   #
   # Returns nothing.
   def train!
-    @data = {}
+
+    @length_data = {}
+
+    @all_books.each do |category, books|
+      @length_data[category] = {
+        words: 0,
+        books: 0
+      }
+      books.each do |filename, tokens|
+        @length_data[category][:words] += tokens.count
+        @length_data[category][:books] += 1
+      end
+    end
+
+    @data = {} # {:astronomy=>"footnote", :philosophy=>"things", :religion=>"christ", :archeology=>"gutenberg"}
+    @length = {}
+    bad_words = ['footnote', 'things', 'gutenberg', 'project', 'illustration']
+
 
     @all_books.each do |category, books|
       @data[category] = Hash.new(0)
       books.each do |filename, tokens|
-        tokens.each {|token| @data[category][token] += 1 if token.length > 5}
+        tokens.each {|token| @data[category][token] += 1 if token.length > 5 && bad_words.include?(token) != true }
       end
       @data[category] = @data[category].max_by {|k,v| v}[0]
     end
     # raise "You must implement Predictor#train!."
-
-  # Public: Predicts category.
   
   # tokens - A list of tokens (words).
   
@@ -37,22 +52,36 @@ def predict(tokens)
     # # Philosophy book has 100 words. Then say we must predict some book with 120
     # # words. In this case, we will predict Philosophy, since 120 is closer to 100
     # # than it is to 50.
-    counter2 = 0
     max_occurrences = 0
     the_category = nil
     counter = Hash.new
 
-      @data.each do |category, word|
-          counter[category] = 0
-          tokens.each {|token| counter[category] += 1 if token == word}
-          if max_occurrences < counter[category]
-            max_occurrences = counter[category]
-            the_category = category
-          end
+    minimum_category = nil
+    minimum_distance = 999999999999
+
+    @length_data.each do |category, counts|
+      average_words_per_book = counts[:words].to_f / counts[:books]
+      difference = (tokens.count - average_words_per_book).abs
+
+      if difference < minimum_distance
+        minimum_category = category
+        minimum_distance = difference
+      end
+    end
+
+
+
+    @data.each do |category, word|
+      counter[category] = 0
+      tokens.each {|token| counter[category] += 1 if word == token || category.to_s == token}
+      if minimum_category == category
+        counter[category] += 2
+      end
+      if max_occurrences.to_f < counter[category]
+        max_occurrences = counter[category]
+        the_category = category
       end
-    counter2 += 1
+    end
     the_category
-    # # raise "You must implement Predictor#predict."
   end
 end
-