From cbf579a829a09d97c2923a5ea16fa1c2b896cba3 Mon Sep 17 00:00:00 2001 From: Lili Serfaty Date: Thu, 4 Sep 2014 11:18:41 -0500 Subject: [PATCH] create system to sort through books and place books within category based on content. --- Gemfile | 3 ++ data/stopwords.txt | 2 +- gutenberg.rb | 3 +- lib/Gemfile | 3 ++ lib/complex_predictor.rb | 62 +++++++++++++++++++++++++++++++++------- 5 files changed, 59 insertions(+), 14 deletions(-) create mode 100644 Gemfile create mode 100644 lib/Gemfile diff --git a/Gemfile b/Gemfile new file mode 100644 index 0000000..e688475 --- /dev/null +++ b/Gemfile @@ -0,0 +1,3 @@ +source 'https://rubygems.org' + +gem 'pry-byebug' \ No newline at end of file diff --git a/data/stopwords.txt b/data/stopwords.txt index 7336c7c..bcaccb4 100644 --- a/data/stopwords.txt +++ b/data/stopwords.txt @@ -1 +1 @@ -a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your,one,out,more,now,first,two,very,such,same,shall,upon,before,therefore,great,made,even,same,work,make,being,through,here,way,true,see,time,those,place,much,without,body,whole,another,thus,set,new,given,both,above,well,part,between,end,order,each,form,gutenberg +a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your,one,out,more,now,first,two,very,such,same,shall,upon,before,therefore,great,made,even,same,work,make,being,through,here,way,true,see,time,those,place,much,without,body,whole,another,thus,set,new,given,both,above,well,part,between,end,order,each,form,gutenberg,project, diff --git a/gutenberg.rb b/gutenberg.rb index 84d20f6..68fb1ef 100644 --- a/gutenberg.rb +++ b/gutenberg.rb @@ -16,13 +16,12 @@ def run!(predictor_klass, opts={}) predictor.train! puts "Training took #{Time.now - start_time} seconds." - puts "Predicting..." start_time = Time.now accuracy = predictor.predict_test_set(opts) puts "Predictions took #{Time.now - start_time} seconds." puts "Accuracy: #{accuracy}" end -run!(SimplePredictor) +# run!(SimplePredictor) run!(ComplexPredictor, debug: true) diff --git a/lib/Gemfile b/lib/Gemfile new file mode 100644 index 0000000..e688475 --- /dev/null +++ b/lib/Gemfile @@ -0,0 +1,3 @@ +source 'https://rubygems.org' + +gem 'pry-byebug' \ No newline at end of file diff --git a/lib/complex_predictor.rb b/lib/complex_predictor.rb index b8921f3..de205fa 100644 --- a/lib/complex_predictor.rb +++ b/lib/complex_predictor.rb @@ -1,22 +1,62 @@ require_relative 'predictor' +require 'pry-byebug' class ComplexPredictor < Predictor - # Public: Trains the predictor on books in our dataset. This method is called - # before the predict() method is called. - # - # Returns nothing. + def train! @data = {} + @top_token_words = {} + + @all_books.each do |category, books| + @data[category] = { + token: {} + } + books.each do |filename, tokens| + tokens.each do |token| + # i thought nested if statements were no-nos so patrick helped me with this part. + if good_token?(token) + if @data[category][:token][token] + @data[category][:token][token] += 1 + else + @data[category][:token][token] = 1 + end + end + end + end + end + top_tokens = nil + + @data.each do |category, token| + # binding.pry + top_tokens = @data[category][:token].sort_by { |token, count| count }.reverse[0..15] + @top_token_words[category] = [] + # yusef helped me through a tricky time with arrays and string and stuff in my hash value + top_tokens.each do |x| + if @top_token_words[category] + @top_token_words[category] << x[0] + end + end + end end - # Public: Predicts category. - # - # tokens - A list of tokens (words). - # - # Returns a category. def predict(tokens) - # Always predict astronomy, for now. - :astronomy + + predicted_category = nil + + max_match = 0 + @top_token_words.each do |category, top_words| + count_match = 0 + tokens.each do |token| + if top_words.include? token + count_match += 1 + end + end + if count_match > max_match + predicted_category = category + max_match = count_match + end + end + predicted_category.to_sym end end