diff --git a/data/stopwords.txt b/data/stopwords.txt index 7336c7c..e4ab9a4 100644 --- a/data/stopwords.txt +++ b/data/stopwords.txt @@ -1 +1 @@ -a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your,one,out,more,now,first,two,very,such,same,shall,upon,before,therefore,great,made,even,same,work,make,being,through,here,way,true,see,time,those,place,much,without,body,whole,another,thus,set,new,given,both,above,well,part,between,end,order,each,form,gutenberg +light,because,been,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,in,into,just,least,like,likely,might,most,must,my,neither,often,on,only,or,other,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your,one,out,more,now,first,two,very,such,same,shall,upon,before,therefore,great,made,even,same,work,being,through,here,true,see,time,those,place,much,without,body,whole,another,thus,set,given,both,above,well,part,between,end,order,each,form,gutenberg,project,should,cannot,things,without,character,little,before,called,further,together,through,therefore,certain,between,saying,themselves,things,without,answer,person,another,though,brought,neither,whether,nothing,people,number,within,houses,species,inasmuch,number,perhaps,others,indeed,having,second,course,during,different,nearly,twenty,already,possible,always,actually,others,become,really,whatsoever,question,toward,coming,purpose,present,personal,friends,friend,behind,places,picture,precious,example,action,destroy,filled,stated,surely,office,wherefore,namely,spoken,speaks,mearly,remember,individual,simply,wilderness,contrary,greater,change,necessary,broken,suppose,relation,men,man,many,feet,small,distance,observations,footnote,found,illustration,large,stone,years,world,nature,miles,itself,earth,sense,hello,planet,about,house,ancient,works,under,three,after,thing,theory,times,against,death,spirit,christ,words,faith,father,ditto,temple, \ No newline at end of file diff --git a/gutenberg.rb b/gutenberg.rb index 84d20f6..2df40e3 100644 --- a/gutenberg.rb +++ b/gutenberg.rb @@ -24,5 +24,6 @@ def run!(predictor_klass, opts={}) end run!(SimplePredictor) + run!(ComplexPredictor, debug: true) diff --git a/lib/complex_predictor.rb b/lib/complex_predictor.rb index b8921f3..3796a3a 100644 --- a/lib/complex_predictor.rb +++ b/lib/complex_predictor.rb @@ -1,4 +1,5 @@ require_relative 'predictor' +require 'pry-byebug' class ComplexPredictor < Predictor # Public: Trains the predictor on books in our dataset. This method is called @@ -7,6 +8,18 @@ class ComplexPredictor < Predictor # Returns nothing. def train! @data = {} + @all_books.each do |category, books| + @data[category] = Hash.new(0) + tokenHash = Hash.new(0) + books.each do |filename, tokens| + tokens[2000..-tokens.count/2].each do |word| + if word.length > 4 && good_token?(word) + tokenHash[word]+= 1 + end + end + end + @data[category][:keys] = tokenHash.sort_by{|x,y| y}[-3..-1].map!{|x| x[0]} + end end # Public: Predicts category. @@ -15,8 +28,18 @@ def train! # # Returns a category. def predict(tokens) - # Always predict astronomy, for now. - :astronomy + newarray = tokens[0..18000] + @decider = Hash.new(0) + tokens[2500..10000].each do |word| + if word.length > 4 && good_token?(word) + @data.each do |key,val| + if val.values[0].include?(word) + @decider[key]+=1 + end + end + end + end + return @decider.max_by{|x,y| y}[0] end end diff --git a/lib/predictor.rb b/lib/predictor.rb index d544f81..034f6a1 100644 --- a/lib/predictor.rb +++ b/lib/predictor.rb @@ -42,7 +42,7 @@ def predict(tokens) # Returns true if you should use this token. In our project, "token" is # synonymous with "word". def good_token?(token) - !STOP_WORDS.include?(token) && token.size > 2 + token.size > 4 && !STOP_WORDS.include?(token) end ############################################################################# diff --git a/lib/simple_predictor.rb b/lib/simple_predictor.rb index 6b93003..c9ee5cb 100644 --- a/lib/simple_predictor.rb +++ b/lib/simple_predictor.rb @@ -25,7 +25,6 @@ def train! # } # } @data = {} - @all_books.each do |category, books| @data[category] = { words: 0,