From cdea42fe17083c9bac1f734be198ff9b8fe2fa98 Mon Sep 17 00:00:00 2001 From: Penkar Date: Wed, 3 Sep 2014 14:23:05 -0500 Subject: [PATCH 1/7] Unfortunate late commit 92% 4 sec --- data/stopwords.txt | 2 +- gutenberg.rb | 3 ++- lib/complex_predictor.rb | 45 ++++++++++++++++++++++++++++++++++++++-- lib/simple_predictor.rb | 1 - 4 files changed, 46 insertions(+), 5 deletions(-) diff --git a/data/stopwords.txt b/data/stopwords.txt index 7336c7c..16c1727 100644 --- a/data/stopwords.txt +++ b/data/stopwords.txt @@ -1 +1 @@ -a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your,one,out,more,now,first,two,very,such,same,shall,upon,before,therefore,great,made,even,same,work,make,being,through,here,way,true,see,time,those,place,much,without,body,whole,another,thus,set,new,given,both,above,well,part,between,end,order,each,form,gutenberg +a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your,one,out,more,now,first,two,very,such,same,shall,upon,before,therefore,great,made,even,same,work,make,being,through,here,way,true,see,time,those,place,much,without,body,whole,another,thus,set,new,given,both,above,well,part,between,end,order,each,form,gutenberg,project,should,cannot,things,without,character,little,before,called,further,together,through,therefore,certain,between,saying,themselves,things,without,answer,person,another,though,brought,neither,whether,nothing,people,number,within,houses,species,inasmuch,number,perhaps,others,indeed,having,second,course,during,different,nearly,twenty,already,possible,always,actually,others,become,really,whatsoever,question,toward,coming,purpose,present,personal,friends,friend,behind,places,picture,precious,example,action,destroy,filled,stated,surely,office,wherefore,namely,spoken,speaks,mearly,remember,individual,simply,wilderness,contrary,greater,change,necessary,broken,suppose,relation, \ No newline at end of file diff --git a/gutenberg.rb b/gutenberg.rb index 84d20f6..86a18a0 100644 --- a/gutenberg.rb +++ b/gutenberg.rb @@ -23,6 +23,7 @@ def run!(predictor_klass, opts={}) puts "Accuracy: #{accuracy}" end -run!(SimplePredictor) +# run!(SimplePredictor) + run!(ComplexPredictor, debug: true) diff --git a/lib/complex_predictor.rb b/lib/complex_predictor.rb index b8921f3..555a217 100644 --- a/lib/complex_predictor.rb +++ b/lib/complex_predictor.rb @@ -1,12 +1,36 @@ require_relative 'predictor' +require 'pry-byebug' class ComplexPredictor < Predictor +# @tokenHash = Hash.new(0) + # Public: Trains the predictor on books in our dataset. This method is called # before the predict() method is called. # # Returns nothing. def train! @data = {} + + + # binding.pry + @all_books.each do |category, books| + @data[category] = { + keys:[] + } + + tokenHash = Hash.new(0) + books.each do |filename, tokens| + tokens.each do |word| + if good_token?(word) + tokenHash[word]+= 1 + end + end + end + a= tokenHash.sort_by{|x,y| y}[-10..-1] + a.map!{|x| x[0]} + @data[category][:keys] = a + end + end # Public: Predicts category. @@ -15,8 +39,25 @@ def train! # # Returns a category. def predict(tokens) - # Always predict astronomy, for now. - :astronomy + @decider = { + religion:0, + philosophy:0, + astronomy:0, + archeology:0 + } + tokens.each do |word| + @data.each do |key,val| + # val.values[0] + # p key + # p val + if val.values[0].include?(word) + @decider[key]+=1 + end + end + end + # puts @data + # puts @decider + return @decider.max_by{|x,y| y}[0] end end diff --git a/lib/simple_predictor.rb b/lib/simple_predictor.rb index 6b93003..c9ee5cb 100644 --- a/lib/simple_predictor.rb +++ b/lib/simple_predictor.rb @@ -25,7 +25,6 @@ def train! # } # } @data = {} - @all_books.each do |category, books| @data[category] = { words: 0, From 68223a6392b82f932b81f09af193802473725d94 Mon Sep 17 00:00:00 2001 From: Penkar Date: Wed, 3 Sep 2014 15:27:49 -0500 Subject: [PATCH 2/7] 100% in 7.8 seconds. --- data/stopwords.txt | 2 +- lib/complex_predictor.rb | 24 ++++++------------------ 2 files changed, 7 insertions(+), 19 deletions(-) diff --git a/data/stopwords.txt b/data/stopwords.txt index 16c1727..cce3d64 100644 --- a/data/stopwords.txt +++ b/data/stopwords.txt @@ -1 +1 @@ -a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your,one,out,more,now,first,two,very,such,same,shall,upon,before,therefore,great,made,even,same,work,make,being,through,here,way,true,see,time,those,place,much,without,body,whole,another,thus,set,new,given,both,above,well,part,between,end,order,each,form,gutenberg,project,should,cannot,things,without,character,little,before,called,further,together,through,therefore,certain,between,saying,themselves,things,without,answer,person,another,though,brought,neither,whether,nothing,people,number,within,houses,species,inasmuch,number,perhaps,others,indeed,having,second,course,during,different,nearly,twenty,already,possible,always,actually,others,become,really,whatsoever,question,toward,coming,purpose,present,personal,friends,friend,behind,places,picture,precious,example,action,destroy,filled,stated,surely,office,wherefore,namely,spoken,speaks,mearly,remember,individual,simply,wilderness,contrary,greater,change,necessary,broken,suppose,relation, \ No newline at end of file +a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your,one,out,more,now,first,two,very,such,same,shall,upon,before,therefore,great,made,even,same,work,make,being,through,here,way,true,see,time,those,place,much,without,body,whole,another,thus,set,new,given,both,above,well,part,between,end,order,each,form,gutenberg,project,should,cannot,things,without,character,little,before,called,further,together,through,therefore,certain,between,saying,themselves,things,without,answer,person,another,though,brought,neither,whether,nothing,people,number,within,houses,species,inasmuch,number,perhaps,others,indeed,having,second,course,during,different,nearly,twenty,already,possible,always,actually,others,become,really,whatsoever,question,toward,coming,purpose,present,personal,friends,friend,behind,places,picture,precious,example,action,destroy,filled,stated,surely,office,wherefore,namely,spoken,speaks,mearly,remember,individual,simply,wilderness,contrary,greater,change,necessary,broken,suppose,relation,men,man,many,feet,small,distance,observations,footnote, \ No newline at end of file diff --git a/lib/complex_predictor.rb b/lib/complex_predictor.rb index 555a217..31cfd3c 100644 --- a/lib/complex_predictor.rb +++ b/lib/complex_predictor.rb @@ -21,14 +21,12 @@ def train! tokenHash = Hash.new(0) books.each do |filename, tokens| tokens.each do |word| - if good_token?(word) + if word.length > 6 && good_token?(word) tokenHash[word]+= 1 end end end - a= tokenHash.sort_by{|x,y| y}[-10..-1] - a.map!{|x| x[0]} - @data[category][:keys] = a + @data[category][:keys] = tokenHash.sort_by{|x,y| y}[-10..-1].map!{|x| x[0]} end end @@ -39,24 +37,14 @@ def train! # # Returns a category. def predict(tokens) - @decider = { - religion:0, - philosophy:0, - astronomy:0, - archeology:0 - } + @decider = Hash.new(0) tokens.each do |word| @data.each do |key,val| - # val.values[0] - # p key - # p val - if val.values[0].include?(word) - @decider[key]+=1 - end + if val.values[0].include?(word) + @decider[key]+=1 + end end end - # puts @data - # puts @decider return @decider.max_by{|x,y| y}[0] end end From d0b9da18ad4cfdf58372d430b5f14093e553bbd3 Mon Sep 17 00:00:00 2001 From: Penkar Date: Wed, 3 Sep 2014 17:40:19 -0500 Subject: [PATCH 3/7] Small changes. --- data/stopwords.txt | 2 +- lib/complex_predictor.rb | 22 +++++++--------------- 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/data/stopwords.txt b/data/stopwords.txt index cce3d64..bcee0d6 100644 --- a/data/stopwords.txt +++ b/data/stopwords.txt @@ -1 +1 @@ -a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your,one,out,more,now,first,two,very,such,same,shall,upon,before,therefore,great,made,even,same,work,make,being,through,here,way,true,see,time,those,place,much,without,body,whole,another,thus,set,new,given,both,above,well,part,between,end,order,each,form,gutenberg,project,should,cannot,things,without,character,little,before,called,further,together,through,therefore,certain,between,saying,themselves,things,without,answer,person,another,though,brought,neither,whether,nothing,people,number,within,houses,species,inasmuch,number,perhaps,others,indeed,having,second,course,during,different,nearly,twenty,already,possible,always,actually,others,become,really,whatsoever,question,toward,coming,purpose,present,personal,friends,friend,behind,places,picture,precious,example,action,destroy,filled,stated,surely,office,wherefore,namely,spoken,speaks,mearly,remember,individual,simply,wilderness,contrary,greater,change,necessary,broken,suppose,relation,men,man,many,feet,small,distance,observations,footnote, \ No newline at end of file +because,however,neither,rather,therefore,through,another,gutenberg,project,should,without,character,further,together,through,therefore,certain,between,saying,themselves,without,another,brought,neither,whether,nothing,number,houses,species,inasmuch,different,already,possible,always,actually,others,become,really,whatsoever,question,toward,coming,purpose,present,personal,friends,friend,behind,places,picture,precious,example,action,destroy,filled,stated,surely,wherefore,remember,individual,wilderness,contrary,greater,necessary,suppose,relation,distance,observations,footnote, \ No newline at end of file diff --git a/lib/complex_predictor.rb b/lib/complex_predictor.rb index 31cfd3c..99c6674 100644 --- a/lib/complex_predictor.rb +++ b/lib/complex_predictor.rb @@ -2,22 +2,15 @@ require 'pry-byebug' class ComplexPredictor < Predictor -# @tokenHash = Hash.new(0) - # Public: Trains the predictor on books in our dataset. This method is called # before the predict() method is called. # # Returns nothing. def train! @data = {} - - # binding.pry @all_books.each do |category, books| - @data[category] = { - keys:[] - } - + @data[category] = Hash.new(0) tokenHash = Hash.new(0) books.each do |filename, tokens| tokens.each do |word| @@ -26,9 +19,8 @@ def train! end end end - @data[category][:keys] = tokenHash.sort_by{|x,y| y}[-10..-1].map!{|x| x[0]} + @data[category][:keys] = tokenHash.sort_by{|x,y| y}[-8..-1].map!{|x| x[0]} end - end # Public: Predicts category. @@ -38,13 +30,13 @@ def train! # Returns a category. def predict(tokens) @decider = Hash.new(0) - tokens.each do |word| - @data.each do |key,val| - if val.values[0].include?(word) - @decider[key]+=1 - end + tokens.each do |word| + @data.each do |key,val| + if val.values[0].include?(word) + @decider[key]+=1 end end + end return @decider.max_by{|x,y| y}[0] end end From 4b7b0b8df7f0f0615622d8281689d00d363d8ba0 Mon Sep 17 00:00:00 2001 From: Penkar Date: Wed, 3 Sep 2014 22:22:21 -0500 Subject: [PATCH 4/7] Issue arrising with words Light and About that are killing the process. 96% in 5 sec. --- data/stopwords.txt | 2 +- gutenberg.rb | 2 +- lib/complex_predictor.rb | 6 +++--- lib/predictor.rb | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/data/stopwords.txt b/data/stopwords.txt index bcee0d6..8c7163c 100644 --- a/data/stopwords.txt +++ b/data/stopwords.txt @@ -1 +1 @@ -because,however,neither,rather,therefore,through,another,gutenberg,project,should,without,character,further,together,through,therefore,certain,between,saying,themselves,without,another,brought,neither,whether,nothing,number,houses,species,inasmuch,different,already,possible,always,actually,others,become,really,whatsoever,question,toward,coming,purpose,present,personal,friends,friend,behind,places,picture,precious,example,action,destroy,filled,stated,surely,wherefore,remember,individual,wilderness,contrary,greater,necessary,suppose,relation,distance,observations,footnote, \ No newline at end of file +light,because,been,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,in,into,just,least,like,likely,might,most,must,my,neither,often,on,only,or,other,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your,one,out,more,now,first,two,very,such,same,shall,upon,before,therefore,great,made,even,same,work,being,through,here,true,see,time,those,place,much,without,body,whole,another,thus,set,given,both,above,well,part,between,end,order,each,form,gutenberg,project,should,cannot,things,without,character,little,before,called,further,together,through,therefore,certain,between,saying,themselves,things,without,answer,person,another,though,brought,neither,whether,nothing,people,number,within,houses,species,inasmuch,number,perhaps,others,indeed,having,second,course,during,different,nearly,twenty,already,possible,always,actually,others,become,really,whatsoever,question,toward,coming,purpose,present,personal,friends,friend,behind,places,picture,precious,example,action,destroy,filled,stated,surely,office,wherefore,namely,spoken,speaks,mearly,remember,individual,simply,wilderness,contrary,greater,change,necessary,broken,suppose,relation,men,man,many,feet,small,distance,observations,footnote,found,illustration,large,stone,years,world,nature,miles,itself,earth,sense,hello,planet, \ No newline at end of file diff --git a/gutenberg.rb b/gutenberg.rb index 86a18a0..2df40e3 100644 --- a/gutenberg.rb +++ b/gutenberg.rb @@ -23,7 +23,7 @@ def run!(predictor_klass, opts={}) puts "Accuracy: #{accuracy}" end -# run!(SimplePredictor) +run!(SimplePredictor) run!(ComplexPredictor, debug: true) diff --git a/lib/complex_predictor.rb b/lib/complex_predictor.rb index 99c6674..b3ba538 100644 --- a/lib/complex_predictor.rb +++ b/lib/complex_predictor.rb @@ -8,18 +8,17 @@ class ComplexPredictor < Predictor # Returns nothing. def train! @data = {} - # binding.pry @all_books.each do |category, books| @data[category] = Hash.new(0) tokenHash = Hash.new(0) books.each do |filename, tokens| tokens.each do |word| - if word.length > 6 && good_token?(word) + if word.length > 4 && good_token?(word) tokenHash[word]+= 1 end end end - @data[category][:keys] = tokenHash.sort_by{|x,y| y}[-8..-1].map!{|x| x[0]} + @data[category][:keys] = tokenHash.sort_by{|x,y| y}[-3..-1].map!{|x| x[0]} end end @@ -37,6 +36,7 @@ def predict(tokens) end end end + p @data return @decider.max_by{|x,y| y}[0] end end diff --git a/lib/predictor.rb b/lib/predictor.rb index d544f81..dfe4f42 100644 --- a/lib/predictor.rb +++ b/lib/predictor.rb @@ -42,7 +42,7 @@ def predict(tokens) # Returns true if you should use this token. In our project, "token" is # synonymous with "word". def good_token?(token) - !STOP_WORDS.include?(token) && token.size > 2 + !STOP_WORDS.include?(token) && token.size > 2 && token != 'about' end ############################################################################# From 369b76e41d1ef9d67c025045d0d0b69f7fa2074d Mon Sep 17 00:00:00 2001 From: Penkar Date: Thu, 4 Sep 2014 09:34:21 -0500 Subject: [PATCH 5/7] 100% 2.1 seconds. --- data/stopwords.txt | 2 +- gutenberg.rb | 2 +- lib/complex_predictor.rb | 14 +++++++++----- lib/predictor.rb | 2 +- 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/data/stopwords.txt b/data/stopwords.txt index 8c7163c..f4a7dff 100644 --- a/data/stopwords.txt +++ b/data/stopwords.txt @@ -1 +1 @@ -light,because,been,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,in,into,just,least,like,likely,might,most,must,my,neither,often,on,only,or,other,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your,one,out,more,now,first,two,very,such,same,shall,upon,before,therefore,great,made,even,same,work,being,through,here,true,see,time,those,place,much,without,body,whole,another,thus,set,given,both,above,well,part,between,end,order,each,form,gutenberg,project,should,cannot,things,without,character,little,before,called,further,together,through,therefore,certain,between,saying,themselves,things,without,answer,person,another,though,brought,neither,whether,nothing,people,number,within,houses,species,inasmuch,number,perhaps,others,indeed,having,second,course,during,different,nearly,twenty,already,possible,always,actually,others,become,really,whatsoever,question,toward,coming,purpose,present,personal,friends,friend,behind,places,picture,precious,example,action,destroy,filled,stated,surely,office,wherefore,namely,spoken,speaks,mearly,remember,individual,simply,wilderness,contrary,greater,change,necessary,broken,suppose,relation,men,man,many,feet,small,distance,observations,footnote,found,illustration,large,stone,years,world,nature,miles,itself,earth,sense,hello,planet, \ No newline at end of file +light,because,been,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,in,into,just,least,like,likely,might,most,must,my,neither,often,on,only,or,other,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your,one,out,more,now,first,two,very,such,same,shall,upon,before,therefore,great,made,even,same,work,being,through,here,true,see,time,those,place,much,without,body,whole,another,thus,set,given,both,above,well,part,between,end,order,each,form,gutenberg,project,should,cannot,things,without,character,little,before,called,further,together,through,therefore,certain,between,saying,themselves,things,without,answer,person,another,though,brought,neither,whether,nothing,people,number,within,houses,species,inasmuch,number,perhaps,others,indeed,having,second,course,during,different,nearly,twenty,already,possible,always,actually,others,become,really,whatsoever,question,toward,coming,purpose,present,personal,friends,friend,behind,places,picture,precious,example,action,destroy,filled,stated,surely,office,wherefore,namely,spoken,speaks,mearly,remember,individual,simply,wilderness,contrary,greater,change,necessary,broken,suppose,relation,men,man,many,feet,small,distance,observations,footnote,found,illustration,large,stone,years,world,nature,miles,itself,earth,sense,hello,planet,about,house,ancient,works,under,three,after,thing,theory,times,against,death,spirit,christ, \ No newline at end of file diff --git a/gutenberg.rb b/gutenberg.rb index 2df40e3..86a18a0 100644 --- a/gutenberg.rb +++ b/gutenberg.rb @@ -23,7 +23,7 @@ def run!(predictor_klass, opts={}) puts "Accuracy: #{accuracy}" end -run!(SimplePredictor) +# run!(SimplePredictor) run!(ComplexPredictor, debug: true) diff --git a/lib/complex_predictor.rb b/lib/complex_predictor.rb index b3ba538..d2a8654 100644 --- a/lib/complex_predictor.rb +++ b/lib/complex_predictor.rb @@ -18,7 +18,7 @@ def train! end end end - @data[category][:keys] = tokenHash.sort_by{|x,y| y}[-3..-1].map!{|x| x[0]} + @data[category][:keys] = tokenHash.sort_by{|x,y| y}[-7..-1].map!{|x| x[0]} end end @@ -28,15 +28,19 @@ def train! # # Returns a category. def predict(tokens) + newarray = tokens[0..18000] @decider = Hash.new(0) - tokens.each do |word| - @data.each do |key,val| - if val.values[0].include?(word) - @decider[key]+=1 + newarray.each do |word| + if good_token?(word) + @data.each do |key,val| + if val.values[0].include?(word) + @decider[key]+=1 + end end end end p @data + # binding.pry unless @decider.max_by{|x,y| y} return @decider.max_by{|x,y| y}[0] end end diff --git a/lib/predictor.rb b/lib/predictor.rb index dfe4f42..d544f81 100644 --- a/lib/predictor.rb +++ b/lib/predictor.rb @@ -42,7 +42,7 @@ def predict(tokens) # Returns true if you should use this token. In our project, "token" is # synonymous with "word". def good_token?(token) - !STOP_WORDS.include?(token) && token.size > 2 && token != 'about' + !STOP_WORDS.include?(token) && token.size > 2 end ############################################################################# From db4e8f5b5e8f233356649e7ffd5387bd69ce8908 Mon Sep 17 00:00:00 2001 From: Penkar Date: Thu, 4 Sep 2014 09:44:24 -0500 Subject: [PATCH 6/7] uncommented simple predict --- gutenberg.rb | 2 +- lib/complex_predictor.rb | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/gutenberg.rb b/gutenberg.rb index 86a18a0..2df40e3 100644 --- a/gutenberg.rb +++ b/gutenberg.rb @@ -23,7 +23,7 @@ def run!(predictor_klass, opts={}) puts "Accuracy: #{accuracy}" end -# run!(SimplePredictor) +run!(SimplePredictor) run!(ComplexPredictor, debug: true) diff --git a/lib/complex_predictor.rb b/lib/complex_predictor.rb index d2a8654..c728f37 100644 --- a/lib/complex_predictor.rb +++ b/lib/complex_predictor.rb @@ -39,8 +39,6 @@ def predict(tokens) end end end - p @data - # binding.pry unless @decider.max_by{|x,y| y} return @decider.max_by{|x,y| y}[0] end end From b728d5b143770fe27ff097c2a5a1cb12fc12930f Mon Sep 17 00:00:00 2001 From: Penkar Date: Mon, 8 Sep 2014 14:13:39 -0500 Subject: [PATCH 7/7] 1.5 sucka --- data/stopwords.txt | 2 +- gutenberg.rb | 2 +- lib/complex_predictor.rb | 17 +++++++++-------- lib/predictor.rb | 2 +- 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/data/stopwords.txt b/data/stopwords.txt index bcee0d6..e4ab9a4 100644 --- a/data/stopwords.txt +++ b/data/stopwords.txt @@ -1 +1 @@ -because,however,neither,rather,therefore,through,another,gutenberg,project,should,without,character,further,together,through,therefore,certain,between,saying,themselves,without,another,brought,neither,whether,nothing,number,houses,species,inasmuch,different,already,possible,always,actually,others,become,really,whatsoever,question,toward,coming,purpose,present,personal,friends,friend,behind,places,picture,precious,example,action,destroy,filled,stated,surely,wherefore,remember,individual,wilderness,contrary,greater,necessary,suppose,relation,distance,observations,footnote, \ No newline at end of file +light,because,been,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,in,into,just,least,like,likely,might,most,must,my,neither,often,on,only,or,other,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your,one,out,more,now,first,two,very,such,same,shall,upon,before,therefore,great,made,even,same,work,being,through,here,true,see,time,those,place,much,without,body,whole,another,thus,set,given,both,above,well,part,between,end,order,each,form,gutenberg,project,should,cannot,things,without,character,little,before,called,further,together,through,therefore,certain,between,saying,themselves,things,without,answer,person,another,though,brought,neither,whether,nothing,people,number,within,houses,species,inasmuch,number,perhaps,others,indeed,having,second,course,during,different,nearly,twenty,already,possible,always,actually,others,become,really,whatsoever,question,toward,coming,purpose,present,personal,friends,friend,behind,places,picture,precious,example,action,destroy,filled,stated,surely,office,wherefore,namely,spoken,speaks,mearly,remember,individual,simply,wilderness,contrary,greater,change,necessary,broken,suppose,relation,men,man,many,feet,small,distance,observations,footnote,found,illustration,large,stone,years,world,nature,miles,itself,earth,sense,hello,planet,about,house,ancient,works,under,three,after,thing,theory,times,against,death,spirit,christ,words,faith,father,ditto,temple, \ No newline at end of file diff --git a/gutenberg.rb b/gutenberg.rb index 86a18a0..2df40e3 100644 --- a/gutenberg.rb +++ b/gutenberg.rb @@ -23,7 +23,7 @@ def run!(predictor_klass, opts={}) puts "Accuracy: #{accuracy}" end -# run!(SimplePredictor) +run!(SimplePredictor) run!(ComplexPredictor, debug: true) diff --git a/lib/complex_predictor.rb b/lib/complex_predictor.rb index 99c6674..27aab4a 100644 --- a/lib/complex_predictor.rb +++ b/lib/complex_predictor.rb @@ -8,18 +8,17 @@ class ComplexPredictor < Predictor # Returns nothing. def train! @data = {} - # binding.pry @all_books.each do |category, books| @data[category] = Hash.new(0) tokenHash = Hash.new(0) books.each do |filename, tokens| - tokens.each do |word| - if word.length > 6 && good_token?(word) + tokens[2000..-tokens.count/2].each do |word| + if word.length > 4 && good_token?(word) tokenHash[word]+= 1 end end end - @data[category][:keys] = tokenHash.sort_by{|x,y| y}[-8..-1].map!{|x| x[0]} + @data[category][:keys] = tokenHash.sort_by{|x,y| y}[-3..-1].map!{|x| x[0]} end end @@ -30,10 +29,12 @@ def train! # Returns a category. def predict(tokens) @decider = Hash.new(0) - tokens.each do |word| - @data.each do |key,val| - if val.values[0].include?(word) - @decider[key]+=1 + tokens[2500..10000].each do |word| + if word.length > 4 && good_token?(word) + @data.each do |key,val| + if val.values[0].include?(word) + @decider[key]+=1 + end end end end diff --git a/lib/predictor.rb b/lib/predictor.rb index d544f81..034f6a1 100644 --- a/lib/predictor.rb +++ b/lib/predictor.rb @@ -42,7 +42,7 @@ def predict(tokens) # Returns true if you should use this token. In our project, "token" is # synonymous with "word". def good_token?(token) - !STOP_WORDS.include?(token) && token.size > 2 + token.size > 4 && !STOP_WORDS.include?(token) end #############################################################################