Skip to content

Commit

Permalink
percentage progress output during parse. fixed sentence separator. ad…
Browse files Browse the repository at this point in the history
…ded gitignore
  • Loading branch information
codebox committed Apr 1, 2013
1 parent e2eda8f commit 39be764
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 3 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
*.pyc
*.db
*.txt
.DS_Store
9 changes: 6 additions & 3 deletions parse.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from __future__ import division
import sqlite3
import codecs
import sys
Expand All @@ -6,17 +7,19 @@ class Parser:
SENTENCE_START_SYMBOL = '^'
SENTENCE_END_SYMBOL = '$'

def __init__(self, name, db):
def __init__(self, name, db, split_char = '.'):
self.name = name
self.db = db
self.split_char = split_char

def save_word_pair(self, word1, word2):
self.db.add_word(word1, word2)

def parse(self, file_name):
txt = codecs.open(file_name, 'r', 'utf-8').read()
sentences = txt.split('\n')
sentences = txt.split(self.split_char)
i = 0
l = len(sentences)

for sentence in sentences:
words = sentence.split()
Expand All @@ -30,7 +33,7 @@ def parse(self, file_name):
self.db.commit()
i += 1
if i % 1000 == 0:
print i
print '%d%% complete' % (100 * i / l,)
sys.stdout.flush()


0 comments on commit 39be764

Please sign in to comment.