Skip to content

Commit

Permalink
Files
Browse files Browse the repository at this point in the history
  • Loading branch information
reimers committed Jun 9, 2016
1 parent b9d07be commit 539f5a5
Show file tree
Hide file tree
Showing 5 changed files with 57,754 additions and 0 deletions.
103 changes: 103 additions & 0 deletions EvaluateTruecaser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
from Truecaser import *
import cPickle
import nltk
import string


def evaluateTrueCaser(testSentences, wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist):
correctTokens = 0
totalTokens = 0

for sentence in testSentences:
tokensCorrect = nltk.word_tokenize(sentence)
tokens = [token.lower() for token in tokensCorrect]
tokensTrueCase = getTrueCase(tokens, 'title', wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist)

perfectMatch = True

for idx in xrange(len(tokensCorrect)):
totalTokens += 1
if tokensCorrect[idx] == tokensTrueCase[idx]:
correctTokens += 1
else:
perfectMatch = False

if not perfectMatch:
print tokensCorrect
print tokensTrueCase

print "-------------------"


print "Accuracy: %.2f%%" % (correctTokens / float(totalTokens)*100)


def defaultTruecaserEvaluation(wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist):
testSentences = [
"Its website was launched on February 4, 2004 by Mark Zuckerberg with his Harvard College roommates and fellow students Eduardo Saverin, Andrew McCollum, Dustin Moskovitz, and Chris Hughes."
,"Facebook is a for-profit corporation and online social networking service based in Menlo Park, California, United States. "
,"The founders had initially limited the website's membership to Harvard students, but later expanded it to colleges in the Boston area, the Ivy League, and Stanford University. "
,"It gradually added support for students at various other universities and later to high school students. "
,"Since 2006, anyone in general aged 13 and older has been allowed to become a registered user of the website, though variations exist in the minimum age requirement, depending on applicable local laws."
,"Its name comes from the face book directories often given to American university students."
,"Because of the large volume of data that users submit to the service, Facebook has come under scrutiny for their privacy policies. Facebook, Inc. held its initial public offering in February 2012 and began selling stock to the public three months later, reaching an original peak market capitalization of $104 billion."
,"Zuckerberg wrote a program called Facemash on October 28, 2003 while attending Harvard University as a sophomore (second year student)."
,"Apple Inc. is an American multinational technology company headquartered in Cupertino, California, that designs, develops, and sells consumer electronics, computer software, and online services."
,"Its hardware products include the iPhone smartphone, the iPad tablet computer, the Mac personal computer, the iPod portable media player, and the Apple Watch smartwatch."
,"Apple's consumer software includes the OS X and iOS operating systems, the iTunes media player, the Safari web browser, and the iLife and iWork creativity and productivity suites."
,"Its online services include the iTunes Store, the iOS App Store and Mac App Store, and iCloud."
,"Microsoft Corporation (commonly referred to as Microsoft) is an American multinational technology company headquartered in Redmond, Washington, that develops, manufactures, licenses, supports and sells computer software, consumer electronics and personal computers and services."
,"Its best known software products are the Microsoft Windows line of operating systems, Microsoft Office office suite, and Internet Explorer and Edge web browsers."
,"Its flagship hardware products are the Xbox game consoles and the Microsoft Surface tablet lineup."
,"It is the world's largest software maker by revenue, and one of the world's most valuable companies."
,"Google is an American multinational technology company specializing in Internet-related services and products."
,"These include online advertising technologies, search, cloud computing, and software."
,"Most of its profits are derived from AdWords, an online advertising service that places advertising near the list of search results."
,"Rapid growth since incorporation has triggered a chain of products, acquisitions and partnerships beyond Google's core search engine (Google Search)."
,"It offers online productivity software (Google Docs) including email (Gmail), a cloud storage service (Google Drive) and a social networking service (Google+)."
,"Desktop products include applications for web browsing (Google Chrome), organizing and editing photos (Google Photos), and instant messaging and video chat (Hangouts)."
,"The company leads the development of the Android mobile operating system and the browser-only Chrome OS for a class of netbooks known as Chromebooks and desktop PCs known as Chromeboxes."
,"Google has moved increasingly into communications hardware, partnering with major electronics manufacturers[20] in the production of its \"high-quality low-cost\" Nexus devices."
,"In 2012, a fiber-optic infrastructure was installed in Kansas City to facilitate a Google Fiber broadband service."
,"WhatsApp Messenger is a proprietary cross-platform, encrypted, instant messaging client for smartphones."
,"It uses the Internet to send text messages, documents, images, video, user location and audio messages to other users using standard cellular mobile numbers."
,"As of February 2016, WhatsApp had a user base of one billion, making it the most popular messaging application."
,"WhatsApp Inc., based in Mountain View, California, United States, was acquired by Facebook Inc. on February 19, 2014, for approximately US$19.3 billion"
,"Barack Hussein Obama II (born August 4, 1961) is an American politician serving as the 44th President of the United States."
,"He is the first African American to hold the office, as well as the first president born outside of the continental United States."
,"Born in Honolulu, Hawaii, Obama is a graduate of Columbia University and Harvard Law School, where he served as president of the Harvard Law Review."
,"He was a community organizer in Chicago before earning his law degree."
,"He worked as a civil rights attorney and taught constitutional law at University of Chicago Law School between 1992 and 2004."
,"He served three terms representing the 13th District in the Illinois Senate from 1997 to 2004, and ran unsuccessfully in the Democratic primary for the United States House of Representatives in 2000 against incumbent Bobby Rush."
,"In 2004, Obama received national attention during his campaign to represent Illinois in the United States Senate with his victory in the March Democratic Party primary, his keynote address at the Democratic National Convention in July, and his election to the Senate in November."
,"He began his presidential campaign in 2007 and, after a close primary campaign against Hillary Clinton in 2008, he won sufficient delegates in the Democratic Party primaries to receive the presidential nomination."
,"He then defeated Republican nominee John McCain in the general election, and was inaugurated as president on January 20, 2009."
,"Nine months after his inauguration, Obama was named the 2009 Nobel Peace Prize laureate."
,"Albert Einstein was a German-born theoretical physicist. He developed the general theory of relativity, one of the two pillars of modern physics (alongside quantum mechanics)."
,"Einstein's work is also known for its influence on the philosophy of science."
,"Einstein is best known in popular culture for his mass-energy equivalence formula E = mc2 (which has been dubbed \"the world's most famous equation\")."
,"He received the 1921 Nobel Prize in Physics for his \"services to theoretical physics\", in particular his discovery of the law of the photoelectric effect, a pivotal step in the evolution of quantum theory."
,"Near the beginning of his career, Einstein thought that Newtonian mechanics was no longer enough to reconcile the laws of classical mechanics with the laws of the electromagnetic field."
,"This led to the development of his special theory of relativity."
,"He realized, however, that the principle of relativity could also be extended to gravitational fields, and with his subsequent theory of gravitation in 1916, he published a paper on general relativity."
,"He continued to deal with problems of statistical mechanics and quantum theory, which led to his explanations of particle theory and the motion of molecules. He also investigated the thermal properties of light which laid the foundation of the photon theory of light."
,"In 1917, Einstein applied the general theory of relativity to model the large-scale structure of the universe."
,"Ulm is a city in the federal German state of Baden-Wuerttemberg, situated on the River Danube."
,"The city, whose population is estimated at almost 120,000 (2015), forms an urban district of its own and is the administrative seat of the Alb-Donau district."
,"Ulm, founded around 850, is rich in history and traditions as a former Free Imperial City."
,"Today, it is an economic centre due to its varied industries, and it is the seat of the University of Ulm."
,"Internationally, Ulm is primarily known for having the church with the tallest steeple in the world (161.53 m or 529.95 ft), the Gothic minster (Ulm Minster) and as the birthplace of Albert Einstein."
]

evaluateTrueCaser(testSentences, wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist)

if __name__ == "__main__":
f = open('distributions.obj', 'rb')
uniDist = cPickle.load(f)
backwardBiDist = cPickle.load(f)
forwardBiDist = cPickle.load(f)
trigramDist = cPickle.load(f)
wordCasingLookup = cPickle.load(f)
f.close()

defaultTruecaserEvaluation(wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist)
127 changes: 127 additions & 0 deletions TrainFunctions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
import nltk

def getCasing(word):
""" Returns the casing of a word"""
if len(word) == 0:
return 'other'
elif word.isdigit(): #Is a digit
return 'numeric'
elif word.islower(): #All lower case
return 'allLower'
elif word.isupper(): #All upper case
return 'allUpper'
elif word[0].isupper(): #is a title, initial char upper, then all lower
return 'initialUpper'

return 'other'


def checkSentenceSanity(sentence):
""" Checks the sanity of the sentence. If the sentence is for example all uppercase, it is recjected"""
caseDist = nltk.FreqDist()

for token in sentence:
caseDist[getCasing(token)] += 1

if caseDist.most_common(1)[0][0] != 'allLower':
return False

return True

def updateDistributionsFromSentences(text, wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist):
"""
Updates the NLTK Frequency Distributions based on a list of sentences.
text: Array of sentences.
Each sentence must be an array of Tokens.
"""
# :: Create unigram lookup ::
for sentence in text:
if not checkSentenceSanity(sentence):
continue

for tokenIdx in xrange(1, len(sentence)):
word = sentence[tokenIdx]
uniDist[word] += 1

if word.lower() not in wordCasingLookup:
wordCasingLookup[word.lower()] = set()

wordCasingLookup[word.lower()].add(word)


# :: Create backward + forward bigram lookup ::
for sentence in text:
if not checkSentenceSanity(sentence):
continue

for tokenIdx in xrange(2, len(sentence)): #Start at 2 to skip first word in sentence
word = sentence[tokenIdx]
wordLower = word.lower()

if wordLower in wordCasingLookup and len(wordCasingLookup[wordLower]) >= 2: #Only if there are multiple options
prevWord = sentence[tokenIdx-1]

backwardBiDist[prevWord+"_"+word] +=1

if tokenIdx < len(sentence)-1:
nextWord = sentence[tokenIdx+1].lower()
forwardBiDist[word+"_"+nextWord] += 1

# :: Create trigram lookup ::
for sentence in text:
if not checkSentenceSanity(sentence):
continue

for tokenIdx in xrange(2, len(sentence)-1): #Start at 2 to skip first word in sentence
prevWord = sentence[tokenIdx-1]
curWord = sentence[tokenIdx]
curWordLower = word.lower()
nextWordLower = sentence[tokenIdx+1].lower()

if curWordLower in wordCasingLookup and len(wordCasingLookup[curWordLower]) >= 2: #Only if there are multiple options
trigramDist[prevWord+"_"+curWord+"_"+nextWordLower] += 1




def updateDistributionsFromNgrams(bigramFile, trigramFile, wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist):
"""
Updates the FrequencyDistribitions based on an ngram file,
e.g. the ngram file of http://www.ngrams.info/download_coca.asp
"""
for line in open(bigramFile):
splits = line.strip().split('\t')
cnt, word1, word2 = splits
cnt = int(cnt)

# Unigram
if word1.lower() not in wordCasingLookup:
wordCasingLookup[word1.lower()] = set()

wordCasingLookup[word1.lower()].add(word1)

if word2.lower() not in wordCasingLookup:
wordCasingLookup[word2.lower()] = set()

wordCasingLookup[word2.lower()].add(word2)


uniDist[word1] += cnt
uniDist[word2] += cnt

# Bigrams
backwardBiDist[word1+"_"+word2] +=cnt
forwardBiDist[word1+"_"+word2.lower()] += cnt


#Tigrams
for line in open(trigramFile):
splits = line.strip().split('\t')
cnt, word1, word2, word3 = splits
cnt = int(cnt)

trigramDist[word1+"_"+word2+"_"+word3.lower()] += cnt




80 changes: 80 additions & 0 deletions TrainTruecaser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
"""
This script trains the TrueCase System
"""
import nltk
import nltk.corpus
from nltk.corpus import brown
from nltk.corpus import reuters
import cPickle
import string
import math
import MySQLdb
import MySQLdb.cursors
import nltk.data

from TrainFunctions import *
from EvaluateTruecaser import defaultTruecaserEvaluation


uniDist = nltk.FreqDist()
backwardBiDist = nltk.FreqDist()
forwardBiDist = nltk.FreqDist()
trigramDist = nltk.FreqDist()
wordCasingLookup = {}





"""
There are three options to train the true caser:
1) Use the sentences in NLTK
2) Use the train.txt file. Each line must contain a single sentence. Use a large corpus, for example Wikipedia
3) Use Bigrams + Trigrams count from the website http://www.ngrams.info/download_coca.asp
The more training data, the better the results
"""


# :: Option 1: Train it based on NLTK corpus ::
print "Update from NLTK Corpus"
NLTKCorpus = brown.sents()+reuters.sents()+nltk.corpus.semcor.sents()+nltk.corpus.conll2000.sents()+nltk.corpus.state_union.sents()
updateDistributionsFromSentences(NLTKCorpus, wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist)

# :: Option 2: Train it based the train.txt file ::
""" #Uncomment, if you want to train from train.txt
print "Update from train.txt file"
sentences = []
for line in open('train.txt'):
sentences.append(line.strip())
tokens = [nltk.word_tokenize(sentence) for sentence in sentences]
updateDistributionsFromSentences(tokens, wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist)
"""

# :: Option 3: Train it based ngrams tables from http://www.ngrams.info/download_coca.asp ::
""" #Uncomment, if you want to train from train.txt
print "Update Bigrams / Trigrams"
updateDistributionsFromNgrams('ngrams/w2.txt', 'ngrams/w3.txt', wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist)
"""

f = open('distributions.obj', 'wb')
cPickle.dump(uniDist, f, protocol=cPickle.HIGHEST_PROTOCOL)
cPickle.dump(backwardBiDist, f, protocol=cPickle.HIGHEST_PROTOCOL)
cPickle.dump(forwardBiDist, f, protocol=cPickle.HIGHEST_PROTOCOL)
cPickle.dump(trigramDist, f, protocol=cPickle.HIGHEST_PROTOCOL)
cPickle.dump(wordCasingLookup, f, protocol=cPickle.HIGHEST_PROTOCOL)
f.close()



# :: Correct sentences ::

defaultTruecaserEvaluation(wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist)







Loading

0 comments on commit 539f5a5

Please sign in to comment.