Skip to content

Commit

Permalink
First .py commit
Browse files Browse the repository at this point in the history
This is the first commit where I have brought my actual Python scripts
into the repository.
  • Loading branch information
sonofmun committed Nov 28, 2013
1 parent cb90688 commit e0f612d
Show file tree
Hide file tree
Showing 33 changed files with 15,036 additions and 0 deletions.
23 changes: 23 additions & 0 deletions BWConversionCleanup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
'''
Created on 22.08.2012
@author: matthew.munson
'''
import re
source = open('C:\CompLing\GBible\BWConversion_Dirty.txt', 'r', encoding='utf-8') #put the file to be used as the source here
destination = open('C:\CompLing\GBible\BWConversion_Upper.txt', 'w', encoding='utf-8') #put the path to the destination file here
#ignoreline = re.compile(r'(^Origenes sec\.|^Cl\. 0198|^[0-9]{1,4})')
retab = re.compile(r'\t')
newlinedel = re.compile(r'\n')
for line in source:
#ignoreline = re.search(r'(^Origenes secundum|^Cl\. 0198|^[0-9]{1,4}\n|^\t)', line)
#print(line)
line = line.replace('\'', '\\\'')
line = retab.sub('\': \'', line)
line = newlinedel.sub('\', \'', line)
line = line.upper()
#print(line)
#line.replace('\t', ': \'')
destination.write(line)
destination.close()
source.close()
42 changes: 42 additions & 0 deletions BWGtoUnicode.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
'''
Created on 22.08.2012
@author: matthew.munson
'''
def BWGtoUnicode(sourcepath, destpath):
import re
#The following line assigns the appropriate unicode characters to the existing BibleWorks beta code characters.
#It will need to be changed if the source is something other than BibleWorks Greek.
mapping = dict({'P': 'Π', 'a': 'α', 'b': 'β', 'R': 'Ρ', 'a`': 'ἁ', 'a[': 'ἅ', 'a[|': 'ᾅ', 'a|[': 'ᾅ', 'a]': 'ἃ', 'a]|': 'ᾃ', 'a-': 'ἇ', 'a-|': 'ᾇ', 'a`|': 'ᾁ', 'av': 'ἀ', 'a;': 'ἄ', 'a;|': 'ᾄ', 'a\'': 'ἂ', 'a\'|': 'ᾂ', 'a=': 'ἆ', 'a=|': 'ᾆ', 'av|': 'ᾀ', 'a,': 'ά', 'a,|': 'ᾴ', 'a/': 'ᾶ', 'a/|': 'ᾷ', 'a|': 'ᾳ', 'a.': '\u1F70', 'e`': 'ἑ', 'e[': 'ἕ', 'e]': 'ἓ', 'ev': 'ἐ', 'e;': 'ἔ', 'e\'': 'ἒ', 'e,': 'έ', 'e.': '\u1F72', 'h`': 'ἡ', 'h[': 'ἥ', 'h[|': 'ᾕ', 'h]': 'ἣ', 'h]|': 'ᾓ', 'h-': 'ἧ', 'h-|': 'ᾗ', 'h`|': 'ᾑ', 'hv': 'ἠ', 'h;': 'ἤ', 'h;|': 'ᾔ', 'h\'': 'ἢ', 'h\'|': 'ᾒ', 'h=': 'ἦ', 'h=|': 'ᾖ', 'hv|': 'ᾐ', 'h,': 'ή', 'h,|': 'ῄ', 'h/': 'ῆ', 'h/|': 'ῇ', 'h|': 'ῃ', 'h.': '\u1F74', 'i`': 'ἱ', 'i[': 'ἵ', 'i]': 'ἳ', 'i-': 'ἷ', 'iv': 'ἰ', 'i;': 'ἴ', 'i\'': 'ἲ', 'i=': 'ἶ', 'i,': 'ί', 'i<': 'ΐ', 'i>': 'ῒ', 'i?': 'ϊ', 'i/': 'ῖ', 'i\'': 'ῗ', 'i.': '\u1F76', 'o`': 'ὁ', 'o[': 'ὅ', 'o]': 'ὃ', 'ov': 'ὀ', 'o;': 'ὄ', 'o\'': 'ὂ', 'o,': 'ό', 'o.': '\u1F78', 'r`': 'ῥ', 'rv': 'ῤ', 'u`': 'ὑ', 'u[': 'ὕ', 'u]': 'ὓ', 'u-': 'ὗ', 'uv': 'ὐ', 'u;': 'ὔ', 'u\'': 'ὒ', 'u=': 'ὖ', 'u,': 'ύ', 'u<': 'ΰ', 'u>': 'ῢ', 'u?': 'ϋ', 'u/': 'ῦ', 'u\'': 'ῧ', 'u.': '\u1F7A', 'w`': 'ὡ', 'w[': 'ὥ', 'w[|': 'ᾥ', 'w]': 'ὣ', 'w]|': 'ᾣ', 'w-': 'ὧ', 'w-|': 'ᾧ', 'w`|': 'ᾡ', 'wv': 'ὠ', 'w;': 'ὤ', 'w;|': 'ᾤ', 'w\'': 'ὢ', 'w\'|': 'ᾢ', 'w=': 'ὦ', 'w=|': 'ᾦ', 'wv|': 'ᾠ', 'w,': 'ώ', 'w/': 'ῶ', 'w/|': 'ῷ', 'w|': 'ῳ', 'w,|': 'ῴ', 'w.': '\u1F7C', 'A': 'Α', 'a': 'α', 'B': 'Β', 'b': 'β', 'G': 'Γ', 'g': 'γ', 'D': 'Δ', 'd': 'δ', 'E': 'Ε', 'e': 'ε', 'Z': 'Ζ', 'z': 'ζ', 'H': 'Η', 'h': 'η', 'Q': 'Θ', 'q': 'θ', 'I': 'Ι', 'i': 'ι', 'K': 'Κ', 'k': 'κ', 'L': 'Λ', 'l': 'λ', 'M': 'Μ', 'm': 'μ', 'N': 'Ν', 'n': 'ν', 'X': 'Ξ', 'x': 'ξ', 'O': 'Ο', 'o': 'ο', 'P': 'Π', 'p': 'π', 'R': 'Ρ', 'r': 'ρ', 'S': 'Σ', 's': 'σ', 'j': 'ς', 'T': 'Τ', 't': 'τ', 'U': 'Υ', 'u': 'υ', 'F': 'Φ', 'f': 'φ', 'C': 'Χ', 'c': 'χ', 'Y': 'Ψ', 'y': 'ψ', 'W': 'Ω', 'w': 'ω', '~A': 'Ἁ', '{A': 'Ἅ', '{A|': 'ᾍ', '}A': 'Ἃ', '}A|': 'ᾋ', '_A': 'Ἇ', '_A|': 'ᾏ', '~A|': 'ᾉ', 'VA': 'Ἀ', ':A': 'Ἄ', ':A|': 'ᾌ', '"A': 'Ἂ', '"A|': 'ᾊ', '+A': 'Ἆ', '+A|': 'ᾎ', 'VA|': 'ᾈ', '<A': 'Ά', '~E': 'Ἑ', '{E': 'Ἕ', '}E': 'Ἓ', 'VE': 'Ἐ', ':E': 'Ἔ', '"E': 'Ἒ', '<E': 'Έ', '~H': 'Ἡ', '{H': 'Ἥ', '{H|': 'ᾝ', '}H': 'Ἣ', '}H|': 'ᾛ', '_H': 'Ἧ', '_H|': 'ᾟ', '~H|': 'ᾙ', 'VH': 'Ἠ', ':H': 'Ἤ', ':H|': 'ᾜ', 'H': 'Η', '"H|': 'ᾚ', '+H': 'Ἦ', '+H|': 'ᾞ', 'VH|': 'ᾘ', '<H': 'Ή', 'H|': 'ῌ', '~I': 'Ἱ', '{I': 'Ἵ', '}I': 'Ἳ', '_I': 'Ἷ', 'VI': 'Ἰ', ':I': 'Ἴ', '"I': 'Ἲ', '+I': 'Ἶ', 'I?': 'Ϊ', '~O': 'Ὁ', '{O': 'Ὅ', '}O': 'Ὃ', 'VO': 'Ὀ', ':O': 'Ὄ', '"O': 'Ὂ', '~R': '\u1FEC', '~U': 'Ὑ', '{U': 'Ὕ', '}U': 'Ὓ', '_U': 'Ὗ', 'VU': 'Ύ', '?U': 'Ϋ', '~W': 'Ὡ', '{W': 'Ὥ', '{W|': 'ᾭ', '}W': 'Ὣ', '}W|': 'ᾫ', '_W': 'Ὧ', '_W|': 'ᾯ', '~W|': 'ᾩ', 'VW': 'Ὠ', ':W': 'Ὤ', ':W|': 'ᾬ', '"W': 'Ὢ', '"W|': 'ᾪ', '+W': 'Ὦ', '+W|': 'ᾮ', 'VW|': 'ᾨ', 'W|': 'ῼ', 'V': '\u1FBD', 'Å': '.', '\\': '\u0387', '(': ',', ')': '.', 'È': ';', '&': '-', '^': '*', '%': ')', '$': '(', '#': ']', '@': '[', '!': '+', 'Î': '[', 'Ð': ']', '¹': '\"'})
#The following two lines contain the paths for the source file (book) and the destination file (unibook). Change them as necessary.
book = open(sourcepath, 'r')
unibook = open(destpath, 'w', encoding='utf-8')
keys = mapping.keys()
keys = list(keys)
keys.sort(key = len, reverse = True)
for line in book:
#The following line extracts the book, chapter, and verse information.
#It will only work for something set up like BibleWorks output that has a 3 letter book name, followed by a space, then the chapter number, a colon, then the verse number.
#If the book, chapter, and verse information are in a different format, the regular expression will need to be changed.
bookname = re.search(r'^[A-Za-z0-9]{3}', line)
newline = ''
pos = re.compile(r'@[a-z0-9]+')
p = re.compile(r' +')
tokenizedline = p.split(line)
ampersand = '@'
for word in tokenizedline:
newword = word
#The following "if" statement skips all POS analysis information. This should not be converted to Greek characters.
if ampersand not in word:
#The following "if" statement skips the name of the biblical book. This should not be converted to Greek characters.
if word != bookname.group(0):
for key in keys:
GLetter = mapping[key]
newword = newword.replace(key, GLetter)
if newline == '':
newline = newword
else:
newline = newline + ' ' + newword
unibook.write(newline)
unibook.close()
book.close()
23 changes: 23 additions & 0 deletions BWtoXML.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
'''
Created on 19.12.2012
This script takes a .txt file input in BibleWorks Greek Beta Code with lemma information and companion text
and converts it to XML.
It requires that the BWGtoUnicode.py and the UnicodeToXMLLemmas.py are in the same directory from which
this file is executed.
@author: matthew.munson
'''
import re
import BWGtoUnicode
import UnicodeToXMLLemmas
print("Please use forward slashes / instead of backslashes \ in your paths.")
sourcepath = input("What is the full path of the BW Beta Code source file (should be a .txt file)?")
destpath = input("What is the full path of the directory where you wish your XML Unicode destination file to be created?")
rawpath = re.sub(".[txTX]{3}", "", sourcepath)
filename = re.search(r"\w+.[txTX]{3}", sourcepath)
rawfilename = re.sub(".[txTX]{3}", "", filename.group(0))
unicodedest = rawpath + "Unicode.txt"
xmldest = destpath + "/" + rawfilename + "XML" + ".txt"
BWGtoUnicode.BWGtoUnicode(sourcepath, unicodedest)
UnicodeToXMLLemmas.UnicodeToXML(unicodedest,xmldest)
92 changes: 92 additions & 0 deletions BookChapterVerseDelete.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
'''
Created on 22.08.2012
@author: matthew.munson
'''
import re
import os
os.chdir('C:\CompLing\Jerome\Experiments\Zeta\EpistleVSHistorical\Corpus') #put the path to the destination files here
for files in os.listdir("."):
filebase = files.rstrip('.txtTXT')
destfiles = filebase + '_new.txt'
destination = open(destfiles, 'w', encoding='utf-8')
if files.endswith(".txt") or files.endswith('.TXT'):
# txtfiles.append(files)
#with fileinput.input(files=txtfiles) as source:
with open(files, 'r'
) as source: #this should complete the line if the encoding is not utf-8
# , encoding='utf-8') as source: #this should complete the line if the encoding is utf-8
sourcetext = source.read()
#the next line deletes any lines that begin with [nnn].
#I did this specifically for Jerome´s Epistula ad Damasum.
#It may not work for everything
# if files in footnotefilelist:
# footnotedel = re.compile(r'^[0-9]{1,3}.+?\n', re.S)
for line in sourcetext:
bookchapterverse = re.compile(r'^([A-Za-z0-9]{3}) ([0-9]{1,4}:[0-9]{1,4})') #this line extracts the name of the biblical book from the beginning of the line that is being extractednotedel = re.compile(r'[\[\(\|].+?[\]\)\|]', re.S)
# titledel = re.compile(r'^EPISTOLA.+?\n', re.M)
# epistleintrodel = re.compile(r'^MONITUM.+?^EPISTOLA.+?\n', flags = re.S | re.M)
newsource = bookchapterverse.sub('', line)
# newsource = notedel.sub('', newsource)
# newsource = epistleintrodel.sub('', newsource)
# newsource = titledel.sub('', newsource)
destination.write(newsource)
destination.close()
#unibook = open('C:/CompLing/GBible/GreekBibleUnicode.txt', 'r', encoding='utf-8')
#xmlbook = open('C:/CompLing/GBible/GreekBibleXML.txt', 'w', encoding='utf-8')
#oldbookname = ''
#oldchapter = ''
#for line in unibook:
# wordnumber = 0
# bookchapterverse = re.search(r'^([A-Za-z0-9]{3}) ([0-9]{1,4}:[0-9]{1,4})', line) #this line extracts the name of the biblical book from the beginning of the line that is being extracted
# bookname = bookchapterverse.group(1)
# chapterverse = bookchapterverse.group(2)
# newline = ''
# colonsplit = re.compile(r':')
# tokenizedline = line.split()
# ampersand = '@'
# endline = re.search(' $', line)
# if endline != None:
# line = line.strip(endline.group(0))
# for word in tokenizedline:
# newword = word
# if word == bookname:
# if bookname != oldbookname:
# if oldbookname != '':
# newword = '</chapter>\n</book>\n<book>' + newword + '\n'#+ '</book>'
# newline = newline + newword
# elif oldbookname == '':
# newword = '<book>' + newword #+ '</book>'
# newline = newline + newword
# elif word == chapterverse:
# words = colonsplit.split(word)
# chapter = words[0]
# verse = words[1]
# if chapter != oldchapter:
# if oldchapter != '':
# newword = '<chapter>' + chapter + '\n<verse>' + verse #+ '</verse>'
# newline = newline + newword
# elif oldchapter =='':
# newword = '<chapter>' + chapter + '\n' + '<verse>' + verse
# newline = newline + newword
# else:
# newword = '<verse>' + verse #+ '</verse>'
# newline = newline + newword
# elif ampersand not in word:
# wordnumber = wordnumber + 1
# wordnumberstring = bookname + '.' + chapterverse.replace(':', '.') + '.' + str(wordnumber)
# newword = '<w id=\'' + wordnumberstring + '\'>' + word + '</w>'
# newline = newline + newword
# else:
# newword = '<POS id=\'' + wordnumberstring + '\'>' + word.replace(ampersand, '') + '</POS>'
# newline = newline + newword
## newline = newline + newword
## if oldchapter != chapter:
## newline = newline + '</verse></chapter>\n'
## else:
# newline = newline + '</verse>\n'
# xmlbook.write(newline)
# oldchapter = chapter
# oldbookname = bookname
#xmlbook.close()
#unibook.close()
27 changes: 27 additions & 0 deletions BookExtract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
'''
Created on 21.08.2012
@author: matthew.munson
'''
import re #imports the regular expressions module
source = open('C:\CompLing\Jerome\From Hebrew\Corpus\He_gen - est.txt', 'r+') #opens the text from which we want to extract the information
oldbook = '' #initializes the variable 'oldbook'
target = '' #initializes the variable 'target'
for line in source: #this for loop loops over every line in the document and writes it to a new file
book = re.search(r'^[A-Za-z0-9]{3}', line) #this line extracts the name of the biblical book from the beginning of the line that is being extracted
file = 'C:/CompLing/Jerome/From Hebrew/Corpus/' + book.group(0) + '.txt' #this sets the name of the target file depending on what the name of the book is that was extracted from the beginning of 'line'
if oldbook == '': #checks to see if this is the first line in the source file. If it is, it assigns the value of book.group(0) to oldbook and then opens the target file.
oldbook = book.group(0) #assigns the value of book.group(0) to oldbook
target = open(file, 'w') #opens the target file for writing
elif oldbook != book.group(0): #this loop runs when the program encounters a line in the source file from a new biblical book (i.e., it has a different book abbreviation at the beginning of the line)
#print(book.group(0)) #checks the value of book.group(0). Just for testing.
#print(oldbook) #checks the value of oldbook. Just for testing.
target.close() #this closes the previous target file so that the lines from the new biblical book can be written to a new file.
target = open(file, 'w') #this opens the new target file using the abbreviation of the biblical book as its name
oldbook = book.group(0) #this assigns the value of book.group(0) to oldbook. This makes it so that this if...elif loop does not run until the program encounters a line where the biblical book abbreviation is different at the beginning of the line.
target.write(line) #this writes the contents of 'line' to the file just opened. This is necessary within the loop so that if the next line within the source file starts a new book, the file for the previous book will be closed before trying to assign a new file name to 'target'
#with open(file, 'a') as target: #this with loop automatically closes 'file' when the loop finishes.
#target.write(line) #this writes the contents of 'line' to the file just opened. This is necessary within the loop so that if the next line within the source file starts a new book, the file for the previous book will be closed before trying to assign a new file name to 'target'
target.close() #closes the last target file used
source.close() #closes the source file
input('Finished! Press return to continue.') #this is here to show the user when the program is finished.
76 changes: 76 additions & 0 deletions CollCalcLogLike.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
'''
Created on 14.02.2013
@author: matthew.munson
'''
import re
import pickle
import os
import math
import os.path
import decimal
lemdoc = open("C:/CompLing/GBible/XML/LemLists/GBibleLemmsNT.txt", mode = 'rb')
lemlist = pickle.load(lemdoc)
print(len(lemlist))
FileList = os.listdir("C:/CompLing/GBible/XML/CollLists/NT")
LemCount = 0
#CollPos = ['L10', 'L9', 'L8', 'L7', 'L6', 'L5', 'L4', 'L3', 'L2', 'L1', 'Lemma', 'R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R8', 'R9', 'R10']
for file in FileList:
LLCollList = []
CollFilename = "C:/CompLing/GBible/XML/CollLists/NT/" + file
CollFile = open(CollFilename, mode = 'r', encoding = 'utf-8')
LLfile = re.sub(r'.txt', r'_LL.txt', file)
LLFileName = "C:/CompLing/GBible/XML/CollLists/NT/LL/" + LLfile
if os.path.isfile(LLFileName) == True: #checks to see if a collocation file already exists for this lemma
continue #if a collocation file for this lemma exists, it breaks out of the if loop and selects the next member of lemlist
else:
LLListFile = open(LLFileName, mode = 'w', encoding = 'utf-8')
for line in CollFile:
if 'P=Lemma' in line:
LemCount = int(re.sub(r'.+?Lemma W=[^\']+\', ([0-9]+).*\n', r'\1', line))
# CollIndex = 0
if '(' in line:
line = line.strip(',"\n') # = re.sub(r'\"', '', line, count = 1)
LineList = re.split(r'\",+\"', line)
for collocate in LineList:
if re.match('\",+\"', collocate):
continue
elif 'Lemma' in collocate:
LLLem = re.sub(r'\"{,1}\(\'(P=Lemma) (W=[^\']+)\', ([0-9]+)\)\"{,1}', r'(\1 \2 N=\3 MI=NA', collocate)
LLLem = LLLem + ')'
LLCollList.append(LLLem)
continue
else:
# print(collocate)
CollCount = int(re.sub(r'\"{,1}\(\'P=[LR][0-9]{1,2} W=[^\']+\', ([0-9]+)\)\"{,1}', r'\1', collocate))
CollLemCount = lemlist.count(re.sub(r'\"{,1}\(\'P=[LR][0-9]{1,2} W=([^\']+)\', [0-9]+\)\"{,1}', r'\1', collocate))
Num = len(lemlist)
C1 = decimal.Decimal(LemCount)
C2 = decimal.Decimal(CollLemCount)
C12 = decimal.Decimal(CollCount)
P = decimal.Decimal(C2/Num)
P1 = decimal.Decimal(C12/C1)
P2 = decimal.Decimal((C2 - C12)/(Num-C1))
Test = C2-C12
# print(P)
# print(Test)
# print(decimal.Context(Emin = -425000000).power(P, C2-C12))
# print(collocate)
LL1 = decimal.Context(Emin = -425000000).log10(decimal.Context(Emin = -425000000).power(P, C12)*decimal.Context(Emin = -425000000).power(1-P, C1-C12))
LL2 = decimal.Context(Emin = -425000000).log10(decimal.Context(Emin = -425000000).power(P, C2-C12)*decimal.Context(Emin = -425000000).power(1-P, Num-C1-C2-C12))
if P1 == 1:
LL3 = 0
else:
LL3 = decimal.Context(Emin = -425000000).log10(decimal.Context(Emin = -425000000).power(P1, C12)*decimal.Context(Emin = -425000000).power(1-P1, C1-C12))
if P2 == 0:
LL4 = 0
else:
LL4 = decimal.Context(Emin = -425000000).log10(decimal.Context(Emin = -425000000).power(P2, C2-C12)*decimal.Context(Emin = -425000000).power(1-P2, (Num-C1)-(C2-C12)))
LL = -2*(LL1+LL2-LL3-LL4)
LLColl = re.sub(r'\"{,1}\(\'(P=[LR][0-9]{1,2}) (W=[^\']+)\', ([0-9]+)\)\"{,1}', r'(\1 \2 N=\3 LL=', collocate)
LLColl = LLColl + str(LL) + ')'
LLCollList.append(LLColl)
LLCollStr = str(LLCollList)
LLListFile.write(LLCollStr)
LLListFile.close()
lemdoc.close()
Loading

0 comments on commit e0f612d

Please sign in to comment.