First .py commit

This is the first commit where I have brought my actual Python scripts into the repository.
sonofmun · Nov 28, 2013 · e0f612d · e0f612d
1 parent cb90688
commit e0f612d
Show file tree

Hide file tree

Showing 33 changed files with 15,036 additions and 0 deletions.
diff --git a/BWConversionCleanup.py b/BWConversionCleanup.py
@@ -0,0 +1,23 @@
+'''
+Created on 22.08.2012
+
+@author: matthew.munson
+'''
+import re
+source = open('C:\CompLing\GBible\BWConversion_Dirty.txt', 'r', encoding='utf-8') #put the file to be used as the source here
+destination = open('C:\CompLing\GBible\BWConversion_Upper.txt', 'w', encoding='utf-8') #put the path to the destination file here
+#ignoreline = re.compile(r'(^Origenes sec\.|^Cl\. 0198|^[0-9]{1,4})')
+retab = re.compile(r'\t')
+newlinedel = re.compile(r'\n')
+for line in source:
+    #ignoreline = re.search(r'(^Origenes secundum|^Cl\. 0198|^[0-9]{1,4}\n|^\t)', line)
+    #print(line)
+    line = line.replace('\'', '\\\'')
+    line = retab.sub('\': \'', line)
+    line = newlinedel.sub('\', \'', line)
+    line = line.upper()
+    #print(line)
+    #line.replace('\t', ': \'')
+    destination.write(line)
+destination.close()
+source.close()
diff --git a/BWGtoUnicode.py b/BWGtoUnicode.py
@@ -0,0 +1,42 @@
+'''
+Created on 22.08.2012
+
+@author: matthew.munson
+'''
+def BWGtoUnicode(sourcepath, destpath):
+    import re
+    #The following line assigns the appropriate unicode characters to the existing BibleWorks beta code characters.
+    #It will need to be changed if the source is something other than BibleWorks Greek.
+    mapping = dict({'P': 'Π', 'a': 'α', 'b': 'β', 'R': 'Ρ', 'a`': 'ἁ', 'a[': 'ἅ', 'a[|': 'ᾅ', 'a|[': 'ᾅ', 'a]': 'ἃ', 'a]|': 'ᾃ', 'a-': 'ἇ', 'a-|': 'ᾇ', 'a`|': 'ᾁ', 'av': 'ἀ', 'a;': 'ἄ', 'a;|': 'ᾄ', 'a\'': 'ἂ', 'a\'|': 'ᾂ', 'a=': 'ἆ', 'a=|': 'ᾆ', 'av|': 'ᾀ', 'a,': 'ά', 'a,|': 'ᾴ', 'a/': 'ᾶ', 'a/|': 'ᾷ', 'a|': 'ᾳ', 'a.': '\u1F70', 'e`': 'ἑ', 'e[': 'ἕ', 'e]': 'ἓ', 'ev': 'ἐ', 'e;': 'ἔ', 'e\'': 'ἒ', 'e,': 'έ', 'e.': '\u1F72', 'h`': 'ἡ', 'h[': 'ἥ', 'h[|': 'ᾕ', 'h]': 'ἣ', 'h]|': 'ᾓ', 'h-': 'ἧ', 'h-|': 'ᾗ', 'h`|': 'ᾑ', 'hv': 'ἠ', 'h;': 'ἤ', 'h;|': 'ᾔ', 'h\'': 'ἢ', 'h\'|': 'ᾒ', 'h=': 'ἦ', 'h=|': 'ᾖ', 'hv|': 'ᾐ', 'h,': 'ή', 'h,|': 'ῄ', 'h/': 'ῆ', 'h/|': 'ῇ', 'h|': 'ῃ', 'h.': '\u1F74', 'i`': 'ἱ', 'i[': 'ἵ', 'i]': 'ἳ', 'i-': 'ἷ', 'iv': 'ἰ', 'i;': 'ἴ', 'i\'': 'ἲ', 'i=': 'ἶ', 'i,': 'ί', 'i<': 'ΐ', 'i>': 'ῒ', 'i?': 'ϊ', 'i/': 'ῖ', 'i\'': 'ῗ', 'i.': '\u1F76', 'o`': 'ὁ', 'o[': 'ὅ', 'o]': 'ὃ', 'ov': 'ὀ', 'o;': 'ὄ', 'o\'': 'ὂ', 'o,': 'ό', 'o.': '\u1F78', 'r`': 'ῥ', 'rv': 'ῤ', 'u`': 'ὑ', 'u[': 'ὕ', 'u]': 'ὓ', 'u-': 'ὗ', 'uv': 'ὐ', 'u;': 'ὔ', 'u\'': 'ὒ', 'u=': 'ὖ', 'u,': 'ύ', 'u<': 'ΰ', 'u>': 'ῢ', 'u?': 'ϋ', 'u/': 'ῦ', 'u\'': 'ῧ', 'u.': '\u1F7A', 'w`': 'ὡ', 'w[': 'ὥ', 'w[|': 'ᾥ', 'w]': 'ὣ', 'w]|': 'ᾣ', 'w-': 'ὧ', 'w-|': 'ᾧ', 'w`|': 'ᾡ', 'wv': 'ὠ', 'w;': 'ὤ', 'w;|': 'ᾤ', 'w\'': 'ὢ', 'w\'|': 'ᾢ', 'w=': 'ὦ', 'w=|': 'ᾦ', 'wv|': 'ᾠ', 'w,': 'ώ', 'w/': 'ῶ', 'w/|': 'ῷ', 'w|': 'ῳ', 'w,|': 'ῴ', 'w.': '\u1F7C', 'A': 'Α', 'a': 'α', 'B': 'Β', 'b': 'β', 'G': 'Γ', 'g': 'γ', 'D': 'Δ', 'd': 'δ', 'E': 'Ε', 'e': 'ε', 'Z': 'Ζ', 'z': 'ζ', 'H': 'Η', 'h': 'η', 'Q': 'Θ', 'q': 'θ', 'I': 'Ι', 'i': 'ι', 'K': 'Κ', 'k': 'κ', 'L': 'Λ', 'l': 'λ', 'M': 'Μ', 'm': 'μ', 'N': 'Ν', 'n': 'ν', 'X': 'Ξ', 'x': 'ξ', 'O': 'Ο', 'o': 'ο', 'P': 'Π', 'p': 'π', 'R': 'Ρ', 'r': 'ρ', 'S': 'Σ', 's': 'σ', 'j': 'ς', 'T': 'Τ', 't': 'τ', 'U': 'Υ', 'u': 'υ', 'F': 'Φ', 'f': 'φ', 'C': 'Χ', 'c': 'χ', 'Y': 'Ψ', 'y': 'ψ', 'W': 'Ω', 'w': 'ω', '~A': 'Ἁ', '{A': 'Ἅ', '{A|': 'ᾍ', '}A': 'Ἃ', '}A|': 'ᾋ', '_A': 'Ἇ', '_A|': 'ᾏ', '~A|': 'ᾉ', 'VA': 'Ἀ', ':A': 'Ἄ', ':A|': 'ᾌ', '"A': 'Ἂ', '"A|': 'ᾊ', '+A': 'Ἆ', '+A|': 'ᾎ', 'VA|': 'ᾈ', '<A': 'Ά', '~E': 'Ἑ', '{E': 'Ἕ', '}E': 'Ἓ', 'VE': 'Ἐ', ':E': 'Ἔ', '"E': 'Ἒ', '<E': 'Έ', '~H': 'Ἡ', '{H': 'Ἥ', '{H|': 'ᾝ', '}H': 'Ἣ', '}H|': 'ᾛ', '_H': 'Ἧ', '_H|': 'ᾟ', '~H|': 'ᾙ', 'VH': 'Ἠ', ':H': 'Ἤ', ':H|': 'ᾜ', 'H': 'Η', '"H|': 'ᾚ', '+H': 'Ἦ', '+H|': 'ᾞ', 'VH|': 'ᾘ', '<H': 'Ή', 'H|': 'ῌ', '~I': 'Ἱ', '{I': 'Ἵ', '}I': 'Ἳ', '_I': 'Ἷ', 'VI': 'Ἰ', ':I': 'Ἴ', '"I': 'Ἲ', '+I': 'Ἶ', 'I?': 'Ϊ', '~O': 'Ὁ', '{O': 'Ὅ', '}O': 'Ὃ', 'VO': 'Ὀ', ':O': 'Ὄ', '"O': 'Ὂ', '~R': '\u1FEC', '~U': 'Ὑ', '{U': 'Ὕ', '}U': 'Ὓ', '_U': 'Ὗ', 'VU': 'Ύ', '?U': 'Ϋ', '~W': 'Ὡ', '{W': 'Ὥ', '{W|': 'ᾭ', '}W': 'Ὣ', '}W|': 'ᾫ', '_W': 'Ὧ', '_W|': 'ᾯ', '~W|': 'ᾩ', 'VW': 'Ὠ', ':W': 'Ὤ', ':W|': 'ᾬ', '"W': 'Ὢ', '"W|': 'ᾪ', '+W': 'Ὦ', '+W|': 'ᾮ', 'VW|': 'ᾨ', 'W|': 'ῼ', 'V': '\u1FBD', 'Å': '.', '\\': '\u0387', '(': ',', ')': '.', 'È': ';', '&': '-', '^': '*', '%': ')', '$': '(', '#': ']', '@': '[', '!': '+', 'Î': '[', 'Ð': ']', '¹': '\"'})
+    #The following two lines contain the paths for the source file (book) and the destination file (unibook).  Change them as necessary.
+    book = open(sourcepath, 'r')
+    unibook = open(destpath, 'w', encoding='utf-8')
+    keys = mapping.keys()
+    keys = list(keys)
+    keys.sort(key = len, reverse = True)
+    for line in book:
+        #The following line extracts the book, chapter, and verse information.
+        #It will only work for something set up like BibleWorks output that has a 3 letter book name, followed by a space, then the chapter number, a colon, then the verse number.
+        #If the book, chapter, and verse information are in a different format, the regular expression will need to be changed.
+        bookname = re.search(r'^[A-Za-z0-9]{3}', line)
+        newline = ''
+        pos = re.compile(r'@[a-z0-9]+')
+        p = re.compile(r' +')
+        tokenizedline = p.split(line)
+        ampersand = '@'
+        for word in tokenizedline:
+            newword = word
+            #The following "if" statement skips all POS analysis information.  This should not be converted to Greek characters.
+            if ampersand not in word:
+                #The following "if" statement skips the name of the biblical book.  This should not be converted to Greek characters.
+                if word != bookname.group(0):
+                    for key in keys:
+                        GLetter = mapping[key]
+                        newword = newword.replace(key, GLetter)
+            if newline == '':
+                newline = newword
+            else:
+                newline = newline + ' ' + newword
+        unibook.write(newline)
+    unibook.close()
+    book.close()
diff --git a/BWtoXML.py b/BWtoXML.py
@@ -0,0 +1,23 @@
+'''
+Created on 19.12.2012
+
+This script takes a .txt file input in BibleWorks Greek Beta Code with lemma information and companion text
+and converts it to XML.
+It requires that the BWGtoUnicode.py and the UnicodeToXMLLemmas.py are in the same directory from which 
+this file is executed.
+
+@author: matthew.munson
+'''
+import re
+import BWGtoUnicode
+import UnicodeToXMLLemmas
+print("Please use forward slashes / instead of backslashes \ in your paths.")
+sourcepath = input("What is the full path of the BW Beta Code source file (should be a .txt file)?")
+destpath = input("What is the full path of the directory where you wish your XML Unicode destination file to be created?")
+rawpath = re.sub(".[txTX]{3}", "", sourcepath)
+filename = re.search(r"\w+.[txTX]{3}", sourcepath)
+rawfilename = re.sub(".[txTX]{3}", "", filename.group(0))
+unicodedest = rawpath + "Unicode.txt"
+xmldest = destpath + "/" +  rawfilename + "XML" + ".txt"
+BWGtoUnicode.BWGtoUnicode(sourcepath, unicodedest)
+UnicodeToXMLLemmas.UnicodeToXML(unicodedest,xmldest)
diff --git a/BookChapterVerseDelete.py b/BookChapterVerseDelete.py
@@ -0,0 +1,92 @@
+'''
+Created on 22.08.2012
+
+@author: matthew.munson
+'''
+import re
+import os
+os.chdir('C:\CompLing\Jerome\Experiments\Zeta\EpistleVSHistorical\Corpus') #put the path to the destination files here
+for files in os.listdir("."):
+    filebase = files.rstrip('.txtTXT')
+    destfiles = filebase + '_new.txt'
+    destination = open(destfiles, 'w', encoding='utf-8') 
+    if files.endswith(".txt") or files.endswith('.TXT'):
+#        txtfiles.append(files)
+#with fileinput.input(files=txtfiles) as source:
+        with open(files, 'r'
+           ) as source: #this should complete the line if the encoding is not utf-8
+#            , encoding='utf-8') as source: #this should complete the line if the encoding is utf-8
+            sourcetext = source.read()
+            #the next line deletes any lines that begin with [nnn].  
+            #I did this specifically for Jerome´s Epistula ad Damasum.  
+            #It may not work for everything
+#            if files in footnotefilelist:
+#                footnotedel = re.compile(r'^[0-9]{1,3}.+?\n', re.S) 
+            for line in sourcetext:
+                bookchapterverse = re.compile(r'^([A-Za-z0-9]{3}) ([0-9]{1,4}:[0-9]{1,4})') #this line extracts the name of the biblical book from the beginning of the line that is being extractednotedel = re.compile(r'[\[\(\|].+?[\]\)\|]', re.S)
+    #            titledel = re.compile(r'^EPISTOLA.+?\n', re.M)
+    #            epistleintrodel = re.compile(r'^MONITUM.+?^EPISTOLA.+?\n', flags = re.S | re.M)
+                newsource = bookchapterverse.sub('', line)
+    #            newsource = notedel.sub('', newsource)
+    #            newsource = epistleintrodel.sub('', newsource)
+    #            newsource = titledel.sub('', newsource)
+                destination.write(newsource)
+    destination.close()
+#unibook = open('C:/CompLing/GBible/GreekBibleUnicode.txt', 'r', encoding='utf-8')
+#xmlbook = open('C:/CompLing/GBible/GreekBibleXML.txt', 'w', encoding='utf-8')
+#oldbookname = ''
+#oldchapter = ''
+#for line in unibook:
+#    wordnumber = 0
+#    bookchapterverse = re.search(r'^([A-Za-z0-9]{3}) ([0-9]{1,4}:[0-9]{1,4})', line) #this line extracts the name of the biblical book from the beginning of the line that is being extracted
+#    bookname = bookchapterverse.group(1)
+#    chapterverse = bookchapterverse.group(2)
+#    newline = ''
+#    colonsplit = re.compile(r':')
+#    tokenizedline = line.split()
+#    ampersand = '@'
+#    endline = re.search(' $', line)
+#    if endline != None:
+#        line = line.strip(endline.group(0))
+#    for word in tokenizedline:
+#        newword = word
+#        if word == bookname:
+#            if bookname != oldbookname:
+#                if oldbookname != '':
+#                    newword = '</chapter>\n</book>\n<book>' + newword + '\n'#+ '</book>'
+#                    newline = newline + newword
+#                elif oldbookname == '':
+#                    newword = '<book>' + newword #+ '</book>'
+#                    newline = newline + newword
+#        elif word == chapterverse:
+#            words = colonsplit.split(word)
+#            chapter = words[0]
+#            verse = words[1]
+#            if chapter != oldchapter:
+#                if oldchapter != '':
+#                    newword = '<chapter>' + chapter + '\n<verse>' + verse #+ '</verse>'
+#                    newline = newline + newword
+#                elif oldchapter =='':
+#                    newword = '<chapter>' + chapter + '\n' + '<verse>' + verse
+#                    newline = newline + newword
+#            else:
+#                newword = '<verse>' + verse #+ '</verse>'
+#                newline = newline + newword
+#        elif ampersand not in word:
+#            wordnumber = wordnumber + 1
+#            wordnumberstring = bookname + '.' + chapterverse.replace(':', '.') + '.' + str(wordnumber)
+#            newword = '<w id=\'' + wordnumberstring + '\'>' + word + '</w>'
+#            newline = newline + newword
+#        else:
+#            newword = '<POS id=\'' + wordnumberstring + '\'>' + word.replace(ampersand, '') + '</POS>'
+#            newline = newline + newword
+##        newline = newline + newword
+##    if oldchapter != chapter:
+##        newline = newline + '</verse></chapter>\n'
+##    else:
+#    newline = newline + '</verse>\n'
+#    xmlbook.write(newline)
+#    oldchapter = chapter
+#    oldbookname = bookname
+#xmlbook.close()
+#unibook.close()
diff --git a/BookExtract.py b/BookExtract.py
@@ -0,0 +1,27 @@
+'''
+Created on 21.08.2012
+
+@author: matthew.munson
+'''
+import re #imports the regular expressions module
+source = open('C:\CompLing\Jerome\From Hebrew\Corpus\He_gen - est.txt', 'r+') #opens the text from which we want to extract the information
+oldbook = '' #initializes the variable 'oldbook'
+target = '' #initializes the variable 'target'
+for line in source: #this for loop loops over every line in the document and writes it to a new file
+    book = re.search(r'^[A-Za-z0-9]{3}', line) #this line extracts the name of the biblical book from the beginning of the line that is being extracted
+    file = 'C:/CompLing/Jerome/From Hebrew/Corpus/' + book.group(0) + '.txt' #this sets the name of the target file depending on what the name of the book is that was extracted from the beginning of 'line'
+    if oldbook == '': #checks to see if this is the first line in the source file.  If it is, it assigns the value of book.group(0) to oldbook and then opens the target file.
+        oldbook = book.group(0) #assigns the value of book.group(0) to oldbook
+        target = open(file, 'w') #opens the target file for writing
+    elif oldbook != book.group(0): #this loop runs when the program encounters a line in the source file from a new biblical book (i.e., it has a different book abbreviation at the beginning of the line)
+        #print(book.group(0)) #checks the value of book.group(0).  Just for testing.
+        #print(oldbook) #checks the value of oldbook.  Just for testing.
+        target.close() #this closes the previous target file so that the lines from the new biblical book can be written to a new file.
+        target = open(file, 'w') #this opens the new target file using the abbreviation of the biblical book as its name
+        oldbook = book.group(0) #this assigns the value of book.group(0) to oldbook.  This makes it so that this if...elif loop does not run until the program encounters a line where the biblical book abbreviation is different at the beginning of the line.
+    target.write(line) #this writes the contents of 'line' to the file just opened.  This is necessary within the loop so that if the next line within the source file starts a new book, the file for the previous book will be closed before trying to assign a new file name to 'target'
+    #with open(file, 'a') as target: #this with loop automatically closes 'file' when the loop finishes.
+        #target.write(line) #this writes the contents of 'line' to the file just opened.  This is necessary within the loop so that if the next line within the source file starts a new book, the file for the previous book will be closed before trying to assign a new file name to 'target'
+target.close() #closes the last target file used
+source.close() #closes the source file
+input('Finished! Press return to continue.') #this is here to show the user when the program is finished.
diff --git a/CollCalcLogLike.py b/CollCalcLogLike.py
@@ -0,0 +1,76 @@
+'''
+Created on 14.02.2013
+
+@author: matthew.munson
+'''
+import re
+import pickle
+import os
+import math
+import os.path
+import decimal
+lemdoc = open("C:/CompLing/GBible/XML/LemLists/GBibleLemmsNT.txt", mode = 'rb')
+lemlist = pickle.load(lemdoc)
+print(len(lemlist))
+FileList = os.listdir("C:/CompLing/GBible/XML/CollLists/NT")
+LemCount = 0
+#CollPos = ['L10', 'L9', 'L8', 'L7', 'L6', 'L5', 'L4', 'L3', 'L2', 'L1', 'Lemma', 'R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R8', 'R9', 'R10']
+for file in FileList:    
+    LLCollList = []
+    CollFilename = "C:/CompLing/GBible/XML/CollLists/NT/" + file
+    CollFile = open(CollFilename, mode = 'r', encoding = 'utf-8')
+    LLfile = re.sub(r'.txt', r'_LL.txt', file)
+    LLFileName = "C:/CompLing/GBible/XML/CollLists/NT/LL/" + LLfile
+    if os.path.isfile(LLFileName) == True: #checks to see if a collocation file already exists for this lemma
+        continue #if a collocation file for this lemma exists, it breaks out of the if loop and selects the next member of lemlist
+    else:
+        LLListFile = open(LLFileName, mode = 'w', encoding = 'utf-8')
+        for line in CollFile:
+            if 'P=Lemma' in line:
+                LemCount = int(re.sub(r'.+?Lemma W=[^\']+\', ([0-9]+).*\n', r'\1', line))
+    #        CollIndex = 0
+            if '(' in line:
+                line = line.strip(',"\n') # = re.sub(r'\"', '', line, count = 1)
+                LineList = re.split(r'\",+\"', line)
+                for collocate in LineList:
+                    if re.match('\",+\"', collocate):
+                        continue
+                    elif 'Lemma' in collocate:
+                        LLLem = re.sub(r'\"{,1}\(\'(P=Lemma) (W=[^\']+)\', ([0-9]+)\)\"{,1}', r'(\1 \2 N=\3 MI=NA', collocate)
+                        LLLem = LLLem + ')'
+                        LLCollList.append(LLLem)
+                        continue
+                    else:
+    #                    print(collocate)
+                        CollCount = int(re.sub(r'\"{,1}\(\'P=[LR][0-9]{1,2} W=[^\']+\', ([0-9]+)\)\"{,1}', r'\1', collocate))
+                        CollLemCount = lemlist.count(re.sub(r'\"{,1}\(\'P=[LR][0-9]{1,2} W=([^\']+)\', [0-9]+\)\"{,1}', r'\1', collocate))
+                        Num = len(lemlist)
+                        C1 = decimal.Decimal(LemCount)
+                        C2 = decimal.Decimal(CollLemCount)
+                        C12 = decimal.Decimal(CollCount)
+                        P = decimal.Decimal(C2/Num)
+                        P1 = decimal.Decimal(C12/C1)
+                        P2 = decimal.Decimal((C2 - C12)/(Num-C1))
+                        Test = C2-C12
+#                        print(P)
+#                        print(Test)
+#                        print(decimal.Context(Emin = -425000000).power(P, C2-C12))
+#                        print(collocate)
+                        LL1 = decimal.Context(Emin = -425000000).log10(decimal.Context(Emin = -425000000).power(P, C12)*decimal.Context(Emin = -425000000).power(1-P, C1-C12))
+                        LL2 = decimal.Context(Emin = -425000000).log10(decimal.Context(Emin = -425000000).power(P, C2-C12)*decimal.Context(Emin = -425000000).power(1-P, Num-C1-C2-C12))
+                        if P1 == 1:
+                            LL3 = 0
+                        else:
+                            LL3 = decimal.Context(Emin = -425000000).log10(decimal.Context(Emin = -425000000).power(P1, C12)*decimal.Context(Emin = -425000000).power(1-P1, C1-C12))
+                        if P2 == 0:
+                            LL4 = 0
+                        else:
+                            LL4 = decimal.Context(Emin = -425000000).log10(decimal.Context(Emin = -425000000).power(P2, C2-C12)*decimal.Context(Emin = -425000000).power(1-P2, (Num-C1)-(C2-C12)))
+                        LL = -2*(LL1+LL2-LL3-LL4)
+                        LLColl = re.sub(r'\"{,1}\(\'(P=[LR][0-9]{1,2}) (W=[^\']+)\', ([0-9]+)\)\"{,1}', r'(\1 \2 N=\3 LL=', collocate)
+                        LLColl = LLColl + str(LL) + ')'
+                        LLCollList.append(LLColl)
+        LLCollStr = str(LLCollList)
+        LLListFile.write(LLCollStr)
+        LLListFile.close()
+lemdoc.close()