Skip to content

Commit 78d2b2c

Browse files
committed
Added option: collapse mid-front nasals.
1 parent 74ac51e commit 78d2b2c

File tree

3 files changed

+4451
-4417
lines changed

3 files changed

+4451
-4417
lines changed

clean_corpus.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,7 @@
1010
######################################################################################
1111

1212
# NOTE: Please verify that all corpora to be processed are located in corpora/corpus_name/raw/
13-
# (one folder per corpus, containing .cha files) and that you have a participants.txt file in
14-
# corpora/corpus_name/ before running this script.
13+
# (one folder per corpus, containing .cha files) before running this script.
1514

1615
import re
1716
import os

compile.py

+53-18
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,11 @@
1616

1717
# SET PARAMETERS
1818

19-
# Age limits (XyXm):
19+
### Age limits (XyXm) ###
2020
age_min_input = '0y0m'
2121
age_max_input = '2y0m'
2222

23-
# Define which transcription to load (orthographic or phonological):
23+
### Define which transcription to load (orthographic or phonological) ###
2424
# Note: use option 4 for the phonologized corpus described in Carbajal, Bouchon, Dupoux & Peperkamp (2018)
2525
option = 4
2626

@@ -30,44 +30,46 @@
3030
folder = 'corpora'
3131
filename = '/clean/extract.txt'
3232
outname = 'ortho'
33+
fullname = '*Transcription type: orthographic'
3334
elif option in [2,3,4,5]:
3435
phono_transcript = True
3536
folder = 'output'
3637
if option == 2:
3738
# Phonological transcription with Liaison (no other rules)
3839
filename = '/phonologized_L.txt'
3940
outname = 'phono_L'
41+
fullname = '*Transcription type: phonological\n*Phonological rules: liaison'
4042
elif option == 3:
4143
# Phonological transcription with Liaison and Liquid Deletion
4244
filename = '/phonologized_L_D.txt'
4345
outname = 'phono_L_D'
46+
fullname = '*Transcription type: phonological\n*Phonological rules: liaison and liquid deletion'
4447
elif option == 4:
4548
# Phonological transcription with Liaison, Liquid Deletion and Enchainement plus Je-devoicing (resyllabification)
4649
filename = '/phonologized_L_D_E.txt'
4750
outname = 'phono_L_D_E'
51+
fullname = '*Transcription type: phonological\n*Phonological rules: liaison, liquid deletion,\n enchainement and je-devoicing'
4852
elif option == 5:
4953
# Phonological transcription with Liaison, Liquid Deletion, Schwa Insertion and Enchainement plus Je-devoicing (resyllabification)
5054
filename = '/phonologized_L_D_S_E.txt'
5155
outname = 'phono_L_D_S_E'
56+
fullname = '*Transcription type: phonological\n*Phonological rules: liaison, liquid deletion, \n schwa insertion, enchainement and je-devoicing'
57+
58+
59+
### Other options (set to True or False) ###
5260

5361
# Print fileID and age?
5462
printInfo = True
55-
if printInfo:
56-
printInfoTag = ''
57-
else:
58-
printInfoTag = '_noFileInfo'
5963

6064
# Print in lower-case? (IMPORTANT: lower-case cannot be used in phonological transcriptions as capital letters may mean different phonemes)
61-
lowerCase = False # Select True or False
62-
if phono_transcript:
63-
lowerCase = False
65+
lowerCase = False
6466

65-
# Remove parentheses? (They mark unpronounced parts of words)
67+
# Remove parentheses? (They mark unpronounced parts of words - only useful in orthographic transcriptions)
6668
removeParentheses = False
67-
if removeParentheses:
68-
removeParenthesesTag = '_noParnths'
69-
else:
70-
removeParenthesesTag = ''
69+
70+
# Collapse nasal oe (symbol "1") and nasal ɛ (symbol "5") into single category (symbol "5")?
71+
# Note: in many dialects of French, this contrast doesn't exist. E.g., "brun" and "brin" are both pronounced with nasal ɛ.
72+
collapseNasals = True
7173

7274
###########################################################################################
7375
# FUNCTIONS
@@ -100,25 +102,58 @@ def check_age(current_age):
100102

101103
if not os.path.exists('compiled_corpus'):
102104
os.makedirs('compiled_corpus')
103-
105+
106+
# Tags and options info:
107+
if printInfo:
108+
printInfoTag = ''
109+
infofullname = '*Print file ID and age: True'
110+
else:
111+
printInfoTag = '_noFileInfo'
112+
infofullname = '*Print file ID and age: False'
113+
if (not phono_transcript) and removeParentheses:
114+
removeParenthesesTag = '_noParnths'
115+
parenthesischoice = '*Print parentheses: False'
116+
else:
117+
removeParenthesesTag = ''
118+
parenthesischoice = '*Print parentheses: True'
119+
if collapseNasals:
120+
nasalsfullname = '*Collapse mid-front nasals: True'
121+
else:
122+
nasalsfullname = '*Collapse mid-front nasals: False'
123+
104124
# Open output file for writing
105125
f = open('compiled_corpus/corpus_' + outname + '_' + age_min_input + '_' + age_max_input + printInfoTag + removeParenthesesTag + '.txt', 'w')
126+
106127
# List of files
107128
dirlist = [ item for item in os.listdir(folder) if os.path.isdir(os.path.join(folder, item)) ]
108129

130+
# Print options info:
131+
print '\nCompiling corpora with the following options:\n'
132+
print fullname
133+
print '*Age range: ' + age_min_input + ' - ' + age_max_input
134+
print infofullname
135+
if (not phono_transcript):
136+
print parenthesischoice
137+
else:
138+
print nasalsfullname
139+
print '*Included corpora:'
140+
109141
for corpusdir in dirlist:
142+
print ' -' + corpusdir
110143
location = folder + '/' + corpusdir
111144
with open(location + filename) as recoded_file:
112145
for line_ID, line_text in enumerate(recoded_file):
113-
if lowerCase:
146+
if (not phono_transcript) and lowerCase:
114147
line_text = line_text.lower()
115-
if removeParentheses:
148+
if (not phono_transcript) and removeParentheses:
116149
line_text = line_text.replace('(','').replace(')','')
117150
line = line_text.split()
151+
if phono_transcript and collapseNasals:
152+
line[5:] = [text.replace('1','5') for text in line[5:]] # NOT WORKING (to do!)
118153
age = [int(x) for x in line[1:4]]
119154
if check_age(age):
120155
if printInfo:
121-
print >> f, corpusdir + ' ' + line_text.strip()
156+
print >> f, corpusdir + ' ' + ' '.join(line)
122157
else:
123158
print >> f, ' '.join(line[4:])
124159

0 commit comments

Comments
 (0)