|
16 | 16 |
|
17 | 17 | # SET PARAMETERS
|
18 | 18 |
|
19 |
| -# Age limits (XyXm): |
| 19 | +### Age limits (XyXm) ### |
20 | 20 | age_min_input = '0y0m'
|
21 | 21 | age_max_input = '2y0m'
|
22 | 22 |
|
23 |
| -# Define which transcription to load (orthographic or phonological): |
| 23 | +### Define which transcription to load (orthographic or phonological) ### |
24 | 24 | # Note: use option 4 for the phonologized corpus described in Carbajal, Bouchon, Dupoux & Peperkamp (2018)
|
25 | 25 | option = 4
|
26 | 26 |
|
|
30 | 30 | folder = 'corpora'
|
31 | 31 | filename = '/clean/extract.txt'
|
32 | 32 | outname = 'ortho'
|
| 33 | + fullname = '*Transcription type: orthographic' |
33 | 34 | elif option in [2,3,4,5]:
|
34 | 35 | phono_transcript = True
|
35 | 36 | folder = 'output'
|
36 | 37 | if option == 2:
|
37 | 38 | # Phonological transcription with Liaison (no other rules)
|
38 | 39 | filename = '/phonologized_L.txt'
|
39 | 40 | outname = 'phono_L'
|
| 41 | + fullname = '*Transcription type: phonological\n*Phonological rules: liaison' |
40 | 42 | elif option == 3:
|
41 | 43 | # Phonological transcription with Liaison and Liquid Deletion
|
42 | 44 | filename = '/phonologized_L_D.txt'
|
43 | 45 | outname = 'phono_L_D'
|
| 46 | + fullname = '*Transcription type: phonological\n*Phonological rules: liaison and liquid deletion' |
44 | 47 | elif option == 4:
|
45 | 48 | # Phonological transcription with Liaison, Liquid Deletion and Enchainement plus Je-devoicing (resyllabification)
|
46 | 49 | filename = '/phonologized_L_D_E.txt'
|
47 | 50 | outname = 'phono_L_D_E'
|
| 51 | + fullname = '*Transcription type: phonological\n*Phonological rules: liaison, liquid deletion,\n enchainement and je-devoicing' |
48 | 52 | elif option == 5:
|
49 | 53 | # Phonological transcription with Liaison, Liquid Deletion, Schwa Insertion and Enchainement plus Je-devoicing (resyllabification)
|
50 | 54 | filename = '/phonologized_L_D_S_E.txt'
|
51 | 55 | outname = 'phono_L_D_S_E'
|
| 56 | + fullname = '*Transcription type: phonological\n*Phonological rules: liaison, liquid deletion, \n schwa insertion, enchainement and je-devoicing' |
| 57 | + |
| 58 | + |
| 59 | +### Other options (set to True or False) ### |
52 | 60 |
|
53 | 61 | # Print fileID and age?
|
54 | 62 | printInfo = True
|
55 |
| -if printInfo: |
56 |
| - printInfoTag = '' |
57 |
| -else: |
58 |
| - printInfoTag = '_noFileInfo' |
59 | 63 |
|
60 | 64 | # Print in lower-case? (IMPORTANT: lower-case cannot be used in phonological transcriptions as capital letters may mean different phonemes)
|
61 |
| -lowerCase = False # Select True or False |
62 |
| -if phono_transcript: |
63 |
| - lowerCase = False |
| 65 | +lowerCase = False |
64 | 66 |
|
65 |
| -# Remove parentheses? (They mark unpronounced parts of words) |
| 67 | +# Remove parentheses? (They mark unpronounced parts of words - only useful in orthographic transcriptions) |
66 | 68 | removeParentheses = False
|
67 |
| -if removeParentheses: |
68 |
| - removeParenthesesTag = '_noParnths' |
69 |
| -else: |
70 |
| - removeParenthesesTag = '' |
| 69 | + |
| 70 | +# Collapse nasal oe (symbol "1") and nasal ɛ (symbol "5") into single category (symbol "5")? |
| 71 | +# Note: in many dialects of French, this contrast doesn't exist. E.g., "brun" and "brin" are both pronounced with nasal ɛ. |
| 72 | +collapseNasals = True |
71 | 73 |
|
72 | 74 | ###########################################################################################
|
73 | 75 | # FUNCTIONS
|
@@ -100,25 +102,58 @@ def check_age(current_age):
|
100 | 102 |
|
101 | 103 | if not os.path.exists('compiled_corpus'):
|
102 | 104 | os.makedirs('compiled_corpus')
|
103 |
| - |
| 105 | + |
| 106 | +# Tags and options info: |
| 107 | +if printInfo: |
| 108 | + printInfoTag = '' |
| 109 | + infofullname = '*Print file ID and age: True' |
| 110 | +else: |
| 111 | + printInfoTag = '_noFileInfo' |
| 112 | + infofullname = '*Print file ID and age: False' |
| 113 | +if (not phono_transcript) and removeParentheses: |
| 114 | + removeParenthesesTag = '_noParnths' |
| 115 | + parenthesischoice = '*Print parentheses: False' |
| 116 | +else: |
| 117 | + removeParenthesesTag = '' |
| 118 | + parenthesischoice = '*Print parentheses: True' |
| 119 | +if collapseNasals: |
| 120 | + nasalsfullname = '*Collapse mid-front nasals: True' |
| 121 | +else: |
| 122 | + nasalsfullname = '*Collapse mid-front nasals: False' |
| 123 | + |
104 | 124 | # Open output file for writing
|
105 | 125 | f = open('compiled_corpus/corpus_' + outname + '_' + age_min_input + '_' + age_max_input + printInfoTag + removeParenthesesTag + '.txt', 'w')
|
| 126 | + |
106 | 127 | # List of files
|
107 | 128 | dirlist = [ item for item in os.listdir(folder) if os.path.isdir(os.path.join(folder, item)) ]
|
108 | 129 |
|
| 130 | +# Print options info: |
| 131 | +print '\nCompiling corpora with the following options:\n' |
| 132 | +print fullname |
| 133 | +print '*Age range: ' + age_min_input + ' - ' + age_max_input |
| 134 | +print infofullname |
| 135 | +if (not phono_transcript): |
| 136 | + print parenthesischoice |
| 137 | +else: |
| 138 | + print nasalsfullname |
| 139 | +print '*Included corpora:' |
| 140 | + |
109 | 141 | for corpusdir in dirlist:
|
| 142 | + print ' -' + corpusdir |
110 | 143 | location = folder + '/' + corpusdir
|
111 | 144 | with open(location + filename) as recoded_file:
|
112 | 145 | for line_ID, line_text in enumerate(recoded_file):
|
113 |
| - if lowerCase: |
| 146 | + if (not phono_transcript) and lowerCase: |
114 | 147 | line_text = line_text.lower()
|
115 |
| - if removeParentheses: |
| 148 | + if (not phono_transcript) and removeParentheses: |
116 | 149 | line_text = line_text.replace('(','').replace(')','')
|
117 | 150 | line = line_text.split()
|
| 151 | + if phono_transcript and collapseNasals: |
| 152 | + line[5:] = [text.replace('1','5') for text in line[5:]] # NOT WORKING (to do!) |
118 | 153 | age = [int(x) for x in line[1:4]]
|
119 | 154 | if check_age(age):
|
120 | 155 | if printInfo:
|
121 |
| - print >> f, corpusdir + ' ' + line_text.strip() |
| 156 | + print >> f, corpusdir + ' ' + ' '.join(line) |
122 | 157 | else:
|
123 | 158 | print >> f, ' '.join(line[4:])
|
124 | 159 |
|
|
0 commit comments