Added option: collapse mid-front nasals.

juliacarbajal · juliacarbajal · commit 78d2b2c1383a · 2018-11-21T14:45:54.000+01:00
diff --git a/clean_corpus.py b/clean_corpus.py
@@ -10,8 +10,7 @@
 ######################################################################################
 
 # NOTE: Please verify that all corpora to be processed are located in corpora/corpus_name/raw/
-# (one folder per corpus, containing .cha files) and that you have a participants.txt file in
-# corpora/corpus_name/ before running this script.
+# (one folder per corpus, containing .cha files) before running this script.
 
 import re
 import os
diff --git a/compile.py b/compile.py
@@ -16,11 +16,11 @@
 
 # SET PARAMETERS
 
-# Age limits (XyXm):
+### Age limits (XyXm) ###
 age_min_input = '0y0m'
 age_max_input = '2y0m'
 
-# Define which transcription to load (orthographic or phonological):
+### Define which transcription to load (orthographic or phonological) ###
 # Note: use option 4 for the phonologized corpus described in Carbajal, Bouchon, Dupoux & Peperkamp (2018)
 option = 4
 
@@ -30,44 +30,46 @@
 	folder = 'corpora'
 	filename = '/clean/extract.txt'
 	outname = 'ortho'
+	fullname = '*Transcription type: orthographic'
 elif option in [2,3,4,5]:
 	phono_transcript = True
 	folder = 'output'
 	if option == 2:
 		# Phonological transcription with Liaison (no other rules)
 		filename = '/phonologized_L.txt'
 		outname = 'phono_L'
+		fullname = '*Transcription type: phonological\n*Phonological rules: liaison'
 	elif option == 3:
 		# Phonological transcription with Liaison and Liquid Deletion
 		filename = '/phonologized_L_D.txt'
 		outname = 'phono_L_D'
+		fullname = '*Transcription type: phonological\n*Phonological rules: liaison and liquid deletion'
 	elif option == 4:
 		# Phonological transcription with Liaison, Liquid Deletion and Enchainement plus Je-devoicing (resyllabification)
 		filename = '/phonologized_L_D_E.txt'
 		outname = 'phono_L_D_E'
+		fullname = '*Transcription type: phonological\n*Phonological rules: liaison, liquid deletion,\n enchainement and je-devoicing'
 	elif option == 5:
 		# Phonological transcription with Liaison, Liquid Deletion, Schwa Insertion and Enchainement plus Je-devoicing (resyllabification)
 		filename = '/phonologized_L_D_S_E.txt'
 		outname = 'phono_L_D_S_E'
+		fullname = '*Transcription type: phonological\n*Phonological rules: liaison, liquid deletion, \n schwa insertion, enchainement and je-devoicing'
+
+		
+### Other options (set to True or False) ###
 
 # Print fileID and age?
 printInfo = True
-if printInfo:
-	printInfoTag = ''
-else:
-	printInfoTag = '_noFileInfo'
 
 # Print in lower-case? (IMPORTANT: lower-case cannot be used in phonological transcriptions as capital letters may mean different phonemes)
-lowerCase = False # Select True or False
-if phono_transcript:
-	lowerCase = False
+lowerCase = False
 
-# Remove parentheses? (They mark unpronounced parts of words)
+# Remove parentheses? (They mark unpronounced parts of words - only useful in orthographic transcriptions)
 removeParentheses = False
-if removeParentheses:
-	removeParenthesesTag = '_noParnths'
-else:
-	removeParenthesesTag = ''
+
+# Collapse nasal oe (symbol "1") and nasal ɛ (symbol "5") into single category (symbol "5")?
+# Note: in many dialects of French, this contrast doesn't exist. E.g., "brun" and "brin" are both pronounced with nasal ɛ.
+collapseNasals = True
 
 ###########################################################################################
 # FUNCTIONS
@@ -100,25 +102,58 @@ def check_age(current_age):
 
 if not os.path.exists('compiled_corpus'):
 	os.makedirs('compiled_corpus')
-	
+
+# Tags and options info:
+if printInfo:
+	printInfoTag = ''
+	infofullname = '*Print file ID and age: True'
+else:
+	printInfoTag = '_noFileInfo'
+	infofullname = '*Print file ID and age: False'
+if (not phono_transcript) and removeParentheses:
+	removeParenthesesTag = '_noParnths'
+	parenthesischoice = '*Print parentheses: False'
+else:
+	removeParenthesesTag = ''
+	parenthesischoice = '*Print parentheses: True'
+if collapseNasals:
+	nasalsfullname = '*Collapse mid-front nasals: True'
+else:
+	nasalsfullname = '*Collapse mid-front nasals: False'
+
 # Open output file for writing
 f = open('compiled_corpus/corpus_' + outname + '_' + age_min_input + '_' + age_max_input + printInfoTag + removeParenthesesTag + '.txt', 'w')
+
 # List of files
 dirlist = [ item for item in os.listdir(folder) if os.path.isdir(os.path.join(folder, item)) ]
 
+# Print options info:
+print '\nCompiling corpora with the following options:\n'
+print fullname
+print '*Age range: ' + age_min_input + ' - ' + age_max_input
+print infofullname
+if (not phono_transcript):
+	print parenthesischoice
+else:
+	print nasalsfullname
+print '*Included corpora:'
+	
 for corpusdir in dirlist:
+	print ' -' + corpusdir
 	location = folder + '/' + corpusdir
 	with open(location + filename) as recoded_file:
 		for line_ID, line_text in enumerate(recoded_file):
-			if lowerCase:
+			if (not phono_transcript) and lowerCase:
 				line_text = line_text.lower()
-			if removeParentheses:
+			if (not phono_transcript) and removeParentheses:
 				line_text = line_text.replace('(','').replace(')','')
 			line = line_text.split()
+			if phono_transcript and collapseNasals:
+				line[5:] = [text.replace('1','5') for text in line[5:]] # NOT WORKING (to do!)
 			age = [int(x) for x in line[1:4]]
 			if check_age(age):
 				if printInfo:
-					print >> f, corpusdir + ' ' + line_text.strip()
+					print >> f, corpusdir + ' ' + ' '.join(line)
 				else:
 					print >> f, ' '.join(line[4:])
 
diff --git a/compiled_corpus/corpus_phono_L_D_E_0y0m_2y0m.txt b/compiled_corpus/corpus_phono_L_D_E_0y0m_2y0m.txt