-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrome_genderization.py
330 lines (262 loc) · 13.3 KB
/
rome_genderization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
# encoding: utf-8
"""Module to genderize job titles in ROME.
This module allows one to normalize French job titles across genders in
ROME. In the standard ROME file provided by Pole Emploi job titles are often
inconsistently specified, which leads to confusion and poorer user experience.
In addition it makes the job title longer than it should be, which makes them
harder to identify at a glance.
For readability English-language examples are provided in the comments in this
module.
Example:
A job such as "Senior Fireman" might sometimes be genderized as
"Senior Fireman / Senior Firewoman" and sometimes as "Senior Fire(wo)man".
We want to turn any variant of these possible genderizations into a
normalized pair of two strings: "Senior Fireman" and "Senior Firewoman".
If a job title is not gendered, for example "Artist", we want to simply return
"Artist" and "Artist".
Nomenclature in this module:
- bracket notation: when the job title is gendered using the form
"Senior Fire(wo)man"
- slash notation: when the job title is gendered using the form
"Senior Fireman / Firewoman"
"""
import re
# This variable is a {postfix: [word_endings]} dictionary of lists
# describing how a postfix is supposed to transform a word based on its
# possible word endings. These are strings that, when found at the end
# of the word to be transformed, will be replaced by the postfix.
# If a postfix is meant to be simply appended to a word, the empty
# string should be used.
# Note that the list ordering matters and must be from more specific
# to more general (ie. in decreasing order of length), as the postfix
# substitution will perform the first substitution that matches the last
# characters of the word to be transformed.
# Example: {'euse': ['eur', 'er']} means that the postfix 'euse' will
# transform 'Vendeur' into 'Vendeuse' and 'Manager' into 'Manageuse'.
# Status: This list is currently comprehensive with regards to the postfixes
# found in ROME v238, with the exception of the 's' postfix as it
# denotes pluralization rather than gender.
_POSTFIX_MAP = {
'e': [''],
'ère': ['er'],
'se': ['r'],
'sse': [''],
'euse': ['eur', 'er'],
've': ['f'],
're': ['r'],
'rice': ['eur'],
'ne': [''],
'trice': ['teur'],
'le': [''],
'ive': ['if'],
'ière': ['ier'],
}
# Known list of genderization in lower case.
_KNOWN_GENDERIZATION = frozenset([
('homme', 'femme'),
('steward', 'hôtesse')])
# A string of chars not present in input text to serve as placeholder for
# non-genderization slashes to preserve them and prevent further parsing.
_SLASH_PLACEHOLDER_CHARS = '##'
def _substitute_postfix(word, postfix):
"""Given a word and a postfix, return the transformed word.
Perform the correct postfix substitution from a masculine word to a
feminine word, taking into account the fact that some postfix are meant
to be appended to the base string while others are meant to be substituted.
Both nouns and adjectives may be genderized.
Will perform the first possible substitution that matches the known word
endings for a given postfix in the _POSTFIX_MAP dictionary.
See the documentation for _POSTFIX_MAP for more details.
Examples:
- Abbateur(se) -> Abbateur, Abbateuse
- Accompagnateur(trice) -> Accompagnateur, Accompagnatrice
- social(e) -> social, sociale
- Technicien(ne) -> Technicien, Technicienne
- administratif(ive) -> administratif, administrative
etc.
Args:
word: A string representing the word a postfix pertains to.
postfix: A string representing the postfix.
Return:
A string containing the transformed word once the postfix has been
applied.
"""
if postfix not in _POSTFIX_MAP:
raise ValueError('Unknown postfix: ' + postfix)
known_endings = _POSTFIX_MAP[postfix]
for known_ending in known_endings:
if word.endswith(known_ending):
root = word[:len(word) - len(known_ending)]
return root + postfix
error_string = '%s: Unmapped ending for postfix "%s"' % (word, postfix)
raise ValueError(error_string)
def _extract_bracket_notation(raw_job_name):
"""Parse a genderized string in the bracket notation.
Extract the genderized strings from a job title with the bracket notation,
by going through the string and parsing occurences of words(postfixes) by
generating a pair of strings, one without the postfixes and the other with
all words with postfixes transformed according to substitute_postfix().
For example, turn:
"The Fire(wo)man puts his(her) hat on" into "The Fireman puts his hat on"
and "The Firewoman puts her hat on".
Args:
word: A string representing the raw job name, possibly but not
necessarily in bracket notation.
Return:
If treating the string as one in bracket notation worked: a pair
of strings representing the masculine and the ferminine version,
respectively.
Return None if the item doesn't appear to be in bracket notation.
"""
# We only want brackets directly following a character:
bracket_regex = re.compile(r'(\S+?)\((\S+?)\)')
matches = re.findall(bracket_regex, raw_job_name)
if not matches: # if we don't seem to be in the bracket notation
return None
# Masculine name is just the string without the bracket content;
# to get feminine names we also substitute the relevant words.
# We'll perform these deletions/substitutions iteratively.
masculine_name = feminine_name = raw_job_name
for word, postfix in matches:
if postfix == 's': # ignore the plural postfix case
continue
masculine_name = masculine_name.replace('(%s)' % postfix, '')
feminine_name = feminine_name.replace('(%s)' % postfix, '')
new_word = _substitute_postfix(word, postfix)
feminine_name = feminine_name.replace(word, new_word)
return masculine_name, feminine_name
def _expand_qualifiers(left_string, right_string, n_comparison_chars=3):
"""Distribute qualifiers from the left side to the right and vice-versa.
When a proposition is gendered in the slash format (e.g. "Senior
Fireman / Firewoman"), the qualifiers on either side of the slash
may or may not be meant to be distributive with respect to the other
side. Given both sies of such a proposition, this function returns
both sides with all qualifiers fully distributed.
For example, the masculine name "Junior in training" could be
genderized in the slash notation as:
- "Junior Fireman in training / Junior Firewoman in training"
(all qualifiers repeated, no distribution needed)
- "Junior Fireman / Firewoman in training"
(no qualifiers repeated, qualifiers on both sides must be distributed)
- "Junior Fireman / Junior Firewoman in training"
(only qualifiers on the left side are repeated)
- "Junior Fireman in training / Firewoman in training"
(only qualifiers on the right side are repeated)
We want to turn any of these into "Junior Fireman in training" and
"Junior Firewoman in training".
Args:
left_string: A string representing the left side of the proposition
(ie. the part before the slash) for a phrase in the slash notation.
right_string: A string representing the left side of the proposition
(ie. the part after the slash) for a phrase in the slash notation.
n_comparison_chars: An int indicating how many first characters must
be shared between a word on the right and on the left before we
consider it is part of the same root.
Return:
A pair of strings, representing the left_string and right_string passed
as input but with all qualifiers on either side fully expanded.
Return two equal strings if it appears the slash does not represent a
genderized substring, both equal to the input string with the slash
substituted with _SLASH_PLACEHOLDER_CHARS to prevent further parsing.
"""
right_words = right_string.split(' ')
left_words = left_string.split(' ')
# Insert words from the left side in front of the right until a word that
# looks like right side's first shared word is found.
# We need to fuzzy match to find this shared root because these words will
# usually be in different genders. For our purposes simply comparing the
# first few characters should be enough, especially since in French the
# genderization usually only changes the word ending.
to_insert = []
for word in left_words:
if word[:n_comparison_chars] == right_words[0][:n_comparison_chars]:
break
if (word.lower(), right_words[0].lower()) in _KNOWN_GENDERIZATION:
break
to_insert.append(word)
if to_insert == left_words: # looks like we had a non-genderization slash
full_string = left_string + _SLASH_PLACEHOLDER_CHARS + right_string
return full_string, full_string
right_words = to_insert + right_words
# Now that all left-side qualifiers have been distributed to the right,
# append extra right-side words to the the left side:
left_words += right_words[len(left_words):]
expanded_left_string = ' '.join(left_words)
expanded_right_string = ' '.join(right_words)
return expanded_left_string, expanded_right_string
def _extract_slash_notation(raw_job_name):
"""Parse a genderized string in the slash notation.
Extract the genderized strings from a job title with the slash notation,
such as "Senior Fireman / Firewoman". Note that a job title may feature
multiple slashes, as in: "The Fireman / Firewoman puts his / her hat on".
Return two strings respectively containing the masculine version and the
feminine version, while making sure all qualifiers on either side are
properly distributed, as qualifiers (such as "Senior" in this example)
on a given side of the slash are sometimes repeated on both sides
in the original name and sometimes not. See distribute_qualifiers() for
more details.
For example, turn:
- "Senior Fireman / Firewoman" (single slash case)
into "Senior Fireman" and "Senior Firewoman"
- "The Fireman / Firewoman puts his / her hat on" (multiple slashes case)
into "The Fireman puts his hat on" and The Firewoman puts her hat on".
Args:
word: A string representing the raw job name, possibly but not
necessarily in slash notation.
Return:
If treating the string as one in slash notation worked: a pair
of strings representing the masculine and the ferminine version,
respectively.
Return None if the item doesn't appear to be in slash notation.
"""
if '/' not in raw_job_name:
return None
if _SLASH_PLACEHOLDER_CHARS in raw_job_name:
error_string = 'Reserved substring "%s" in job name: %s' % (
_SLASH_PLACEHOLDER_CHARS, raw_job_name)
raise ValueError(error_string)
raw_job_name = raw_job_name.replace(' / ', '/') # normalize spacing first
substrings = raw_job_name.split('/', maxsplit=1)
masculine_name, feminine_name = _expand_qualifiers(*substrings)
# When multiple slashes are present (ie. the job name contains multiple
# subsequences to be genderized), expanding each slash one at a time then
# repeatedly expanding each such subsequence will yield the correct output.
while '/' in masculine_name:
masculine_substrings = masculine_name.split('/', maxsplit=1)
masculine_name = _expand_qualifiers(*masculine_substrings)[0]
feminine_substrings = feminine_name.split('/', maxsplit=1)
feminine_name = _expand_qualifiers(*feminine_substrings)[1]
# Restore any non-gender slash, if any:
masculine_name = masculine_name.replace(_SLASH_PLACEHOLDER_CHARS, '/')
feminine_name = feminine_name.replace(_SLASH_PLACEHOLDER_CHARS, '/')
if masculine_name == feminine_name: # catch cases with a non-gender slash
return None
return masculine_name, feminine_name
def genderize(raw_job_names):
"""Normalize the genderization of a dataframe of ROME job titles.
Take a pandas Series of the raw job titles as present in the ROME and
genderize it, adding a masculine_name and a feminine_name field to it.
Args:
raw_job_names: A pandas Series of strings representing job names
with ambiguous genderization.
Return:
A pair of pandas Series, respectively containing the masculine
and the feminine version of the job names passed as input.
"""
# By default, masculine_name = feminine_name = raw_job_name, then we
# overwrite the value when either rule is successful.
masculine_names = raw_job_names.copy()
feminine_names = raw_job_names.copy()
bracket_output = raw_job_names.apply(_extract_bracket_notation)
slash_output = raw_job_names.apply(_extract_slash_notation)
is_bracket = bracket_output.notnull()
is_slash = slash_output.notnull()
masculine_names[is_bracket] = bracket_output[is_bracket].apply(
lambda x: x[0])
feminine_names[is_bracket] = bracket_output[is_bracket].apply(
lambda x: x[1])
masculine_names[is_slash] = slash_output[is_slash].apply(
lambda x: x[0])
feminine_names[is_slash] = slash_output[is_slash].apply(
lambda x: x[1])
return masculine_names, feminine_names