Skip to content

Commit

Permalink
Merge pull request #25 from psibre/add-abbreviations
Browse files Browse the repository at this point in the history
Add abbreviation support
  • Loading branch information
psibre authored Aug 16, 2024
2 parents 5c887e4 + 63b655b commit ba27de3
Show file tree
Hide file tree
Showing 5 changed files with 191 additions and 5 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@ Upper Sorbian language component for MaryTTS
[Unreleased]
------------

### Added

- Abbreviation expansion in preprocessing

[v0.2.0] - 2024-05-12
---------------------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,4 +46,18 @@ class PreprocessIT {
}
assert actual == expected
}

@Test
void 'Given input with abbreviations, When text is converted to words, Then abbreviations are expanded correctly'() {
def input = 'GHz l/km mpg cm³.'
def expected = 'gigahertzow litrow na kilometer milow na galonu kubiknych centimetrow.'
def output = mary.generateXML(input)
def outputStr = output.documentElement.serialize()
def xmlSlurper = new XmlSlurper(false, false)
def tokens = xmlSlurper.parseText(outputStr).depthFirst().findAll { it.name() == 't' }
def actual = tokens.inject('') { result, token ->
(result.isEmpty() || token ==~ /\p{Punct}/) ? result + token : result + ' ' + token
}
assert actual == expected
}
}
42 changes: 42 additions & 0 deletions src/main/java/marytts/language/hsb/Preprocess.java
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
public class Preprocess extends InternalModule {

static final ULocale locale = new ULocale.Builder().setLanguage("hsb").build();
private Map<String, String> abbreviations;
private Map<String, String> symbols;
private RuleBasedNumberFormat ruleBasedNumberFormat;
private NumberFormat numberFormat;
Expand All @@ -39,6 +40,26 @@ public Preprocess() throws MaryConfigurationException {
super("Preprocess", MaryDataType.TOKENS, MaryDataType.WORDS, locale.toLocale());
initNumberExpansion("formatRules.txt");
initSymbolExpansion("symbols.csv");
initAbbreviationExpansion("abbreviations.csv");
}

private void initAbbreviationExpansion(String resourceName) throws MaryConfigurationException {
try {
abbreviations = new HashMap<>();
InputStream abbreviationsStream = this.getClass().getResourceAsStream(resourceName);
InputStreamReader abbreviationsReader = new InputStreamReader(abbreviationsStream, Charsets.UTF_8);
CSVParser csv = CSVFormat.Builder.create(CSVFormat.DEFAULT)
.setHeader("abbreviation", "expansion")
.build()
.parse(abbreviationsReader);
for (CSVRecord record : csv) {
String abbreviation = record.get("abbreviation");
String expansion = record.get("expansion");
abbreviations.put(abbreviation, expansion);
}
} catch (Exception exception) {
throw new MaryConfigurationException(String.format("Could not load abbreviations from %s.%s", this.getClass().getCanonicalName(), resourceName), exception);
}
}

private void initSymbolExpansion(String resourceName) throws MaryConfigurationException {
Expand Down Expand Up @@ -73,13 +94,34 @@ private void initNumberExpansion(String resourceName) throws MaryConfigurationEx

public MaryData process(MaryData d) {
Document doc = d.getDocument();
expandAllAbbreviations(doc);
expandAllSymbols(doc);
expandAllNumbers(doc);
MaryData result = new MaryData(getOutputType(), d.getLocale());
result.setDocument(doc);
return result;
}

private void expandAllAbbreviations(Document document) {
TreeWalker treeWalker = ((DocumentTraversal) document).createTreeWalker(document, NodeFilter.SHOW_ELEMENT,
new NameNodeFilter(MaryXML.TOKEN), false);
Element token;
while ((token = (Element) treeWalker.nextNode()) != null) {
String tokenText = MaryDomUtils.tokenText(token);
String expandedAbbreviation = expandAbbreviation(tokenText);
if (expandedAbbreviation != tokenText) {
MaryDomUtils.setTokenText(token, expandedAbbreviation);
}
}
}

protected String expandAbbreviation(String abbreviation) {
if (abbreviations.containsKey(abbreviation))
return abbreviations.get(abbreviation);
else
return abbreviation;
}

private void expandAllSymbols(Document document) {
TreeWalker treeWalker = ((DocumentTraversal) document).createTreeWalker(document, NodeFilter.SHOW_ELEMENT,
new NameNodeFilter(MaryXML.TOKEN), false);
Expand Down
131 changes: 131 additions & 0 deletions src/main/resources/marytts/language/hsb/abbreviations.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
"kwart.","kwartal"
"kw.","kwartal"
"měs.","měsac"
"tydź.","tydźeń"
"hodź.","hodźina"
"min.","mjeńšin"
"sek.","sekundow"
"př.Chr.n.","před Chrystowym narodźenjom"
"př.n.l.č.","před našim ličenjom časa"
"po Chr.n.","po Chrystowym narodźenju"
"n.l.č.","našeho ličenja časa"
"jan.","januar"
"feb.","februar"
"měr.","měrc"
"apr.","apryl"
"mej.","meja"
"jun.","junij"
"jul.","julij"
"awg.","awgust"
"sep.","september"
"okt.","oktober"
"now.","nowember"
"dec.","december"
"dop.","dopołdnja"
"přip.","připołdnju"
"pop.","popołdnju"
"nje.","njedźela"
"pón.","póndźela"
"wut.","wutora"
"srj.","srjeda"
"štw.","štwórtk"
"pja.","pjatk"
"sob.","sobota"
"nj.","njedźela"
"pó.","póndźela"
"wu.","wutora"
"sr.","srjeda"
"št.","štwórtk"
"pj.","pjatk"
"tys.","tysac"
"mil.","milionow"
"mrd.","miliardow"
"bil.","bilionow"
"mio.","milionow"
"PLN","pólskich złotych"
"zł","pólskich złotych"
"Kč","čěskich krónow"
"EUR","eurow"
"DKK","danskich krónow"
"DM","hriwnow"
"CHF","šwicarskich frankow"
"CZK","čěskich krónow"
"HUF","madźarskich forintow"
"PLZ","pej el cet"
"RUB","ruskich rublow"
"CNY","chinskich yuanow"
"CN¥","chinskich yuanow"
"JPY","japanskich yenow"
"AUD","awstralskich dolarow"
"NZ$","nowoseelandskich dolarow"
"dn.","dnjow"
"dn.","dnjow"
"cm","centimetrow"
"dm","decimetrow"
"ft","stopow"
"km","kilometrow"
"m","metrow"
"μm","mikrometrow"
"mm","milimetrow"
"nm","nanometrow"
"nmi","nawtiskich milow"
"cm²"," kwadratnych centimetrow"
"ft²","kwadratnych stopow"
"in²","kwadratnych cólow"
"km²","kwadratnych kilometrow"
"m²","kwadratnych metrow"
"mi²","kwadratnych milow"
"yd²","kwadratnych yardow"
"cm³","kubiknych centimetrow"
"ft³","kubiknych stopow"
"in³","kubiknych cólow"
"km³","kubiknych kilometrow"
"m³","kubiknych metrow"
"mi³","kubiknych milow"
"yd³","kubiknych yardow"
"fl. oz.","běžitych uncow"
"łž.","łžicow"
"łžk.","łžičkow"
"m/s²","metrow na kwadratnu sekundu"
"km/h","kilometrow na hodźinu"
"m/s","metrow na sekundu"
"mph","milow na hodźinu"
"kg","kilogramow"
"µg","mikrogramow"
"mg","miligramow"
"oz","uncow"
"oz. tr.","trojskich uncow"
"lb","puntow"
"cal","kalorijow"
"kcal","kilokalorijow"
"kJ","kilodźulow"
"kWh","kilowattowych hodźin"
"GW","gigawattow"
"PS","konjacych mocow"
"kW","kilowattow"
"MW","megawattow"
"mW","miliwattow"
"mA","milliamperow"
"Ω","ohmow"
"GHz","gigahertzow"
"Hz","hertzow"
"kHz","kilohertzow"
"MHz","megahertzow"
"hPa","hektopascalow"
"inHg","cólow žiwoslěbroweho stołpika"
"mbar","milibarow"
"mm Hg","milimetrow žiwoslěbroweho stołpika"
"°C","stopnjow Celsiusa"
"°F","stopnjow Fahrenheita"
"bit","bitow"
"byte","byteow"
"Gb","gigabitow"
"GB","gigabyteow"
"kb","kilobitow"
"kB","kilobyteow"
"Mb","megabitow"
"MB","megabyteow"
"Tb","terabitow"
"TB","terabyteow"
"l/km","litrow na kilometer"
"mpg","milow na galonu"
5 changes: 0 additions & 5 deletions src/main/resources/marytts/language/hsb/symbols.csv
Original file line number Diff line number Diff line change
Expand Up @@ -77,13 +77,8 @@ $,dolarow
=,runa so
>,wjetše hač
@,et
[,wotewrjena róžkata spinka
\,beksleš
],zawrjena róžkata spinka
^,wyše
{,wotewrjena wuzhibowana spinka
|,padoruna smuha
},zawrjena wuzhibowana spinka
~,tilda
°,stopnjow
"°C",stopnjow Celsius
Expand Down

0 comments on commit ba27de3

Please sign in to comment.