-
Notifications
You must be signed in to change notification settings - Fork 0
/
dta-txt.sh
33 lines (33 loc) · 1.01 KB
/
dta-txt.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
#!/bin/bash
set -e
xsltproc --version &>/dev/null # needed
test -d dta_komplett_*/ && cd $_ || {
wget http://media.dwds.de/dta/download/dta_komplett_2017-09-01.zip && \
unzip dta_komplett_2017-09-01.zip
cd dta_komplett_2017-09-01
}
mkdir -p txt
test -f tcf-extract-txt.xsl || cat <<EOF > tcf-extract-txt.xsl
<xsl:stylesheet
version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:tcf="http://www.dspin.de/data/textcorpus">
<!-- rid of xml syntax: -->
<xsl:output
method="text"
standalone="yes"
omit-xml-declaration="yes"/>
<!-- copy text element verbatim: -->
<xsl:template match="tcf:TextCorpus">
<xsl:value-of select="tcf:text" disable-output-escaping="yes"/>
<xsl:apply-templates/>
</xsl:template>
<!-- override implicit rules copying elements and attributes: -->
<xsl:template match="text()"/>
</xsl:stylesheet>
EOF
for file in simple/*.xml
do
outfile=txt/${file#simple/}
xsltproc tcf-extract-txt.xsl $file > ${outfile%.xml}.txt
done