-
Notifications
You must be signed in to change notification settings - Fork 0
/
docx_tei.php
72 lines (67 loc) · 2.76 KB
/
docx_tei.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
<?php declare(strict_types=1);
include_once(__DIR__ . '/vendor/autoload.php');
use Psr\Log\LogLevel;
use Oeuvres\Kit\{Filesys, Log, Xt};
use Oeuvres\Kit\Logger\{LoggerCli};
use Oeuvres\Teinte\Format\{Docx};
use Oeuvres\Xsl\{Xpack};
Log::setLogger(new LoggerCli(LogLevel::DEBUG));
if (!isset($argv[1])) {
die("usage: php docx_tei.php examples/*.docx");
}
// drop $argv[0], $argv[1…] should be file
array_shift($argv);
// destination directory for tei files
$tmp_dir = __DIR__ . '/out/';
Filesys::mkdir($tmp_dir);
$source = new Docx();
// local xml template
$source->user_template(__DIR__ . '/galenus_tmpl_lat.xml');
// regex program to insert
$source->user_pcre(__DIR__ . '/galenus_pcre.tsv');
$force = true;
// loop on arguments to get files of globs
foreach ($argv as $glob) {
Log::info($glob);
foreach (glob($glob) as $docx_file) {
$src_name = pathinfo($docx_file, PATHINFO_FILENAME);
$split = explode('.', $src_name);
$dst_file = dirname(__DIR__) . '/galenus_cts/data/' . $split[0] . '/' . $split[1] . '/' . $src_name .'.xml';
if (!$force && file_exists($dst_file) && filemtime($docx_file) < filemtime($dst_file)) {
continue;
}
Filesys::mkdir(dirname($dst_file));
Log::info($docx_file . " > " . $dst_file);
$source->open($docx_file);
// for debug
$source->pkg(); // open the docx
$source->teilike(); // apply a first tei layer
file_put_contents($tmp_dir . $src_name .'_teilike.xml', $source->teiXML());
$source->pcre(); // apply regex, custom re may break XML
// for debug write this step
file_put_contents($tmp_dir . $src_name .'_pcre.xml', $source->teiXML());
$source->tmpl();
$grc_file = dirname(__DIR__) . '/galenus_cts/data/' . $split[0] . '/' . $split[1] . '/' . str_replace('verbatim-lat', '1st1K-grc', $src_name) .'.xml';
if (!file_exists($grc_file)) {
$grc_file = dirname(__DIR__) . '/galenus_cts/data/' . $split[0] . '/' . $split[1] . '/' . str_replace('verbatim-lat', 'verbatim-grc', $src_name) .'.xml';
}
if (!file_exists($grc_file)) {
$grc_file = dirname(__DIR__) . '/galenus_cts/data/' . $split[0] . '/' . $split[1] . '/' . str_replace('verbatim-lat1', '1st1K-grc2', $src_name) .'.xml';
}
if (!file_exists($grc_file)) {
echo "[404] $grc_file\n";
unlink($dst_file);
continue;
}
// finalize with personal xslt
$xml = Xt::transformToXml(
__DIR__ . '/galenusgrc.xsl',
$source->teiDOM(),
[
'filename' => $src_name,
'grc_file' => 'file:///' . str_replace('\\', '/', $grc_file),
]
);
file_put_contents($dst_file, $xml);
}
}