forked from IntelligenceX/fileconversion
-
Notifications
You must be signed in to change notification settings - Fork 1
/
ODT 2 Text.go
130 lines (110 loc) · 2.77 KB
/
ODT 2 Text.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
/*
File Name: ODT 2 Text.go
Copyright: 2019 Kleissner Investments s.r.o.
Author: Peter Kleissner
Fork from https://github.com/lu4p/cat/blob/master/odtxt/odtreader.go.
The extract discards any formatting. The output is one large string without new-lines at the current time.
*/
package fileconversion
import (
"archive/zip"
"errors"
"io"
"io/ioutil"
"github.com/IntelligenceX/fileconversion/html2text"
)
// ODT2Text extracts text of an OpenDocument Text file
// Size is the full size of the input file.
func ODT2Text(file io.ReaderAt, size int64, writer io.Writer, limit int64) (written int64, err error) {
f, err := odtNewReader(file, size)
if err != nil {
return 0, err
}
text, err := f.GetTxt()
if err != nil {
return 0, err
}
err = writeOutput(writer, []byte(text), &written, &limit)
return
}
//odt zip struct
type odt struct {
zipFileReader *zip.Reader
Files []*zip.File
FilesContent map[string][]byte
Content string
}
func odtNewReader(file io.ReaderAt, size int64) (*odt, error) {
reader, err := zip.NewReader(file, size)
if err != nil {
return nil, err
}
odtDoc := odt{
zipFileReader: reader,
Files: reader.File,
FilesContent: map[string][]byte{},
}
for _, f := range odtDoc.Files {
contents, _ := odtDoc.retrieveFileContents(f.Name)
odtDoc.FilesContent[f.Name] = contents
}
return &odtDoc, nil
}
//Read all files contents
func (d *odt) retrieveFileContents(filename string) ([]byte, error) {
var file *zip.File
for _, f := range d.Files {
if f.Name == filename {
file = f
break
}
}
if file == nil {
return nil, errors.New(filename + " file not found")
}
reader, err := file.Open()
if err != nil {
return nil, err
}
return ioutil.ReadAll(reader)
}
func (d *odt) GetTxt() (content string, err error) {
xmlData := d.FilesContent["content.xml"]
return xml2Text(xmlData)
//content, err = d.listP(xmlData)
}
/*
// listP for w:p tag value
func (d *odt) listP(data []byte) (string, error) {
v := new(odtQuery)
err := xml.Unmarshal(data, &v)
if err != nil {
return "", err
}
var result string
for _, text := range v.Body.Text {
for _, line := range text.P {
if line == "" {
continue
}
result += line + "\n"
}
}
return result, nil
}
type odtQuery struct {
XMLName xml.Name `xml:"document-content"`
Body odtBody `xml:"body"`
}
type odtBody struct {
Text []odtText `xml:"text"`
}
type odtText struct {
P []string `xml:"p"`
}
*/
// xml2Text extracts any text from XML data.
// Note that any formatting will be lost. The output is one large string without new-lines.
func xml2Text(data []byte) (string, error) {
return html2text.FromString(string(data))
}