-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbuildindex.scala
152 lines (127 loc) · 4.08 KB
/
buildindex.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import scala.io.Source
import scala.math.Ordered
import java.io._
import julien.access._
import julien.galago.core.util.ExtentArray
class IndexWriterFunction[T <: KeyedData](
val dst: String,
val suffix: String,
val codec: Codec[T]
) extends Function1[T, Unit] {
@transient lazy val writer = new IndexFileWriter(dst + suffix, codec)
def close = writer.close
def apply(data: T) = writer.append(data.key, data)
}
// Define the writer function
abstract class Writer[T1, R](
val dst: String,
val suffix: String
) extends Function1[T1, R] with Serializable {
@transient lazy val dstFile = new FileWriter(dst + suffix)
def close = dstFile.close
def apply(obj: T1): R
}
case class CountedSplit(val filename: String, val count: Int)
val count = (fn: String) => {
val headers =
Source.fromFile(fn).getLines.filter { l =>
l == "<DOC>"
}
CountedSplit(fn, headers.size)
}
// We start with a list of files
// DocumentSource
val prefix = "/usr/ayr/tmp1/irmarc/projects/thesis/code/julien/data/"
val files =
Seq(prefix + "test1", prefix + "test2")
// Fan them out for counting
// ParserCounter
case class OffsetSplit(val file: String, val count: Int, val start: Int)
val counted = files.par.map(count)
// offsetting function
val offset = new Function1[CountedSplit, OffsetSplit] {
var total = 0
def apply(cs: CountedSplit): OffsetSplit = {
val toReturn = OffsetSplit(cs.filename, cs.count, total)
total += cs.count
toReturn
}
}
// OffsetSplitter
val shifted = counted.seq.map(offset)
// Three transformations in a row!
// ParserSelector/UniversalParser & TagTokenizer & A lemmatizer
case class Doc(
val id: Int,
val name: String,
val content: String,
val text: Seq[String]
)
val intoDocuments = (of: OffsetSplit) => {
val rawContent =
Source.fromFile(of.file).getLines.mkString("\n").split("</DOC>")
val numbered = rawContent.zipWithIndex.map { case (d, i) =>
val id = i + of.start
val name =
"""<DOCNO>(.+)</DOCNO""".r.findFirstMatchIn(d).get.group(1).trim
Doc(id, name, d, Seq.empty)
}
numbered.toSeq
}
val tokenize = (d: Doc) => d.copy(text = """\s""".r.split(d.content))
val normalize = (d: Doc) => d.copy(text = d.text.map(_.toLowerCase))
val parsedDocuments =
shifted.par.flatMap(intoDocuments).map(tokenize).map(normalize)
case class IdLength(val id: Int, val length: Int) extends KeyedData {
val key = id.toString
}
val writeLengths = new Writer[IdLength, Unit](prefix, "lengths") {
def apply(p: IdLength): Unit = dstFile.write(s"${p.id}\t${p.length}\n")
}
val encodeLengths = new IndexWriterFunction[IdLength](prefix, "lengths",
new LengthsCodec())
parsedDocuments.map {
doc => IdLength(doc.id, doc.text.length)
}.seq.sortBy(_.id).foreach(encodeLengths)
encodeLengths.close
// names
case class IdName(val id: Int, val name: String)
val writeNames = new Writer[IdName, Unit](prefix, "names") {
def apply(p: IdName): Unit = dstFile.write(s"${p.id}\t${p.name}\n")
}
parsedDocuments.map {
doc => IdName(doc.id, doc.name)
}.seq.sortBy(_.id).foreach(writeNames)
writeNames.close
// postings
val getPostings = (d: Doc) => {
val termPositions = d.text.zipWithIndex.groupBy(_._1)
val postings = termPositions.keys.map { term =>
val pos = termPositions(term).map(_._2).sorted.toArray
PositionsPosting(term, d.id, pos.length, new ExtentArray(pos))
}
postings
}
// Works for text
val writePostings = new Writer[PositionsPosting, Unit](prefix, "postings") {
def apply(p: PositionsPosting): Unit = {
val builder = List.newBuilder[Int]
p.positions.reset
while (p.positions.hasNext) {
builder += p.positions.next
}
val pStr = builder.result.mkString(",")
val str = s"${p.term}\t${p.docid}\t${p.count}\t${pStr}\n"
dstFile.write(str)
}
}
val encodePostings =
new IndexWriterFunction[PositionsPosting](prefix, "postings",
new PositionsPostingCodec())
parsedDocuments.flatMap {
doc => getPostings(doc)
}.seq.sorted.foreach(encodePostings)
// Boo - how to avoid this?
encodePostings.close
// corpus -- fairly redundant - don't worry about this yet
//parsedDocuments.sorted.foreach(doc => write(doc))