-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreadChunks.lua
63 lines (46 loc) · 2.27 KB
/
readChunks.lua
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
readChunks = {}
function readChunks.createListOfFiles()
local files = {}
io.input(LIST_OF_FILES_TO_READ)
t = io.read("*all")
for line in string.gmatch(t, "[^\n]+") do
table.insert(files, line)
end
io.input():close()
return files
end
readChunks.files = readChunks.createListOfFiles()
readChunks.corpus = {}
readChunks.corpusReading = {lastPosition = 1, currentFileContents = {}, currentFile = 0, locationsOfLastBatchChunks = {}}
function readChunks.resetFileIterator()
readChunks.corpusReading = {lastPosition = 1, currentFileContents = {}, currentFile = 0, locationsOfLastBatchChunks = {}}
end
function readChunks.hasNextFile()
return readChunks.corpusReading.currentFile < #readChunks.files
end
-- NOTE TODO this will crash when some of the last files are shorter than params.seq_length
function readChunks.readNextChunk()
local dataPointFromFile = {}
-- read a new file if necessary
while readChunks.corpusReading.lastPosition + params.seq_length >= #readChunks.corpusReading.currentFileContents do
readChunks.corpusReading.lastPosition = 0
readChunks.corpusReading.currentFile = readChunks.corpusReading.currentFile + 1
readChunks.corpusReading.currentFileContents = readFiles.readAFile(readChunks.files[readChunks.corpusReading.currentFile])
if DOING_EVALUATION_OUTPUT then
for u=1, params.seq_length do
table.insert(readChunks.corpusReading.currentFileContents,1)
end
end
end
local startPosition = readChunks.corpusReading.lastPosition+1
for i=readChunks.corpusReading.lastPosition+1, readChunks.corpusReading.lastPosition +params.seq_length do
table.insert(dataPointFromFile, readChunks.corpusReading.currentFileContents[i])
end
readChunks.corpusReading.lastPosition = readChunks.corpusReading.lastPosition +params.seq_length
return dataPointFromFile, readChunks.corpusReading.currentFile, startPosition, params.seq_length
end
function readChunks.readNextChunkForBatchItem(batchIndex)
local dataPointFromFile, fileNumber, startPosition, lengthOfChunk = readChunks.readNextChunk()
readChunks.corpusReading.locationsOfLastBatchChunks[batchIndex] = {file = fileNumber, offset = startPosition, length = lengthOfChunk}
return dataPointFromFile
end