Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix CSV parsing for missing values in first row if non float column #21

Merged
merged 5 commits into from
Dec 16, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion changelog.org
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@
- fix CSV parsing wrt. empty fields (treated as NaN) and explicit NaN
& Inf values
- fix CSV parsing of files with extraneous newlines
- add more parsing tests
- fix CSV parsing with missing values at the end of a line (becomes
=NaN=)
- fix CSV parsing of empty fields if missing in first row and element
is *not* float
- add more parsing tests
* v0.1.9
- add basic implementation of =spread= (inverse of =gather=; similar
to dplyr =pivot_wider=). The current implementation is rather basic
Expand Down
2 changes: 1 addition & 1 deletion datamancer.nimble
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Package

version = "0.1.8"
version = "0.1.10"
author = "Vindaar"
description = "A dataframe library with a dplyr like API"
license = "MIT"
Expand Down
4 changes: 2 additions & 2 deletions docs/docs.nim
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ proc getNimRootDir(): string =
import "$nim/testament/lib/stdtest/specialpaths.nim"
nimRootDir
]#
fmt"{currentSourcePath}".parentDir.parentDir.parentDir
getCurrentCompilerExe().parentDir.parentDir

const
DirSep = when defined(windows): '\\' else: '/'
Expand Down Expand Up @@ -94,7 +94,7 @@ proc buildDocs*(path: string, docPath: string,
## https://github.com/nim-lang/Nim/pull/11814 is required.
##
##
const gitUrl = "https://github.com/Vindaar/datamancer"
const gitUrl = "https://github.com/SciNim/datamancer"
## WARNING: this means `gen_docs` *only* works if you use `nimble develop` on
## the repository. Nimble cannot deal with ****. This is frustrating. Thanks.
let baseDir = execAction("nimble path datamancer").parentDir & $DirSep
Expand Down
52 changes: 36 additions & 16 deletions src/datamancer/io.nim
Original file line number Diff line number Diff line change
Expand Up @@ -127,14 +127,18 @@ template guessType(data: ptr UncheckedArray[char], buf: var string,
# only determine types for as many cols as in header
if col < numCols:
copyBuf(data, buf, idx, colStart)
if buf.isInt:
colTypes[col] = colInt
elif buf.isNumber:
colTypes[col] = colFloat
elif buf.isBool:
colTypes[col] = colBool
else:
colTypes[col] = colString
if colTypes[col] == colNone: # do not overwrite if already set
if buf.len == 0:
# inconclusive, need to look at next line
colTypes[col] = colNone
elif buf.isInt:
colTypes[col] = colInt
elif buf.isNumber:
colTypes[col] = colFloat
elif buf.isBool:
colTypes[col] = colBool
else:
colTypes[col] = colString

proc i64(c: char): int {.inline.} = int(ord(c) - ord('0'))

Expand Down Expand Up @@ -340,6 +344,18 @@ template parseCol(data: ptr UncheckedArray[char], buf: var string,
raise newException(IOError, "Invalid column type to parse into: `colNone`. " &
"This shouldn't have happened! row = " & $row & ", col = " & $col)

template advanceToNextRow() {.dirty.} =
## The steps done after a line break is found & we advance to the next row.
##
## Stored in a dity template as we also use it while guessing types.
inc row
col = 0
if data[idx] == '\r' and data[idx + 1] == '\l':
inc idx
colStart = idx + 1
rowStart = idx + 1
lastWasSep = false

template parseLine(data: ptr UncheckedArray[char], buf: var string,
sep: char,
quote: char,
Expand All @@ -359,13 +375,7 @@ template parseLine(data: ptr UncheckedArray[char], buf: var string,
rowStart = idx + 1
elif unlikely(data[idx] in {'\n', '\r', '\l'}):
fnToCall
inc row
col = 0
if data[idx] == '\r' and data[idx + 1] == '\l':
inc idx
colStart = idx + 1
rowStart = idx + 1
lastWasSep = false
advanceToNextRow()
when toBreak:
inc idx
break
Expand All @@ -391,6 +401,10 @@ template parseLine(data: ptr UncheckedArray[char], buf: var string,
discard
inc idx

proc allColTypesSet(colTypes: seq[ColKind]): bool =
## checks if all column types are determined, i.e. not `colNone` the default
result = colTypes.allIt(it != colNone)

proc readCsvTypedImpl(data: ptr UncheckedArray[char],
size: int,
lineCnt: int,
Expand Down Expand Up @@ -451,21 +465,27 @@ proc readCsvTypedImpl(data: ptr UncheckedArray[char],
var colTypes = newSeq[ColKind](numCols)
var lastIdx = idx
var lastColStart = colStart
var lastRow = row
var dataColsIdx = 0
while idx < size:
parseLine(data, buf, sep, quote, col, idx, colStart, row, rowStart, lastWasSep, inQuote, toBreak = true):
guessType(data, buf, colTypes, col, idx, colStart, numCols)
# if we see the end of the line, store the current column number
if data[idx] in {'\n', '\r', '\l'}:
dataColsIdx = col
if not allColTypesSet(colTypes): # manually perform steps to go to next line and skip
# `when toBreak` logic
advanceToNextRow()
inc idx
continue

if dataColsIdx + 1 != numCols:
raise newException(IOError, "Input data contains " & $(dataColsIdx + 1) & " in the data portion, but " &
$numCols & " columns in the header.")
# 2a. revert the indices (make it a peek)
idx = lastIdx
colStart = lastColStart
dec row
row = lastRow
# 3. create the starting columns
var cols = newSeq[Column](numCols)
let dataLines = lineCnt - skippedLines
Expand Down
43 changes: 38 additions & 5 deletions tests/testDf.nim
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,6 @@ NaN,N/A,0.3,300
check df["x"].kind == colObject # because of invalid floats
check df["y"].kind == colFloat
check df["z"].kind == colInt
echo df
check cmpElements(df["w", float].toRawSeq, @[1'f64, 2, NaN, 4])
check cmpElements(df["x", Value].toRawSeq, @[%~ 10, %~ "ERR", %~ "N/A", %~ 40])
check cmpElements(df["y", float].toRawSeq, @[0.1, Inf, 0.3, 0.4])
Expand Down Expand Up @@ -161,7 +160,7 @@ NaN,N/A,0.3,300
checkBlock()
removeFile(path)

test "Parsing with missing values":
test "Parsing with missing values, float":
let exp = """x,y,z
1,2,
4,,6
Expand All @@ -170,17 +169,51 @@ NaN,N/A,0.3,300
template checkBlock(): untyped {.dirty.} =
check df["x"].kind == colFloat
check df["y"].kind == colFloat
#check df["z"].kind == colFloat
echo df
check df["z"].kind == colFloat
check cmpElements(df["x", float].toRawSeq, @[1'f64,4,NaN])
check cmpElements(df["y", float].toRawSeq, @[2'f64,NaN,8])
#check df["z", float] == toTensor([NaN,6,9])
check cmpElements(df["z", float].toRawSeq, @[NaN,6,9])

block FromString:
let df = parseCsvString(exp)
checkBlock()

block FromFile:
let path = "/tmp/test_missing_datamancer.csv"
when defined(linux):
## XXX: use proper temp handling to check on other OSs
writeFile(path, exp)
let df = readCsv(path)
checkBlock()
removeFile(path)

test "Parsing with missing values, string":
let exp = """x,y,z
a,2,
aa,3,
b,,foo
,8,bar
"""
template checkBlock(): untyped {.dirty.} =
check df["x"].kind == colString
check df["y"].kind == colFloat
check df["z"].kind == colString
check cmpElements(df["x", string].toRawSeq, @["a", "aa", "b", ""])
check cmpElements(df["y", float].toRawSeq, @[2'f64,3,NaN,8])
check cmpElements(df["z", string].toRawSeq, @["","","foo","bar"])

block FromString:
let df = parseCsvString(exp)
checkBlock()

block FromFile:
let path = "/tmp/test_missing_string_datamancer.csv"
when defined(linux):
## XXX: use proper temp handling to check on other OSs
writeFile(path, exp)
let df = readCsv(path)
checkBlock()
removeFile(path)

suite "DataFrame tests":
test "Creation of DFs from seqs":
Expand Down