SciNim · Vindaar · Dec 16, 2021 · Dec 16, 2021 · Dec 16, 2021 · Dec 16, 2021
diff --git a/changelog.org b/changelog.org
@@ -3,7 +3,11 @@
 - fix CSV parsing wrt. empty fields (treated as NaN) and explicit NaN
   & Inf values
 - fix CSV parsing of files with extraneous newlines
-- add more parsing tests  
+- fix CSV parsing with missing values at the end of a line (becomes
+  =NaN=)
+- fix CSV parsing of empty fields if missing in first row and element
+  is *not* float
+- add more parsing tests
 * v0.1.9
 - add basic implementation of =spread= (inverse of =gather=; similar
   to dplyr =pivot_wider=). The current implementation is rather basic

diff --git a/datamancer.nimble b/datamancer.nimble
@@ -1,6 +1,6 @@
 # Package
 
-version       = "0.1.8"
+version       = "0.1.10"
 author        = "Vindaar"
 description   = "A dataframe library with a dplyr like API"
 license       = "MIT"

diff --git a/docs/docs.nim b/docs/docs.nim
@@ -24,7 +24,7 @@ proc getNimRootDir(): string =
   import "$nim/testament/lib/stdtest/specialpaths.nim"
   nimRootDir
   ]#
-  fmt"{currentSourcePath}".parentDir.parentDir.parentDir
+  getCurrentCompilerExe().parentDir.parentDir
 
 const
   DirSep = when defined(windows): '\\' else: '/'
@@ -94,7 +94,7 @@ proc buildDocs*(path: string, docPath: string,
   ## https://github.com/nim-lang/Nim/pull/11814 is required.
   ##
   ##
-  const gitUrl = "https://github.com/Vindaar/datamancer"
+  const gitUrl = "https://github.com/SciNim/datamancer"
   ## WARNING: this means `gen_docs` *only* works if you use `nimble develop` on
   ## the repository. Nimble cannot deal with ****. This is frustrating. Thanks.
   let baseDir = execAction("nimble path datamancer").parentDir & $DirSep

diff --git a/src/datamancer/io.nim b/src/datamancer/io.nim
@@ -127,14 +127,18 @@ template guessType(data: ptr UncheckedArray[char], buf: var string,
   # only determine types for as many cols as in header
   if col < numCols:
     copyBuf(data, buf, idx, colStart)
-    if buf.isInt:
-      colTypes[col] = colInt
-    elif buf.isNumber:
-      colTypes[col] = colFloat
-    elif buf.isBool:
-      colTypes[col] = colBool
-    else:
-      colTypes[col] = colString
+    if colTypes[col] == colNone: # do not overwrite if already set
+      if buf.len == 0:
+        # inconclusive, need to look at next line
+        colTypes[col] = colNone
+      elif buf.isInt:
+        colTypes[col] = colInt
+      elif buf.isNumber:
+        colTypes[col] = colFloat
+      elif buf.isBool:
+        colTypes[col] = colBool
+      else:
+        colTypes[col] = colString
 
 proc i64(c: char): int {.inline.} = int(ord(c) - ord('0'))
 
@@ -340,6 +344,18 @@ template parseCol(data: ptr UncheckedArray[char], buf: var string,
       raise newException(IOError, "Invalid column type to parse into: `colNone`. " &
         "This shouldn't have happened! row = " & $row & ", col = " & $col)
 
+template advanceToNextRow() {.dirty.} =
+  ## The steps done after a line break is found & we advance to the next row.
+  ##
+  ## Stored in a dity template as we also use it while guessing types.
+  inc row
+  col = 0
+  if data[idx] == '\r' and data[idx + 1] == '\l':
+    inc idx
+  colStart = idx + 1
+  rowStart = idx + 1
+  lastWasSep = false
+
 template parseLine(data: ptr UncheckedArray[char], buf: var string,
                    sep: char,
                    quote: char,
@@ -359,13 +375,7 @@ template parseLine(data: ptr UncheckedArray[char], buf: var string,
     rowStart = idx + 1
   elif unlikely(data[idx] in {'\n', '\r', '\l'}):
     fnToCall
-    inc row
-    col = 0
-    if data[idx] == '\r' and data[idx + 1] == '\l':
-      inc idx
-    colStart = idx + 1
-    rowStart = idx + 1
-    lastWasSep = false
+    advanceToNextRow()
     when toBreak:
       inc idx
       break
@@ -391,6 +401,10 @@ template parseLine(data: ptr UncheckedArray[char], buf: var string,
     discard
   inc idx
 
+proc allColTypesSet(colTypes: seq[ColKind]): bool =
+  ## checks if all column types are determined, i.e. not `colNone` the default
+  result = colTypes.allIt(it != colNone)
+
 proc readCsvTypedImpl(data: ptr UncheckedArray[char],
                       size: int,
                       lineCnt: int,
@@ -451,21 +465,27 @@ proc readCsvTypedImpl(data: ptr UncheckedArray[char],
   var colTypes = newSeq[ColKind](numCols)
   var lastIdx = idx
   var lastColStart = colStart
+  var lastRow = row
   var dataColsIdx = 0
   while idx < size:
     parseLine(data, buf, sep, quote, col, idx, colStart, row, rowStart, lastWasSep, inQuote, toBreak = true):
       guessType(data, buf, colTypes, col, idx, colStart, numCols)
       # if we see the end of the line, store the current column number
       if data[idx] in {'\n', '\r', '\l'}:
         dataColsIdx = col
+        if not allColTypesSet(colTypes): # manually perform steps to go to next line and skip
+                                         # `when toBreak` logic
+          advanceToNextRow()
+          inc idx
+          continue
 
   if dataColsIdx + 1 != numCols:
     raise newException(IOError, "Input data contains " & $(dataColsIdx + 1) & " in the data portion, but " &
       $numCols & " columns in the header.")
   # 2a. revert the indices (make it a peek)
   idx = lastIdx
   colStart = lastColStart
-  dec row
+  row = lastRow
   # 3. create the starting columns
   var cols = newSeq[Column](numCols)
   let dataLines = lineCnt - skippedLines

diff --git a/tests/testDf.nim b/tests/testDf.nim
@@ -117,7 +117,6 @@ NaN,N/A,0.3,300
       check df["x"].kind == colObject # because of invalid floats
       check df["y"].kind == colFloat
       check df["z"].kind == colInt
-      echo df
       check cmpElements(df["w", float].toRawSeq, @[1'f64, 2, NaN, 4])
       check cmpElements(df["x", Value].toRawSeq, @[%~ 10, %~ "ERR", %~ "N/A", %~ 40])
       check cmpElements(df["y", float].toRawSeq, @[0.1, Inf, 0.3, 0.4])
@@ -161,7 +160,7 @@ NaN,N/A,0.3,300
         checkBlock()
         removeFile(path)
 
-  test "Parsing with missing values":
+  test "Parsing with missing values, float":
     let exp = """x,y,z
 1,2,
 4,,6
@@ -170,17 +169,51 @@ NaN,N/A,0.3,300
     template checkBlock(): untyped {.dirty.} =
       check df["x"].kind == colFloat
       check df["y"].kind == colFloat
-      #check df["z"].kind == colFloat
-      echo df
+      check df["z"].kind == colFloat
       check cmpElements(df["x", float].toRawSeq, @[1'f64,4,NaN])
       check cmpElements(df["y", float].toRawSeq, @[2'f64,NaN,8])
-      #check df["z", float] == toTensor([NaN,6,9])
+      check cmpElements(df["z", float].toRawSeq, @[NaN,6,9])
 
     block FromString:
       let df = parseCsvString(exp)
       checkBlock()
 
+    block FromFile:
+      let path = "/tmp/test_missing_datamancer.csv"
+      when defined(linux):
+        ## XXX: use proper temp handling to check on other OSs
+        writeFile(path, exp)
+        let df = readCsv(path)
+        checkBlock()
+        removeFile(path)
+
+  test "Parsing with missing values, string":
+    let exp = """x,y,z
+a,2,
+aa,3,
+b,,foo
+,8,bar
+"""
+    template checkBlock(): untyped {.dirty.} =
+      check df["x"].kind == colString
+      check df["y"].kind == colFloat
+      check df["z"].kind == colString
+      check cmpElements(df["x", string].toRawSeq, @["a", "aa", "b", ""])
+      check cmpElements(df["y", float].toRawSeq, @[2'f64,3,NaN,8])
+      check cmpElements(df["z", string].toRawSeq, @["","","foo","bar"])
 
+    block FromString:
+      let df = parseCsvString(exp)
+      checkBlock()
+
+    block FromFile:
+      let path = "/tmp/test_missing_string_datamancer.csv"
+      when defined(linux):
+        ## XXX: use proper temp handling to check on other OSs
+        writeFile(path, exp)
+        let df = readCsv(path)
+        checkBlock()
+        removeFile(path)
 
 suite "DataFrame tests":
   test "Creation of DFs from seqs":