From bc3f9175b59b1b2ebfb6924799802b1f5c973c41 Mon Sep 17 00:00:00 2001 From: olafklinke <84761329+olafklinke@users.noreply.github.com> Date: Wed, 14 Sep 2022 15:25:24 +0200 Subject: [PATCH 01/12] add depedency for temporary, bump version number --- xls.cabal | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/xls.cabal b/xls.cabal index 0acf162..cee814d 100644 --- a/xls.cabal +++ b/xls.cabal @@ -1,5 +1,5 @@ name: xls -version: 0.1.3 +version: 0.1.4 synopsis: Parse Microsoft Excel xls files (BIFF/Excel 97-2004) description: Parse Microsoft Excel spreadsheet files in @.xls@ file format @@ -50,9 +50,11 @@ library hs-source-dirs: lib exposed-modules: Data.Xls build-depends: base >= 4.7 && < 5 + , bytestring >= 0.10 , conduit >= 1.1 && < 1.4 , filepath >= 1.0 && < 1.5 , resourcet >= 0.3 && < 1.3 + , temporary , transformers >= 0.1 && < 0.6 c-sources: lib/libxls-wrapper.c, From 4925ca760f57052a911eb6e8392a396e7f7525e6 Mon Sep 17 00:00:00 2001 From: olafklinke <84761329+olafklinke@users.noreply.github.com> Date: Wed, 14 Sep 2022 15:27:03 +0200 Subject: [PATCH 02/12] add functions to decode ByteStrings --- lib/Data/Xls.hs | 102 +++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 93 insertions(+), 9 deletions(-) diff --git a/lib/Data/Xls.hs b/lib/Data/Xls.hs index 6aeafd6..942eb61 100644 --- a/lib/Data/Xls.hs +++ b/lib/Data/Xls.hs @@ -21,22 +21,32 @@ module Data.Xls ( decodeXlsIO + , decodeXlsByteString + , decodeXlsByteString' , decodeXls , XlsException(..) + , XLSError(..) ) where -import Control.Exception (Exception, throwIO, bracket) +import Control.Exception (Exception, throwIO, bracket, catch) import Control.Monad.IO.Class -import Control.Monad (when, void) +import Control.Monad (void) import Control.Monad.Trans.Resource import Data.Conduit hiding (Conduit, Sink, Source) import Data.Data import Data.Int +import Data.Word (Word32) import Data.Maybe (catMaybes, fromJust, isJust, fromMaybe) +import Data.ByteString (hPut) +import Data.ByteString.Internal (ByteString(..)) import Foreign.C import Foreign.Ptr +import Foreign.ForeignPtr (withForeignPtr) +import Foreign.Storable (Storable(..)) +import Foreign.Marshal.Alloc (malloc) import Text.Printf +import System.IO.Temp (withSystemTempFile) #define CCALL(name,signature) \ foreign import ccall unsafe #name \ @@ -44,9 +54,43 @@ foreign import ccall unsafe #name \ -- Workbook accessor functions data XLSWorkbookStruct +-- | An enum returned by libxls. See libxls\/include\/xls.h +data XLSError = LIBXLS_OK + | LIBXLS_ERROR_OPEN + | LIBXLS_ERROR_SEEK + | LIBXLS_ERROR_READ + | LIBXLS_ERROR_PARSE + | LIBXLS_ERROR_MALLOC + deriving (Show,Eq,Enum) +instance Storable XLSError where + sizeOf = const (sizeOf (0 :: Word32)) -- TODO: sizeof enum is compiler and architecture dependent! + alignment = sizeOf -- okay for simple Storables + peek = fmap (toEnum . (fromIntegral :: Word32 -> Int)) . peek . castPtr + poke ptr e = poke (castPtr ptr) ((fromIntegral :: Int -> Word32).fromEnum $ e) type XLSWorkbook = Ptr XLSWorkbookStruct +type XLSErrorT = Ptr XLSError +type CBuffer = Ptr CUChar + +-- | Recall that +-- @ +-- ByteString ~ (ForeignPtr Char8,Int) +-- CUChar ~ Word8 +-- CSize ~ Word64 +-- @ +-- +-- So we need to marshal +-- +-- @ +-- (ForeignPtr Char8) -> Ptr CUChar +-- Int -> CSize +-- @ +toCBuffer :: ByteString -> IO (CBuffer,CSize) +toCBuffer (PS fPtr offset ilen) = do + withForeignPtr fPtr $ \ptrData -> do + return (plusPtr (castPtr ptrData) offset,CSize (fromIntegral ilen)) CCALL(xls_open, CString -> CString -> IO XLSWorkbook) +CCALL(xls_open_buffer, CBuffer -> CSize -> CString -> XLSErrorT -> IO XLSWorkbook) CCALL(xls_wb_sheetcount, XLSWorkbook -> IO CInt -- Int32) CCALL(xls_close_WB, XLSWorkbook -> IO ()) @@ -82,6 +126,13 @@ data XlsException = instance Exception XlsException +exceptionLeft :: XlsException -> Either XLSError a +exceptionLeft (XlsFileNotFound _) = Left LIBXLS_ERROR_OPEN +exceptionLeft (XlsParseError _) = Left LIBXLS_ERROR_PARSE + +catchXls :: IO a -> IO (Either XLSError a) +catchXls = flip catch (return.exceptionLeft) . fmap Right + -- | Parse a Microsoft excel xls workbook file into a Conduit yielding -- rows in a worksheet. Each row represented by a list of Strings, each String -- representing an individual cell. @@ -111,6 +162,27 @@ decodeXls file = count <- liftIO $ c_xls_wb_sheetcount pWB mapM_ (decodeOneWorkSheet file pWB) [0 .. count - 1] +-- | A work-around via temporary files and 'decodeXlsIO', +-- since this lib is lacking a pure function to decode the contents +-- of an XLS file. +-- Due to Erik Rybakken. +decodeXlsByteString :: ByteString -> IO [[[String]]] +decodeXlsByteString content = withSystemTempFile "decodeXlsByteString" + $ \filePath h -> do + hPut h content + decodeXlsIO filePath + +-- | Experimental: This function uses the @xls_open_buffer@ function of libxls. +decodeXlsByteString' :: ByteString -> IO (Either XLSError [[[String]]]) +decodeXlsByteString' bs = do + (buf,buflen) <- toCBuffer bs + enc <- newCString "UTF-8" + outError <- malloc + wb <- c_xls_open_buffer buf buflen enc outError + e <- peek outError + case e of + LIBXLS_OK -> decodeXLSWorkbook Nothing wb + _ -> return (Left e) -- | Parse a Microsoft excel xls workbook file into a list of worksheets, each -- worksheet consists of a list of rows and each row consists of a list of @@ -124,13 +196,25 @@ decodeXlsIO decodeXlsIO file = do file' <- newCString file pWB <- newCString "UTF-8" >>= c_xls_open file' - when (pWB == nullPtr) $ - throwIO $ XlsFileNotFound - $ "XLS file " ++ file ++ " not found." - count <- liftIO $ c_xls_wb_sheetcount pWB - results <- mapM (decodeOneWorkSheetIO file pWB) [0 .. count - 1] - void $ c_xls_close_WB pWB - return results + parseResult <- decodeXLSWorkbook (Just file) pWB + case parseResult of + Right results -> return results + Left e -> case e of + LIBXLS_ERROR_OPEN -> throwIO $ XlsFileNotFound $ + "XLS file " ++ file ++ " not found." + _ -> throwIO $ XlsParseError $ + "XLS file " ++ file ++ " could not be parsed." + +-- helper function for decoding both file and buffer +decodeXLSWorkbook :: Maybe FilePath -> XLSWorkbook -> IO (Either XLSError [[[String]]]) +decodeXLSWorkbook mFile pWB = if pWB == nullPtr + then return (Left LIBXLS_ERROR_OPEN) + else catchXls $ do + count <- liftIO $ c_xls_wb_sheetcount pWB + results <- mapM (decodeOneWorkSheetIO (maybe "buffer" id mFile) pWB) [0 .. count - 1] + void $ c_xls_close_WB pWB + return results + decodeOneWorkSheet :: MonadResource m From 5e1a09a4e27dad9b531921dbc8f46b58783c41a2 Mon Sep 17 00:00:00 2001 From: olafklinke <84761329+olafklinke@users.noreply.github.com> Date: Wed, 14 Sep 2022 16:59:09 +0200 Subject: [PATCH 03/12] expose Data.XlsCell module --- xls.cabal | 1 + 1 file changed, 1 insertion(+) diff --git a/xls.cabal b/xls.cabal index cee814d..c9c8ca4 100644 --- a/xls.cabal +++ b/xls.cabal @@ -49,6 +49,7 @@ library hs-source-dirs: lib exposed-modules: Data.Xls + , Data.XlsCell build-depends: base >= 4.7 && < 5 , bytestring >= 0.10 , conduit >= 1.1 && < 1.4 From e899008dccb3c0be16c0d5301bfdc973d9afa9d9 Mon Sep 17 00:00:00 2001 From: olafklinke <84761329+olafklinke@users.noreply.github.com> Date: Wed, 14 Sep 2022 17:00:31 +0200 Subject: [PATCH 04/12] make decodeXlsIO return Cell instead of String --- lib/Data/Xls.hs | 49 ++++++++++++++++++++++++--------------------- lib/Data/XlsCell.hs | 35 ++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 23 deletions(-) create mode 100644 lib/Data/XlsCell.hs diff --git a/lib/Data/Xls.hs b/lib/Data/Xls.hs index 942eb61..acf3e85 100644 --- a/lib/Data/Xls.hs +++ b/lib/Data/Xls.hs @@ -37,15 +37,15 @@ import Data.Conduit hiding (Conduit, Sink, Source) import Data.Data import Data.Int import Data.Word (Word32) -import Data.Maybe (catMaybes, fromJust, isJust, fromMaybe) +import Data.Maybe (catMaybes, fromJust, isJust) import Data.ByteString (hPut) import Data.ByteString.Internal (ByteString(..)) +import Data.XlsCell (CellF(..),Cell,cellToString) import Foreign.C import Foreign.Ptr import Foreign.ForeignPtr (withForeignPtr) import Foreign.Storable (Storable(..)) import Foreign.Marshal.Alloc (malloc) -import Text.Printf import System.IO.Temp (withSystemTempFile) #define CCALL(name,signature) \ @@ -166,14 +166,14 @@ decodeXls file = -- since this lib is lacking a pure function to decode the contents -- of an XLS file. -- Due to Erik Rybakken. -decodeXlsByteString :: ByteString -> IO [[[String]]] +decodeXlsByteString :: ByteString -> IO [[[Cell]]] decodeXlsByteString content = withSystemTempFile "decodeXlsByteString" $ \filePath h -> do hPut h content decodeXlsIO filePath -- | Experimental: This function uses the @xls_open_buffer@ function of libxls. -decodeXlsByteString' :: ByteString -> IO (Either XLSError [[[String]]]) +decodeXlsByteString' :: ByteString -> IO (Either XLSError [[[Cell]]]) decodeXlsByteString' bs = do (buf,buflen) <- toCBuffer bs enc <- newCString "UTF-8" @@ -192,7 +192,7 @@ decodeXlsByteString' bs = do -- decodeXlsIO :: FilePath - -> IO [[[String]]] + -> IO [[[Cell]]] decodeXlsIO file = do file' <- newCString file pWB <- newCString "UTF-8" >>= c_xls_open file' @@ -206,7 +206,7 @@ decodeXlsIO file = do "XLS file " ++ file ++ " could not be parsed." -- helper function for decoding both file and buffer -decodeXLSWorkbook :: Maybe FilePath -> XLSWorkbook -> IO (Either XLSError [[[String]]]) +decodeXLSWorkbook :: Maybe FilePath -> XLSWorkbook -> IO (Either XLSError [[[Cell]]]) decodeXLSWorkbook mFile pWB = if pWB == nullPtr then return (Left LIBXLS_ERROR_OPEN) else catchXls $ do @@ -239,7 +239,7 @@ decodeOneWorkSheetIO :: FilePath -> XLSWorkbook -> CInt - -> IO [[String]] + -> IO [[Cell]] decodeOneWorkSheetIO file pWB index = bracket alloc cleanup decodeRowsIO where @@ -263,7 +263,7 @@ decodeRows pWS = do decodeRowsIO :: XLSWorksheet - -> IO [[String]] + -> IO [[Cell]] decodeRowsIO pWS = do rows <- c_xls_ws_rowcount pWS cols <- c_xls_ws_colcount pWS @@ -281,22 +281,27 @@ decodeOneRowIO :: XLSWorksheet -> Int16 -> Int16 - -> IO [String] + -> IO [Cell] decodeOneRowIO pWS cols rowindex = mapM (c_xls_cell pWS rowindex) [0 .. cols - 1] - >>= mapM decodeOneCell - >>= pure . (map $ fromMaybe "") + >>= mapM decodeOneCell' data CellType = Numerical | Formula | Str | Other decodeOneCell :: XLSCell -> IO (Maybe String) -decodeOneCell cellPtr = do +decodeOneCell = fmap maybeString . decodeOneCell' where + maybeString (OtherCell _) = Nothing + maybeString c = Just (cellToString c) + +decodeOneCell' :: XLSCell -> IO Cell +decodeOneCell' cellPtr = do nil <- isNullCell cellPtr if nil then - return Nothing - else cellValue cellPtr >>= return . Just + return (OtherCell ()) + else cellValue cellPtr where + emptyCell = OtherCell () isNullCell ptr = if ptr == nullPtr then return True @@ -321,21 +326,19 @@ decodeOneCell cellPtr = do return Nothing return $ case cellType typ ftype strval of - Numerical -> outputNum numval + Numerical -> let (CDouble d) = numval in NumericalCell d Formula -> decodeFormula strval numval - Str -> fromJust strval - Other -> "" -- we don't decode anything else + Str -> (TextCell . fromJust) strval + Other -> emptyCell -- we don't decode anything else decodeFormula str numval = case str of Just "bool" -> outputBool numval - Just "error" -> "*error*" - Just x -> x - Nothing -> "" -- is it possible? + Just "error" -> TextCell "*error*" + Just x -> TextCell x + Nothing -> emptyCell -- is it possible? - outputNum d = printf "%.15g" (uncurry encodeFloat (decodeFloat d) - :: Double) - outputBool d = if d == 0 then "false" else "true" + outputBool d = BoolCell (if d == 0 then False else True) cellType t ftype strval = if t == 0x27e || t == 0x0BD || t == 0x203 then diff --git a/lib/Data/XlsCell.hs b/lib/Data/XlsCell.hs new file mode 100644 index 0000000..60544f7 --- /dev/null +++ b/lib/Data/XlsCell.hs @@ -0,0 +1,35 @@ +-- | +-- Module : Data.XlsCell +-- Copyright : (c) 2022 Olaf.Klinke +-- +-- License : BSD-style +-- Maintainer : olaf.klinke@phymetric.de +-- Stability : experimental +-- Portability : GHC +-- +-- Static Excel cell values +-- +{-# LANGUAGE DeriveFunctor, FlexibleInstances #-} +module Data.XlsCell (CellF(..),Cell,cellToString) where +import Data.String (IsString(..)) +import Text.Printf (printf) + +-- | extensible 'Cell' type +data CellF o = NumericalCell Double + | TextCell String + | BoolCell Bool + | OtherCell o + deriving (Functor) +instance IsString (CellF o) where + fromString = TextCell + +-- | static 'Cell's in Excel files can hold +-- numbers, test or booleans. +type Cell = CellF () + +-- | convert to 'String'. Not the inverse of 'fromString'! +cellToString :: Cell -> String +cellToString (NumericalCell d) = printf "%.15g" d +cellToString (TextCell txt) = txt +cellToString (BoolCell b) = if b then "True" else "False" +cellToString (OtherCell _) = "" From 4e7a2ba639cc72678e4e40a06afd3c481d63f2e3 Mon Sep 17 00:00:00 2001 From: olafklinke <84761329+olafklinke@users.noreply.github.com> Date: Wed, 14 Sep 2022 17:01:12 +0200 Subject: [PATCH 05/12] use cellToString --- bin/xls2csv.hs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/xls2csv.hs b/bin/xls2csv.hs index 990f4e4..1182d83 100755 --- a/bin/xls2csv.hs +++ b/bin/xls2csv.hs @@ -3,6 +3,7 @@ import Data.List (intercalate) import Data.Xls (decodeXlsIO) +import Data.XlsCell (cellToString) import WithCli (withCli) -- TODO need to escape the separator and the escaping quotes themselves @@ -10,7 +11,7 @@ import WithCli (withCli) xlsToCSV :: String -> IO () xlsToCSV file = do worksheets <- decodeXlsIO file - mapM_ (mapM_ (putStrLn . intercalate ",")) worksheets + mapM_ (mapM_ (putStrLn . intercalate "," . fmap cellToString)) worksheets main :: IO () main = withCli xlsToCSV From f25086e5d370d2f5846bb21cd678406dbb04cfd3 Mon Sep 17 00:00:00 2001 From: olafklinke <84761329+olafklinke@users.noreply.github.com> Date: Wed, 9 Nov 2022 15:34:15 +0100 Subject: [PATCH 06/12] adjust expected data to CellF type --- test/Spec.hs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/test/Spec.hs b/test/Spec.hs index b644fb7..9e25f75 100644 --- a/test/Spec.hs +++ b/test/Spec.hs @@ -1,4 +1,5 @@ import Data.Xls +import Data.XlsCell (CellF(..),Cell) import Test.Hspec main :: IO () @@ -7,5 +8,8 @@ main = hspec $ describe "Sanity check" $ do content <- decodeXlsIO "test/data/test.xls" content `shouldBe` testFileContent -testFileContent :: [[[String]]] -testFileContent = [[["1.000000000000000","2.3","text"]],[["1.000000000000000","2.3","text"]]] +testFileContent :: [[[Cell]]] +testFileContent = [ + [[NumericalCell 1.0,TextCell "2.3",TextCell "text"]], + [[NumericalCell 1.0,TextCell "2.3",TextCell "text"]] + ] From a76b586f8c12a1b13ecd3e62e7e65ac1fdd9b31c Mon Sep 17 00:00:00 2001 From: olafklinke <84761329+olafklinke@users.noreply.github.com> Date: Wed, 9 Nov 2022 15:35:42 +0100 Subject: [PATCH 07/12] derive Show, Eq instances to CellF --- lib/Data/XlsCell.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Data/XlsCell.hs b/lib/Data/XlsCell.hs index 60544f7..d980ef2 100644 --- a/lib/Data/XlsCell.hs +++ b/lib/Data/XlsCell.hs @@ -19,7 +19,7 @@ data CellF o = NumericalCell Double | TextCell String | BoolCell Bool | OtherCell o - deriving (Functor) + deriving (Functor,Show,Eq) instance IsString (CellF o) where fromString = TextCell From 49059e299018338d175e86bb8644e2483ec15ce1 Mon Sep 17 00:00:00 2001 From: olafklinke <84761329+olafklinke@users.noreply.github.com> Date: Fri, 24 Mar 2023 14:17:33 +0100 Subject: [PATCH 08/12] make Data.XlsCell an internal module --- xls.cabal | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xls.cabal b/xls.cabal index c9c8ca4..5208525 100644 --- a/xls.cabal +++ b/xls.cabal @@ -49,7 +49,7 @@ library hs-source-dirs: lib exposed-modules: Data.Xls - , Data.XlsCell + other-modules: Data.XlsCell build-depends: base >= 4.7 && < 5 , bytestring >= 0.10 , conduit >= 1.1 && < 1.4 From 92828dbf45348e754ba2804c9a110bf3f7381526 Mon Sep 17 00:00:00 2001 From: olafklinke <84761329+olafklinke@users.noreply.github.com> Date: Fri, 24 Mar 2023 14:18:15 +0100 Subject: [PATCH 09/12] re-export Data.XlsCell --- lib/Data/Xls.hs | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/lib/Data/Xls.hs b/lib/Data/Xls.hs index acf3e85..bc301ed 100644 --- a/lib/Data/Xls.hs +++ b/lib/Data/Xls.hs @@ -20,7 +20,12 @@ #endif module Data.Xls - ( decodeXlsIO + ( -- * Cell data type + CellF(..) + , Cell + , cellToString + -- * decoding Xls files + , decodeXlsIO , decodeXlsByteString , decodeXlsByteString' , decodeXls @@ -162,9 +167,9 @@ decodeXls file = count <- liftIO $ c_xls_wb_sheetcount pWB mapM_ (decodeOneWorkSheet file pWB) [0 .. count - 1] --- | A work-around via temporary files and 'decodeXlsIO', --- since this lib is lacking a pure function to decode the contents --- of an XLS file. +-- | A work-around via temporary files and 'decodeXlsIO'. +-- Since this library lacks a pure function to decode from a buffer, +-- we just write the buffer to a temporary file and decode the file. -- Due to Erik Rybakken. decodeXlsByteString :: ByteString -> IO [[[Cell]]] decodeXlsByteString content = withSystemTempFile "decodeXlsByteString" From f56ee77031f8e44f14acfac9dcf1000087f54e50 Mon Sep 17 00:00:00 2001 From: olafklinke <84761329+olafklinke@users.noreply.github.com> Date: Fri, 24 Mar 2023 14:19:51 +0100 Subject: [PATCH 10/12] import cellToString from Data.Xls --- bin/xls2csv.hs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/bin/xls2csv.hs b/bin/xls2csv.hs index 1182d83..7ec43b1 100755 --- a/bin/xls2csv.hs +++ b/bin/xls2csv.hs @@ -2,8 +2,7 @@ -- stack --resolver lts runhaskell --package getopt-generics import Data.List (intercalate) -import Data.Xls (decodeXlsIO) -import Data.XlsCell (cellToString) +import Data.Xls (decodeXlsIO,cellToString) import WithCli (withCli) -- TODO need to escape the separator and the escaping quotes themselves From 63a87cafa4541f301191c63c03dfd4d15adc1f50 Mon Sep 17 00:00:00 2001 From: olafklinke <84761329+olafklinke@users.noreply.github.com> Date: Fri, 24 Mar 2023 14:31:07 +0100 Subject: [PATCH 11/12] remove import of now internal module Data.XlsCell --- test/Spec.hs | 1 - 1 file changed, 1 deletion(-) diff --git a/test/Spec.hs b/test/Spec.hs index 9e25f75..5f5ba22 100644 --- a/test/Spec.hs +++ b/test/Spec.hs @@ -1,5 +1,4 @@ import Data.Xls -import Data.XlsCell (CellF(..),Cell) import Test.Hspec main :: IO () From 068856dd078826f59e3ee3fb2657feafa7e43d9f Mon Sep 17 00:00:00 2001 From: olafklinke <84761329+olafklinke@users.noreply.github.com> Date: Fri, 24 Mar 2023 14:33:27 +0100 Subject: [PATCH 12/12] make XLSError an instance of Exception --- lib/Data/Xls.hs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/Data/Xls.hs b/lib/Data/Xls.hs index bc301ed..7c9188f 100644 --- a/lib/Data/Xls.hs +++ b/lib/Data/Xls.hs @@ -72,6 +72,8 @@ instance Storable XLSError where alignment = sizeOf -- okay for simple Storables peek = fmap (toEnum . (fromIntegral :: Word32 -> Int)) . peek . castPtr poke ptr e = poke (castPtr ptr) ((fromIntegral :: Int -> Word32).fromEnum $ e) +instance Exception XLSError where + type XLSWorkbook = Ptr XLSWorkbookStruct type XLSErrorT = Ptr XLSError type CBuffer = Ptr CUChar