Skip to content

Commit

Permalink
adapted genoconvert and forge
Browse files Browse the repository at this point in the history
  • Loading branch information
stschiff committed Nov 8, 2024
1 parent ac97fdf commit 443e7b7
Show file tree
Hide file tree
Showing 4 changed files with 124 additions and 62 deletions.
2 changes: 2 additions & 0 deletions src-executables/Main-trident.hs
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,7 @@ forgeOptParser = ForgeOptions <$> parseGenoDataSources
<*> parseIntersect
<*> parseOutGenotypeFormat True
<*> parseForgeOutMode
<*> parseZipOut
<*> parseOutPackagePath
<*> parseMaybeOutPackageName
<*> parsePackageWise
Expand All @@ -225,6 +226,7 @@ genoconvertOptParser = GenoconvertOptions <$> parseGenoDataSources
<*> parseRemoveOld
<*> parseOutputPlinkPopMode
<*> parseOnlyLatest
<*> parseZipOut

summariseOptParser :: OP.Parser SummariseOptions
summariseOptParser = SummariseOptions <$> parseBasePaths
Expand Down
151 changes: 89 additions & 62 deletions src/Poseidon/CLI/Genoconvert.hs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import Poseidon.Utils (PoseidonException (..), PoseidonIO,

import Control.Exception (catch, throwIO)
import Control.Monad (unless, when)
import Data.List ((\\))
import Data.Maybe (isJust)
import Data.Time (getCurrentTime)
import Pipes (MonadIO (liftIO), runEffect, (>->))
Expand All @@ -29,7 +30,8 @@ import SequenceFormats.Eigenstrat (writeEigenstrat)
import SequenceFormats.Plink (PlinkPopNameMode,
eigenstratInd2PlinkFam, writePlink)
import System.Directory (createDirectoryIfMissing,
doesFileExist, removeFile)
doesFileExist, removeFile,
renameFile)
import System.FilePath (dropTrailingPathSeparator, (<.>),
(</>))

Expand All @@ -46,89 +48,114 @@ data GenoconvertOptions = GenoconvertOptions
}

runGenoconvert :: GenoconvertOptions -> PoseidonIO ()
runGenoconvert (GenoconvertOptions genoSources outFormat onlyGeno outPath removeOld outPlinkPopMode onlyLatest) = do
runGenoconvert (GenoconvertOptions genoSources outFormat onlyGeno outPath
removeOld outPlinkPopMode onlyLatest outZip) = do
let pacReadOpts = defaultPackageReadOptions {
_readOptIgnoreChecksums = True
, _readOptIgnoreGeno = False
, _readOptGenoCheck = True
, _readOptOnlyLatest = onlyLatest
}
-- load packages
properPackages <- readPoseidonPackageCollection pacReadOpts $ [getPacBaseDir x | x@PacBaseDir {} <- genoSources]
pseudoPackages <- mapM makePseudoPackageFromGenotypeData [getGenoDirect x | x@GenoDirect {} <- genoSources]
properPackages <- readPoseidonPackageCollection pacReadOpts $
[getPacBaseDir x | x@PacBaseDir {} <- genoSources]
pseudoPackages <- mapM makePseudoPackageFromGenotypeData
[getGenoDirect x | x@GenoDirect {} <- genoSources]

logInfo $ "Unpackaged genotype data files loaded: " ++ show (length pseudoPackages)
-- convert
mapM_ (convertGenoTo outFormat onlyGeno outPath removeOld outPlinkPopMode) properPackages
mapM_ (convertGenoTo outFormat True outPath removeOld outPlinkPopMode) pseudoPackages
mapM_ (convertGenoTo outFormat onlyGeno outPath removeOld outPlinkPopMode outZip) properPackages
mapM_ (convertGenoTo outFormat True outPath removeOld outPlinkPopMode outZip) pseudoPackages

convertGenoTo :: String -> Bool -> Maybe FilePath -> Bool ->
PlinkPopNameMode -> PoseidonPackage -> PoseidonIO ()
convertGenoTo outFormat onlyGeno outPath removeOld outPlinkPopMode pac = do
PlinkPopNameMode -> Bool -> PoseidonPackage -> PoseidonIO ()
convertGenoTo outFormat onlyGeno outPath removeOld outPlinkPopMode outZip pac = do
-- start message
logInfo $
"Converting genotype data in "
++ show (posPacNameAndVersion pac)
++ " to format "
++ show outFormat
++ ":"
++ if outZip then "(gzipped):" else ":"
-- compile file names paths
let outName = getPacName . posPacNameAndVersion $ pac
(outInd, outSnp, outGeno) <- case (outFormat, outZip) of
("EIGENSTRAT", False) -> return (outName <.> ".ind", outName <.> ".snp" , outName <.> ".geno" )
("EIGENSTRAT", True ) -> return (outName <.> ".ind", outName <.> ".snp.gz", outName <.> ".geno.gz")
("PLINK", False) -> return (outName <.> ".fam", outName <.> ".bim" , outName <.> ".bed" )
("PLINK", True ) -> return (outName <.> ".fam", outName <.> ".bim.gz", outName <.> ".bed.gz" )
_ -> liftIO . throwIO $
PoseidonGenericException ("Illegal outFormat " ++ outFormat ++ ". Only Outformats EIGENSTRAT or PLINK are allowed at the moment")
-- check if genotype data needs conversion
if getFormat (genotypeFileSpec (posPacGenotypeData pac)) == (outFormat, outZip)
then logWarning "The genotype data is already in the requested format"
("EIGENSTRAT", False) -> return
(outName <.> ".ind", outName <.> ".snp" , outName <.> ".geno" )
("EIGENSTRAT", True ) -> return
(outName <.> ".ind", outName <.> ".snp.gz", outName <.> ".geno.gz")
("PLINK", False) -> return
(outName <.> ".fam", outName <.> ".bim" , outName <.> ".bed" )
("PLINK", True ) -> return
(outName <.> ".fam", outName <.> ".bim.gz", outName <.> ".bed.gz" )
_ -> liftIO . throwIO . PoseidonGenericException $
"Illegal outFormat " ++ outFormat ++
". Only Outformats EIGENSTRAT or PLINK are allowed at the moment"
-- create new genotype data files
newBaseDir <- case outPath of
Just x -> do
-- create new directory
logInfo $ "Writing to directory (will be created if missing): " ++ x
liftIO $ createDirectoryIfMissing True (dropTrailingPathSeparator x)
return x
Nothing -> return $ posPacBaseDir pac
let (outG, outS, outI) = (newBaseDir </> outGeno, newBaseDir </> outSnp, newBaseDir </> outInd)
allExists <- and <$> mapM checkFile [outG, outS, outI]
if allExists
then logWarning $ "Package already in desired file-type, skipping genotype conversion for " ++
show (posPacNameAndVersion pac)
else do
-- create new genotype data files
newBaseDir <- case outPath of
Just x -> do
-- create new directory
logInfo $ "Writing to directory (will be created if missing): " ++ x
liftIO $ createDirectoryIfMissing True (dropTrailingPathSeparator x)
return x
Nothing -> return $ posPacBaseDir pac
let (outG, outS, outI) = (newBaseDir </> outGeno, newBaseDir </> outSnp, newBaseDir </> outInd)
anyExists <- or <$> mapM checkFile [outG, outS, outI]
if anyExists
then logWarning ("skipping genotype conversion for " ++ show (posPacNameAndVersion pac))
else do
logInfo "Processing SNPs..."
logA <- envLogAction
currentTime <- liftIO getCurrentTime
errLength <- envErrorLength
let eigenstratIndEntries = jannoRows2EigenstratIndEntries . posPacJanno $ pac
liftIO $ catch (
runSafeT $ do
eigenstratProd <- loadGenotypeData (posPacBaseDir pac) (posPacGenotypeData pac)
let outConsumer = case outFormat of
"EIGENSTRAT" -> writeEigenstrat outG outS outI eigenstratIndEntries
"PLINK" -> writePlink outG outS outI (map (eigenstratInd2PlinkFam outPlinkPopMode) eigenstratIndEntries)
_ -> liftIO . throwIO $ PoseidonGenericException ("Illegal outFormat " ++ outFormat ++ ". Only Outformats EIGENSTRAT or PLINK are allowed at the moment")
runEffect $ eigenstratProd >-> printSNPCopyProgress logA currentTime >-> outConsumer
) (throwIO . PoseidonGenotypeExceptionForward errLength)
logInfo "Done"
-- overwrite genotype data field in POSEIDON.yml file
unless (onlyGeno || isJust outPath) $ do
gFileSpec <- case outFormat of
"EIGENSTRAT" -> return $ GenotypeEigenstrat outGeno Nothing outSnp Nothing outInd Nothing
"PLINK" -> return $ GenotypePlink outGeno Nothing outSnp Nothing outInd Nothing
_ -> liftIO . throwIO $ PoseidonGenericException ("Illegal outFormat " ++ outFormat ++ ". Only Outformats EIGENSTRAT or PLINK are allowed at the moment")
let genotypeData = GenotypeDataSpec gFileSpec (genotypeSnpSet . posPacGenotypeData $ pac)
newPac = pac { posPacGenotypeData = genotypeData }
logInfo "Adjusting POSEIDON.yml..."
liftIO $ writePoseidonPackage newPac
-- delete now replaced input genotype data
let filesToDelete = case genotypeFileSpec . posPacGenotypeData $ pac of
GenotypeEigenstrat g _ s _ i _ -> [g, s, i]
GenotypePlink g _ s _ i _ -> [g, s, i]
GenotypeVCF g _ -> [g]
when removeOld . liftIO . mapM_ (removeFile . (posPacBaseDir pac </>)) $ filesToDelete
logInfo "Processing SNPs..."
logA <- envLogAction
currentTime <- liftIO getCurrentTime
errLength <- envErrorLength
let eigenstratIndEntries = jannoRows2EigenstratIndEntries . posPacJanno $ pac
liftIO $ catch (
runSafeT $ do
eigenstratProd <- loadGenotypeData (posPacBaseDir pac) (posPacGenotypeData pac)
let outConsumer = case outFormat of
"EIGENSTRAT" -> writeEigenstrat (outG ++ ".gconvert")
(outS ++ ".gconvert")
(outI ++ ".gconvert")
eigenstratIndEntries
"PLINK" -> writePlink (outG ++ ".gconvert")
(outS ++ ".gconvert")
(outI ++ ".gconvert")
(map (eigenstratInd2PlinkFam outPlinkPopMode) eigenstratIndEntries)
_ -> liftIO . throwIO . PoseidonGenericException $
"Illegal outFormat " ++ outFormat ++
". Only Outformats EIGENSTRAT or PLINK are allowed at the moment"
runEffect $ eigenstratProd >-> printSNPCopyProgress logA currentTime >-> outConsumer
) (throwIO . PoseidonGenotypeExceptionForward errLength)
-- the following will just override if the file already exists.
liftIO $ renameFile (outG ++ ".gconvert") outG
liftIO $ renameFile (outS ++ ".gconvert") outS
liftIO $ renameFile (outI ++ ".gconvert") outI
logInfo "Done"
-- overwrite genotype data field in POSEIDON.yml file
unless (onlyGeno || isJust outPath) $ do
gFileSpec <- case outFormat of
"EIGENSTRAT" -> return $
GenotypeEigenstrat outGeno Nothing outSnp Nothing outInd Nothing
"PLINK" -> return $
GenotypePlink outGeno Nothing outSnp Nothing outInd Nothing
_ -> liftIO . throwIO . PoseidonGenericException $
"Illegal outFormat " ++ outFormat ++
". Only Outformats EIGENSTRAT or PLINK are allowed at the moment"
let oldGenotypeData = GenotypeDataSpec gFileSpec (genotypeSnpSet . posPacGenotypeData $ pac)
newPac = pac { posPacGenotypeData = oldGenotypeData }
logInfo "Adjusting POSEIDON.yml..."
liftIO $ writePoseidonPackage newPac
-- delete now replaced input genotype data
let oldBaseDir = posPacBaseDir pac
oldGenoFiles <- case genotypeFileSpec . posPacGenotypeData $ pac of
GenotypeEigenstrat g _ s _ i _ -> return [oldBaseDir </> g, oldBaseDir </> s, oldBaseDir </> i]
GenotypePlink g _ s _ i _ -> return [oldBaseDir </> g, oldBaseDir </> s, oldBaseDir </> i]
GenotypeVCF g _ -> return [oldBaseDir </> g]
let newGenoFiles = [outG, outS, outI]

let filesToDelete = oldGenoFiles \\ newGenoFiles
when removeOld . liftIO . mapM_ removeFile $ filesToDelete
where
checkFile :: FilePath -> PoseidonIO Bool
checkFile fn = do
Expand Down
7 changes: 7 additions & 0 deletions src/Poseidon/CLI/OptparseApplicativeParsers.hs
Original file line number Diff line number Diff line change
Expand Up @@ -899,3 +899,10 @@ parseOutputOrdered = OP.switch (
OP.long "ordered" <>
OP.help "With this option, the output of forge is ordered according to the entities given."
)

parseZipOut :: OP.Parser Bool
parseZipOut = OP.switch (
OP.long "zip" <>
OP.short 'z' <>
OP.help "whether the resultung genotype- and snp-files should be gzipped"
)
Loading

0 comments on commit 443e7b7

Please sign in to comment.