From 443e7b7acc3b768d1fa2302360850d04953433d7 Mon Sep 17 00:00:00 2001 From: Stephan Schiffels Date: Fri, 8 Nov 2024 19:54:15 +0100 Subject: [PATCH] adapted genoconvert and forge --- src-executables/Main-trident.hs | 2 + src/Poseidon/CLI/Genoconvert.hs | 151 +++++++++++------- .../CLI/OptparseApplicativeParsers.hs | 7 + .../GoldenTestsRunCommands.hs | 26 +++ 4 files changed, 124 insertions(+), 62 deletions(-) diff --git a/src-executables/Main-trident.hs b/src-executables/Main-trident.hs index aa2ff901..8971ab5c 100644 --- a/src-executables/Main-trident.hs +++ b/src-executables/Main-trident.hs @@ -211,6 +211,7 @@ forgeOptParser = ForgeOptions <$> parseGenoDataSources <*> parseIntersect <*> parseOutGenotypeFormat True <*> parseForgeOutMode + <*> parseZipOut <*> parseOutPackagePath <*> parseMaybeOutPackageName <*> parsePackageWise @@ -225,6 +226,7 @@ genoconvertOptParser = GenoconvertOptions <$> parseGenoDataSources <*> parseRemoveOld <*> parseOutputPlinkPopMode <*> parseOnlyLatest + <*> parseZipOut summariseOptParser :: OP.Parser SummariseOptions summariseOptParser = SummariseOptions <$> parseBasePaths diff --git a/src/Poseidon/CLI/Genoconvert.hs b/src/Poseidon/CLI/Genoconvert.hs index 71ed0557..5ff1f8f0 100644 --- a/src/Poseidon/CLI/Genoconvert.hs +++ b/src/Poseidon/CLI/Genoconvert.hs @@ -21,6 +21,7 @@ import Poseidon.Utils (PoseidonException (..), PoseidonIO, import Control.Exception (catch, throwIO) import Control.Monad (unless, when) +import Data.List ((\\)) import Data.Maybe (isJust) import Data.Time (getCurrentTime) import Pipes (MonadIO (liftIO), runEffect, (>->)) @@ -29,7 +30,8 @@ import SequenceFormats.Eigenstrat (writeEigenstrat) import SequenceFormats.Plink (PlinkPopNameMode, eigenstratInd2PlinkFam, writePlink) import System.Directory (createDirectoryIfMissing, - doesFileExist, removeFile) + doesFileExist, removeFile, + renameFile) import System.FilePath (dropTrailingPathSeparator, (<.>), ()) @@ -46,7 +48,8 @@ data GenoconvertOptions = GenoconvertOptions } runGenoconvert :: GenoconvertOptions -> PoseidonIO () -runGenoconvert (GenoconvertOptions genoSources outFormat onlyGeno outPath removeOld outPlinkPopMode onlyLatest) = do +runGenoconvert (GenoconvertOptions genoSources outFormat onlyGeno outPath + removeOld outPlinkPopMode onlyLatest outZip) = do let pacReadOpts = defaultPackageReadOptions { _readOptIgnoreChecksums = True , _readOptIgnoreGeno = False @@ -54,81 +57,105 @@ runGenoconvert (GenoconvertOptions genoSources outFormat onlyGeno outPath remove , _readOptOnlyLatest = onlyLatest } -- load packages - properPackages <- readPoseidonPackageCollection pacReadOpts $ [getPacBaseDir x | x@PacBaseDir {} <- genoSources] - pseudoPackages <- mapM makePseudoPackageFromGenotypeData [getGenoDirect x | x@GenoDirect {} <- genoSources] + properPackages <- readPoseidonPackageCollection pacReadOpts $ + [getPacBaseDir x | x@PacBaseDir {} <- genoSources] + pseudoPackages <- mapM makePseudoPackageFromGenotypeData + [getGenoDirect x | x@GenoDirect {} <- genoSources] logInfo $ "Unpackaged genotype data files loaded: " ++ show (length pseudoPackages) -- convert - mapM_ (convertGenoTo outFormat onlyGeno outPath removeOld outPlinkPopMode) properPackages - mapM_ (convertGenoTo outFormat True outPath removeOld outPlinkPopMode) pseudoPackages + mapM_ (convertGenoTo outFormat onlyGeno outPath removeOld outPlinkPopMode outZip) properPackages + mapM_ (convertGenoTo outFormat True outPath removeOld outPlinkPopMode outZip) pseudoPackages convertGenoTo :: String -> Bool -> Maybe FilePath -> Bool -> - PlinkPopNameMode -> PoseidonPackage -> PoseidonIO () -convertGenoTo outFormat onlyGeno outPath removeOld outPlinkPopMode pac = do + PlinkPopNameMode -> Bool -> PoseidonPackage -> PoseidonIO () +convertGenoTo outFormat onlyGeno outPath removeOld outPlinkPopMode outZip pac = do -- start message logInfo $ "Converting genotype data in " ++ show (posPacNameAndVersion pac) ++ " to format " ++ show outFormat - ++ ":" + ++ if outZip then "(gzipped):" else ":" -- compile file names paths let outName = getPacName . posPacNameAndVersion $ pac (outInd, outSnp, outGeno) <- case (outFormat, outZip) of - ("EIGENSTRAT", False) -> return (outName <.> ".ind", outName <.> ".snp" , outName <.> ".geno" ) - ("EIGENSTRAT", True ) -> return (outName <.> ".ind", outName <.> ".snp.gz", outName <.> ".geno.gz") - ("PLINK", False) -> return (outName <.> ".fam", outName <.> ".bim" , outName <.> ".bed" ) - ("PLINK", True ) -> return (outName <.> ".fam", outName <.> ".bim.gz", outName <.> ".bed.gz" ) - _ -> liftIO . throwIO $ - PoseidonGenericException ("Illegal outFormat " ++ outFormat ++ ". Only Outformats EIGENSTRAT or PLINK are allowed at the moment") - -- check if genotype data needs conversion - if getFormat (genotypeFileSpec (posPacGenotypeData pac)) == (outFormat, outZip) - then logWarning "The genotype data is already in the requested format" + ("EIGENSTRAT", False) -> return + (outName <.> ".ind", outName <.> ".snp" , outName <.> ".geno" ) + ("EIGENSTRAT", True ) -> return + (outName <.> ".ind", outName <.> ".snp.gz", outName <.> ".geno.gz") + ("PLINK", False) -> return + (outName <.> ".fam", outName <.> ".bim" , outName <.> ".bed" ) + ("PLINK", True ) -> return + (outName <.> ".fam", outName <.> ".bim.gz", outName <.> ".bed.gz" ) + _ -> liftIO . throwIO . PoseidonGenericException $ + "Illegal outFormat " ++ outFormat ++ + ". Only Outformats EIGENSTRAT or PLINK are allowed at the moment" + -- create new genotype data files + newBaseDir <- case outPath of + Just x -> do + -- create new directory + logInfo $ "Writing to directory (will be created if missing): " ++ x + liftIO $ createDirectoryIfMissing True (dropTrailingPathSeparator x) + return x + Nothing -> return $ posPacBaseDir pac + let (outG, outS, outI) = (newBaseDir outGeno, newBaseDir outSnp, newBaseDir outInd) + allExists <- and <$> mapM checkFile [outG, outS, outI] + if allExists + then logWarning $ "Package already in desired file-type, skipping genotype conversion for " ++ + show (posPacNameAndVersion pac) else do - -- create new genotype data files - newBaseDir <- case outPath of - Just x -> do - -- create new directory - logInfo $ "Writing to directory (will be created if missing): " ++ x - liftIO $ createDirectoryIfMissing True (dropTrailingPathSeparator x) - return x - Nothing -> return $ posPacBaseDir pac - let (outG, outS, outI) = (newBaseDir outGeno, newBaseDir outSnp, newBaseDir outInd) - anyExists <- or <$> mapM checkFile [outG, outS, outI] - if anyExists - then logWarning ("skipping genotype conversion for " ++ show (posPacNameAndVersion pac)) - else do - logInfo "Processing SNPs..." - logA <- envLogAction - currentTime <- liftIO getCurrentTime - errLength <- envErrorLength - let eigenstratIndEntries = jannoRows2EigenstratIndEntries . posPacJanno $ pac - liftIO $ catch ( - runSafeT $ do - eigenstratProd <- loadGenotypeData (posPacBaseDir pac) (posPacGenotypeData pac) - let outConsumer = case outFormat of - "EIGENSTRAT" -> writeEigenstrat outG outS outI eigenstratIndEntries - "PLINK" -> writePlink outG outS outI (map (eigenstratInd2PlinkFam outPlinkPopMode) eigenstratIndEntries) - _ -> liftIO . throwIO $ PoseidonGenericException ("Illegal outFormat " ++ outFormat ++ ". Only Outformats EIGENSTRAT or PLINK are allowed at the moment") - runEffect $ eigenstratProd >-> printSNPCopyProgress logA currentTime >-> outConsumer - ) (throwIO . PoseidonGenotypeExceptionForward errLength) - logInfo "Done" - -- overwrite genotype data field in POSEIDON.yml file - unless (onlyGeno || isJust outPath) $ do - gFileSpec <- case outFormat of - "EIGENSTRAT" -> return $ GenotypeEigenstrat outGeno Nothing outSnp Nothing outInd Nothing - "PLINK" -> return $ GenotypePlink outGeno Nothing outSnp Nothing outInd Nothing - _ -> liftIO . throwIO $ PoseidonGenericException ("Illegal outFormat " ++ outFormat ++ ". Only Outformats EIGENSTRAT or PLINK are allowed at the moment") - let genotypeData = GenotypeDataSpec gFileSpec (genotypeSnpSet . posPacGenotypeData $ pac) - newPac = pac { posPacGenotypeData = genotypeData } - logInfo "Adjusting POSEIDON.yml..." - liftIO $ writePoseidonPackage newPac - -- delete now replaced input genotype data - let filesToDelete = case genotypeFileSpec . posPacGenotypeData $ pac of - GenotypeEigenstrat g _ s _ i _ -> [g, s, i] - GenotypePlink g _ s _ i _ -> [g, s, i] - GenotypeVCF g _ -> [g] - when removeOld . liftIO . mapM_ (removeFile . (posPacBaseDir pac )) $ filesToDelete + logInfo "Processing SNPs..." + logA <- envLogAction + currentTime <- liftIO getCurrentTime + errLength <- envErrorLength + let eigenstratIndEntries = jannoRows2EigenstratIndEntries . posPacJanno $ pac + liftIO $ catch ( + runSafeT $ do + eigenstratProd <- loadGenotypeData (posPacBaseDir pac) (posPacGenotypeData pac) + let outConsumer = case outFormat of + "EIGENSTRAT" -> writeEigenstrat (outG ++ ".gconvert") + (outS ++ ".gconvert") + (outI ++ ".gconvert") + eigenstratIndEntries + "PLINK" -> writePlink (outG ++ ".gconvert") + (outS ++ ".gconvert") + (outI ++ ".gconvert") + (map (eigenstratInd2PlinkFam outPlinkPopMode) eigenstratIndEntries) + _ -> liftIO . throwIO . PoseidonGenericException $ + "Illegal outFormat " ++ outFormat ++ + ". Only Outformats EIGENSTRAT or PLINK are allowed at the moment" + runEffect $ eigenstratProd >-> printSNPCopyProgress logA currentTime >-> outConsumer + ) (throwIO . PoseidonGenotypeExceptionForward errLength) + -- the following will just override if the file already exists. + liftIO $ renameFile (outG ++ ".gconvert") outG + liftIO $ renameFile (outS ++ ".gconvert") outS + liftIO $ renameFile (outI ++ ".gconvert") outI + logInfo "Done" + -- overwrite genotype data field in POSEIDON.yml file + unless (onlyGeno || isJust outPath) $ do + gFileSpec <- case outFormat of + "EIGENSTRAT" -> return $ + GenotypeEigenstrat outGeno Nothing outSnp Nothing outInd Nothing + "PLINK" -> return $ + GenotypePlink outGeno Nothing outSnp Nothing outInd Nothing + _ -> liftIO . throwIO . PoseidonGenericException $ + "Illegal outFormat " ++ outFormat ++ + ". Only Outformats EIGENSTRAT or PLINK are allowed at the moment" + let oldGenotypeData = GenotypeDataSpec gFileSpec (genotypeSnpSet . posPacGenotypeData $ pac) + newPac = pac { posPacGenotypeData = oldGenotypeData } + logInfo "Adjusting POSEIDON.yml..." + liftIO $ writePoseidonPackage newPac + -- delete now replaced input genotype data + let oldBaseDir = posPacBaseDir pac + oldGenoFiles <- case genotypeFileSpec . posPacGenotypeData $ pac of + GenotypeEigenstrat g _ s _ i _ -> return [oldBaseDir g, oldBaseDir s, oldBaseDir i] + GenotypePlink g _ s _ i _ -> return [oldBaseDir g, oldBaseDir s, oldBaseDir i] + GenotypeVCF g _ -> return [oldBaseDir g] + let newGenoFiles = [outG, outS, outI] + + let filesToDelete = oldGenoFiles \\ newGenoFiles + when removeOld . liftIO . mapM_ removeFile $ filesToDelete where checkFile :: FilePath -> PoseidonIO Bool checkFile fn = do diff --git a/src/Poseidon/CLI/OptparseApplicativeParsers.hs b/src/Poseidon/CLI/OptparseApplicativeParsers.hs index f260eb07..6e3eb48f 100644 --- a/src/Poseidon/CLI/OptparseApplicativeParsers.hs +++ b/src/Poseidon/CLI/OptparseApplicativeParsers.hs @@ -899,3 +899,10 @@ parseOutputOrdered = OP.switch ( OP.long "ordered" <> OP.help "With this option, the output of forge is ordered according to the entities given." ) + +parseZipOut :: OP.Parser Bool +parseZipOut = OP.switch ( + OP.long "zip" <> + OP.short 'z' <> + OP.help "whether the resultung genotype- and snp-files should be gzipped" + ) \ No newline at end of file diff --git a/test/PoseidonGoldenTests/GoldenTestsRunCommands.hs b/test/PoseidonGoldenTests/GoldenTestsRunCommands.hs index af6fce0f..c748c248 100644 --- a/test/PoseidonGoldenTests/GoldenTestsRunCommands.hs +++ b/test/PoseidonGoldenTests/GoldenTestsRunCommands.hs @@ -422,6 +422,7 @@ testPipelineGenoconvert testDir checkFilePath = do , _genoconvertRemoveOld = False , _genoconvertOutPlinkPopMode = PlinkPopNameAsFamily , _genoconvertOnlyLatest = False + , _genoconvertOutZip = False } runAndChecksumFiles checkFilePath testDir (testLog $ runGenoconvert genoconvertOpts1) "genoconvert" [ "genoconvert" "Schiffels" "Schiffels_2016.bed" @@ -437,6 +438,7 @@ testPipelineGenoconvert testDir checkFilePath = do , _genoconvertRemoveOld = False , _genoconvertOutPlinkPopMode = PlinkPopNameAsPhenotype , _genoconvertOnlyLatest = False + , _genoconvertOutZip = False } runAndChecksumFiles checkFilePath testDir (testLog $ runGenoconvert genoconvertOpts2) "genoconvert" [ "genoconvert" "Schiffels_otherPlinkEncoding" "Schiffels_2016.bed" @@ -453,6 +455,7 @@ testPipelineGenoconvert testDir checkFilePath = do , _genoconvertRemoveOld = False , _genoconvertOutPlinkPopMode = PlinkPopNameAsFamily , _genoconvertOnlyLatest = False + , _genoconvertOutZip = False } runAndChecksumFiles checkFilePath testDir (testLog $ runGenoconvert genoconvertOpts3) "genoconvert" [ "init" "Wang" "Wang.geno" @@ -481,6 +484,7 @@ testPipelineGenoconvert testDir checkFilePath = do , _genoconvertRemoveOld = False , _genoconvertOutPlinkPopMode = PlinkPopNameAsFamily , _genoconvertOnlyLatest = False + , _genoconvertOutZip = False } runAndChecksumFiles checkFilePath testDir (testLog $ runGenoconvert genoconvertOpts4) "genoconvert" [ "init" "Schiffels" "geno.bed" @@ -505,6 +509,7 @@ testPipelineGenoconvert testDir checkFilePath = do , _genoconvertRemoveOld = False , _genoconvertOutPlinkPopMode = PlinkPopNameAsFamily , _genoconvertOnlyLatest = False + , _genoconvertOutZip = False } runAndChecksumFiles checkFilePath testDir (testLog $ runGenoconvert genoconvertOpts5) "genoconvert" [ "init_vcf" "Schiffels_vcf" "geno.bed" @@ -570,6 +575,7 @@ testPipelineForge testDir checkFilePath = do , _forgeIntersect = False , _forgeOutFormat = "EIGENSTRAT" , _forgeOutMode = NormalOut + , _forgeOutZip = False , _forgeOutPacPath = testDir "forge" "ForgePac1" , _forgeOutPacName = Just "ForgePac1" , _forgePackageWise = False @@ -593,6 +599,7 @@ testPipelineForge testDir checkFilePath = do , _forgeIntersect = False , _forgeOutFormat = "EIGENSTRAT" , _forgeOutMode = NormalOut + , _forgeOutZip = False , _forgeOutPacPath = testDir "forge" "ForgePac1_vcf" , _forgeOutPacName = Just "ForgePac1_vcf" , _forgePackageWise = False @@ -616,6 +623,7 @@ testPipelineForge testDir checkFilePath = do , _forgeIntersect = False , _forgeOutFormat = "PLINK" , _forgeOutMode = MinimalOut + , _forgeOutZip = False , _forgeOutPacPath = testDir "forge" "ForgePac2" , _forgeOutPacName = Nothing , _forgePackageWise = False @@ -636,6 +644,7 @@ testPipelineForge testDir checkFilePath = do , _forgeIntersect = False , _forgeOutFormat = "EIGENSTRAT" , _forgeOutMode = NormalOut + , _forgeOutZip = False , _forgeOutPacPath = testDir "forge" "ForgePac3" , _forgeOutPacName = Nothing , _forgePackageWise = False @@ -660,6 +669,7 @@ testPipelineForge testDir checkFilePath = do , _forgeIntersect = False , _forgeOutFormat = "PLINK" , _forgeOutMode = NormalOut + , _forgeOutZip = False , _forgeOutPacPath = testDir "forge" "ForgePac4" , _forgeOutPacName = Nothing , _forgePackageWise = False @@ -683,6 +693,7 @@ testPipelineForge testDir checkFilePath = do , _forgeIntersect = False , _forgeOutFormat = "EIGENSTRAT" , _forgeOutMode = NormalOut + , _forgeOutZip = False , _forgeOutPacPath = testDir "forge" "ForgePac5" , _forgeOutPacName = Just "ForgePac5" , _forgePackageWise = False @@ -730,6 +741,7 @@ testPipelineForge testDir checkFilePath = do , _forgeIntersect = False , _forgeOutFormat = "EIGENSTRAT" , _forgeOutMode = GenoOut + , _forgeOutZip = False , _forgeOutPacPath = testDir "forge" "ForgePac6" , _forgeOutPacName = Just "ForgePac6" , _forgePackageWise = False @@ -765,6 +777,7 @@ testPipelineForge testDir checkFilePath = do , _forgeIntersect = False , _forgeOutFormat = "EIGENSTRAT" , _forgeOutMode = NormalOut + , _forgeOutZip = False , _forgeOutPacPath = testDir "forge" "ForgePac7" , _forgeOutPacName = Just "ForgePac7" , _forgePackageWise = False @@ -788,6 +801,7 @@ testPipelineForge testDir checkFilePath = do , _forgeIntersect = False , _forgeOutFormat = "EIGENSTRAT" , _forgeOutMode = NormalOut + , _forgeOutZip = False , _forgeOutPacPath = testDir "forge" "ForgePac8" , _forgeOutPacName = Just "ForgePac8" , _forgePackageWise = False @@ -808,6 +822,7 @@ testPipelineForge testDir checkFilePath = do , _forgeIntersect = False , _forgeOutFormat = "EIGENSTRAT" , _forgeOutMode = NormalOut + , _forgeOutZip = False , _forgeOutPacPath = testDir "forge" "ForgePac9" , _forgeOutPacName = Just "ForgePac9" , _forgePackageWise = False @@ -829,6 +844,7 @@ testPipelineForge testDir checkFilePath = do , _forgeIntersect = False , _forgeOutFormat = "EIGENSTRAT" , _forgeOutMode = NormalOut + , _forgeOutZip = False , _forgeOutPacPath = testDir "forge" "ForgePac10" , _forgeOutPacName = Just "ForgePac10" , _forgePackageWise = False @@ -851,6 +867,7 @@ testPipelineForge testDir checkFilePath = do , _forgeIntersect = False , _forgeOutFormat = "EIGENSTRAT" , _forgeOutMode = NormalOut + , _forgeOutZip = False , _forgeOutPacPath = testDir "forge" "ForgePac11" , _forgeOutPacName = Just "ForgePac11" , _forgePackageWise = True @@ -872,6 +889,7 @@ testPipelineForge testDir checkFilePath = do , _forgeIntersect = False , _forgeOutFormat = "EIGENSTRAT" , _forgeOutMode = NormalOut + , _forgeOutZip = False , _forgeOutPacPath = testDir "forge" "ForgePac12" , _forgeOutPacName = Just "ForgePac12" , _forgePackageWise = False @@ -891,6 +909,7 @@ testPipelineForge testDir checkFilePath = do , _forgeIntersect = False , _forgeOutFormat = "EIGENSTRAT" , _forgeOutMode = NormalOut + , _forgeOutZip = False , _forgeOutPacPath = testDir "forge" "ForgePac13" , _forgeOutPacName = Just "ForgePac13" , _forgePackageWise = False @@ -911,6 +930,7 @@ testPipelineForge testDir checkFilePath = do , _forgeIntersect = False , _forgeOutFormat = "EIGENSTRAT" , _forgeOutMode = NormalOut + , _forgeOutZip = False , _forgeOutPacPath = testDir "forge" "ForgePac14" , _forgeOutPacName = Just "ForgePac14" , _forgePackageWise = False @@ -931,6 +951,7 @@ testPipelineForge testDir checkFilePath = do , _forgeIntersect = False , _forgeOutFormat = "EIGENSTRAT" , _forgeOutMode = NormalOut + , _forgeOutZip = False , _forgeOutPacPath = testDir "forge" "ForgePac15" , _forgeOutPacName = Just "ForgePac15" , _forgePackageWise = False @@ -951,6 +972,7 @@ testPipelineForge testDir checkFilePath = do , _forgeIntersect = False , _forgeOutFormat = "EIGENSTRAT" , _forgeOutMode = NormalOut + , _forgeOutZip = False , _forgeOutPacPath = testDir "forge" "ForgePac16" , _forgeOutPacName = Just "ForgePac16" , _forgePackageWise = False @@ -971,6 +993,7 @@ testPipelineForge testDir checkFilePath = do , _forgeIntersect = False , _forgeOutFormat = "EIGENSTRAT" , _forgeOutMode = NormalOut + , _forgeOutZip = False , _forgeOutPacPath = testDir "forge" "ForgePac17" , _forgeOutPacName = Just "ForgePac17" , _forgePackageWise = False @@ -989,6 +1012,7 @@ testPipelineForge testDir checkFilePath = do , _forgeIntersect = False , _forgeOutFormat = "PLINK" , _forgeOutMode = NormalOut + , _forgeOutZip = False , _forgeOutPacPath = testDir "forge" "ForgePac18" , _forgeOutPacName = Just "ForgePac18" , _forgePackageWise = False @@ -1009,6 +1033,7 @@ testPipelineForge testDir checkFilePath = do , _forgeIntersect = False , _forgeOutFormat = "PLINK" , _forgeOutMode = PreservePymlOut + , _forgeOutZip = False , _forgeOutPacPath = testDir "forge" "ForgePac19" , _forgeOutPacName = Just "ForgePac19" , _forgePackageWise = False @@ -1038,6 +1063,7 @@ testPipelineForge testDir checkFilePath = do , _forgeIntersect = False , _forgeOutFormat = "PLINK" , _forgeOutMode = NormalOut + , _forgeOutZip = False , _forgeOutPacPath = testDir "forge" "ForgePac20" , _forgeOutPacName = Just "ForgePac20" , _forgePackageWise = False