diff --git a/.travis.yml b/.travis.yml index 0ff5933..0b8bbfb 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,16 +3,15 @@ language: julia os: - linux - osx +env: + - DATADEPS_ALWAYS_ACCEPT=true julia: # 1.0 should also work, but Pkg.test hit some chmod issues on 1.0 in docker containers - 1.3 + - 1.5 - nightly notifications: email: false -matrix: - fast_finish: true - allow_failures: - - julia: nightly # uncomment the following lines to override the default test script #script: # - if [[ -a .git/shallow ]]; then git fetch --unshallow; fi @@ -20,9 +19,12 @@ matrix: after_success: - julia -e 'using Pkg; Pkg.add("Coverage"); using Coverage; Codecov.submit(process_folder())' jobs: + fast_finish: true + allow_failures: + - julia: nightly include: - stage: "Documentation" - julia: 1.0 + julia: 1.5 os: linux script: - julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd())); Pkg.instantiate()' diff --git a/Project.toml b/Project.toml index e58ad5a..80e8eae 100644 --- a/Project.toml +++ b/Project.toml @@ -4,10 +4,14 @@ authors = ["Invenia Technical Computing"] version = "0.6.0" [deps] +BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0" +CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" +DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe" Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" +NamedDims = "356022a1-0364-5f58-8944-0da4b18d706f" NearestNeighbors = "b8a86587-4115-5ab1-83bc-aa920d37bbce" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" @@ -16,10 +20,14 @@ TableOperations = "ab02a1b2-a7df-11e8-156e-fb1833f50b87" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" [compat] -AxisKeys = "0.1.5" +AxisKeys = "0.1" +BSON = "0.2" +CSV = "0.6, 0.7" +DataDeps = "0.7" Distances = "0.8, 0.9" IterTools = "1.2, 1.3" Missings = "0.4" +NamedDims = "0.2" NearestNeighbors = "0.4" StatsBase = "0.32" TableOperations = "0.2" @@ -33,8 +41,8 @@ Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" -RDatasets = "ce6b1742-4840-55fa-b093-852dadbb1d8b" +Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["AxisArrays", "AxisKeys", "Combinatorics", "DataFrames", "Dates", "Distances", "RDatasets", "Test"] +test = ["AxisArrays", "AxisKeys", "Combinatorics", "DataFrames", "Dates", "Distances", "Documenter", "Test"] diff --git a/README.md b/README.md index 3741155..ba6d643 100644 --- a/README.md +++ b/README.md @@ -15,13 +15,13 @@ julia> using Pkg; Pkg.add("Impute") ## Quickstart Let's start by loading our dependencies: ```julia -julia> using DataFrames, RDatasets, Impute +julia> using DataFrames, Impute ``` We'll also want some test data containing missings to work with: ```julia -julia> df = dataset("boot", "neuro") +julia> df = Impute.dataset("test/table/neuro") |> DataFrame 469×6 DataFrames.DataFrame │ Row │ V1 │ V2 │ V3 │ V4 │ V5 │ V6 │ │ │ Float64⍰ │ Float64⍰ │ Float64 │ Float64⍰ │ Float64⍰ │ Float64⍰ │ @@ -47,7 +47,7 @@ julia> df = dataset("boot", "neuro") Our first instinct might be to drop all observations, but this leaves us too few rows to work with: ```julia -julia> Impute.drop(df) +julia> Impute.filter(df; dims=:rows) 4×6 DataFrames.DataFrame │ Row │ V1 │ V2 │ V3 │ V4 │ V5 │ V6 │ │ │ Float64 │ Float64 │ Float64 │ Float64 │ Float64 │ Float64 │ diff --git a/docs/Manifest.toml b/docs/Manifest.toml new file mode 100644 index 0000000..857ff1f --- /dev/null +++ b/docs/Manifest.toml @@ -0,0 +1,446 @@ +# This file is machine-generated - editing it directly is not advised + +[[AbstractFFTs]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "051c95d6836228d120f5f4b984dd5aba1624f716" +uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c" +version = "0.5.0" + +[[AxisKeys]] +deps = ["AbstractFFTs", "IntervalSets", "InvertedIndices", "LazyStack", "LinearAlgebra", "NamedDims", "OffsetArrays", "Statistics", "Tables"] +git-tree-sha1 = "f3a35fff6784dc24c17fd6351b6b41c4580bcd5a" +uuid = "94b1ba4f-4ee9-5380-92f1-94cde586c3c5" +version = "0.1.6" + +[[BSON]] +git-tree-sha1 = "dd36d7cf3d185eeaaf64db902c15174b22f5dafb" +uuid = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0" +version = "0.2.6" + +[[Base64]] +uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" + +[[BinaryProvider]] +deps = ["Libdl", "Logging", "SHA"] +git-tree-sha1 = "ecdec412a9abc8db54c0efc5548c64dfce072058" +uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232" +version = "0.5.10" + +[[CSV]] +deps = ["CategoricalArrays", "DataFrames", "Dates", "Mmap", "Parsers", "PooledArrays", "SentinelArrays", "Tables", "Unicode"] +git-tree-sha1 = "a390152e6850405a48ca51bd7ca33d11a21d6230" +uuid = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" +version = "0.7.7" + +[[CategoricalArrays]] +deps = ["DataAPI", "Future", "JSON", "Missings", "Printf", "Statistics", "StructTypes", "Unicode"] +git-tree-sha1 = "2ac27f59196a68070e132b25713f9a5bbc5fa0d2" +uuid = "324d7699-5711-5eae-9e2f-1d82baa6b597" +version = "0.8.3" + +[[ColorTypes]] +deps = ["FixedPointNumbers", "Random"] +git-tree-sha1 = "b9de8dc6106e09c79f3f776c27c62360d30e5eb8" +uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f" +version = "0.9.1" + +[[Colors]] +deps = ["ColorTypes", "FixedPointNumbers", "InteractiveUtils", "Printf", "Reexport"] +git-tree-sha1 = "177d8b959d3c103a6d57574c38ee79c81059c31b" +uuid = "5ae59095-9a9b-59fe-a467-6f913c188581" +version = "0.11.2" + +[[Compat]] +deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] +git-tree-sha1 = "cf03b37436c6bc162e7c8943001568b4cad4bee3" +uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" +version = "3.19.0" + +[[Contour]] +deps = ["StaticArrays"] +git-tree-sha1 = "d05a3a25b762720d40246d5bedf518c9c2614ef5" +uuid = "d38c429a-6771-53c6-b99e-75d170b6e991" +version = "0.5.5" + +[[DataAPI]] +git-tree-sha1 = "176e23402d80e7743fc26c19c681bfb11246af32" +uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" +version = "1.3.0" + +[[DataDeps]] +deps = ["HTTP", "Reexport", "SHA"] +git-tree-sha1 = "f2be642d7a94e7f0cabcd2106fee4c6715d452d1" +uuid = "124859b0-ceae-595e-8997-d05f6a7a8dfe" +version = "0.7.2" + +[[DataFrames]] +deps = ["CategoricalArrays", "Compat", "DataAPI", "Future", "InvertedIndices", "IteratorInterfaceExtensions", "Missings", "PooledArrays", "Printf", "REPL", "Reexport", "SortingAlgorithms", "Statistics", "TableTraits", "Tables", "Unicode"] +git-tree-sha1 = "a7c1c9a6e47a92321bbc9d500dab9b04cc4a6a39" +uuid = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +version = "0.21.7" + +[[DataStructures]] +deps = ["InteractiveUtils", "OrderedCollections"] +git-tree-sha1 = "88d48e133e6d3dd68183309877eac74393daa7eb" +uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" +version = "0.17.20" + +[[DataValueInterfaces]] +git-tree-sha1 = "bfc1187b79289637fa0ef6d4436ebdfe6905cbd6" +uuid = "e2d170a0-9d28-54be-80f0-106bbe20a464" +version = "1.0.0" + +[[Dates]] +deps = ["Printf"] +uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" + +[[DelimitedFiles]] +deps = ["Mmap"] +uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab" + +[[Distances]] +deps = ["LinearAlgebra", "Statistics"] +git-tree-sha1 = "a5b88815e6984e9f3256b6ca0dc63109b16a506f" +uuid = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" +version = "0.9.2" + +[[Distributed]] +deps = ["Random", "Serialization", "Sockets"] +uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" + +[[DocStringExtensions]] +deps = ["LibGit2", "Markdown", "Pkg", "Test"] +git-tree-sha1 = "50ddf44c53698f5e784bbebb3f4b21c5807401b1" +uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" +version = "0.8.3" + +[[Documenter]] +deps = ["Base64", "DocStringExtensions", "InteractiveUtils", "JSON", "LibGit2", "Logging", "Markdown", "REPL", "Unicode"] +git-tree-sha1 = "580155ffaeb175f37dc0bd31ed6c127663efbc60" +uuid = "e30172f5-a6a5-5a46-863b-614d45cd2de4" +version = "0.22.6" + +[[EllipsisNotation]] +git-tree-sha1 = "65dad386e877850e6fce4fc77f60fe75a468ce9d" +uuid = "da5c29d0-fa7d-589e-88eb-ea29b0a81949" +version = "0.4.0" + +[[FFMPEG]] +deps = ["BinaryProvider", "Libdl"] +git-tree-sha1 = "9143266ba77d3313a4cf61d8333a1970e8c5d8b6" +uuid = "c87230d0-a227-11e9-1b43-d7ebe4e7570a" +version = "0.2.4" + +[[FixedPointNumbers]] +git-tree-sha1 = "d14a6fa5890ea3a7e5dcab6811114f132fec2b4b" +uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93" +version = "0.6.1" + +[[Future]] +deps = ["Random"] +uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820" + +[[GR]] +deps = ["Base64", "DelimitedFiles", "LinearAlgebra", "Printf", "Random", "Serialization", "Sockets", "Test"] +git-tree-sha1 = "c690c2ab22ac9ee323d9966deae61a089362b25c" +uuid = "28b8d3ca-fb5f-59d9-8090-bfdbd6d07a71" +version = "0.44.0" + +[[GeometryTypes]] +deps = ["ColorTypes", "FixedPointNumbers", "LinearAlgebra", "StaticArrays"] +git-tree-sha1 = "78f0ce9d01993b637a8f28d84537d75dc0ce8eef" +uuid = "4d00f742-c7ba-57c2-abde-4428a4b178cb" +version = "0.7.10" + +[[Grisu]] +git-tree-sha1 = "03d381f65183cb2d0af8b3425fde97263ce9a995" +uuid = "42e2da0e-8278-4e71-bc24-59509adca0fe" +version = "1.0.0" + +[[HTTP]] +deps = ["Base64", "Dates", "IniFile", "MbedTLS", "Sockets"] +git-tree-sha1 = "c7ec02c4c6a039a98a15f955462cd7aea5df4508" +uuid = "cd3eb016-35fb-5094-929b-558a96fad6f3" +version = "0.8.19" + +[[Impute]] +deps = ["BSON", "CSV", "DataDeps", "Distances", "IterTools", "LinearAlgebra", "Missings", "NamedDims", "NearestNeighbors", "Random", "Statistics", "StatsBase", "TableOperations", "Tables"] +path = "/Users/rory/repos/invenia/Impute.jl" +uuid = "f7bf1975-0170-51b9-8c5f-a992d46b9575" +version = "0.6.0" + +[[IniFile]] +deps = ["Test"] +git-tree-sha1 = "098e4d2c533924c921f9f9847274f2ad89e018b8" +uuid = "83e8ac13-25f8-5344-8a64-a9f2b223428f" +version = "0.5.0" + +[[InteractiveUtils]] +deps = ["Markdown"] +uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" + +[[IntervalSets]] +deps = ["Dates", "EllipsisNotation", "Statistics"] +git-tree-sha1 = "3b1cef135bc532b3c3401b309e1b8a2a2ba26af5" +uuid = "8197267c-284f-5f27-9208-e0e47529a953" +version = "0.5.1" + +[[InvertedIndices]] +deps = ["Test"] +git-tree-sha1 = "15732c475062348b0165684ffe28e85ea8396afc" +uuid = "41ab1584-1d38-5bbf-9106-f11c6c58b48f" +version = "1.0.0" + +[[IterTools]] +git-tree-sha1 = "05110a2ab1fc5f932622ffea2a003221f4782c18" +uuid = "c8e1da08-722c-5040-9ed9-7db0dc04731e" +version = "1.3.0" + +[[IteratorInterfaceExtensions]] +git-tree-sha1 = "a3f24677c21f5bbe9d2a714f95dcd58337fb2856" +uuid = "82899510-4779-5014-852e-03e436cf321d" +version = "1.0.0" + +[[JSON]] +deps = ["Dates", "Mmap", "Unicode"] +git-tree-sha1 = "565947e5338efe62a7db0aa8e5de782c623b04cd" +uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" +version = "0.20.1" + +[[LazyStack]] +deps = ["LinearAlgebra", "NamedDims", "OffsetArrays", "Test", "ZygoteRules"] +git-tree-sha1 = "a8bf67afad3f1ee59d367267adb7c44ccac7fdee" +uuid = "1fad7336-0346-5a1a-a56f-a06ba010965b" +version = "0.0.7" + +[[LibGit2]] +deps = ["Printf"] +uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" + +[[Libdl]] +uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" + +[[LinearAlgebra]] +deps = ["Libdl"] +uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" + +[[Logging]] +uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" + +[[MacroTools]] +deps = ["Markdown", "Random"] +git-tree-sha1 = "f7d2e3f654af75f01ec49be82c231c382214223a" +uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" +version = "0.5.5" + +[[Markdown]] +deps = ["Base64"] +uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" + +[[MbedTLS]] +deps = ["Dates", "MbedTLS_jll", "Random", "Sockets"] +git-tree-sha1 = "426a6978b03a97ceb7ead77775a1da066343ec6e" +uuid = "739be429-bea8-5141-9913-cc70e7f3736d" +version = "1.0.2" + +[[MbedTLS_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "c0b1286883cac4e2b617539de41111e0776d02e8" +uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" +version = "2.16.8+0" + +[[Measures]] +git-tree-sha1 = "e498ddeee6f9fdb4551ce855a46f54dbd900245f" +uuid = "442fdcdd-2543-5da2-b0f3-8c86c306513e" +version = "0.3.1" + +[[Missings]] +deps = ["DataAPI"] +git-tree-sha1 = "ed61674a0864832495ffe0a7e889c0da76b0f4c8" +uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" +version = "0.4.4" + +[[Mmap]] +uuid = "a63ad114-7e13-5084-954f-fe012c677804" + +[[NaNMath]] +git-tree-sha1 = "c84c576296d0e2fbb3fc134d3e09086b3ea617cd" +uuid = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3" +version = "0.3.4" + +[[NamedDims]] +deps = ["LinearAlgebra", "Pkg", "Requires", "Statistics"] +git-tree-sha1 = "263f7305bfa5b8b69cd3239ec6a4c037ff96b3df" +uuid = "356022a1-0364-5f58-8944-0da4b18d706f" +version = "0.2.28" + +[[NearestNeighbors]] +deps = ["Distances", "StaticArrays"] +git-tree-sha1 = "93107e3cdada73d63245ed8170dcae680f0c8fd8" +uuid = "b8a86587-4115-5ab1-83bc-aa920d37bbce" +version = "0.4.6" + +[[OffsetArrays]] +git-tree-sha1 = "3fdfca8a532507d65f39ff0ad34fe81097a55337" +uuid = "6fe1bfb0-de20-5000-8ca7-80f57d26f881" +version = "1.3.0" + +[[OrderedCollections]] +git-tree-sha1 = "16c08bf5dba06609fe45e30860092d6fa41fde7b" +uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" +version = "1.3.1" + +[[Parsers]] +deps = ["Dates"] +git-tree-sha1 = "6fa4202675c05ba0f8268a6ddf07606350eda3ce" +uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0" +version = "1.0.11" + +[[Pkg]] +deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"] +uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" + +[[PlotThemes]] +deps = ["PlotUtils", "Requires", "Statistics"] +git-tree-sha1 = "d2f3a41081a72815f5c59eacdc8046237a7cbe12" +uuid = "ccf2f8ad-2431-5c83-bf29-c5338b663b6a" +version = "0.4.0" + +[[PlotUtils]] +deps = ["Colors", "Dates", "Printf", "Random", "Reexport"] +git-tree-sha1 = "51e742162c97d35f714f9611619db6975e19384b" +uuid = "995b91a9-d308-5afd-9ec6-746e21dbc043" +version = "0.6.5" + +[[Plots]] +deps = ["Base64", "Contour", "Dates", "FFMPEG", "FixedPointNumbers", "GR", "GeometryTypes", "JSON", "LinearAlgebra", "Measures", "NaNMath", "Pkg", "PlotThemes", "PlotUtils", "Printf", "REPL", "Random", "RecipesBase", "Reexport", "Requires", "Showoff", "SparseArrays", "Statistics", "StatsBase", "UUIDs"] +git-tree-sha1 = "11c75a31269c1c64790e7cb910346f64cd4440c1" +uuid = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" +version = "0.27.1" + +[[PooledArrays]] +deps = ["DataAPI"] +git-tree-sha1 = "b1333d4eced1826e15adbdf01a4ecaccca9d353c" +uuid = "2dfb63ee-cc39-5dd5-95bd-886bf059d720" +version = "0.5.3" + +[[Printf]] +deps = ["Unicode"] +uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" + +[[REPL]] +deps = ["InteractiveUtils", "Markdown", "Sockets"] +uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" + +[[Random]] +deps = ["Serialization"] +uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" + +[[RecipesBase]] +git-tree-sha1 = "7bdce29bc9b2f5660a6e5e64d64d91ec941f6aa2" +uuid = "3cdcf5f2-1ef4-517c-9805-6587b60abb01" +version = "0.7.0" + +[[Reexport]] +deps = ["Pkg"] +git-tree-sha1 = "7b1d07f411bc8ddb7977ec7f377b97b158514fe0" +uuid = "189a3867-3050-52da-a836-e630ba90ab69" +version = "0.2.0" + +[[Requires]] +deps = ["Test"] +git-tree-sha1 = "f6fbf4ba64d295e146e49e021207993b6b48c7d1" +uuid = "ae029012-a4dd-5104-9daa-d747884805df" +version = "0.5.2" + +[[SHA]] +uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" + +[[SentinelArrays]] +deps = ["Dates", "Random"] +git-tree-sha1 = "6ccde405cf0759eba835eb613130723cb8f10ff9" +uuid = "91c51154-3ec4-41a3-a24f-3f23e20d615c" +version = "1.2.16" + +[[Serialization]] +uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" + +[[SharedArrays]] +deps = ["Distributed", "Mmap", "Random", "Serialization"] +uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383" + +[[Showoff]] +deps = ["Dates", "Grisu"] +git-tree-sha1 = "ee010d8f103468309b8afac4abb9be2e18ff1182" +uuid = "992d4aef-0814-514b-bc4d-f2e9a6c4116f" +version = "0.3.2" + +[[Sockets]] +uuid = "6462fe0b-24de-5631-8697-dd941f90decc" + +[[SortingAlgorithms]] +deps = ["DataStructures", "Random", "Test"] +git-tree-sha1 = "03f5898c9959f8115e30bc7226ada7d0df554ddd" +uuid = "a2af1166-a08f-5f64-846c-94a0d3cef48c" +version = "0.3.1" + +[[SparseArrays]] +deps = ["LinearAlgebra", "Random"] +uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" + +[[StaticArrays]] +deps = ["LinearAlgebra", "Random", "Statistics"] +git-tree-sha1 = "016d1e1a00fabc556473b07161da3d39726ded35" +uuid = "90137ffa-7385-5640-81b9-e52037218182" +version = "0.12.4" + +[[Statistics]] +deps = ["LinearAlgebra", "SparseArrays"] +uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" + +[[StatsBase]] +deps = ["DataAPI", "DataStructures", "LinearAlgebra", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics"] +git-tree-sha1 = "19bfcb46245f69ff4013b3df3b977a289852c3a1" +uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" +version = "0.32.2" + +[[StructTypes]] +deps = ["Dates", "UUIDs"] +git-tree-sha1 = "1ed04f622a39d2e5a6747c3a70be040c00333933" +uuid = "856f2bd8-1eba-4b0a-8007-ebc267875bd4" +version = "1.1.0" + +[[TableOperations]] +deps = ["Tables", "Test"] +git-tree-sha1 = "208630a14884abd110a8f8008b0882f0d0f5632c" +uuid = "ab02a1b2-a7df-11e8-156e-fb1833f50b87" +version = "0.2.1" + +[[TableTraits]] +deps = ["IteratorInterfaceExtensions"] +git-tree-sha1 = "b1ad568ba658d8cbb3b892ed5380a6f3e781a81e" +uuid = "3783bdb8-4a98-5b6b-af9a-565f29a5fe9c" +version = "1.0.0" + +[[Tables]] +deps = ["DataAPI", "DataValueInterfaces", "IteratorInterfaceExtensions", "LinearAlgebra", "TableTraits", "Test"] +git-tree-sha1 = "24a584cf65e2cfabdadc21694fb69d2e74c82b44" +uuid = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" +version = "1.1.0" + +[[Test]] +deps = ["Distributed", "InteractiveUtils", "Logging", "Random"] +uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[[UUIDs]] +deps = ["Random", "SHA"] +uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" + +[[Unicode]] +uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" + +[[ZygoteRules]] +deps = ["MacroTools"] +git-tree-sha1 = "b3b4882cc9accf6731a08cc39543fbc6b669dca8" +uuid = "700de1a5-db45-46bc-99cf-38207098b444" +version = "0.2.0" diff --git a/docs/Project.toml b/docs/Project.toml index 65ccf08..cdf33d2 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -1,10 +1,14 @@ [deps] +AxisKeys = "94b1ba4f-4ee9-5380-92f1-94cde586c3c5" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" Impute = "f7bf1975-0170-51b9-8c5f-a992d46b9575" -RDatasets = "ce6b1742-4840-55fa-b093-852dadbb1d8b" +Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" +NamedDims = "356022a1-0364-5f58-8944-0da4b18d706f" +Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" +StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" [compat] -DataFrames = ">= 0.16" +DataFrames = ">= 0.21" Documenter = "~0.22" -RDatasets = ">= 0.6.2" diff --git a/docs/make.jl b/docs/make.jl index c634dcb..fcf9ef7 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -2,18 +2,26 @@ using Documenter, Impute makedocs( modules=[Impute], - format=:html, + format=Documenter.HTML(assets=["assets/invenia.css"]), pages=[ "Home" => "index.md", - "Impute" => "api/impute.md", - "Context" => "api/context.md", - "Imputors" => "api/imputors.md", - "Utilities" => "api/utils.md", + "Walkthroughs" => [ + "Spatiotemporal" => "walkthroughs/spatiotemporal.md", + "SVD" => "walkthroughs/svd.md", + ], + "API" => [ + "Impute" => "api/impute.md", + "Assertions" => "api/assertions.md", + "Filter" => "api/filter.md", + "Imputors" => "api/imputors.md", + "Chain" => "api/chain.md", + "Functional" => "api/functional.md", + "Utilities" => "api/utils.md", + ], ], repo="https://github.com/invenia/Impute.jl/blob/{commit}{path}#L{line}", sitename="Impute.jl", authors="Invenia Technical Computing Corporation", - assets=["assets/invenia.css"], ) deploydocs( diff --git a/docs/src/api/assertions.md b/docs/src/api/assertions.md new file mode 100644 index 0000000..ab1af7e --- /dev/null +++ b/docs/src/api/assertions.md @@ -0,0 +1,14 @@ +# Assertions + +```@autodocs +Modules = [Impute] +Pages = ["assertions.jl"] +Order = [:module, :constant, :type, :function] +``` + +## Threshold +```@autodocs +Modules = [Impute] +Pages = ["threshold.jl"] +Order = [:module, :constant, :type, :function] +``` diff --git a/docs/src/api/context.md b/docs/src/api/chain.md similarity index 74% rename from docs/src/api/context.md rename to docs/src/api/chain.md index 70de958..a7b3bce 100644 --- a/docs/src/api/context.md +++ b/docs/src/api/chain.md @@ -1,5 +1,6 @@ +# Chain ```@autodocs Modules = [Impute] -Pages = ["context.jl"] +Pages = ["chain.jl"] Order = [:module, :constant, :type, :function] ``` diff --git a/docs/src/api/filter.md b/docs/src/api/filter.md new file mode 100644 index 0000000..33a614c --- /dev/null +++ b/docs/src/api/filter.md @@ -0,0 +1,6 @@ +# Filter +```@autodocs +Modules = [Impute] +Pages = ["filter.jl"] +Order = [:module, :constant, :type, :function] +``` diff --git a/docs/src/api/functional.md b/docs/src/api/functional.md new file mode 100644 index 0000000..21f7225 --- /dev/null +++ b/docs/src/api/functional.md @@ -0,0 +1,83 @@ +# Functional + +To reduce verbosity, Impute.jl also provides a functional interface to its `Assertion`s, `Filter`s, `Imputor`s, etc. + +Ex) + +```jldoctest +julia> using Impute: Interpolate, impute + +julia> M = [1.0 2.0 missing missing 5.0; 1.1 2.2 3.3 missing 5.5]; + +julia> impute(M, Interpolate(); dims=:rows) +2×5 Array{Union{Missing, Float64},2}: + 1.0 2.0 3.0 4.0 5.0 + 1.1 2.2 3.3 4.4 5.5 +``` + +Can also be written as +```jldoctest +julia> using Impute + +julia> M = [1.0 2.0 missing missing 5.0; 1.1 2.2 3.3 missing 5.5]; + +julia> Impute.interp(M; dims=:rows) +2×5 Array{Union{Missing, Float64},2}: + 1.0 2.0 3.0 4.0 5.0 + 1.1 2.2 3.3 4.4 5.5 +``` + +## Threshold +```@docs +Impute.threshold +``` + +## Filter +```@docs +Impute.filter +``` + +## Standardize +```@docs +Impute.standardize +``` + +## Replace +```docs +Impute.replace +``` + +## Substitute +```@docs +Impute.substitute +``` + +## Simple Random Sample (SRS) +```docs +Impute.srs +``` + +## Interpolate +```docs +Impute.interp +``` + +## Last Observation Carried Forward (LOCF) +```@docs +Impute.locf +``` + +## Next Observation Carried Backward (NOCB) +```@docs +Impute.nocb +``` + +## K-Nearest Neighbors (KNN) +```@docs +Impute.knn +``` + +## SVD +```@docs +Impute.svd +``` diff --git a/docs/src/api/imputors.md b/docs/src/api/imputors.md index 3d9c926..72502f5 100644 --- a/docs/src/api/imputors.md +++ b/docs/src/api/imputors.md @@ -1,44 +1,70 @@ +# Imputors + ```@autodocs Modules = [Impute] Pages = ["imputors.jl"] Order = [:module, :constant, :type, :function] ``` -# Drop +## Standardize +```@autodocs +Modules = [Impute] +Pages = ["standardize.jl"] +Order = [:module, :constant, :type, :function] +``` + +## Replace ```@autodocs Modules = [Impute] -Pages = ["drop.jl"] +Pages = ["replace.jl"] Order = [:module, :constant, :type, :function] ``` -# Fill + +## Substitute ```@autodocs Modules = [Impute] -Pages = ["fill.jl"] +Pages = ["substitute.jl"] Order = [:module, :constant, :type, :function] ``` -# Interpolate + +## Simple Random Sample (SRS) +```@autodocs +Modules = [Impute] +Pages = ["srs.jl"] +Order = [:module, :constant, :type, :function] +``` + +## Interpolate ```@autodocs Modules = [Impute] Pages = ["interp.jl"] Order = [:module, :constant, :type, :function] ``` -# Last Observation Carried Forward (LOCF) + +## Last Observation Carried Forward (LOCF) ```@autodocs Modules = [Impute] Pages = ["locf.jl"] Order = [:module, :constant, :type, :function] ``` -# Next Observation Carried Backward (NOCB) +## Next Observation Carried Backward (NOCB) ```@autodocs Modules = [Impute] Pages = ["nocb.jl"] Order = [:module, :constant, :type, :function] ``` -# Chain +## K-Nearest Neighbors (KNN) +```@autodocs +Modules = [Impute] +Pages = ["knn.jl"] +Order = [:module, :constant, :type, :function] +``` + +## SVD ```@autodocs Modules = [Impute] -Pages = ["chain.jl"] +Pages = ["svd.jl"] Order = [:module, :constant, :type, :function] ``` diff --git a/docs/src/index.md b/docs/src/index.md index a2ddf2e..467c288 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -1,8 +1,8 @@ # Impute ```@setup quickstart -using DataFrames, RDatasets, Impute -df = dataset("boot", "neuro") +using DataFrames, Impute +df = Impute.dataset("test/table/neuro") |> DataFrame ``` [![stable](https://img.shields.io/badge/docs-stable-blue.svg)](https://invenia.github.io/Impute.jl/stable/) @@ -23,20 +23,20 @@ julia> using Pkg; Pkg.add("Impute") Let's start by loading our dependencies: ```@repl -using DataFrames, RDatasets, Impute +using DataFrames, Impute ``` We'll also want some test data containing `missing`s to work with: ```@repl quickstart -df = dataset("boot", "neuro") +df = Impute.dataset("test/table/neuro") |> DataFrame ``` Our first instinct might be to drop all observations, but this leaves us too few rows to work with: ```@repl quickstart -Impute.drop(df) +Impute.filter(df; dims=:rows) ``` We could try imputing the values with linear interpolation, but that still leaves missing diff --git a/docs/src/walkthroughs/spatiotemporal.md b/docs/src/walkthroughs/spatiotemporal.md new file mode 100644 index 0000000..67f7e84 --- /dev/null +++ b/docs/src/walkthroughs/spatiotemporal.md @@ -0,0 +1,97 @@ +# Spatiotemporal Panel Datasets + +We often also need to missing data in spatiotemporal data. +For this example, we'll use daily temperature values from major cities around the world. + +TODO: Give a different workflow/example using a DataFrame. + +```@repl st-example +using AxisKeys, Impute, NamedDims, Plots, Statistics, StatsBase + +# So NamedDimsArray is the outer wrapper +AxisKeys.nameouter() = true + +# Construct a KeyedArray of our dataset as we want to track gaps (or missing rows) +# in the source CSV data. +data = wrapdims( + Impute.dataset("test/table/temperature"), + :AverageTemperature, + :dt, + :City; + default=missing, + sort=true, +) + +# Rename our dims +data = rename(data, :dt => :time, :City => :loc) +``` + +Okay, so let's take a look at how much temperature data is missing. +```@repl st-example +heatmap(ismissing.(data); color=:greys); +savefig("st-missing-plot.svg"); nothing # hide +``` +![](st-missing-plot.svg) + +So many cities are missing a lot of historical data. +A common operation is to remove locations with too many missing historical observations. +In our case, we also want to penalize observations closer to the present. + +Lets start be define a set of exponential weights for our observations: +```@repl st-example +wv = eweights(1:length(data.time), 0.001) +plot(wv); +savefig("st-wv-plot.svg"); nothing # hide +``` +![](st-wv-plot.svg) + +Now we want to filter out locations (columns) according to those weights. +For now, we'll say that a location should be removed if the weighted ratio exceeds `0.1`. +```@repl st-example +data = Impute.filter(data; dims=:cols) do v + mratio = sum(wv[ismissing.(v)]) / sum(wv) + return mratio < 0.1 +end +``` + +Okay, so we removed almost 25% of the locations that didn't meet our missing data requirement. +However, most of our observations from the 1700's are still mostly missing. +Let's remove those rows that have more 50% of the locations missing. +```@repl st-example +data = Impute.filter(data; dims=:rows) do v + mratio = count(ismissing, v) / length(v) + return mratio < 0.5 +end +``` + +Now let's take a look at what data remains. +```@repl st-example +heatmap(ismissing.(data); color=:greys); +savefig("st-missing-reduced-plot.svg"); nothing # hide +``` +![](st-missing-reduced-plot.svg) + + +Alright, we can work with the remaining missing values now. +Now we could try simply imputing the values columnwise for each city using something like `Impute.nocb` +```@repl st-example +heatmap(Impute.nocb(data; dims=:cols)); +savefig("st-nocb-plot.svg"); nothing # hide +``` +![](st-nocb-plot.svg) + +But, this looks rather crude and creates clear artifacts in the dataset. +Since we suspect that observations in similar locations would have had similar recordings +we could use `Impute.svd` or `Impute.knn` to find similarities across multiple locations. +NOTE: We need to call `svd!` on the raw data because `NamedDimsArray`s/`KeyedArray`s don't seem to support `LinearAlgebra.svd` yet. +```@repl st-example +Impute.svd!(parent(parent(data)); init=Impute.NOCB(), dims=:cols, tol=1e-2); +heatmap(data); +savefig("st-svd-plot.svg"); nothing # hide +``` +![](st-svd-plot.svg) + +TODO: Use KNN after fixing bug with that imputor + +This method appears to have removed the artifacts found with the basic NOCB method alone. +Now we have a complete dataset ready for downstream processing :) diff --git a/docs/src/walkthroughs/svd.md b/docs/src/walkthroughs/svd.md new file mode 100644 index 0000000..d92d3e2 --- /dev/null +++ b/docs/src/walkthroughs/svd.md @@ -0,0 +1,53 @@ +# SVD Imputation + +Often matrices and n-dimensional arrays with missing values can be imputed via a low rank approximation. +Impute.jl provides one such method using a single value decomposition. +The general idea is to: + +1. Fill the missing values with some rough approximates (e.g., `mean`, `median`, `rand`) +2. Reconstruct this "completed" matrix with a low rank SVD approximation (i.e., `k` largest singular values) +3. Replace our initial estimates with the reconstructed values +4. Repeat steps 1-3 until convergence (update difference is below a tolerance) + +To demonstrate how this is useful lets load a reduced MNIST dataset. +We'll want both the completed dataset and another dataset with 35% of the values set to `-1.0` (indicating missingness). + +TODO: Update example with more a realistic dataset like some microarray data + +```@repl svd-example +using Distances, Impute, Plots, Statistics +mnist = Impute.dataset("test/matrix/mnist"); +completed, incomplete = mnist[0.0], mnist[0.25]; +``` + +Alright, before we get started lets have a look at what our incomplete data looks like: + +```@repl svd-example +heatmap(incomplete; color=:greys); +savefig("mnist-incomplete-plot.svg"); nothing # hide +``` +![](mnist-incomplete-plot.svg) + +Okay, so as we'd expect there's a reasonable bit of structure we can exploit. +So how does the svd method compare against other common, yet simpler, methods? + +```@repl svd-example +data = Impute.standardize(incomplete; values=-1.0) + +# NOTE: SVD performance is almost identical regardless of the `init` setting. +imputors = [ + "0.5" => Impute.Replace(; values=0.5), + "mean" => Impute.Substitute(; robust=false), + "median" => Impute.Substitute(), + "svd" => Impute.SVD(; tol=1e-2), +] + +results = map(last.(imputors)) do imp + r = Impute.impute(data, imp; dims=:) + return nrmsd(completed, r) +end + +bar(first.(imputors), results); +savefig("svd-results-plot.svg"); nothing # hide +``` +![](svd-results-plot.svg) diff --git a/src/Impute.jl b/src/Impute.jl index 2d6e158..8ec638d 100644 --- a/src/Impute.jl +++ b/src/Impute.jl @@ -1,8 +1,12 @@ module Impute +using BSON +using CSV +using DataDeps using Distances using IterTools using Missings +using NamedDims using NearestNeighbors using Random using Statistics @@ -14,314 +18,15 @@ using Base.Iterators using LinearAlgebra using LinearAlgebra: Diagonal -import Base.Iterators: drop - -""" - ImputeError{T} <: Exception - -Is thrown by `impute` methods when the limit of imputable values has been exceeded. - -# Fields -* msg::T - the message to print. -""" -struct ImputeError{T} <: Exception - msg::T -end - -Base.showerror(io::IO, err::ImputeError) = println(io, "ImputeError: $(err.msg)") - -include("context.jl") +include("utils.jl") +include("assertions.jl") include("imputors.jl") - -#= -These default methods are required because @auto_hash_equals doesn't -play nice with Base.@kwdef -=# -function Base.hash(imp::T, h::UInt) where T <: Union{Imputor, AbstractContext} - h = hash(Symbol(T), h) - - for f in fieldnames(T) - h = hash(getfield(imp, f), h) - end - - return h -end - -function Base.:(==)(a::T, b::T) where T <: Union{Imputor, AbstractContext} - result = true - - for f in fieldnames(T) - if !isequal(getfield(a, f), getfield(b, f)) - result = false - break - end - end - - return result -end - -const global imputation_methods = ( - drop = DropObs, - dropobs = DropObs, - dropvars = DropVars, - interp = Interpolate, - interpolate = Interpolate, - fill = Fill, - locf = LOCF, - nocb = NOCB, - srs = SRS, - svd = SVD, - knn = KNN, -) - +include("filter.jl") +include("chain.jl") include("deprecated.jl") +include("functional.jl") +include("data.jl") -for (f, v) in pairs(imputation_methods) - typename = nameof(v) - f! = Symbol(f, :!) - - @eval begin - $f(data; kwargs...) = _impute(data, $typename, kwargs...) - $f!(data; kwargs...) = _impute!(data, $typename, kwargs...) - $f(; kwargs...) = data -> _impute(data, $typename, kwargs...) - $f!(; kwargs...) = data -> _impute!(data, $typename, kwargs...) - end -end - -@doc """ - Impute.dropobs(data; dims=1, context=Context()) - -Removes missing observations from the `AbstractArray` or `Tables.table` provided. -See [DropObs](@ref) for details. - -# Example -``` -julia> using DataFrames; using Impute: Impute, Context - -julia> df = DataFrame(:a => [1.0, 2.0, missing, missing, 5.0], :b => [1.1, 2.2, 3.3, missing, 5.5]) -5×2 DataFrames.DataFrame -│ Row │ a │ b │ -│ │ Float64 │ Float64 │ -├─────┼──────────┼──────────┤ -│ 1 │ 1.0 │ 1.1 │ -│ 2 │ 2.0 │ 2.2 │ -│ 3 │ missing │ 3.3 │ -│ 4 │ missing │ missing │ -│ 5 │ 5.0 │ 5.5 │ - -julia> Impute.dropobs(df; dims=2, context=Context(; limit=1.0)) -3×2 DataFrames.DataFrame -│ Row │ a │ b │ -│ │ Float64 │ Float64 │ -├─────┼─────────┼─────────┤ -│ 1 │ 1.0 │ 1.1 │ -│ 2 │ 2.0 │ 2.2 │ -│ 3 │ 5.0 │ 5.5 │ -``` -""" dropobs - -@doc """ - Impute.dropvars(data; dims=1, context=Context()) - -Finds variables with too many missing values in a `AbstractMatrix` or `Tables.table` and -removes them from the input data. See [DropVars](@ref) for details. - -# Example -```jldoctest -julia> using DataFrames; using Impute: Impute, Context - -julia> df = DataFrame(:a => [1.0, 2.0, missing, missing, 5.0], :b => [1.1, 2.2, 3.3, missing, 5.5]) -5×2 DataFrames.DataFrame -│ Row │ a │ b │ -│ │ Float64 │ Float64 │ -├─────┼──────────┼──────────┤ -│ 1 │ 1.0 │ 1.1 │ -│ 2 │ 2.0 │ 2.2 │ -│ 3 │ missing │ 3.3 │ -│ 4 │ missing │ missing │ -│ 5 │ 5.0 │ 5.5 │ - -julia> Impute.dropvars(df; context=Context(; limit=0.2)) -5×1 DataFrames.DataFrame -│ Row │ b │ -│ │ Float64 │ -├─────┼──────────┤ -│ 1 │ 1.1 │ -│ 2 │ 2.2 │ -│ 3 │ 3.3 │ -│ 4 │ missing │ -│ 5 │ 5.5 │ -``` -""" dropvars - -@doc """ - Impute.interp(data; dims=1, context=Context()) - -Performs linear interpolation between the nearest values in an vector. -See [Interpolate](@ref) for details. - -# Example -```jldoctest -julia> using DataFrames; using Impute: Impute, Context - -julia> df = DataFrame(:a => [1.0, 2.0, missing, missing, 5.0], :b => [1.1, 2.2, 3.3, missing, 5.5]) -5×2 DataFrames.DataFrame -│ Row │ a │ b │ -│ │ Float64 │ Float64 │ -├─────┼──────────┼──────────┤ -│ 1 │ 1.0 │ 1.1 │ -│ 2 │ 2.0 │ 2.2 │ -│ 3 │ missing │ 3.3 │ -│ 4 │ missing │ missing │ -│ 5 │ 5.0 │ 5.5 │ - -julia> Impute.interp(df; context=Context(; limit=1.0)) -5×2 DataFrames.DataFrame -│ Row │ a │ b │ -│ │ Float64 │ Float64 │ -├─────┼──────────┼──────────┤ -│ 1 │ 1.0 │ 1.1 │ -│ 2 │ 2.0 │ 2.2 │ -│ 3 │ 3.0 │ 3.3 │ -│ 4 │ 4.0 │ 4.4 │ -│ 5 │ 5.0 │ 5.5 │ -``` -""" interp - -@doc """ - Impute.fill(data; value=mean, dims=1, context=Context()) - -Fills in the missing data with a specific value. See [Fill](@ref) for details. - -# Example -```jldoctest -julia> using DataFrames; using Impute: Impute, Context - -julia> df = DataFrame(:a => [1.0, 2.0, missing, missing, 5.0], :b => [1.1, 2.2, 3.3, missing, 5.5]) -5×2 DataFrames.DataFrame -│ Row │ a │ b │ -│ │ Float64 │ Float64 │ -├─────┼──────────┼──────────┤ -│ 1 │ 1.0 │ 1.1 │ -│ 2 │ 2.0 │ 2.2 │ -│ 3 │ missing │ 3.3 │ -│ 4 │ missing │ missing │ -│ 5 │ 5.0 │ 5.5 │ - -julia> Impute.fill(df; value=-1.0, context=Context(; limit=1.0)) -5×2 DataFrames.DataFrame -│ Row │ a │ b │ -│ │ Float64 │ Float64 │ -├─────┼──────────┼──────────┤ -│ 1 │ 1.0 │ 1.1 │ -│ 2 │ 2.0 │ 2.2 │ -│ 3 │ -1.0 │ 3.3 │ -│ 4 │ -1.0 │ -1.0 │ -│ 5 │ 5.0 │ 5.5 │ -``` -""" fill - -@doc """ - Impute.locf(data; dims=1, context=Context()) - -Iterates forwards through the `data` and fills missing data with the last existing -observation. See [LOCF](@ref) for details. - -# Example -```jldoctest -julia> using DataFrames; using Impute: Impute, Context - -julia> df = DataFrame(:a => [1.0, 2.0, missing, missing, 5.0], :b => [1.1, 2.2, 3.3, missing, 5.5]) -5×2 DataFrames.DataFrame -│ Row │ a │ b │ -│ │ Float64 │ Float64 │ -├─────┼──────────┼──────────┤ -│ 1 │ 1.0 │ 1.1 │ -│ 2 │ 2.0 │ 2.2 │ -│ 3 │ missing │ 3.3 │ -│ 4 │ missing │ missing │ -│ 5 │ 5.0 │ 5.5 │ - -julia> Impute.locf(df; context=Context(; limit=1.0)) -5×2 DataFrames.DataFrame -│ Row │ a │ b │ -│ │ Float64 │ Float64 │ -├─────┼──────────┼──────────┤ -│ 1 │ 1.0 │ 1.1 │ -│ 2 │ 2.0 │ 2.2 │ -│ 3 │ 2.0 │ 3.3 │ -│ 4 │ 2.0 │ 3.3 │ -│ 5 │ 5.0 │ 5.5 │ -``` -""" locf - -@doc """ - Impute.nocb(data; dims=1, context=Context()) - -Iterates backwards through the `data` and fills missing data with the next existing -observation. See [LOCF](@ref) for details. - -# Example -```jldoctest -julia> using DataFrames; using Impute: Impute, Context - -julia> df = DataFrame(:a => [1.0, 2.0, missing, missing, 5.0], :b => [1.1, 2.2, 3.3, missing, 5.5]) -5×2 DataFrames.DataFrame -│ Row │ a │ b │ -│ │ Float64 │ Float64 │ -├─────┼──────────┼──────────┤ -│ 1 │ 1.0 │ 1.1 │ -│ 2 │ 2.0 │ 2.2 │ -│ 3 │ missing │ 3.3 │ -│ 4 │ missing │ missing │ -│ 5 │ 5.0 │ 5.5 │ - -julia> Impute.nocb(df; context=Context(; limit=1.0)) -5×2 DataFrames.DataFrame -│ Row │ a │ b │ -│ │ Float64 │ Float64 │ -├─────┼──────────┼──────────┤ -│ 1 │ 1.0 │ 1.1 │ -│ 2 │ 2.0 │ 2.2 │ -│ 3 │ 5.0 │ 3.3 │ -│ 4 │ 5.0 │ 5.5 │ -│ 5 │ 5.0 │ 5.5 │ -``` -""" nocb - -@doc """ - Impute.srs(data; rng=Random.GLOBAL_RNG, context=Context()) - -Simple Random Sampling (SRS) imputation is a method for imputing both continuous and -categorical variables. Furthermore, it completes imputation while preserving the -distributional properties of the variables (e.g., mean, standard deviation). - -# Example -```jldoctest -julia> using DataFrames; using Random; using Impute: Impute, Context - -julia> df = DataFrame(:a => [1.0, 2.0, missing, missing, 5.0], :b => [1.1, 2.2, 3.3, missing, 5.5]) -5×2 DataFrames.DataFrame -│ Row │ a │ b │ -│ │ Float64 │ Float64 │ -├─────┼──────────┼──────────┤ -│ 1 │ 1.0 │ 1.1 │ -│ 2 │ 2.0 │ 2.2 │ -│ 3 │ missing │ 3.3 │ -│ 4 │ missing │ missing │ -│ 5 │ 5.0 │ 5.5 │ - -julia> Impute.srs(df; rng=MersenneTwister(1234), context=Context(; limit=1.0)) -5×2 DataFrame -│ Row │ a │ b │ -│ │ Float64 │ Float64 │ -├─────┼──────────┼──────────┤ -│ 1 │ 1.0 │ 1.1 │ -│ 2 │ 2.0 │ 2.2 │ -│ 3 │ 1.0 │ 3.3 │ -│ 4 │ 5.0 │ 3.3 │ -│ 5 │ 5.0 │ 5.5 │ -``` -""" srs +__init__() = register_datadep() end # module diff --git a/src/assertions.jl b/src/assertions.jl new file mode 100644 index 0000000..9526686 --- /dev/null +++ b/src/assertions.jl @@ -0,0 +1,105 @@ +""" + Assertion + +An Assertion stores settings for checking the validity of a `AbstractArray` or `Tables.table` containing missing values. +New assertions are expected to subtype `Impute.Assertion` and, at minimum, +implement the `_assert(data::AbstractArray{Union{T, Missing}}, ::)` method. +""" +abstract type Assertion end + +""" + assert(data::AbstractArray, a::Assertion; dims=:) + +If the assertion `a` fails then an `ThresholdError` is thrown, otherwise the `data` +provided is returned without mutation. See [`Assertion`](@ref) for the minimum internal +`_assert` call requirements. + +# Arguments +* `data::AbstractArray`: the data to be impute along dimensions `dims` +* `a::Assertion`: the assertion to apply + +# Keywords +* `dims`: The dimension to apply the `_assert` along (default is `:`) + +# Returns +* the input `data` if no error is thrown. + +# Throws +* An error when the test fails + +```jldoctest +julia> using Test; using Impute: Threshold, ThresholdError, assert + +julia> M = [1.0 2.0 missing missing 5.0; 1.1 2.2 3.3 missing 5.5] +2×5 Array{Union{Missing, Float64},2}: + 1.0 2.0 missing missing 5.0 + 1.1 2.2 3.3 missing 5.5 + +julia> @test_throws ThresholdError assert(M, Threshold()) +Test Passed + Thrown: ThresholdError +``` +""" +function assert(data::AbstractArray, a::Assertion; dims=:, kwargs...) + dims === Colon() && return _assert(data, a; kwargs...) + d = Impute.dim(data, dims) + + for d in eachslice(data; dims=d) + _assert(d, a; kwargs...) + end + return data +end + +""" + assert(table, a::Assertion; cols=nothing) + +Applies the assertion `a` to the `table` 1 column at a time; if this is not the desired +behaviour custom `assert` methods should overload this method. See [`Assertion`](@ref) for +the minimum internal `_assert` call requirements. + +# Arguments +* `table`: the data to impute +* `a`: the assertion to apply + +# Keyword Arguments +* `cols`: The columns to impute along (default is to impute all columns) + +# Returns +* the input `data` if no error is thrown. + +# Throws +* An error when any column doesn't pass the test + +# Example +```jldoctest +julia> using DataFrames, Test; using Impute: Threshold, ThresholdError, assert + +julia> df = DataFrame(:a => [1.0, 2.0, missing, missing, 5.0], :b => [1.1, 2.2, 3.3, missing, 5.5]) +5×2 DataFrame +│ Row │ a │ b │ +│ │ Float64? │ Float64? │ +├─────┼──────────┼──────────┤ +│ 1 │ 1.0 │ 1.1 │ +│ 2 │ 2.0 │ 2.2 │ +│ 3 │ missing │ 3.3 │ +│ 4 │ missing │ missing │ +│ 5 │ 5.0 │ 5.5 │ + +julia> @test_throws ThresholdError assert(df, Threshold()) +Test Passed + Thrown: ThresholdError +``` +""" +function assert(table, a::Assertion; cols=nothing, kwargs...) + istable(table) || throw(MethodError(assert, (table, a))) + columntable = Tables.columns(table) + + cnames = cols === nothing ? propertynames(columntable) : cols + for cname in cnames + _assert(getproperty(columntable, cname), a; kwargs...) + end + + return table +end + +include("assertions/threshold.jl") diff --git a/src/assertions/threshold.jl b/src/assertions/threshold.jl new file mode 100644 index 0000000..2218655 --- /dev/null +++ b/src/assertions/threshold.jl @@ -0,0 +1,58 @@ +""" + ThresholdError <: Exception + +Is thrown when a Threshold limit is exceed. + +# Fields +* limit::Float64 - the threshold limit. +* value::Float64 - the missing data ratio identified +""" +struct ThresholdError <: Exception + limit::Float64 + value::Float64 +end + +function Base.showerror(io::IO, err::ThresholdError) + println( + io, + "ThresholdError: Ratio of missing values exceeded $(err.limit) ($(err.value))", + ) +end + +""" + Threshold(; ratio=0.1, weights=nothing) + +Assert that the ratio of missing values in the provided dataset does not exceed to specified ratio. +If a weights array is provided then the ratio will be calculated as the +`sum(weights[ismissing.(data)]) / sum(weights)` + +# Keyword Arguments +* `ratio::Real`: Allowed proportion of missing values (should be between 0.0 and 1.0). +* `weights::AbstractWeights`: A set of statistical weights to use when evaluating the importance + of each observation. If present a weighted ratio of missing values will be calculated. +""" +struct Threshold <: Assertion + ratio::Float64 + weights::Union{AbstractWeights, Nothing} +end + +Threshold(; ratio=0.1, weights=nothing) = Threshold(ratio, weights) + +function _assert(data::AbstractArray{Union{T, Missing}}, t::Threshold) where T + mratio = if t.weights === nothing + count(ismissing, data) / length(data) + else + if size(data) != size(t.weights) + throw(DimensionMismatch(string( + "Input has dimensions $(size(data)), but thresholds weights ", + "has dimensions $(size(t.weights))" + ))) + end + + sum(t.weights[ismissing.(data)]) / sum(t.weights) + end + + mratio > t.ratio && throw(ThresholdError(t.ratio, mratio)) + + return data +end diff --git a/src/chain.jl b/src/chain.jl new file mode 100644 index 0000000..6bde4ab --- /dev/null +++ b/src/chain.jl @@ -0,0 +1,82 @@ +const Transform = Union{Assertion, Filter, Imputor} + +""" + Chain{T<:Tuple{Vararg{Transform}}} <: Function + +Runs multiple `Assertions`, `Filter` or `Imputor`s on the same data in the order they're +provided. + +# Fields +* `transforms::Vector{Union{Assertion, Filter, Imputor}}` +""" +struct Chain{T<:Tuple{Vararg{Transform}}} <: Function + transforms::T +end + +Chain(transforms::Vector{<:Transform}) = Chain(Tuple(transforms)) + +""" + Chain(transforms::Union{Assertion, Filter, Imputor}...) -> Chain + +Creates a Chain using the transforms provided (ordering matters). +""" +Chain(transforms::Transform...) = Chain(tuple(transforms...)) + +""" +Compose new chains with the composition operator + +# Example + +```jldoctest +julia> using Impute: Impute, Interpolate, NOCB, LOCF + +julia> M = [missing 2.0 missing missing 5.0; 1.1 2.2 missing 4.4 missing] +2×5 Array{Union{Missing, Float64},2}: + missing 2.0 missing missing 5.0 + 1.1 2.2 missing 4.4 missing + +julia> C = Interpolate() ∘ NOCB() ∘ LOCF(); + +julia> C(M; dims=:rows) +2×5 Array{Union{Missing, Float64},2}: + 2.0 2.0 3.0 4.0 5.0 + 1.1 2.2 3.3 4.4 4.4 +``` +""" +Base.:(∘)(a::Transform, b::Transform) = Chain(a, b) +Base.:(∘)(C::Chain, b::Transform) = Chain(C.transforms..., b) + +""" + (C::Chain)(data; kwargs...) + +Runnable the "callable" chain `C` on the supplied `data`. + +# Arguments +* `data`: our data to impute + +# Keyword Arguments +* `kwargs`: Keyword arguments that should be applied to each transform (ex `dims=:cols`) + +# Returns +* our imputed data +""" +function (C::Chain)(data; kwargs...) + # Since some operation like filtering can't consistently mutate the data we make a copy + # and don't support a mutating form. + X = trycopy(data) + + for t in C.transforms + if isa(t, Assertion) + # Assertions just return the input + assert(X, t; kwargs...) + elseif isa(t, Filter) + # Filtering doesn't always work in-place + X = apply(X, t; kwargs...) + else + # An in-place impute! method should always exist + X = impute!(X, t; kwargs...) + end + end + + return X +end diff --git a/src/context.jl b/src/context.jl deleted file mode 100644 index 29e41af..0000000 --- a/src/context.jl +++ /dev/null @@ -1,230 +0,0 @@ -""" - AbstractContext - -An imputation context records summary information about missing data for an imputation algorithm. -All `AbstractContext`s are callable with a function, which allows us to write code like: - -```julia -context() do c - # My imputation code using a clean context -end -``` - -This do-block will pass a fresh context to your code and apply the `on_complete` function on -the resulting data and context state. By default, `on_complete` will throw an -[ImputeError](@ref) if we have too many missing values. -""" -abstract type AbstractContext end - -# We implement a version of copy for all contexts which reconstructs the context from the -# raw fields. -Base.copy(ctx::T) where {T <: AbstractContext} = T(fieldvalues(ctx)...) - -""" - ismissing!(ctx::AbstractContext, x) -> Bool - -Uses `ctx.is_missing` to determine if x is missing. If x is a `NamedTuple` or an `AbstractArray` -then `ismissing!` will return true if `ctx.is_missing` returns true for any element. -The ctx.count is increased whenever whenever we return true and if `ctx.count / ctx.num` -exceeds our `ctx.limit` we throw an `ImputeError` - -# Arguments -* `ctx::Context`: the contextual information about missing information. -* `x`: the value to check (may be an single values, abstract array or row) -""" -function ismissing!(ctx::AbstractContext, x) - was_missing = if isa(x, NamedTuple) - any(ctx.is_missing, Tuple(x)) - elseif isa(x, AbstractArray) - any(ctx.is_missing, x) - else - ctx.is_missing(x) - end - - missing_update!(ctx, was_missing) - - return was_missing -end - -""" - findfirst(ctx::AbstractContext, data::AbstractVector) -> Int - -Returns the first non-missing index in `data`. - -# Arguments -* `ctx::AbstractContext`: the context to pass into `ismissing!` -* `data::AbstractVector`: the data array to search - -# Returns -* `Int`: the first index in `data` that isn't missing -""" -function Base.findfirst(ctx::AbstractContext, data::AbstractVector) - return findfirst(x -> !ismissing!(ctx, x), data) -end - -""" - findlast(ctx::AbstractContext, data::AbstractVector) -> Int - -Returns the last non-missing index in `data`. - -# Arguments -* `ctx::AbstractContext`: the context to pass into `ismissing!` -* `data::AbstractVector`: the data array to search - -# Returns -* `Int`: the last index in `data` that isn't missing -""" -function Base.findlast(ctx::AbstractContext, data::AbstractVector) - return findlast(x -> !ismissing!(ctx, x), data) -end - -""" - findnext(ctx::AbstractContext, data::AbstractVector) -> Int - -Returns the next non-missing index in `data`. - -# Arguments -* `ctx::AbstractContext`: the context to pass into `ismissing!` -* `data::AbstractVector`: the data array to search - -# Returns -* `Int`: the next index in `data` that isn't missing -""" -function Base.findnext(ctx::AbstractContext, data::AbstractVector, idx::Int) - return findnext(x -> !ismissing!(ctx, x), data, idx) -end - -mutable struct Context <: AbstractContext - num::Int - count::Int - limit::Float64 - is_missing::Function - on_complete::Function -end - -""" - Context - -Records base information about the missing data and assume all observations are equally -weighted. - -# Keyword Arguments -* `n::Int`: number of observations -* `count::Int`: number of missing values found -* `limit::Float64`: portion of total values allowed to be imputed (should be between 0.0 and 1.0). -* `is_missing::Function`: must return a Bool indicating if the value counts as missing -* `on_complete::Function`: a function to run when imputation is complete -""" -function Context(; - limit::Float64=1.0, - is_missing::Function=ismissing, - on_complete::Function=complete -) - return Context(0, 0, limit, is_missing, on_complete) -end - -function Base.empty(ctx::Context) - _ctx = copy(ctx) - _ctx.num = 0 - _ctx.count = 0 - - return _ctx -end - -function missing_update!(ctx::Context, was_missing) - ctx.num += 1 - - if was_missing - ctx.count += 1 - end -end - -function complete(ctx::Context, data) - missing_ratio = ctx.count / ctx.num - - if missing_ratio > ctx.limit - throw(ImputeError( - "More than $(ctx.limit * 100)% of values were missing ($missing_ratio)." - )) - end - - return data -end - - -mutable struct WeightedContext <: AbstractContext - num::Int - s::Float64 - limit::Float64 - is_missing::Function - on_complete::Function - wv::AbstractWeights -end - -""" - WeightedContext(wv; limit=1.0, is_missing=ismissing, on_complete=complete) - -Records information about the missing data relative to a set of weights. -This context type can be useful if some missing observation are more important than others -(e.g., more recent observations in time series datasets) - -# Arguments -* `wv::AbstractWeights`: a set of statistical weights to use when evaluating the importance - of each observation. Will be accumulated during imputation. - -# Keyword Arguments -* `num::Int`: number of observations -* `s::Float64`: sum of the weights of missing values -* `limit::Float64`: portion of total values allowed to be imputed (should be between 0.0 and 1.0). -* `is_missing::Function`: returns a Bool if the value counts as missing -* `on_complete::Function`: a function to run when imputation is complete -""" -function WeightedContext( - wv::AbstractWeights; - limit::Real=1.0, - is_missing::Function=ismissing, - on_complete::Function=complete -) - return WeightedContext(0, 0.0, limit, is_missing, on_complete, wv) -end - -function Base.empty(ctx::WeightedContext) - _ctx = copy(ctx) - _ctx.num = 0 - _ctx.s = 0.0 - - return _ctx -end - -function missing_update!(ctx::WeightedContext, was_missing) - ctx.num += 1 - - if was_missing - ctx.s += ctx.wv[ctx.num] - end -end - -function complete(ctx::WeightedContext, data) - missing_ratio = ctx.s / sum(ctx.wv) - - if missing_ratio > ctx.limit - throw(ImputeError( - "More than $(ctx.limit * 100)% of weighted values were missing ($missing_ratio)." - )) - end - - return data -end - -#= -Define our callable methods for each context. Once we drop 1.0 we should be able to just -define this on the `AbstractContext`. -=# -for T in (Context, WeightedContext) - @eval begin - function (ctx::$T)(f::Function) - _ctx = empty(ctx) - return ctx.on_complete(_ctx, f(_ctx)) - end - end -end diff --git a/src/data.jl b/src/data.jl new file mode 100644 index 0000000..543d81a --- /dev/null +++ b/src/data.jl @@ -0,0 +1,41 @@ +function register_datadep() + register( + DataDep( + "impute-v1.0.0", + "Datasets for testing and demonstrating Impute.jl", + "https://invenia-public-datasets.s3.amazonaws.com/Impute/v1.0.0/datasets.tar.gz", + "938b3705752eb73141476a2abc7a36cfdaba9ec45f99f0796f44e0870e006e1c", + post_fetch_method=unpack, + ) + ) +end + +function datasets() + dep = datadep"impute-v1.0.0/data/" + + # Only select paths containing a data.x file + selected = Iterators.filter(walkdir(dep)) do (root, dirs, files) + any(f -> first(splitext(f)) == "data", files) + end + + # Return just the root path with the data dep path part removed + return [first(t)[length(dep)+2:end] for t in selected] +end + +function dataset(name) + dep = @datadep_str joinpath("impute-v1.0.0/data", name) + files = readdir(dep) + idx = findfirst(f -> first(splitext(f)) == "data", files) + idx === nothing && throw(ArguementError("No data file found for $name.")) + fullpath = joinpath(dep, files[idx]) + ext = splitext(fullpath)[end] + + # This is necessary because CSV isn't registered in FileIO + if ext == ".csv" + return CSV.File(fullpath) + elseif ext == ".bson" + return BSON.load(fullpath) + else + throw(ArgumentError("Unsupported file type $ext.")) + end +end diff --git a/src/deprecated.jl b/src/deprecated.jl index e69de29..d467d51 100644 --- a/src/deprecated.jl +++ b/src/deprecated.jl @@ -0,0 +1,229 @@ +# Introduced in 0.6 +# NOTE: Deprecated Imputor docstrings use julia-repl rather than jldoctest since depwarn +# output isn't consistent across installs. +""" + Fill(; value=mean) + +Fills in the missing data with a specific value. +The current implementation is univariate, so each variable in a table or matrix will +be handled independently. + +!!! Use Impute.Replace for constants or Impute.Substitue for median/mode substitution. + +# Keyword Arguments +* `value::Any`: A scalar or a function that returns a scalar if + passed the data with missing data removed (e.g, `mean`) + +# Example +```julia-repl +julia> using Impute: Fill, impute + +julia> M = [1.0 2.0 missing missing 5.0; 1.1 2.2 3.3 missing 5.5] +2×5 Array{Union{Missing, Float64},2}: + 1.0 2.0 missing missing 5.0 + 1.1 2.2 3.3 missing 5.5 + +julia> impute(M, Fill(); dims=:rows) +2×5 Array{Union{Missing, Float64},2}: + 1.0 2.0 2.66667 2.66667 5.0 + 1.1 2.2 3.3 3.025 5.5 +``` +""" +struct Fill{T} <: Imputor + value::T + + function Fill(value::T) where T + Base.depwarn( + "Impute.Fill is deprecated in favour of Impute.Replace for constants and " * + "Impute.Substitute for calculating summary statistics over non-missing data.", + :Fill + ) + return new{T}(value) + end +end + +# TODO: Switch to using Base.@kwdef on 1.1 +Fill(; value=mean) = Fill(value) + +function _impute!(data::AbstractArray{Union{T, Missing}}, imp::Fill) where T + fill_val = if isa(imp.value, Function) + available = skipmissing(data) + + if isempty(available) + @debug "Cannot apply fill function $(imp.value) as all values are missing" + return data + else + imp.value(available) + end + else + imp.value + end + + for i in eachindex(data) + if ismissing(data[i]) + data[i] = fill_val + end + end + + return data +end + +# A couple utility methods to avoid messing up var and obs dimensions +# NOTE: We aren't deprecating these as they were always internal function that weren't +# intended for public use. +obsdim(dims::Int) = dims +vardim(dims::Int) = dims == 1 ? 2 : 1 # dims is obsdims, so we want the other one. + +function obswise(data::AbstractMatrix; dims=1) + return (selectdim(data, obsdim(dims), i) for i in axes(data, obsdim(dims))) +end + +function varwise(data::AbstractMatrix; dims=2) + return (selectdim(data, vardim(dims), i) for i in axes(data, vardim(dims))) +end + +function filterobs(f::Function, data::AbstractMatrix; dims=1) + mask = [f(x) for x in obswise(data; dims=dims)] + return dims == 1 ? data[mask, :] : data[:, mask] +end + +function filtervars(f::Function, data::AbstractMatrix; dims=2) + mask = [f(x) for x in varwise(data; dims=dims)] + return dims == 1 ? data[:, mask] : data[mask, :] +end + +""" + DropObs() + +Removes missing observations from the `AbstractArray` or `Tables.table` +provided. + +!!! Use `Impute.filter` instead + +# Example +```julia-repl +julia> using Impute: DropObs, impute + +julia> M = [1.0 1.1; 2.0 2.2; missing 3.3; missing missing; 5.0 5.5] +5×2 Array{Union{Missing, Float64},2}: + 1.0 1.1 + 2.0 2.2 + missing 3.3 + missing missing + 5.0 5.5 + +julia> impute(M, DropObs()) +3×2 Array{Union{Missing, Float64},2}: + 1.0 1.1 + 2.0 2.2 + 5.0 5.5 +``` +""" +struct DropObs <: Imputor + function DropObs() + Base.depwarn( + "Impute.DropObs is deprecated in favour of the more general Impute.Filter.", + :DropObs + ) + return new() + end +end + +# Special case impute! for vectors because we know filter! will work +impute!(data::Vector, imp::DropObs) = Base.filter!(!ismissing, data) + +function impute!(data::Vector{<:NamedTuple}, imp::DropObs) + return Base.filter!(r -> all(!ismissing, propertyvalues(r)), data) +end + +impute(data::AbstractVector, imp::DropObs) = Base.filter(!ismissing, data) + +function impute(data::Vector{<:NamedTuple}, imp::DropObs) + return Base.filter(r -> all(!ismissing, propertyvalues(r)), data) +end + +function impute(data::AbstractMatrix{Union{T, Missing}}, imp::DropObs; dims=1) where T + return filterobs(obs -> all(!ismissing, obs), data; dims=dims) +end + +function impute(table, imp::DropObs) + istable(table) || throw(MethodError(impute, (table, imp))) + rows = Tables.rows(table) + + # Unfortunately, we'll need to construct a new table + # since Tables.rows is just an iterator + filtered = Iterators.filter(rows) do r + all(!ismissing, propertyvalues(r)) + end + + table = materializer(table)(filtered) + return table +end + + +""" + DropVars() + + +Finds variables with too many missing values in a `AbstractMatrix` or +`Tables.table` and removes them from the input data. + +!!! Use `Impute.filter` instead + +# Examples +```julia-repl +julia> using Impute: DropVars, impute + +julia> M = [1.0 2.0 missing missing 5.0; 1.1 2.2 3.3 missing 5.5] +2×5 Array{Union{Missing, Float64},2}: + 1.0 2.0 missing missing 5.0 + 1.1 2.2 3.3 missing 5.5 + +julia> impute(M, DropVars()) +2×3 Array{Union{Missing, Float64},2}: + 1.0 2.0 5.0 + 1.1 2.2 5.5 +``` +""" +struct DropVars <: Imputor + function DropVars() + Base.depwarn( + "Impute.DropVars is deprecated in favour of the more general Impute.Filter.", + :DropVars + ) + return new() + end +end + +function impute!(data::Vector{<:NamedTuple}, imp::DropVars) + return materializer(data)(impute(Tables.columns(data), imp)) +end + +function impute(data::AbstractMatrix{Union{T, Missing}}, imp::DropVars; dims=1) where T + return filtervars(data; dims=dims) do vars + all(!ismissing, vars) + end +end + +function impute(table, imp::DropVars) + istable(table) || throw(MethodError(impute, (table, imp))) + cols = Tables.columns(table) + + cnames = Iterators.filter(propertynames(cols)) do cname + all(!ismissing, getproperty(cols, cname)) + end + + selected = TableOperations.select(table, cnames...) + table = materializer(table)(selected) + return table +end + +function impute!(data::AbstractMatrix{Union{T, Missing}}, imp::Union{DropObs, DropVars}) where T + data = impute(data, imp) + return data +end + +impute!(data, imp::Union{DropObs, DropVars}) = impute(data, imp) + +@deprecate impute(data, C::Chain) C(data) false +@deprecate impute!(data, C::Chain) C(data) false diff --git a/src/filter.jl b/src/filter.jl new file mode 100644 index 0000000..ba7558b --- /dev/null +++ b/src/filter.jl @@ -0,0 +1,63 @@ +""" + Filter([f]) + +Uses a function `f` to identify values, rows, columns or slices of data that should be +removed during an `apply` call. The default function `f` will removing `missing`s, or any +rows, columns or slices containing `missing`s. +""" +struct Filter{F<:Function} + func::F +end + +Filter() = Filter(_keep) + +_keep(x) = !ismissing(x) +_keep(x::Union{Tuple, AbstractArray, IterTools.PropertyValues}) = all(_keep, x) + +apply!(data::Vector, f::Filter) = Base.filter!(f.func, data) +function apply!(data::Vector{<:NamedTuple}, f::Filter; dims=:rows) + d = dim(data, dims) + d == 1 || throw(ArgumentError("Rowtables only support in-place filtering rowwise.")) + return Base.filter!(r -> f.func(propertyvalues(r)), data) +end + +apply(data::Vector, f::Filter) = Base.filter(f.func, data) +function apply(data::Vector{<:NamedTuple}, f::Filter; dims=:rows) + d = dim(data, dims) + if d == 1 + return Base.filter(r -> f.func(propertyvalues(r)), data) + else + return materializer(data)(apply(Tables.columns(data), f; dims=dims)) + end +end + +function apply(data::AbstractArray{Union{T, Missing}}, f::Filter; dims) where T + d = dim(data, dims) + mask = map(f.func, eachslice(data; dims=d)) + # use selectdim to reduce along dimension d using our mask, but call collect + # because we don't want to return a view + return collect(selectdim(data, d, mask)) +end + +apply(data::AbstractArray, f::Filter) = disallowmissing(data) + +function apply(table, f::Filter; dims) + istable(table) || throw(MethodError(apply, (table, f))) + + d = dim(table, dims) + filtered = if d == 1 + Iterators.filter(Tables.rows(table)) do r + f.func(propertyvalues(r)) + end + else + cols = Tables.columns(table) + + cnames = Iterators.filter(propertynames(cols)) do cname + f.func(getproperty(cols, cname)) + end + + TableOperations.select(table, cnames...) + end + + return materializer(table)(filtered) +end diff --git a/src/functional.jl b/src/functional.jl new file mode 100644 index 0000000..fc12bd2 --- /dev/null +++ b/src/functional.jl @@ -0,0 +1,568 @@ +# Generate a functional interface from the Assertion and Imputor types. +""" + _splitkwargs(::Type{T}, kwargs...) where T -> (imp, rem) + +Takes a type with kwargs and returns the constructed type and the +unused kwargs which should be passed to the `impute!` call. + +NOTE: This is only intended to be used internally +""" +function _splitkwargs(::Type{T}, kwargs...) where T + rem = Dict(kwargs...) + kwdef = empty(rem) + + for f in fieldnames(T) + if haskey(rem, f) + kwdef[f] = pop!(rem, f) + end + end + + return (T(; kwdef...), rem) +end + +# Specialcase kwargs constructor for substitute. +# TODO: Add an imputor method that types should overwrite when necessary or have it fallback to `fieldnames` +function _splitkwargs(::Type{Substitute}, kwargs...) + rem = Dict(kwargs...) + kwdef = empty(rem) + + for f in (:statistic, :robust, :weights) + if haskey(rem, f) + kwdef[f] = pop!(rem, f) + end + end + + return (Substitute(; kwdef...), rem) +end + +const global assertion_methods = ( + threshold = Threshold, +) + +const global imputation_methods = ( + dropobs = DropObs, + dropvars = DropVars, + interp = Interpolate, + interpolate = Interpolate, + fill = Fill, + locf = LOCF, + nocb = NOCB, + replace = Replace, + srs = SRS, + standardize = Standardize, + substitute = Substitute, + svd = SVD, + knn = KNN, +) + +for (func, type) in pairs(assertion_methods) + typename = nameof(type) + @eval begin + function $func(data; kwargs...) + a, rem = _splitkwargs($typename, kwargs...) + return assert(data, a; rem...) + end + end +end + +for (func, type) in pairs(imputation_methods) + typename = nameof(type) + func! = Symbol(func, :!) + + @eval begin + function $func(data; kwargs...) + imp, rem = _splitkwargs($typename, kwargs...) + return impute(data, imp; rem...) + end + function $func!(data; kwargs...) + imp, rem = _splitkwargs($typename, kwargs...) + return impute!(data, imp; rem...) + end + @deprecate $func(; kwargs...) data -> $func(data; kwargs...) false + @deprecate $func!(; kwargs...) data -> $func!(data; kwargs...) false + end +end + +# Provide a specific functional API for Impute.Filter. +filter(data; kwargs...) = apply(data, Filter(); kwargs...) +filter!(data; kwargs...) = apply!(data, Filter(); kwargs...) +filter(f::Function, data; kwargs...) = apply(data, Filter(f); kwargs...) +filter!(f::Function, data; kwargs...) = apply!(data, Filter(f); kwargs...) + +@doc """ + Impute.threshold(data; ratio=0.1, weights=nothing, kwargs...) + +Assert that proportion of missing values in the `data` do not exceed the `ratio`. + +# Examples +```julia-repl +julia> using DataFrames, Impute + +julia> df = DataFrame(:a => [1.0, 2.0, missing, missing, 5.0], :b => [1.1, 2.2, 3.3, missing, 5.5]) +5×2 DataFrames.DataFrame +│ Row │ a │ b │ +│ │ Float64 │ Float64 │ +├─────┼──────────┼──────────┤ +│ 1 │ 1.0 │ 1.1 │ +│ 2 │ 2.0 │ 2.2 │ +│ 3 │ missing │ 3.3 │ +│ 4 │ missing │ missing │ +│ 5 │ 5.0 │ 5.5 │ + +julia> Impute.threshold(df) +ERROR: ThresholdError: Ratio of missing values exceeded 0.1 (0.4) +Stacktrace: +... + +julia> Impute.threshold(df; ratio=0.8) +5×2 DataFrames.DataFrame +│ Row │ a │ b │ +│ │ Float64 │ Float64 │ +├─────┼──────────┼──────────┤ +│ 1 │ 1.0 │ 1.1 │ +│ 2 │ 2.0 │ 2.2 │ +│ 3 │ missing │ 3.3 │ +│ 4 │ missing │ missing │ +│ 5 │ 5.0 │ 5.5 │ +``` +""" +threshold + +@doc """ + Impute.dropobs(data; dims=1) + +Removes missing observations from the `AbstractArray` or `Tables.table` provided. +See [DropObs](@ref) for details. + +!!! Use `Impute.filter(data; dims=1)` instead. + +# Example +```julia-repl +julia> using DataFrames; using Impute: Impute + +julia> df = DataFrame(:a => [1.0, 2.0, missing, missing, 5.0], :b => [1.1, 2.2, 3.3, missing, 5.5]) +5×2 DataFrames.DataFrame +│ Row │ a │ b │ +│ │ Float64 │ Float64 │ +├─────┼──────────┼──────────┤ +│ 1 │ 1.0 │ 1.1 │ +│ 2 │ 2.0 │ 2.2 │ +│ 3 │ missing │ 3.3 │ +│ 4 │ missing │ missing │ +│ 5 │ 5.0 │ 5.5 │ + +julia> Impute.dropobs(df; dims=2) +3×2 DataFrames.DataFrame +│ Row │ a │ b │ +│ │ Float64 │ Float64 │ +├─────┼─────────┼─────────┤ +│ 1 │ 1.0 │ 1.1 │ +│ 2 │ 2.0 │ 2.2 │ +│ 3 │ 5.0 │ 5.5 │ +``` +""" dropobs + +@doc """ + Impute.dropvars(data; dims=1) + +Finds variables with missing values in a `AbstractMatrix` or `Tables.table` and +removes them from the input data. See [DropVars](@ref) for details. + +!!! Use `Impute.filter(data; dims=2)` instead. + +# Example +```julia-repl +julia> using DataFrames; using Impute: Impute + +julia> df = DataFrame(:a => [1.0, 2.0, missing, missing, 5.0], :b => [1.1, 2.2, 3.3, missing, 5.5]) +5×2 DataFrame +│ Row │ a │ b │ +│ │ Float64? │ Float64? │ +├─────┼──────────┼──────────┤ +│ 1 │ 1.0 │ 1.1 │ +│ 2 │ 2.0 │ 2.2 │ +│ 3 │ missing │ 3.3 │ +│ 4 │ missing │ missing │ +│ 5 │ 5.0 │ 5.5 │ + +julia> Impute.dropvars(df) +0×0 DataFrame +``` +""" dropvars + +@doc """ + Impute.filter([f,] data; dims) + +Filters values, rows, columns or slices of data that should be removed. +The default function `f` will removing `missing`s, or any rows, columns or slices +containing `missing`s. + +# Examples +```jldoctest +julia> using DataFrames; using Impute: Impute + + +julia> df = DataFrame(:a => [1.0, 2.0, missing, missing, 5.0], :b => [1.1, 2.2, 3.3, missing, 5.5]) +5×2 DataFrame +│ Row │ a │ b │ +│ │ Float64? │ Float64? │ +├─────┼──────────┼──────────┤ +│ 1 │ 1.0 │ 1.1 │ +│ 2 │ 2.0 │ 2.2 │ +│ 3 │ missing │ 3.3 │ +│ 4 │ missing │ missing │ +│ 5 │ 5.0 │ 5.5 │ + +julia> Impute.filter(df; dims=:cols) +0×0 DataFrame + +julia> Impute.filter(df; dims=:rows) +3×2 DataFrame +│ Row │ a │ b │ +│ │ Float64 │ Float64 │ +├─────┼─────────┼─────────┤ +│ 1 │ 1.0 │ 1.1 │ +│ 2 │ 2.0 │ 2.2 │ +│ 3 │ 5.0 │ 5.5 │ +``` +""" filter + +@doc """ + Impute.interp(data; dims=1) + +Performs linear interpolation between the nearest values in an vector. +See [Interpolate](@ref) for details. + +# Example +```jldoctest +julia> using DataFrames; using Impute: Impute + +julia> df = DataFrame(:a => [1.0, 2.0, missing, missing, 5.0], :b => [1.1, 2.2, 3.3, missing, 5.5]) +5×2 DataFrame +│ Row │ a │ b │ +│ │ Float64? │ Float64? │ +├─────┼──────────┼──────────┤ +│ 1 │ 1.0 │ 1.1 │ +│ 2 │ 2.0 │ 2.2 │ +│ 3 │ missing │ 3.3 │ +│ 4 │ missing │ missing │ +│ 5 │ 5.0 │ 5.5 │ + +julia> Impute.interp(df) +5×2 DataFrame +│ Row │ a │ b │ +│ │ Float64? │ Float64? │ +├─────┼──────────┼──────────┤ +│ 1 │ 1.0 │ 1.1 │ +│ 2 │ 2.0 │ 2.2 │ +│ 3 │ 3.0 │ 3.3 │ +│ 4 │ 4.0 │ 4.4 │ +│ 5 │ 5.0 │ 5.5 │ +``` +""" interp + +@doc """ + Impute.fill(data; value=mean, dims=1) + +Fills in the missing data with a specific value. See [Fill](@ref) for details. + +!!! Use `Impute.replace` for constants or `Impute.substitue` for median/mode substitution. + +# Example +```julia-repl +julia> using DataFrames; using Impute: Impute + +julia> df = DataFrame(:a => [1.0, 2.0, missing, missing, 5.0], :b => [1.1, 2.2, 3.3, missing, 5.5]) +5×2 DataFrame +│ Row │ a │ b │ +│ │ Float64? │ Float64? │ +├─────┼──────────┼──────────┤ +│ 1 │ 1.0 │ 1.1 │ +│ 2 │ 2.0 │ 2.2 │ +│ 3 │ missing │ 3.3 │ +│ 4 │ missing │ missing │ +│ 5 │ 5.0 │ 5.5 │ + +julia> Impute.fill(df; value=-1.0) +5×2 DataFrame +│ Row │ a │ b │ +│ │ Float64? │ Float64? │ +├─────┼──────────┼──────────┤ +│ 1 │ 1.0 │ 1.1 │ +│ 2 │ 2.0 │ 2.2 │ +│ 3 │ -1.0 │ 3.3 │ +│ 4 │ -1.0 │ -1.0 │ +│ 5 │ 5.0 │ 5.5 │ +``` +""" fill + +@doc """ + Impute.locf(data; dims=1) + +Iterates forwards through the `data` and fills missing data with the last existing +observation. See [LOCF](@ref) for details. + +# Example +```jldoctest +julia> using DataFrames; using Impute: Impute + +julia> df = DataFrame(:a => [1.0, 2.0, missing, missing, 5.0], :b => [1.1, 2.2, 3.3, missing, 5.5]) +5×2 DataFrame +│ Row │ a │ b │ +│ │ Float64? │ Float64? │ +├─────┼──────────┼──────────┤ +│ 1 │ 1.0 │ 1.1 │ +│ 2 │ 2.0 │ 2.2 │ +│ 3 │ missing │ 3.3 │ +│ 4 │ missing │ missing │ +│ 5 │ 5.0 │ 5.5 │ + +julia> Impute.locf(df) +5×2 DataFrame +│ Row │ a │ b │ +│ │ Float64? │ Float64? │ +├─────┼──────────┼──────────┤ +│ 1 │ 1.0 │ 1.1 │ +│ 2 │ 2.0 │ 2.2 │ +│ 3 │ 2.0 │ 3.3 │ +│ 4 │ 2.0 │ 3.3 │ +│ 5 │ 5.0 │ 5.5 │ +``` +""" locf + +@doc """ + Impute.nocb(data; dims=1) + +Iterates backwards through the `data` and fills missing data with the next existing +observation. See [LOCF](@ref) for details. + +# Example +```jldoctest +julia> using DataFrames; using Impute: Impute + +julia> df = DataFrame(:a => [1.0, 2.0, missing, missing, 5.0], :b => [1.1, 2.2, 3.3, missing, 5.5]) +5×2 DataFrame +│ Row │ a │ b │ +│ │ Float64? │ Float64? │ +├─────┼──────────┼──────────┤ +│ 1 │ 1.0 │ 1.1 │ +│ 2 │ 2.0 │ 2.2 │ +│ 3 │ missing │ 3.3 │ +│ 4 │ missing │ missing │ +│ 5 │ 5.0 │ 5.5 │ + +julia> Impute.nocb(df) +5×2 DataFrame +│ Row │ a │ b │ +│ │ Float64? │ Float64? │ +├─────┼──────────┼──────────┤ +│ 1 │ 1.0 │ 1.1 │ +│ 2 │ 2.0 │ 2.2 │ +│ 3 │ 5.0 │ 3.3 │ +│ 4 │ 5.0 │ 5.5 │ +│ 5 │ 5.0 │ 5.5 │ +``` +""" nocb + +@doc """ + Impute.srs(data; rng=Random.GLOBAL_RNG) + +Simple Random Sampling (SRS) imputation is a method for imputing both continuous and +categorical variables. Furthermore, it completes imputation while preserving the +distributional properties of the variables (e.g., mean, standard deviation). + +# Example +```julia-repl +julia> using DataFrames; using Random; using Impute: Impute + +julia> df = DataFrame(:a => [1.0, 2.0, missing, missing, 5.0], :b => [1.1, 2.2, 3.3, missing, 5.5]) +5×2 DataFrame +│ Row │ a │ b │ +│ │ Float64? │ Float64? │ +├─────┼──────────┼──────────┤ +│ 1 │ 1.0 │ 1.1 │ +│ 2 │ 2.0 │ 2.2 │ +│ 3 │ missing │ 3.3 │ +│ 4 │ missing │ missing │ +│ 5 │ 5.0 │ 5.5 │ + +julia> Impute.srs(df; rng=MersenneTwister(1234)) +5×2 DataFrame +│ Row │ a │ b │ +│ │ Float64? │ Float64? │ +├─────┼──────────┼──────────┤ +│ 1 │ 1.0 │ 1.1 │ +│ 2 │ 2.0 │ 2.2 │ +│ 3 │ 1.0 │ 3.3 │ +│ 4 │ 2.0 │ 3.3 │ +│ 5 │ 5.0 │ 5.5 │ +``` +""" srs + +@doc """ + Impute.standardize(data; values) + +Standardize (or replace) various missing data representations with `missing`. + +# Keyword Arguments +* `value::Tuple`: A tuple of values that should be considered `missing` + +# Example +```jldoctest +julia> using DataFrames, Impute + +julia> df = DataFrame( + :a => [1.1, 2.2, NaN, NaN, 5.5], + :b => [1, 2, 3, -9999, 5], + :c => ["v", "w", "x", "y", "NULL"], + ) +5×3 DataFrame +│ Row │ a │ b │ c │ +│ │ Float64 │ Int64 │ String │ +├─────┼─────────┼───────┼────────┤ +│ 1 │ 1.1 │ 1 │ v │ +│ 2 │ 2.2 │ 2 │ w │ +│ 3 │ NaN │ 3 │ x │ +│ 4 │ NaN │ -9999 │ y │ +│ 5 │ 5.5 │ 5 │ NULL │ + +julia> Impute.standardize(df; values=(NaN, -9999, "NULL")) +5×3 DataFrame +│ Row │ a │ b │ c │ +│ │ Float64? │ Int64? │ String? │ +├─────┼──────────┼─────────┼─────────┤ +│ 1 │ 1.1 │ 1 │ v │ +│ 2 │ 2.2 │ 2 │ w │ +│ 3 │ missing │ 3 │ x │ +│ 4 │ missing │ missing │ y │ +│ 5 │ 5.5 │ 5 │ missing │ +``` +""" standardize + +@doc """ + Impute.substitute(data; statistic=nothing) + Impute.substitute(data; robust=true, weights=nothing) + +Substitute missing values with a summary statistic over the non-missing values. + +# Keyword Arguments +* `statistic`: A summary statistic function to be applied to the non-missing values. + This function should return a value of the same type as the input data `eltype`. + If this function isn't passed in then the `defaultstats` function is used to make a + best guess. +* `robust`: Whether to use `median` or `mean` for continuous datasets in `defaultstats` +* `weights`: A set of statistical weights to apply to the `mean` or `median` in `defaultstats`. + +See [Substitute](@ref) for details on substitution rules defined in `defaultstats`. + +# Example +```jldoctest +julia> using DataFrames, Impute + +julia> df = DataFrame( + :a => [8.9, 2.2, missing, missing, 1.3, 6.2, 3.7, 4.8], + :b => [2, 6, 3, missing, 7, 1, 9, missing], + :c => [true, false, true, true, false, missing, false, true], + ) +8×3 DataFrame +│ Row │ a │ b │ c │ +│ │ Float64? │ Int64? │ Bool? │ +├─────┼──────────┼─────────┼─────────┤ +│ 1 │ 8.9 │ 2 │ 1 │ +│ 2 │ 2.2 │ 6 │ 0 │ +│ 3 │ missing │ 3 │ 1 │ +│ 4 │ missing │ missing │ 1 │ +│ 5 │ 1.3 │ 7 │ 0 │ +│ 6 │ 6.2 │ 1 │ missing │ +│ 7 │ 3.7 │ 9 │ 0 │ +│ 8 │ 4.8 │ missing │ 1 │ + +julia> Impute.substitute(df) +8×3 DataFrame +│ Row │ a │ b │ c │ +│ │ Float64? │ Int64? │ Bool? │ +├─────┼──────────┼────────┼───────┤ +│ 1 │ 8.9 │ 2 │ 1 │ +│ 2 │ 2.2 │ 6 │ 0 │ +│ 3 │ 4.25 │ 3 │ 1 │ +│ 4 │ 4.25 │ 4 │ 1 │ +│ 5 │ 1.3 │ 7 │ 0 │ +│ 6 │ 6.2 │ 1 │ 1 │ +│ 7 │ 3.7 │ 9 │ 0 │ +│ 8 │ 4.8 │ 4 │ 1 │ +``` +""" substitute + +@doc """ + Impute.knn(; k=1, threshold=0.5, dist=Euclidean()) + +Imputation using k-Nearest Neighbor algorithm. + +# Keyword Arguments +* `k::Int`: number of nearest neighbors +* `dist::MinkowskiMetric`: distance metric suppports by `NearestNeighbors.jl` (Euclidean, Chebyshev, Minkowski and Cityblock) +* `threshold::AbsstractFloat`: thershold for missing neighbors + +# Reference +* Troyanskaya, Olga, et al. "Missing value estimation methods for DNA microarrays." Bioinformatics 17.6 (2001): 520-525. + +# Example +```jldoctest +julia> using Impute, Missings + +julia> data = allowmissing(reshape(sin.(1:20), 5, 4)); data[[2, 3, 7, 9, 13, 19]] .= missing; data +5×4 Array{Union{Missing, Float64},2}: + 0.841471 -0.279415 -0.99999 -0.287903 + missing missing -0.536573 -0.961397 + missing 0.989358 missing -0.750987 + -0.756802 missing 0.990607 missing + -0.958924 -0.544021 0.650288 0.912945 + +julia> result = Impute.knn(data; dims=:cols) +5×4 Array{Union{Missing, Float64},2}: + 0.841471 -0.279415 -0.99999 -0.287903 + -0.756802 -0.279415 -0.536573 -0.961397 + -0.756802 0.989358 0.0568575 -0.750987 + -0.756802 -0.279415 0.990607 -0.519445 + -0.958924 -0.544021 0.650288 0.912945 +``` +""" knn + +@doc """ + Impute.svd(; kwargs...) + +Imputes the missing values in a matrix using an expectation maximization (EM) algorithm +over low-rank SVD approximations. + +# Keyword Arguments +* `init::Imputor`: initialization method for missing values (default: Substitute()) +* `rank::Union{Int, Nothing}`: rank of the SVD approximation (default: nothing meaning start and 0 and increase) +* `tol::Float64`: convergence tolerance (default: 1e-10) +* `maxiter::Int`: Maximum number of iterations if convergence is not achieved (default: 100) +* `limits::Unoin{Tuple{Float64, Float64}, Nothing}`: Bound the possible approximation values (default: nothing) +* `verbose::Bool`: Whether to display convergence progress (default: true) + +# References +* Troyanskaya, Olga, et al. "Missing value estimation methods for DNA microarrays." Bioinformatics 17.6 (2001): 520-525. + +# Example +```jldoctest +julia> using Impute, Missings + +julia> data = allowmissing(reshape(sin.(1:20), 5, 4)); data[[2, 3, 7, 9, 13, 19]] .= missing; data +5×4 Array{Union{Missing, Float64},2}: + 0.841471 -0.279415 -0.99999 -0.287903 + missing missing -0.536573 -0.961397 + missing 0.989358 missing -0.750987 + -0.756802 missing 0.990607 missing + -0.958924 -0.544021 0.650288 0.912945 + +julia> result = Impute.svd(data; dims=:cols) +5×4 Array{Union{Missing, Float64},2}: + 0.841471 -0.279415 -0.99999 -0.287903 + 0.220258 0.555829 -0.536573 -0.961397 + -0.372745 0.989358 0.533193 -0.750987 + -0.756802 0.253309 0.990607 0.32315 + -0.958924 -0.544021 0.650288 0.912945 +``` +""" svd diff --git a/src/imputors.jl b/src/imputors.jl index 6beb1e0..d2432ba 100644 --- a/src/imputors.jl +++ b/src/imputors.jl @@ -2,123 +2,181 @@ Imputor An imputor stores information about imputing values in `AbstractArray`s and `Tables.table`s. -New imputation methods are expected to sutype `Imputor` and, at minimum, -implement the `impute!(imp::, data::AbstractVector)` method. +New imputation methods are expected to subtype `Imputor` and, at minimum, +implement the `_impute!(data::AbstractArrays, imp::)` method. + +While fallback `impute` and `impute!` methods are provided to extend your `_impute!` methods to +n-dimensional arrays and tables, you can always override these methods to change the +behaviour as necessary. """ abstract type Imputor end -# A couple utility methods to avoid messing up var and obs dimensions -obsdim(dims) = dims -vardim(dims) = dims == 1 ? 2 : 1 - -function obswise(data::AbstractMatrix; dims=1) - return (selectdim(data, obsdim(dims), i) for i in axes(data, obsdim(dims))) -end - -function varwise(data::AbstractMatrix; dims=2) - return (selectdim(data, vardim(dims), i) for i in axes(data, vardim(dims))) -end +#= +These default methods are required because @auto_hash_equals doesn't +play nice with Base.@kwdef +=# +function Base.hash(imp::T, h::UInt) where T <: Imputor + h = hash(Symbol(T), h) -function filterobs(f::Function, data::AbstractMatrix; dims=1) - mask = [f(x) for x in obswise(data; dims=dims)] - return dims == 1 ? data[mask, :] : data[:, mask] -end + for f in fieldnames(T) + h = hash(getfield(imp, f), h) + end -function filtervars(f::Function, data::AbstractMatrix; dims=2) - mask = [f(x) for x in varwise(data; dims=dims)] - return dims == 1 ? data[:, mask] : data[mask, :] + return h end -""" - splitkwargs(::Type{T}, kwargs...) where T <: Imputor -> (imp, rem) - -Takes an Imputor type with kwargs and returns the constructed imputor and the -unused kwargs which should be passed to the `impute!` call. - -NOTE: This is used by utility methods with construct and imputor and call impute in 1 call. -""" -function splitkwargs(::Type{T}, kwargs...) where T <: Imputor - rem = Dict(kwargs...) - kwdef = empty(rem) +function Base.:(==)(a::T, b::T) where T <: Imputor + result = true for f in fieldnames(T) - if haskey(rem, f) - kwdef[f] = rem[f] - delete!(rem, f) + if !isequal(getfield(a, f), getfield(b, f)) + result = false + break end end - return (T(; kwdef...), rem) -end - -# Some utility methods for constructing imputors and imputing data in 1 call. -# NOTE: This is only intended for internal use and is not part of the public API. -function _impute(data, t::Type{T}, kwargs...) where T <: Imputor - imp, rem = splitkwargs(t, kwargs...) - return impute(data, imp; rem...) -end - -function _impute!(data, t::Type{T}, kwargs...) where T <: Imputor - imp, rem = splitkwargs(t, kwargs...) - return impute!(data, imp; rem...) + return result end """ - impute(data, imp::Imputor; kwargs...) + impute(data::T, imp; kwargs...) -> T Returns a new copy of the `data` with the missing data imputed by the imputor `imp`. +For matrices and tables, data is imputed one variable/column at a time. +If this is not the desired behaviour then you should overload this method or specify a different `dims` value. -# Keywords -* `dims`: The dimension to impute along (e.g., observations dim) +# Arguments +* `data`: the data to be impute +* `imp::Imputor`: the Imputor method to use + +# Returns +* the input `data` with values imputed + +# Example +```jldoctest +julia> using Impute: Interpolate, impute + +julia> v = [1.0, 2.0, missing, missing, 5.0] +5-element Array{Union{Missing, Float64},1}: + 1.0 + 2.0 + missing + missing + 5.0 + +julia> impute(v, Interpolate()) +5-element Array{Union{Missing, Float64},1}: + 1.0 + 2.0 + 3.0 + 4.0 + 5.0 +``` """ function impute(data, imp::Imputor; kwargs...) - # Call `deepcopy` because we can trust that it's available for all types. - return impute!(deepcopy(data), imp; kwargs...) + # NOTE: We don't use a return type declaration here because `trycopy` isn't guaranteed + # to return the same type passed in. For example, subarrays and subdataframes will + # return a regular array or dataframe. + return impute!(trycopy(data), imp; kwargs...) end -# Generic fallback for methods that have only defined _impute(v, imp; kwargs...) -impute!(data::AbstractVector, imp::Imputor; kwargs...) = _impute!(data, imp; kwargs...) - """ - impute!(data::AbstractMatrix, imp::Imputor; kwargs...) + impute!(data::A, imp; dims=:, kwargs...) -> A -Impute the data in a matrix by imputing the values one variable at a time; -if this is not the desired behaviour custom imputor methods should overload this method. +Impute the `missing` values in the array `data` using the imputor `imp`. +Optionally, you can specify the dimension to impute along. # Arguments -* `data::AbstractMatrix`: the data to impute +* `data::AbstractArray{Union{T, Missing}}`: the data to be impute along dimensions `dims` * `imp::Imputor`: the Imputor method to use -# Keywords -* `dims`: The dimension to impute along (e.g., observations dim) +# Keyword Arguments +* `dims=:`: The dimension to impute along. `:rows` and `:cols` are also supported for matrices. # Returns -* `AbstractMatrix`: the input `data` with values imputed +* `AbstractArray{Union{T, Missing}}`: the input `data` with values imputed + +# NOTES +1. Matrices have a deprecated `dims=2` special case as `dims=:` is a breaking change +2. Mutation isn't guaranteed for all array types, hence we return the result +3. `eachslice` is used internally which requires Julia 1.1 # Example ```jldoctest -julia> using Impute: Interpolate, Context, impute +julia> using Impute: Interpolate, impute! julia> M = [1.0 2.0 missing missing 5.0; 1.1 2.2 3.3 missing 5.5] 2×5 Array{Union{Missing, Float64},2}: 1.0 2.0 missing missing 5.0 1.1 2.2 3.3 missing 5.5 -julia> impute(M, Interpolate(; context=Context(; limit=1.0)); dims=2) +julia> impute!(M, Interpolate(); dims=1) +2×5 Array{Union{Missing, Float64},2}: + 1.0 2.0 3.0 4.0 5.0 + 1.1 2.2 3.3 4.4 5.5 + +julia> M 2×5 Array{Union{Missing, Float64},2}: 1.0 2.0 3.0 4.0 5.0 1.1 2.2 3.3 4.4 5.5 ``` """ -function impute!(data::AbstractMatrix, imp::Imputor; dims=1) - for var in varwise(data; dims=dims) - impute!(var, imp) +function impute!( + data::A, imp::Imputor; dims=:, kwargs... +)::A where A <: AbstractArray{Union{T, Missing}} where T + dims === Colon() && return _impute!(data, imp; kwargs...) + + for x in eachslice(data; dims=dims) + _impute!(x, imp; kwargs...) end + + return data +end + + +function impute!( + data::M, imp::Imputor; dims=nothing, kwargs... +)::M where M <: AbstractMatrix{Union{T, Missing}} where T + dims === Colon() && return _impute!(data, imp; kwargs...) + # We're calling our `dim` function to throw a depwarn if `dims === nothing` + d = dim(data, dims) + + for x in eachslice(data; dims=d) + _impute!(x, imp; kwargs...) + end + return data end +impute!(data::AbstractMatrix{Missing}, imp::Imputor; kwargs...) = data + +""" + impute!(data::T, imp; kwargs...) -> T where T <: AbstractVector{<:NamedTuple} + +Special case rowtables which are arrays, but we want to fallback to the tables method. +""" +function impute!(data::T, imp::Imputor)::T where T <: AbstractVector{<:NamedTuple} + return materializer(data)(impute!(Tables.columns(data), imp)) +end + +""" + impute!(data::AbstractArray, imp) -> data + + +Just returns the `data` when the array doesn't contain `missing`s +""" +impute!(data::AbstractArray, imp::Imputor; kwargs...) = disallowmissing(data) + +""" + impute!(data::AbstractArray{Missing}, imp) -> data + +Just return the `data` when the array only contains `missing`s +""" +impute!(data::AbstractArray{Missing}, imp::Imputor; kwargs...) = data + + """ - impute!(table, imp::Imputor) + impute!(table, imp; cols=nothing) -> table Imputes the data in a table by imputing the values 1 column at a time; if this is not the desired behaviour custom imputor methods should overload this method. @@ -127,16 +185,20 @@ if this is not the desired behaviour custom imputor methods should overload this * `imp::Imputor`: the Imputor method to use * `table`: the data to impute +# Keyword Arguments +* `cols`: The columns to impute along (default is to impute all columns) + # Returns * the input `data` with values imputed # Example -``jldoctest -julia> using DataFrames; using Impute: Interpolate, Context, impute +```jldoctest +julia> using DataFrames; using Impute: Interpolate, impute + julia> df = DataFrame(:a => [1.0, 2.0, missing, missing, 5.0], :b => [1.1, 2.2, 3.3, missing, 5.5]) 5×2 DataFrame │ Row │ a │ b │ -│ │ Float64 │ Float64 │ +│ │ Float64? │ Float64? │ ├─────┼──────────┼──────────┤ │ 1 │ 1.0 │ 1.1 │ │ 2 │ 2.0 │ 2.2 │ @@ -144,38 +206,46 @@ julia> df = DataFrame(:a => [1.0, 2.0, missing, missing, 5.0], :b => [1.1, 2.2, │ 4 │ missing │ missing │ │ 5 │ 5.0 │ 5.5 │ -julia> impute(df, Interpolate(; context=Context(; limit=1.0))) +julia> impute(df, Interpolate()) 5×2 DataFrame │ Row │ a │ b │ -│ │ Float64 │ Float64 │ +│ │ Float64? │ Float64? │ ├─────┼──────────┼──────────┤ │ 1 │ 1.0 │ 1.1 │ │ 2 │ 2.0 │ 2.2 │ │ 3 │ 3.0 │ 3.3 │ │ 4 │ 4.0 │ 4.4 │ │ 5 │ 5.0 │ 5.5 │ +``` """ -function impute!(table, imp::Imputor) +function impute!(table::T, imp::Imputor; cols=nothing)::T where T + # TODO: We could probably handle iterators of tables here istable(table) || throw(MethodError(impute!, (table, imp))) # Extract a columns iterator that we should be able to use to mutate the data. # NOTE: Mutation is not guaranteed for all table types, but it avoid copying the data columntable = Tables.columns(table) - for cname in propertynames(columntable) + cnames = cols === nothing ? propertynames(columntable) : cols + for cname in cnames impute!(getproperty(columntable, cname), imp) end return table end -# Special case row tables -# NOTE: This may introduce ambiguities for specific imputors that have defined a -# `impute!(data, imp)`` method -function impute!(data::Vector{<:NamedTuple}, imp::Imputor) - return materializer(data)(impute!(Tables.columns(data), imp)) -end - -for file in ("drop.jl", "locf.jl", "nocb.jl", "interp.jl", "fill.jl", "chain.jl", "srs.jl", "svd.jl", "knn.jl") +files = [ + "interp.jl", + "knn.jl", + "locf.jl", + "nocb.jl", + "replace.jl", + "srs.jl", + "standardize.jl", + "substitute.jl", + "svd.jl", +] + +for file in files include(joinpath("imputors", file)) end diff --git a/src/imputors/chain.jl b/src/imputors/chain.jl deleted file mode 100644 index be9e938..0000000 --- a/src/imputors/chain.jl +++ /dev/null @@ -1,59 +0,0 @@ -""" - Chain <: Imputor - -Runs multiple `Imputor`s on the same data in the order they're provided. - -# Fields -* `imputors::Array{Imputor}` -""" -struct Chain <: Imputor - imputors::Vector{Imputor} -end - -""" - Chain(imputors::Imputor...) -> Chain - -Creates a Chain using the `Imputor`s provided (ordering matters). -""" -Chain(imputors::Imputor...) = Chain(collect(imputors)) - -""" -Compose new `Imputor` chains with the composition operator - -# Example - -```jldoctest -julia> using Impute: impute, Interpolate, NOCB, LOCF, Context - -julia> ctx = Context(; limit=1.0) -Context(0, 0, 1.0, ismissing, Impute.complete) - -julia> imp = Interpolate(; context=ctx) ∘ NOCB(; context=ctx) ∘ LOCF(; context=ctx) -Impute.Chain(Impute.Imputor[Interpolate(2, Context(0, 0, 1.0, ismissing, complete)), NOCB(2, Context(0, 0, 1.0, ismissing, complete)), LOCF(2, Context(0, 0, 1.0, ismissing, complete))]) -``` -""" -Base.:(∘)(a::Imputor, b::Imputor) = Chain([a, b]) -function Base.:(∘)(a::Chain, b::Imputor) - push!(a.imputors, b) - return a -end - -""" - impute!(data, imp::Chain) - -Runs the `Imputor`s on the supplied data. - -# Arguments -* `imp::Chain`: the chain to run -* `data`: our data to impute - -# Returns -* our imputed data -""" -function impute!(data, imp::Chain) - for imputor in imp.imputors - data = impute!(data, imputor) - end - - return data -end diff --git a/src/imputors/drop.jl b/src/imputors/drop.jl deleted file mode 100644 index 999ee6b..0000000 --- a/src/imputors/drop.jl +++ /dev/null @@ -1,131 +0,0 @@ -""" - DropObs(; context=Context) - -Removes missing observations from the `AbstractArray` or `Tables.table` provided. - -# Keyword Arguments -* `context::AbstractContext=Context()`: A context which keeps track of missing data - summary information - -# Example -```jldoctest -julia> using Impute: DropObs, Context, impute - -julia> M = [1.0 2.0 missing missing 5.0; 1.1 2.2 3.3 missing 5.5] -2×5 Array{Union{Missing, Float64},2}: - 1.0 2.0 missing missing 5.0 - 1.1 2.2 3.3 missing 5.5 - -julia> impute(M, DropObs(; context=Context(; limit=1.0)); dims=2) -2×3 Array{Union{Missing, Float64},2}: - 1.0 2.0 5.0 - 1.1 2.2 5.5 -``` -""" -struct DropObs <: Imputor - context::AbstractContext -end - -# TODO: Switch to using Base.@kwdef on 1.1 -DropObs(; context=Context()) = DropObs(context) - -# Special case impute! for vectors because we know filter! will work -function impute!(data::Vector, imp::DropObs) - imp.context(c -> filter!(x -> !ismissing!(c, x), data)) -end - -function impute!(data::Vector{<:NamedTuple}, imp::DropObs) - return materializer(data)(impute(Tables.columns(data), imp)) -end - -function impute(data::AbstractVector, imp::DropObs) - imp.context(c -> filter(x -> !ismissing!(c, x), data)) -end - -function impute(data::AbstractMatrix, imp::DropObs; dims=1) - imp.context() do c - return filterobs(data; dims=dims) do obs - !ismissing!(c, obs) - end - end -end - -function impute(table, imp::DropObs) - imp.context() do c - @assert istable(table) - rows = Tables.rows(table) - - # Unfortunately, we'll need to construct a new table - # since Tables.rows is just an iterator - filtered = Iterators.filter(rows) do r - !any(x -> ismissing!(c, x), propertyvalues(r)) - end - - table = materializer(table)(filtered) - return table - end -end - - -""" - DropVars(; context=Context()) - - -Finds variables with too many missing values in a `AbstractMatrix` or `Tables.table` and -removes them from the input data. - -# Keyword Arguments -* `context::AbstractContext`: A context which keeps track of missing data - summary information - -# Examples -```jldoctest -julia> using Impute: DropVars, Context, impute - -julia> M = [1.0 2.0 missing missing 5.0; 1.1 2.2 3.3 missing 5.5] -2×5 Array{Union{Missing, Float64},2}: - 1.0 2.0 missing missing 5.0 - 1.1 2.2 3.3 missing 5.5 - -julia> impute(M, DropVars(; context=Context(; limit=0.2)); dims=2) -1×5 Array{Union{Missing, Float64},2}: - 1.1 2.2 3.3 missing 5.5 -``` -""" -struct DropVars <: Imputor - context::AbstractContext -end - -# TODO: Switch to using Base.@kwdef on 1.1 -DropVars(; context=Context()) = DropVars(context) - -function impute!(data::Vector{<:NamedTuple}, imp::DropVars) - return materializer(data)(impute(Tables.columns(data), imp)) -end - -function impute(data::AbstractMatrix, imp::DropVars; dims=1) - imp.context() do c - return filtervars(data; dims=dims) do vars - !ismissing!(c, vars) - end - end -end - -function impute(table, imp::DropVars) - istable(table) || throw(MethodError(impute!, (table, imp))) - cols = Tables.columns(table) - - imp.context() do c - cnames = Iterators.filter(propertynames(cols)) do cname - !ismissing!(c, getproperty(cols, cname)) - end - - selected = TableOperations.select(table, cnames...) - table = materializer(table)(selected) - return table - end -end - -# Add impute! methods to override the default behaviour in imputors.jl -impute!(data::AbstractMatrix, imp::Union{DropObs, DropVars}) = impute(data, imp) -impute!(data, imp::Union{DropObs, DropVars}) = impute(data, imp) diff --git a/src/imputors/fill.jl b/src/imputors/fill.jl deleted file mode 100644 index e41dac0..0000000 --- a/src/imputors/fill.jl +++ /dev/null @@ -1,60 +0,0 @@ -""" - Fill(; value=mean, context=Context()) - -Fills in the missing data with a specific value. -The current implementation is univariate, so each variable in a table or matrix will -be handled independently. - -# Keyword Arguments -* `value::Any`: A scalar or a function that returns a scalar if - passed the data with missing data removed (e.g, `mean`) -* `context::AbstractContext`: A context which keeps track of missing data - summary information - -# Example -```jldoctest -julia> using Impute: Fill, Context, impute - -julia> M = [1.0 2.0 missing missing 5.0; 1.1 2.2 3.3 missing 5.5] -2×5 Array{Union{Missing, Float64},2}: - 1.0 2.0 missing missing 5.0 - 1.1 2.2 3.3 missing 5.5 - -julia> impute(M, Fill(; context=Context(; limit=1.0)); dims=2) -2×5 Array{Union{Missing, Float64},2}: - 1.0 2.0 2.66667 2.66667 5.0 - 1.1 2.2 3.3 3.025 5.5 -``` -""" -struct Fill{T} <: Imputor - value::T - context::AbstractContext -end - -# TODO: Switch to using Base.@kwdef on 1.1 -Fill(; value=mean, context=Context()) = Fill(value, context) - -function _impute!(data::AbstractVector, imp::Fill) - imp.context() do c - fill_val = if isa(imp.value, Function) - available = Impute.drop(data; context=c) - - if isempty(available) - @debug "Cannot apply fill function $(imp.value) as all values are missing" - return data - else - imp.value(available) - end - else - imp.value - end - - for i in eachindex(data) - if ismissing!(c, data[i]) - data[i] = fill_val - end - end - - return data - end -end diff --git a/src/imputors/interp.jl b/src/imputors/interp.jl index 9cc7b7b..720a521 100644 --- a/src/imputors/interp.jl +++ b/src/imputors/interp.jl @@ -1,81 +1,60 @@ """ - Interpolate(; context=Context()) + Interpolate() Performs linear interpolation between the nearest values in an vector. The current implementation is univariate, so each variable in a table or matrix will be handled independently. -WARNING: Missing values at the head or tail of the array cannot be interpolated if there +!!! Missing values at the head or tail of the array cannot be interpolated if there are no existing values on both sides. As a result, this method does not guarantee that all missing values will be imputed. -# Keyword Arguments -* `context::AbstractContext`: A context which keeps track of missing data - summary information - # Example ```jldoctest -julia> using Impute: Interpolate, Context, impute +julia> using Impute: Interpolate, impute julia> M = [1.0 2.0 missing missing 5.0; 1.1 2.2 3.3 missing 5.5] 2×5 Array{Union{Missing, Float64},2}: 1.0 2.0 missing missing 5.0 1.1 2.2 3.3 missing 5.5 -julia> impute(M, Interpolate(; context=Context(; limit=1.0)); dims=2) +julia> impute(M, Interpolate(); dims=:rows) 2×5 Array{Union{Missing, Float64},2}: 1.0 2.0 3.0 4.0 5.0 1.1 2.2 3.3 4.4 5.5 ``` """ -struct Interpolate <: Imputor - context::AbstractContext -end - -# TODO: Switch to using Base.@kwdef on 1.1 -Interpolate(; context=Context()) = Interpolate(context) - -function _impute!(data::AbstractVector{Missing}, imp::Interpolate) - @debug "Cannot interpolate points when all values are missing" - - # NOTE: We still do this so we can throw an ImputeError if the context has a limit set. - imp.context() do c - findfirst(c, data) - return data - end -end +struct Interpolate <: Imputor end function _impute!(data::AbstractVector{<:Union{T, Missing}}, imp::Interpolate) where T - imp.context() do c - i = findfirst(c, data) + 1 - i < lastindex(data) || @debug "Cannot interpolate points when all values are missing" - - while i < lastindex(data) - if ismissing!(c, data[i]) - prev_idx = i - 1 - next_idx = findnext(c, data, i + 1) + @assert !all(ismissing, data) + i = findfirst(!ismissing, data) + 1 - if next_idx !== nothing - gap_sz = (next_idx - prev_idx) - 1 + while i < lastindex(data) + if ismissing(data[i]) + prev_idx = i - 1 + next_idx = findnext(!ismissing, data, i + 1) - diff = data[next_idx] - data[prev_idx] - incr = diff / T(gap_sz + 1) - val = data[prev_idx] + incr + if next_idx !== nothing + gap_sz = (next_idx - prev_idx) - 1 - # Iteratively fill in the values - for j in i:(next_idx - 1) - data[j] = val - val += incr - end + diff = data[next_idx] - data[prev_idx] + incr = diff / T(gap_sz + 1) + val = data[prev_idx] + incr - i = next_idx - else - break + # Iteratively fill in the values + for j in i:(next_idx - 1) + data[j] = val + val += incr end + + i = next_idx + else + break end - i += 1 end - - return data + i += 1 end + + return data end diff --git a/src/imputors/knn.jl b/src/imputors/knn.jl index 85b1015..6392dfb 100644 --- a/src/imputors/knn.jl +++ b/src/imputors/knn.jl @@ -1,5 +1,5 @@ """ - KNN <: Imputor + KNN(; kwargs...) Imputation using k-Nearest Neighbor algorithm. @@ -7,69 +7,79 @@ Imputation using k-Nearest Neighbor algorithm. * `k::Int`: number of nearest neighbors * `dist::MinkowskiMetric`: distance metric suppports by `NearestNeighbors.jl` (Euclidean, Chebyshev, Minkowski and Cityblock) * `threshold::AbsstractFloat`: thershold for missing neighbors -* `on_complete::Function`: a function to run when imputation is complete # Reference * Troyanskaya, Olga, et al. "Missing value estimation methods for DNA microarrays." Bioinformatics 17.6 (2001): 520-525. """ -# TODO : Support Categorical Distance (NearestNeighbors.jl support needed) struct KNN{M} <: Imputor where M <: NearestNeighbors.MinkowskiMetric k::Int threshold::AbstractFloat dist::M - context::AbstractContext end -function KNN(; k=1, threshold=0.5, dist=Euclidean(), context=Context()) +# TODO : Support Categorical Distance (NearestNeighbors.jl support needed) +function KNN(; k=1, threshold=0.5, dist=Euclidean()) k < 1 && throw(ArgumentError("The number of nearset neighbors should be greater than 0")) !(0 < threshold < 1) && throw(ArgumentError("Missing neighbors threshold should be within 0 to 1")) # to exclude missing value itself - KNN(k + 1, threshold, dist, context) + KNN(k + 1, threshold, dist) end -function impute!(data::AbstractMatrix{<:Union{T, Missing}}, imp::KNN) where T<:Real - imp.context() do ctx - # Get mask array first (order of ) - mmask = ismissing.(transpose(data)) - - # fill missing value as mean value - impute!(data, Fill(; value=mean, context=ctx)) - - # then, transpose to D x N for KDTree - transposed = transpose(disallowmissing(data)) - - kdtree = KDTree(transposed, imp.dist) - idxs, dists = NearestNeighbors.knn(kdtree, transposed, imp.k, true) - - idxes = CartesianIndices(transposed) - fallback_threshold = imp.k * imp.threshold - - for I in CartesianIndices(transposed) - if mmask[I] == 1 - w = 1.0 ./ dists[I[2]] - ws = sum(w[2:end]) - missing_neighbors = ismissing.(transposed[:, idxs[I[2]]][:, 2:end]) - - # exclude missing value itself because distance would be zero - if isnan(ws) || isinf(ws) || iszero(ws) - # if distance is zero or not a number, keep mean imputation - transposed[I] = transposed[I] - elseif count(!iszero, mapslices(sum, missing_neighbors, dims=1)) > - fallback_threshold - # If too many neighbors are also missing, fallback to mean imputation - # get column and check how many neighbors are also missing - transposed[I] = transposed[I] - else - # Inverse distance weighting - wt = w .* transposed[I[1], idxs[I[2]]] - transposed[I] = sum(wt[2:end]) / ws - end +function impute!(data::AbstractMatrix{Union{T, Missing}}, imp::KNN; dims=nothing) where T<:Real + d = dim(data, dims) + + # KDTree expects data of the form dims x n + X = d == 1 ? data : transpose(data) + + # Get mask array first + mmask = ismissing.(X) + + # fill missing value as mean value + impute!(X, Substitute(); dims=1) + + # Disallow `missings` for NearestNeighbors + X = disallowmissing(X) + + kdtree = KDTree(X, imp.dist) + idxs, dists = NearestNeighbors.knn(kdtree, X, imp.k, true) + + idxes = CartesianIndices(X) + fallback_threshold = imp.k * imp.threshold + + for I in CartesianIndices(X) + if mmask[I] == 1 + w = 1.0 ./ dists[I[2]] + ws = sum(w[2:end]) + # Shouldn't ismissing.(X[...][...]) be replaced with mmask[...][...]? + # If so then I think the test might need updating cause the "Data match" section + # seems to fallback on the mean imputation consistently + neighbors = mapslices( + iszero ∘ sum, + ismissing.(X[:, idxs[I[2]]][:, 2:end]); + dims=1 + ) + + # exclude missing value itself because distance would be zero + # If too many neighbors are also missing, fallback to mean imputation + # get column and check how many neighbors are also missing + if isfinite(ws) && !iszero(ws) && count(neighbors) > fallback_threshold + # Inverse distance weighting + wt = w .* X[I[1], idxs[I[2]]] + X[I] = sum(wt[2:end]) / ws end end - - # for type stability - allowmissing(transposed') end + + # for type stability + return allowmissing(d == 1 ? X : X') end + +impute!(data::AbstractMatrix{Missing}, imp::KNN; kwargs...) = data + +function impute(data::AbstractMatrix{Union{T, Missing}}, imp::KNN; kwargs...) where T<:Real + return impute!(trycopy(data), imp; kwargs...) +end + +impute(data::AbstractMatrix{Missing}, imp::KNN; kwargs...) = trycopy(data) diff --git a/src/imputors/locf.jl b/src/imputors/locf.jl index 3b541f1..05a0e45 100644 --- a/src/imputors/locf.jl +++ b/src/imputors/locf.jl @@ -1,5 +1,5 @@ """ - LOCF(; context=Context()) + LOCF()) Last observation carried forward (LOCF) iterates forwards through the `data` and fills missing data with the last existing observation. The current implementation is univariate, @@ -8,51 +8,36 @@ so each variable in a table or matrix will be handled independently. See also: - [NOCB](@ref): Next Observation Carried Backward -WARNING: missing elements at the head of the array may not be imputed if there is no +!!! Missing elements at the head of the array may not be imputed if there is no existing observation to carry forward. As a result, this method does not guarantee that all missing values will be imputed. -# Keyword Arguments -* `context::AbstractContext`: A context which keeps track of missing data - summary information - # Example ```jldoctest -julia> using Impute: LOCF, Context, impute +julia> using Impute: LOCF, impute julia> M = [1.0 2.0 missing missing 5.0; 1.1 2.2 3.3 missing 5.5] 2×5 Array{Union{Missing, Float64},2}: 1.0 2.0 missing missing 5.0 1.1 2.2 3.3 missing 5.5 -julia> impute(M, LOCF(; context=Context(; limit=1.0)); dims=2) +julia> impute(M, LOCF(); dims=:rows) 2×5 Array{Union{Missing, Float64},2}: 1.0 2.0 2.0 2.0 5.0 1.1 2.2 3.3 3.3 5.5 ``` """ -struct LOCF <: Imputor - context::AbstractContext -end - -# TODO: Switch to using Base.@kwdef on 1.1 -LOCF(; context=Context()) = LOCF(context) +struct LOCF <: Imputor end -function _impute!(data::AbstractVector, imp::LOCF) - imp.context() do c - start_idx = findfirst(c, data) - if start_idx === nothing - @debug "Cannot carry forward points when all values are missing" - return data - end +function _impute!(data::AbstractVector{Union{T, Missing}}, imp::LOCF) where T + @assert !all(ismissing, data) + start_idx = findfirst(!ismissing, data) + 1 - start_idx += 1 - for i in start_idx:lastindex(data) - if ismissing!(c, data[i]) - data[i] = data[i-1] - end + for i in start_idx:lastindex(data) + if ismissing(data[i]) + data[i] = data[i-1] end - - return data end + + return data end diff --git a/src/imputors/nocb.jl b/src/imputors/nocb.jl index 0c59107..132a33c 100644 --- a/src/imputors/nocb.jl +++ b/src/imputors/nocb.jl @@ -1,5 +1,5 @@ """ - NOCB(; context=Context()) + NOCB() Next observation carried backward (NOCB) iterates backwards through the `data` and fills missing data with the next existing observation. @@ -7,51 +7,38 @@ missing data with the next existing observation. See also: - [LOCF](@ref): Last Observation Carried Forward -WARNING: missing elements at the tail of the array may not be imputed if there is no +!!! Missing elements at the tail of the array may not be imputed if there is no existing observation to carry backward. As a result, this method does not guarantee that all missing values will be imputed. # Keyword Arguments -* `context::AbstractContext`: A context which keeps track of missing data - summary information # Example ```jldoctest -julia> using Impute: NOCB, Context, impute +julia> using Impute: NOCB, impute julia> M = [1.0 2.0 missing missing 5.0; 1.1 2.2 3.3 missing 5.5] 2×5 Array{Union{Missing, Float64},2}: 1.0 2.0 missing missing 5.0 1.1 2.2 3.3 missing 5.5 -julia> impute(M, NOCB(; context=Context(; limit=1.0)); dims=2) +julia> impute(M, NOCB(); dims=:rows) 2×5 Array{Union{Missing, Float64},2}: 1.0 2.0 5.0 5.0 5.0 1.1 2.2 3.3 5.5 5.5 ``` """ -struct NOCB <: Imputor - context::AbstractContext -end - -# TODO: Switch to using Base.@kwdef on 1.1 -NOCB(; context=Context()) = NOCB(context) +struct NOCB <: Imputor end -function _impute!(data::AbstractVector, imp::NOCB) - imp.context() do c - end_idx = findlast(c, data) - if end_idx === nothing - @debug "Cannot carry backward points when all values are missing" - return data - end +function _impute!(data::AbstractVector{Union{T, Missing}}, imp::NOCB) where T + @assert !all(ismissing, data) + end_idx = findlast(!ismissing, data) - 1 - end_idx -= 1 - for i in end_idx:-1:firstindex(data) - if ismissing!(c, data[i]) - data[i] = data[i+1] - end + for i in end_idx:-1:firstindex(data) + if ismissing(data[i]) + data[i] = data[i+1] end - - return data end + + return data end diff --git a/src/imputors/replace.jl b/src/imputors/replace.jl new file mode 100644 index 0000000..816f24f --- /dev/null +++ b/src/imputors/replace.jl @@ -0,0 +1,37 @@ +""" + Replace(; value) + +Replace `missing`s with one of the specified constant values, depending on the input type. +If multiple values of the same type are provided then the first one will be used. +If the input data is of a different type then the no replacement will be performed. + +# Keyword Arguments +* `values::Tuple`: A scalar or tuple of different values that should be used to replace + missings. Typically, one value per type you're considering imputing for. + +# Example +```jldoctest +julia> using Impute: Replace, impute + +julia> M = [1.0 2.0 missing missing 5.0; 1.1 2.2 3.3 missing 5.5] +2×5 Array{Union{Missing, Float64},2}: + 1.0 2.0 missing missing 5.0 + 1.1 2.2 3.3 missing 5.5 + +julia> impute(M, Replace(; values=0.0); dims=2) +2×5 Array{Union{Missing, Float64},2}: + 1.0 2.0 0.0 0.0 5.0 + 1.1 2.2 3.3 0.0 5.5 +``` +""" +struct Replace <: Imputor + values::Tuple +end + +Replace(; values) = isa(values, Tuple) ? Replace(values) : Replace(tuple(values)) + +function _impute!(data::AbstractArray{Union{T, Missing}}, imp::Replace) where T + i = findfirst(x -> isa(x, T), imp.values) + i === nothing && return data + return Base.replace!(data, missing => imp.values[i]) +end diff --git a/src/imputors/srs.jl b/src/imputors/srs.jl index 0b87ff4..bd69e9f 100644 --- a/src/imputors/srs.jl +++ b/src/imputors/srs.jl @@ -1,11 +1,12 @@ -struct SRS <: Imputor - rng::AbstractRNG - context::AbstractContext +struct SRS{R<:AbstractRNG} <: Imputor + rng::R end +# Docstring below uses julia-repl cause the rng may give different result on different +# versions of julia. """ - SRS(; rng=Random.GLOBAL_RNG, context=Context()) + SRS(; rng=Random.GLOBAL_RNG) Simple Random Sampling (SRS) imputation is a method for imputing both continuous and categorical variables. Furthermore, it completes imputation while preserving the distributional @@ -20,37 +21,33 @@ for both categorical and continuous data. # Keyword Arguments * `rng::AbstractRNG`: A random number generator to use for observation selection -* `context::AbstractContext`: A context which keeps track of missing data - summary information # Example -```jldoctest -julia> using Random; using Impute: SRS, Context, impute +```julia-repl +julia> using Random; using Impute: SRS, impute julia> M = [1.0 2.0 missing missing 5.0; 1.1 2.2 3.3 missing 5.5] 2×5 Array{Union{Missing, Float64},2}: 1.0 2.0 missing missing 5.0 1.1 2.2 3.3 missing 5.5 -julia> impute(M, SRS(; rng=MersenneTwister(1234), context=Context(; limit=1.0)); dims=2) +julia> impute(M, SRS(; rng=MersenneTwister(1234)); dims=:rows) 2×5 Array{Union{Missing, Float64},2}: - 1.0 2.0 1.0 5.0 5.0 + 1.0 2.0 1.0 2.0 5.0 1.1 2.2 3.3 3.3 5.5 ``` """ -SRS(; rng=Random.GLOBAL_RNG, context=Context()) = SRS(rng, context) - -function impute!(data::AbstractVector, imp::SRS) - imp.context() do c - obs_values = Impute.dropobs(data; context=imp.context) - if !isempty(obs_values) - for i in eachindex(data) - if ismissing!(c, data[i]) - data[i] = rand(imp.rng, obs_values) - end +SRS(; rng=Random.GLOBAL_RNG) = SRS(rng) + +function _impute!(data::AbstractArray{Union{T, Missing}}, imp::SRS) where T + obs_values = collect(skipmissing(data)) + if !isempty(obs_values) + for i in eachindex(data) + if ismissing(data[i]) + data[i] = rand(imp.rng, obs_values) end end - - return data end + + return data end diff --git a/src/imputors/standardize.jl b/src/imputors/standardize.jl new file mode 100644 index 0000000..b573100 --- /dev/null +++ b/src/imputors/standardize.jl @@ -0,0 +1,80 @@ +""" + Standardize(; values) + +Standardize (or replace) various missing data values with `missing`. +This is useful for downstream imputation methods that assume missing data is represented by +a `missing`. + +!!! In-place methods are only applicable for datasets which already `allowmissing`. + +# Keyword Arguments +* `value::Tuple`: A tuple of values that should be considered `missing` + +# Example +```jldoctest +julia> using Impute: Standardize, impute + +julia> M = [1.0 2.0 -9999.0 NaN 5.0; 1.1 2.2 3.3 0.0 5.5] +2×5 Array{Float64,2}: + 1.0 2.0 -9999.0 NaN 5.0 + 1.1 2.2 3.3 0.0 5.5 + +julia> impute(M, Standardize(; values=(NaN, -9999.0, 0.0))) +2×5 Array{Union{Missing, Float64},2}: + 1.0 2.0 missing missing 5.0 + 1.1 2.2 3.3 missing 5.5 +``` +""" +struct Standardize{T<:Tuple} <: Imputor + values::T +end + +function Standardize(; values) + T = isa(values, Tuple) ? values : tuple(values) + return Standardize{typeof(T)}(T) +end + +# Primary definition just calls `replace!` +function _impute!(data::AbstractArray{Union{T, Missing}}, imp::Standardize) where T + # Reduce the possible set of values to those that could actually be found in the data + # Useful, if we declare a `Replace` imputor that should be applied to multiple datasets. + Base.replace!(data, (v => missing for v in imp.values if v isa T)...) +end + +# Most of the time the in-place methods won't work because we need to change the +# eltype with allowmissing +impute(data::AbstractArray, imp::Standardize) = _impute!(allowmissing(data), imp) + +# Custom implementation of a non-mutating impute for tables +function impute(table, imp::Standardize) + istable(table) || throw(MethodError(impute, (table, imp))) + + ctable = Tables.columns(table) + + cnames = Tuple(propertynames(ctable)) + cdata = Tuple(impute(getproperty(ctable, cname), imp) for cname in cnames) + # Reconstruct as a ColumnTable + result = NamedTuple{cnames}(cdata) + + # If our input was a ColumnTable just return the result. We can also do the same for + if isa(table, Tables.ColumnTable) + return result + else + return materializer(table)(result) + end +end + +# Specialcase for rowtable +function impute(data::T, imp::Standardize) where T <: AbstractVector{<:NamedTuple} + # We use columntable here so that we don't call `materialize` more often than needed. + return materializer(data)(impute(Tables.columntable(data), imp)) +end + +# Awkward imputor overrides necessary because we intercepted the higher level +# `impute` calls +_impute!(data::AbstractArray{Missing}, imp::Standardize) = data + +# Skip custom dims stuff cause it isn't necessary here. +function impute!(data::AbstractMatrix{Union{T, Missing}}, imp::Standardize) where {T} + return _impute!(data, imp) +end diff --git a/src/imputors/substitute.jl b/src/imputors/substitute.jl new file mode 100644 index 0000000..f45c580 --- /dev/null +++ b/src/imputors/substitute.jl @@ -0,0 +1,89 @@ +""" + Substitute(; statistic=nothing) + Substitute(; robust=true, weights=nothing) + +Substitute missing values with a summary statistic over the non-missing values. + +# Keyword Arguments +* `statistic`: A summary statistic function to be applied to the non-missing values. + This function should return a value of the same type as the input data `eltype`. + If this function isn't passed in then the `defaultstats` function is used to make a + best guess. +* `robust`: Whether to use `median` or `mean` for continuous datasets in `defaultstats` +* `weights`: A set of statistical weights to apply to the `mean` or `median` in `defaultstats`. + +# Default Rules +Our default substitution rules defined in `defaultstats` are as follows: + +* `mode` applies to non-`Real`s, `Bool`s, and `Integers` with few unique values. +* `median` is used for all other `Real` values that aren't restricted by the above rules. + Optionally, `mean` can be used if `robust=false`. If statistical `weights` are passed in + then a weighted `mean`/`median` will be calculated. + +# Example +```jldoctest +julia> using Statistics; using Impute: Substitute, impute + +julia> M = [1.0 2.0 missing missing 5.0; 1.1 2.2 3.3 missing 5.5] +2×5 Array{Union{Missing, Float64},2}: + 1.0 2.0 missing missing 5.0 + 1.1 2.2 3.3 missing 5.5 + +julia> impute(M, Substitute(; statistic=mean ∘ skipmissing); dims=:rows) +2×5 Array{Union{Missing, Float64},2}: + 1.0 2.0 2.66667 2.66667 5.0 + 1.1 2.2 3.3 3.025 5.5 +``` +""" +struct Substitute <: Imputor + statistic::Function +end + +function Substitute(; + statistic::Union{Function, Nothing}=nothing, + robust=true, + weights=nothing +) + statistic !== nothing && return Substitute(statistic) + + return Substitute() do data + if weights === nothing + items = collect(skipmissing(data)) + defaultstats(items, robust) + else + mask = .!ismissing.(data) + items = disallowmissing(data[mask]) + wv = weights[mask] + defaultstats(items, robust, wv) + end + end +end + +function _impute!(data::AbstractArray{Union{T, Missing}}, imp::Substitute) where T + x = imp.statistic(data) + return Base.replace!(data, missing => x) +end + +# Auxiliary functions defining our default substitution rules + +# If we're operating over Bools then we're probably better off using mode +defaultstats(data::AbstractArray{<:Bool}, robust::Bool, args...) = mode(data) + +# If we're operating over integers with relatively few unique values then we're +# likely dealing with either counts or a categorical coding, so mode is probably +# safer +function defaultstats(data::AbstractArray{T}, robust::Bool, args...) where T <: Integer + threshold = 0.25 * length(data) + nunique = length(unique(data)) + nunique < threshold && return mode(data) + result = robust ? median(data, args...) : mean(data, args...) + return round(T, result) +end + +# For most real valued data we should use median +function defaultstats(data::AbstractArray{<:Real}, robust::Bool, args...) + return robust ? median(data, args...) : mean(data, args...) +end + +# Fallback to mode as many types won't support mean or median anyways +defaultstats(data::AbstractArray, args...) = mode(data) diff --git a/src/imputors/svd.jl b/src/imputors/svd.jl index e434ef4..5056c2e 100644 --- a/src/imputors/svd.jl +++ b/src/imputors/svd.jl @@ -1,44 +1,41 @@ """ - SVD <: Imputor + SVD(; kwargs...) Imputes the missing values in a matrix using an expectation maximization (EM) algorithm over low-rank SVD approximations. # Keyword Arguments -* `init::Imputor`: initialization method for missing values (default: Fill()) +* `init::Imputor`: initialization method for missing values (default: Substitute()) * `rank::Union{Int, Nothing}`: rank of the SVD approximation (default: nothing meaning start and 0 and increase) * `tol::Float64`: convergence tolerance (default: 1e-10) * `maxiter::Int`: Maximum number of iterations if convergence is not achieved (default: 100) * `limits::Unoin{Tuple{Float64, Float64}, Nothing}`: Bound the possible approximation values (default: nothing) * `verbose::Bool`: Whether to display convergence progress (default: true) -* `context::Context`: Missing data context settings (default: Context()) # References * Troyanskaya, Olga, et al. "Missing value estimation methods for DNA microarrays." Bioinformatics 17.6 (2001): 520-525. """ struct SVD <: Imputor - init::Fill + init::Imputor rank::Union{Int, Nothing} tol::Float64 maxiter::Int limits::Union{Tuple{Float64, Float64}, Nothing} verbose::Bool - context::AbstractContext end function SVD(; - init=Fill(), rank=nothing, tol=1e-10, maxiter=100, limits=nothing, verbose=true, context=Context() + init=Substitute(), rank=nothing, tol=1e-10, maxiter=100, limits=nothing, verbose=true ) - SVD(init, rank, tol, maxiter, limits, verbose, context) + SVD(init, rank, tol, maxiter, limits, verbose) end -function impute!(data::AbstractMatrix{<:Union{T, Missing}}, imp::SVD) where T<:Real +function impute!(data::AbstractMatrix{Union{T, Missing}}, imp::SVD; dims=nothing) where T<:Real n, p = size(data) k = imp.rank === nothing ? 0 : min(imp.rank, p-1) S = zeros(T, min(n, p)) X = zeros(T, n, p) - ctx = imp.context # Get our before and after views of our missing and non-missing data mmask = ismissing.(data) omask = .!mmask @@ -49,11 +46,11 @@ function impute!(data::AbstractMatrix{<:Union{T, Missing}}, imp::SVD) where T<:R oX = X[omask] # Fill in the original data - impute!(data, imp.init) + impute!(data, imp.init; dims=dims) C = sum(abs2, mdata - mX) / sum(abs2, mdata) err = mean(abs.(odata - oX)) - @info("Before: Diff=$(sum(mdata - mX)), MAE=$err, convergence=$C, normsq=$(sum(abs2, mdata)), $(mX[1])") + @debug("Before", Diff=sum(mdata - mX), MAE=err, convergence=C, normsq=sum(abs2, mdata), mX[1]) for i in 1:imp.maxiter if imp.rank === nothing @@ -82,16 +79,22 @@ function impute!(data::AbstractMatrix{<:Union{T, Missing}}, imp::SVD) where T<:R # Print the error between reconstruction and observed inputs if imp.verbose err = mean(abs.(odata - oX)) - @info("Iteration $i: Diff=$(sum(mdata - mX)), MAE=$err, MSS=$(sum(abs2, mdata)), convergence=$C") + @debug("Iteration", i, Diff=sum(mdata - mX), MAE=err, MSS=sum(abs2, mdata), convergence=C) end # Update missing values data[mmask] .= X[mmask] - if isfinite(C) && C < imp.tol - break - end + isfinite(C) && C < imp.tol && break end return data end + +impute!(data::AbstractMatrix{Missing}, imp::SVD; kwargs...) = data + +function impute(data::AbstractMatrix{Union{T, Missing}}, imp::SVD; kwargs...) where T<:Real + return impute!(trycopy(data), imp; kwargs...) +end + +impute(data::AbstractMatrix{Missing}, imp::SVD; kwargs...) = trycopy(data) diff --git a/src/utils.jl b/src/utils.jl new file mode 100644 index 0000000..f8c1575 --- /dev/null +++ b/src/utils.jl @@ -0,0 +1,28 @@ +function trycopy(data) + # Not all objects support `copy`, but we should use it to improve + # performance if possible. + try + copy(data) + catch + deepcopy(data) + end +end + +function dim(data, d) + # Special case d === nothing as this currently signifies the default colwise + # operations that are being deprecated. + if d === nothing + Base.depwarn( + "Imputing on matrices will require specifying `dims=2` or `dims=:cols` in a " * + "future release, to maintain the current behaviour.", + :dim + ) + return 2 + # Special case tables and matrices using the `:rows` and `:cols` dims values + elseif d in (:rows, :cols) && (istable(data) || isa(data, AbstractMatrix)) + return NamedDims.dim((:rows, :cols), d) + # Fallback to whatever NameDims gives us + else + return NamedDims.dim(NamedDims.dimnames(data), d) + end +end diff --git a/test/assertions.jl b/test/assertions.jl new file mode 100644 index 0000000..3e4ca74 --- /dev/null +++ b/test/assertions.jl @@ -0,0 +1,76 @@ +@testset "Assertions" begin + # Defining our missing datasets + a = allowmissing(1.0:1.0:20.0) + a[[2, 3, 7]] .= missing + mask = map(!ismissing, a) + + # We call collect to not have a wrapper type that references the same data. + m = collect(reshape(a, 5, 4)) + + aa = AxisArray( + deepcopy(m), + Axis{:time}(DateTime(2017, 6, 5, 5):Hour(1):DateTime(2017, 6, 5, 9)), + Axis{:id}(1:4) + ) + + table = DataFrame( + :sin => allowmissing(sin.(1.0:1.0:20.0)), + :cos => allowmissing(sin.(1.0:1.0:20.0)), + ) + + table.sin[[2, 3, 7, 12, 19]] .= missing + + @testset "Base" begin + t = Threshold(; ratio=0.1) + @test_throws ThresholdError assert(a, t) + @test_throws ThresholdError assert(m, t) + @test_throws ThresholdError assert(aa, t) + @test_throws ThresholdError assert(table, t) + + # Test showerror + msg = try + assert(a, t) + catch e + sprint(showerror, e) + end + + @test msg == "ThresholdError: Ratio of missing values exceeded 0.1 (0.15)\n" + + t = Threshold(; ratio=0.8) + # Use isequal because we expect the results to contain missings + @test isequal(assert(a, t), a) + @test isequal(assert(m, t), m) + @test isequal(assert(aa, t), aa) + @test isequal(assert(table, t), table) + end + + @testset "Weighted" begin + # If we use an exponentially weighted context then we won't pass the limit + # because missing earlier observations is less important than later ones. + t = Threshold(; ratio=0.8, weights=eweights(20, 0.3)) + @test isequal(assert(a, t), a) + @test isequal(assert(table, t), table) + + @test isequal(threshold(m; ratio=0.8, weights=eweights(5, 0.3), dims=:cols), m) + @test isequal(threshold(m; ratio=0.8, weights=eweights(5, 0.3), dims=:cols), aa) + + # If we reverse the weights such that earlier observations are more important + # then our previous limit of 0.2 won't be enough to succeed. + t = Threshold(; ratio=0.1, weights=reverse!(eweights(20, 0.3))) + @test_throws ThresholdError assert(a, t) + @test_throws ThresholdError assert(table, t) + + t = Threshold(; ratio=0.1, weights=reverse!(eweights(5, 0.3))) + @test_throws ThresholdError assert(m, t; dims=:cols) + @test_throws ThresholdError assert(aa, t; dims=:cols) + + @test_throws DimensionMismatch assert(a[1:10], t) + @test_throws DimensionMismatch assert(m[1:3, :], t; dims=:cols) + end + + @testset "functional" begin + @test_throws ThresholdError Impute.threshold(a; ratio=0.1) + # Use isequal because we expect the results to contain missings + @test isequal(Impute.threshold(a; ratio=0.8), a) + end +end diff --git a/test/chain.jl b/test/chain.jl new file mode 100644 index 0000000..e38eec3 --- /dev/null +++ b/test/chain.jl @@ -0,0 +1,124 @@ +@testset "Chaining and Piping" begin + # TODO: Add tests at each section to double check that orig hasn't been overwritten. + orig = Impute.dataset("test/table/neuro") |> DataFrame + + @testset "DataFrame" begin + # Less effecient, but a chain should produce the same results as manual + # piping the functional outputs. + result = Impute.interp(orig) |> Impute.locf! |> Impute.nocb! + + @test size(result) == size(orig) + # Confirm that we don't have any more missing values + @test all(!ismissing, Matrix(result)) + + # We can also use the Chain type with explicit Imputor types + C = Impute.Interpolate() ∘ Impute.LOCF() ∘ Impute.NOCB() + result2 = C(orig) + @test result == result2 + + @testset "GroupedDataFrame" begin + T = NamedTuple{(:hod, :obj, :val), Tuple{Int, Int, Union{Float64, Missing}}} + + df = map(Iterators.product(1:24, 1:8, 0:19)) do t + hod, obj, x = t + # Deterministically return some `missing`s per hod/obj pair + return if x in (0, 5, 12, 19) + T((hod, obj, missing)) + else + T((hod, obj, sin(hod) * cos(x) + obj)) + end + end |> DataFrame + + gdf1 = groupby(deepcopy(df), [:hod, :obj]) + gdf2 = groupby(df, [:hod, :obj]) + + C = Impute.Interpolate() ∘ Impute.LOCF() ∘ Impute.NOCB() + + result = mapreduce(C, vcat, gdf1) + # Check that the result isn't the same as the source dataframe + @test df != result + # Check that the size is still the same since we didn't drop any rows + @test size(result) == size(df) + # Check that there are no remaining missing values + @test all(!ismissing, Tables.matrix(result)) + # Double check that our source dataframe still contains missings + @test any(ismissing, Tables.matrix(df)) + end + end + + @testset "Column Table" begin + result = Tables.columntable(orig) |> + Impute.interp! |> + Impute.locf! |> + Impute.nocb! |> + Tables.matrix + + @test size(result) == size(orig) + # Confirm that we don't have any more missing values + @test all(!ismissing, result) + end + + @testset "Row Table" begin + result = Tables.rowtable(orig) |> + Impute.interp! |> + Impute.locf! |> + Impute.nocb! |> + Tables.matrix + + @test size(result) == size(orig) + # Confirm that we don't have any more missing values + @test all(!ismissing, result) + end + + @testset "Matrix" begin + data = Matrix(orig) + C = Impute.Interpolate() ∘ Impute.LOCF() ∘ Impute.NOCB() + result = C(data; dims=:cols) + + @test size(result) == size(data) + # Confirm that we don't have any more missing values + @test all(!ismissing, result) + end + + @testset "AxisArray" begin + data = AxisArray( + Matrix(orig), + Axis{:row}(1:size(orig, 1)), + Axis{:V}(names(orig)), + ) + C = Impute.Interpolate() ∘ Impute.LOCF() ∘ Impute.NOCB() + result = C(data; dims=:cols) + + @test size(result) == size(data) + # Confirm that we don't have any more missing values + @test all(!ismissing, result) + end + + @testset "KeyedArray" begin + data = KeyedArray(Matrix(orig); row=1:size(orig, 1), V=names(orig)) + C = Impute.Interpolate() ∘ Impute.LOCF() ∘ Impute.NOCB() + result = C(data; dims=:cols) + + @test size(result) == size(data) + # Confirm that we don't have any more missing values + @test all(!ismissing, result) + end + + @testset "Multi-type" begin + data = Impute.dataset("test/table/neuro") |> Tables.matrix + @test any(ismissing, data) + # Filter out colunns with more than 400 missing values, Fill with 0, and check that + # everything was replaced + C = Chain( + Impute.Filter(c -> count(ismissing, c) < 400), + Impute.Replace(; values=0.0), + Impute.Threshold(), + ) + + result = C(data; dims=:cols) + @test size(result, 1) == size(data, 1) + # We should have filtered out 1 column + @test size(result, 2) < size(data, 2) + @test all(!ismissing, result) + end +end diff --git a/test/data.jl b/test/data.jl new file mode 100644 index 0000000..c4fe220 --- /dev/null +++ b/test/data.jl @@ -0,0 +1,12 @@ +@testset "data" begin + datasets = Impute.datasets() + + @testset "Impute.dataset($name)" for name in datasets + result = Impute.dataset(name) + if occursin("matrix", name) + @test isa(result, AbstractDict) + elseif occursin("table", name) + @test isa(result, CSV.File) + end + end +end diff --git a/test/deprecated.jl b/test/deprecated.jl index e69de29..3b2a21d 100644 --- a/test/deprecated.jl +++ b/test/deprecated.jl @@ -0,0 +1,306 @@ +@testset "deprecated" begin + # Defining our missing datasets + a = allowmissing(1.0:1.0:20.0) + a[[2, 3, 7]] .= missing + mask = map(!ismissing, a) + + # We call collect to not have a wrapper type that references the same data. + m = collect(reshape(a, 5, 4)) + + aa = AxisArray( + deepcopy(m), + Axis{:time}(DateTime(2017, 6, 5, 5):Hour(1):DateTime(2017, 6, 5, 9)), + Axis{:id}(1:4) + ) + + table = DataFrame( + :sin => allowmissing(sin.(1.0:1.0:20.0)), + :cos => allowmissing(sin.(1.0:1.0:20.0)), + ) + + table.sin[[2, 3, 7, 12, 19]] .= missing + + @testset "Default colwise" begin + msg = string( + "Imputing on matrices will require specifying `dims=2` or `dims=:cols` in a ", + "future release, to maintain the current behaviour." + ) + @test_logs (:warn, msg) Impute.interp(m) + @test_logs (:warn, msg) Impute.locf(m) + @test_logs (:warn, msg) Impute.nocb(m) + @test_logs (:warn, msg) Impute.srs(m) + end + + @testset "Fill" begin + @testset "Value" begin + fill_val = -1.0 + result = @test_deprecated impute(a, Fill(; value=fill_val)) + expected = copy(a) + expected[[2, 3, 7]] .= fill_val + + @test result == expected + @test result == @test_deprecated Impute.fill(a; value=fill_val) + end + + @testset "Mean" begin + result = @test_deprecated impute(a, Fill(; value=mean)) + expected = copy(a) + expected[[2, 3, 7]] .= mean(a[mask]) + + @test result == expected + @test result == @test_deprecated Impute.fill(a; value=mean) + + a2 = copy(a) + @test_deprecated Impute.fill!(a2) + @test a2 == result + end + + @testset "Matrix" begin + data = Matrix(Impute.dataset("test/table/neuro") |> DataFrame) + + result = @test_deprecated impute(data, Fill(; value=0.0); dims=:cols) + @test size(result) == size(data) + @test result == @test_deprecated Impute.fill(data; value=0.0, dims=:cols) + + data2 = copy(data) + @test_deprecated Impute.fill!(data2; value=0.0, dims=:cols) + @test data2 == result + end + end + + @testset "Drop" begin + @testset "Equality" begin + @testset "$T" for T in (DropObs, DropVars) + @test @test_deprecated(T()) == @test_deprecated(T()) + end + end + + @testset "DropObs" begin + @testset "Vector" begin + result = @test_deprecated impute(a, DropObs()) + expected = deleteat!(deepcopy(a), [2, 3, 7]) + + @test result == expected + @test result == @test_deprecated Impute.dropobs(a) + + a2 = deepcopy(a) + @test_deprecated Impute.dropobs!(a2) + @test a2 == expected + end + + @testset "Matrix" begin + result = @test_deprecated impute(m, DropObs()) + expected = m[[1, 4, 5], :] + + @test isequal(result, expected) + @test isequal(result, @test_deprecated(Impute.dropobs(m))) + @test isequal( + collect(result'), + @test_deprecated(Impute.dropobs(collect(m'); dims=2)) + ) + + m_ = @test_deprecated Impute.dropobs!(m) + # The mutating test is broken because we need to making a copy of + # the original matrix + @test_broken isequal(m, expected) + @test isequal(m_, expected) + end + + @testset "Tables" begin + @testset "DataFrame" begin + df = deepcopy(table) + result = impute(df, @test_deprecated DropObs()) + expected = dropmissing(df) + + @test isequal(result, expected) + @test isequal(result, @test_deprecated(Impute.dropobs(df))) + + df_ = @test_deprecated Impute.dropobs!(df) + # The mutating test is broken because we need to making a copy of + # the original table + @test_broken isequal(df, expected) + @test isequal(df_, expected) + end + + @testset "Column Table" begin + coltab = Tables.columntable(table) + + result = @test_deprecated impute(coltab, DropObs()) + expected = Tables.columntable(dropmissing(table)) + + @test isequal(result, expected) + @test isequal(result, @test_deprecated(Impute.dropobs(coltab))) + + coltab_ = @test_deprecated Impute.dropobs!(coltab) + # The mutating test is broken because we need to making a copy of + # the original table + @test_broken isequal(coltab, expected) + @test isequal(coltab_, expected) + end + + @testset "Row Table" begin + rowtab = Tables.rowtable(table) + result = @test_deprecated impute(rowtab, DropObs()) + expected = Tables.rowtable(dropmissing(table)) + + @test isequal(result, expected) + @test isequal(result, @test_deprecated(Impute.dropobs(rowtab))) + + rowtab_ = @test_deprecated Impute.dropobs!(rowtab) + # The mutating test is broken because we need to making a copy of + # the original table + # @test_broken isequal(rowtab, expected) + @test isequal(rowtab_, expected) + end + end + + @testset "AxisArray" begin + result = @test_deprecated impute(aa, DropObs()) + expected = aa[[1, 4, 5], :] + + @test isequal(result, expected) + @test isequal(result, @test_deprecated(Impute.dropobs(aa))) + + aa_ = @test_deprecated Impute.dropobs!(aa) + # The mutating test is broken because we need to making a copy of + # the original matrix + @test_broken isequal(aa, expected) + @test isequal(aa_, expected) + end + end + + @testset "DropVars" begin + @testset "Vector" begin + @test_deprecated @test_throws MethodError Impute.dropvars(a) + end + + @testset "Matrix" begin + result = @test_deprecated impute(m, DropVars()) + expected = copy(m)[:, 3:4] + + @test isequal(result, expected) + @test isequal(result, @test_deprecated(Impute.dropvars(m))) + @test isequal( + collect(result'), + @test_deprecated(Impute.dropvars(collect(m'); dims=2)) + ) + + m_ = @test_deprecated Impute.dropvars!(m) + # The mutating test is broken because we need to making a copy of + # the original matrix + @test_broken isequal(m, expected) + @test isequal(m_, expected) + end + + @testset "Tables" begin + @testset "DataFrame" begin + df = deepcopy(table) + result = @test_deprecated impute(df, DropVars()) + expected = select(df, :cos) + + @test isequal(result, expected) + @test isequal(result, @test_deprecated Impute.dropvars(df)) + + @test_deprecated Impute.dropvars!(df) + # The mutating test is broken because we need to making a copy of + # the original table + @test_broken isequal(df, expected) + end + + @testset "Column Table" begin + coltab = Tables.columntable(table) + + result = @test_deprecated impute(coltab, DropVars()) + expected = Tables.columntable(TableOperations.select(coltab, :cos)) + + @test isequal(result, expected) + @test isequal(result, @test_deprecated Impute.dropvars(coltab)) + + @test_deprecated Impute.dropvars!(coltab) + # The mutating test is broken because we need to making a copy of + # the original table + @test_broken isequal(coltab, expected) + end + + @testset "Row Table" begin + rowtab = Tables.rowtable(table) + result = @test_deprecated impute(rowtab, DropVars()) + expected = Tables.rowtable(TableOperations.select(rowtab, :cos)) + + @test isequal(result, expected) + @test isequal(result, @test_deprecated Impute.dropvars(rowtab)) + + @test_deprecated Impute.dropvars!(rowtab) + # The mutating test is broken because we need to making a copy of + # the original table + @test_broken isequal(rowtab, expected) + end + end + @testset "AxisArray" begin + result = @test_deprecated impute(aa, DropVars()) + expected = copy(aa)[:, 3:4] + + @test isequal(result, expected) + @test isequal(result, @test_deprecated Impute.dropvars(aa)) + + aa_ = @test_deprecated Impute.dropvars!(aa) + # The mutating test is broken because we need to making a copy of + # the original matrix + @test_broken isequal(aa, expected) + @test isequal(aa_, expected) + end + end + end + + @testset "Chain" begin + orig = Impute.dataset("test/table/neuro") |> DataFrame + + # Less effecient, but a chain should produce the same results as manual + # piping the functional outputs. + result = Impute.interp(orig) |> Impute.locf! |> Impute.nocb! + + @test size(result) == size(orig) + # Confirm that we don't have any more missing values + @test all(!ismissing, Matrix(result)) + + # We can also use the Chain type with explicit Imputor types + result2 = @test_deprecated impute( + orig, + Impute.Chain( + Impute.Interpolate(), + Impute.LOCF(), + Impute.NOCB() + ), + ) + + # Test creating a Chain via Imputor composition + C = Impute.Interpolate() ∘ Impute.LOCF() ∘ Impute.NOCB() + result3 = @test_deprecated impute(orig, C) + @test result == result2 + @test result == result3 + end + + @testset "utils" begin + M = [1.0 2.0 3.0 4.0 5.0; 1.1 2.2 3.3 4.4 5.5] + + @testset "obswise" begin + @test map(sum, Impute.obswise(M; dims=2)) == [2.1, 4.2, 6.3, 8.4, 10.5] + @test map(sum, Impute.obswise(M; dims=1)) == [15, 16.5] + end + + @testset "varwise" begin + @test map(sum, Impute.varwise(M; dims=2)) == [15, 16.5] + @test map(sum, Impute.varwise(M; dims=1)) == [2.1, 4.2, 6.3, 8.4, 10.5] + end + + @testset "filterobs" begin + @test Impute.filterobs(x -> sum(x) > 5.0, M; dims=2) == M[:, 3:5] + @test Impute.filterobs(x -> sum(x) > 15.0, M; dims=1) == M[[false, true], :] + end + + @testset "filtervars" begin + @test Impute.filtervars(x -> sum(x) > 15.0, M; dims=2) == M[[false, true], :] + @test Impute.filtervars(x -> sum(x) > 5.0, M; dims=1) == M[:, 3:5] + end + end +end diff --git a/test/filter.jl b/test/filter.jl new file mode 100644 index 0000000..a546129 --- /dev/null +++ b/test/filter.jl @@ -0,0 +1,172 @@ +@testset "Filter" begin + # Defining our missing datasets + a = allowmissing(1.0:1.0:20.0) + a[[2, 3, 7]] .= missing + mask = map(!ismissing, a) + + # We call collect to not have a wrapper type that references the same data. + m = collect(reshape(a, 5, 4)) + + aa = AxisArray( + deepcopy(m), + Axis{:time}(DateTime(2017, 6, 5, 5):Hour(1):DateTime(2017, 6, 5, 9)), + Axis{:id}(1:4) + ) + + table = DataFrame( + :sin => allowmissing(sin.(1.0:1.0:20.0)), + :cos => allowmissing(sin.(1.0:1.0:20.0)), + ) + + table.sin[[2, 3, 7, 12, 19]] .= missing + + @test Filter() == Filter() + + @testset "Vector" begin + result = apply(a, Filter()) + expected = deleteat!(deepcopy(a), [2, 3, 7]) + + @test result == expected + @test result == Impute.filter(a) + + a2 = deepcopy(a) + Impute.filter!(a2) + @test a2 == expected + end + + @testset "Matrix" begin + @test_throws UndefKeywordError apply(m, Filter()) + @test_throws UndefKeywordError Impute.filter(m) + @test_throws MethodError Impute.filter!(m) + + @testset "rows" begin + result = apply(m, Filter(); dims=:rows) + expected = m[[1, 4, 5], :] + + @test isequal(result, expected) + @test isequal(result, (Impute.filter(m; dims=:rows))) + @test isequal(collect(result'), Impute.filter(collect(m'); dims=:cols)) + end + + @testset "cols" begin + result = apply(m, Filter(); dims=:cols) + expected = copy(m)[:, 3:4] + + @test isequal(result, expected) + @test isequal(result, Impute.filter(m; dims=:cols)) + @test isequal(collect(result'), Impute.filter(collect(m'); dims=:rows)) + end + end + + @testset "Tables" begin + @testset "DataFrame" begin + df = deepcopy(table) + + @test_throws UndefKeywordError apply(df, Filter()) + @test_throws UndefKeywordError Impute.filter(df) + @test_throws MethodError Impute.filter!(df) + + @testset "rows" begin + result = apply(df, Filter(); dims=:rows) + expected = dropmissing(df) + + @test isequal(result, expected) + @test isequal(result, Impute.filter(df; dims=:rows)) + end + + @testset "cols" begin + result = apply(df, Filter(); dims=:cols) + expected = select(df, :cos) + + @test isequal(result, expected) + @test isequal(result, Impute.filter(df; dims=:cols)) + end + end + + @testset "Column Table" begin + coltab = Tables.columntable(table) + + @test_throws UndefKeywordError apply(coltab, Filter()) + @test_throws UndefKeywordError Impute.filter(coltab) + @test_throws MethodError Impute.filter!(coltab) + + @testset "rows" begin + result = apply(coltab, Filter(); dims=:rows) + expected = Tables.columntable(dropmissing(table)) + + @test isequal(result, expected) + @test isequal(result, Impute.filter(coltab; dims=:rows)) + end + + @testset "cols" begin + result = apply(coltab, Filter(); dims=:cols) + expected = Tables.columntable(TableOperations.select(coltab, :cos)) + + @test isequal(result, expected) + @test isequal(result, Impute.filter(coltab; dims=:cols)) + end + end + + @testset "Row Table" begin + @testset "rows" begin + rowtab = Tables.rowtable(table) + + result = apply(rowtab, Filter(); dims=:rows) + expected = Tables.rowtable(dropmissing(table)) + + @test isequal(result, expected) + @test isequal(result, Impute.filter(rowtab; dims=:rows)) + @test isequal(result, Impute.filter(rowtab)) + + rowtab_ = Impute.filter!(rowtab) + @test isequal(rowtab, expected) + @test isequal(rowtab_, expected) + end + + @testset "cols" begin + rowtab = Tables.rowtable(table) + + @test_throws ArgumentError Impute.filter!(rowtab; dims=:cols) + result = apply(rowtab, Filter(); dims=:cols) + expected = Tables.rowtable(TableOperations.select(rowtab, :cos)) + + @test isequal(result, expected) + @test isequal(result, Impute.filter(rowtab; dims=:cols)) + end + + end + end + + @testset "AxisArray" begin + @test_throws UndefKeywordError apply(aa, Filter()) + @test_throws UndefKeywordError Impute.filter(aa) + @test_throws MethodError Impute.filter!(aa) + + @testset "rows" begin + result = apply(aa, Filter(); dims=:rows) + expected = m[[1, 4, 5], :] + + @test isequal(result, expected) + @test isequal(result, Impute.filter(aa; dims=:rows)) + @test isequal(collect(result'), Impute.filter(collect(aa'); dims=:cols)) + end + + @testset "cols" begin + result = apply(aa, Filter(); dims=:cols) + expected = copy(aa)[:, 3:4] + + @test isequal(result, expected) + @test isequal(result, Impute.filter(aa; dims=:cols)) + @test isequal(collect(result'), Impute.filter(collect(aa'); dims=:rows)) + end + end + + @testset "functional" begin + expected = deleteat!(deepcopy(a), [2, 3, 7]) + @test Impute.filter(!ismissing, a) == expected + + b = deepcopy(a) + @test Impute.filter!(!ismissing, b) == expected + @test b == expected + end +end diff --git a/test/imputors/interp.jl b/test/imputors/interp.jl new file mode 100644 index 0000000..9feed28 --- /dev/null +++ b/test/imputors/interp.jl @@ -0,0 +1,88 @@ +@testset "Interpolate" begin + @testset "Default" begin + tester = ImputorTester(Interpolate) + + test_hashing(tester) + test_equality(tester) + test_vector(tester) + test_matrix(tester) + # test_cube(tester) + test_dataframe(tester) + test_groupby(tester) + test_axisarray(tester) + test_nameddimsarray(tester) + test_keyedarray(tester) + test_columntable(tester) + test_rowtable(tester) + + @testset "Cube" begin + a = allowmissing(1.0:1.0:60.0) + a[[2, 7, 18, 23, 34, 41, 55, 59, 60]] .= missing + C = collect(reshape(a, 5, 4, 3)) + + # Cube tests are expected to fail + @test_throws MethodError impute(C, tester.imp(; tester.kwargs...); dims=3) + end + end + + @testset "Floats" begin + # Defining our missing datasets + a = allowmissing(1.0:1.0:20.0) + a[[2, 3, 7]] .= missing + + result = impute(a, Interpolate()) + @test result == collect(1.0:1.0:20) + @test result == Impute.interp(a) + + # Test in-place method + a2 = copy(a) + Impute.interp!(a2) + @test a2 == result + + # Test interpolation between identical points + b = ones(Union{Float64, Missing}, 20) + b[[2, 3, 7]] .= missing + @test Impute.interp(b) == ones(Union{Float64, Missing}, 20) + + # Test interpolation at endpoints + b = ones(Union{Float64, Missing}, 20) + b[[1, 3, 20]] .= missing + result = Impute.interp(b) + @test ismissing(result[1]) + @test ismissing(result[20]) + end + + @testset "Ints" begin + # Defining our missing datasets + a = allowmissing(1:1:20) + a[[2, 3, 7]] .= missing + + result = impute(a, Interpolate()) + @test result == collect(1:1:20) + @test result == Impute.interp(a) + + # Test in-place method + a2 = copy(a) + Impute.interp!(a2) + @test a2 == result + + # Test interpolation between identical points + b = ones(Union{Float64, Missing}, 20) + b[[2, 3, 7]] .= missing + @test Impute.interp(b) == ones(Union{Float64, Missing}, 20) + + # Test interpolation at endpoints + b = ones(Union{Float64, Missing}, 20) + b[[1, 3, 20]] .= missing + result = Impute.interp(b) + @test ismissing(result[1]) + @test ismissing(result[20]) + + # Test inexact error + # https://github.com/invenia/Impute.jl/issues/71 + c = [1, missing, 2, 3] + @test_throws InexactError Impute.interp(c) + end + + # TODO Test error cases on non-numeric types +end diff --git a/test/imputors/knn.jl b/test/imputors/knn.jl new file mode 100644 index 0000000..6275db8 --- /dev/null +++ b/test/imputors/knn.jl @@ -0,0 +1,114 @@ +@testset "KNN" begin + @testset "Default" begin + tester = ImputorTester(KNN) + test_hashing(tester) + test_equality(tester) + test_matrix(tester) + test_axisarray(tester) + test_nameddimsarray(tester) + test_keyedarray(tester) + end + @testset "Iris" begin + # Reference + # P. Schimitt, et. al + # A comparison of six methods for missing data imputation + iris = Impute.dataset("test/table/iris") |> DataFrame + iris2 = filter(row -> row[:Species] == "versicolor" || row[:Species] == "virginica", iris) + data = Array(iris2[:, [:SepalLength, :SepalWidth, :PetalLength, :PetalWidth]]) + num_tests = 100 + + @testset "Iris - 0.15" begin + X = add_missings(data, 0.15) + + knn_nrmsd, mean_nrmsd = 0.0, 0.0 + + for i = 1:num_tests + knn_imputed = impute(copy(X), Impute.KNN(; k=2); dims=:cols) + mean_imputed = impute(copy(X), Substitute(); dims=:cols) + + knn_nrmsd = ((i - 1) * knn_nrmsd + nrmsd(data, knn_imputed)) / i + mean_nrmsd = ((i - 1) * mean_nrmsd + nrmsd(data, mean_imputed)) / i + end + + @test knn_nrmsd < mean_nrmsd + # test type stability + @test typeof(X) == typeof(impute(copy(X), Impute.KNN(; k=2); dims=:cols)) + @test typeof(X) == typeof(impute(copy(X), Substitute(); dims=:cols)) + end + + @testset "Iris - 0.25" begin + X = add_missings(data, 0.25) + + knn_nrmsd, mean_nrmsd = 0.0, 0.0 + + for i = 1:num_tests + knn_imputed = impute(copy(X), Impute.KNN(; k=2); dims=:cols) + mean_imputed = impute(copy(X), Substitute(); dims=:cols) + + knn_nrmsd = ((i - 1) * knn_nrmsd + nrmsd(data, knn_imputed)) / i + mean_nrmsd = ((i - 1) * mean_nrmsd + nrmsd(data, mean_imputed)) / i + end + + @test knn_nrmsd < mean_nrmsd + # test type stability + @test typeof(X) == typeof(impute(copy(X), Impute.KNN(; k=2); dims=:cols)) + @test typeof(X) == typeof(impute(copy(X), Substitute(); dims=:cols)) + end + + @testset "Iris - 0.35" begin + X = add_missings(data, 0.35) + + knn_nrmsd, mean_nrmsd = 0.0, 0.0 + + for i = 1:num_tests + knn_imputed = impute(copy(X), Impute.KNN(; k=2); dims=:cols) + mean_imputed = impute(copy(X), Substitute(); dims=:cols) + + knn_nrmsd = ((i - 1) * knn_nrmsd + nrmsd(data, knn_imputed)) / i + mean_nrmsd = ((i - 1) * mean_nrmsd + nrmsd(data, mean_imputed)) / i + end + + @test knn_nrmsd < mean_nrmsd + # test type stability + @test typeof(X) == typeof(impute(copy(X), Impute.KNN(; k=2); dims=:cols)) + @test typeof(X) == typeof(impute(copy(X), Substitute(); dims=:cols)) + end + end + + # Test a case where we expect kNN to perform well (e.g., many variables, ) + @testset "Data match" begin + data = mapreduce(hcat, 1:1000) do i + seeds = [sin(i), cos(i), tan(i), atan(i)] + mapreduce(vcat, combinations(seeds)) do args + [ + +(args...), + *(args...), + +(args...) * 100, + +(abs.(args)...), + (+(args...) * 10) ^ 2, + (+(abs.(args)...) * 10) ^ 2, + log(+(abs.(args)...) * 100), + +(args...) * 100 + rand(-10:0.1:10), + ] + end + end + + X = add_missings(data') + num_tests = 100 + + knn_nrmsd, mean_nrmsd = 0.0, 0.0 + + for i = 1:num_tests + knn_imputed = impute(copy(X), Impute.KNN(; k=4); dims=:cols) + mean_imputed = impute(copy(X), Substitute(); dims=:cols) + + knn_nrmsd = ((i - 1) * knn_nrmsd + nrmsd(data', knn_imputed)) / i + mean_nrmsd = ((i - 1) * mean_nrmsd + nrmsd(data', mean_imputed)) / i + end + + @test knn_nrmsd < mean_nrmsd + # test type stability + @test typeof(X) == typeof(impute(copy(X), Impute.KNN(; k=4); dims=:cols)) + @test typeof(X) == typeof(impute(copy(X), Substitute(); dims=:cols)) + end +end diff --git a/test/imputors/locf.jl b/test/imputors/locf.jl new file mode 100644 index 0000000..a3a94dc --- /dev/null +++ b/test/imputors/locf.jl @@ -0,0 +1,78 @@ +@testset "LOCF" begin + @testset "Default" begin + tester = ImputorTester(LOCF) + + test_hashing(tester) + test_equality(tester) + test_vector(tester) + test_matrix(tester) + # test_cube(tester) + test_dataframe(tester) + test_groupby(tester) + test_axisarray(tester) + test_nameddimsarray(tester) + test_keyedarray(tester) + test_columntable(tester) + test_rowtable(tester) + + @testset "Cube" begin + a = allowmissing(1.0:1.0:60.0) + a[[2, 7, 18, 23, 34, 41, 55, 59, 60]] .= missing + C = collect(reshape(a, 5, 4, 3)) + + # Cube tests are expected to fail + @test_throws MethodError impute(C, tester.imp(; tester.kwargs...); dims=3) + end + end + + @testset "Floats" begin + a = allowmissing(1.0:1.0:20.0) + a[[2, 3, 7]] .= missing + + result = impute(a, LOCF()) + expected = copy(a) + expected[2] = 1.0 + expected[3] = 1.0 + expected[7] = 6.0 + + @test result == expected + @test result == Impute.locf(a) + + a2 = copy(a) + Impute.locf!(a2) + @test a2 == result + + # Test LOCF at endpoints + b = ones(Union{Float64, Missing}, 20) + b[[1, 3, 20]] .= missing + result = Impute.locf(b) + @test ismissing(result[1]) + @test result[20] == 1.0 + end + + @testset "Ints" begin + a = allowmissing(1:1:20) + a[[2, 3, 7]] .= missing + + result = impute(a, LOCF()) + expected = copy(a) + expected[2] = 1 + expected[3] = 1 + expected[7] = 6 + + @test result == expected + end + + @testset "Strings" begin + a = allowmissing([randstring(4) for i in 1:20]) + a[[2, 3, 7]] .= missing + + result = impute(a, LOCF()) + expected = copy(a) + expected[2] = expected[1] + expected[3] = expected[1] + expected[7] = expected[6] + + @test result == expected + end +end diff --git a/test/imputors/nocb.jl b/test/imputors/nocb.jl new file mode 100644 index 0000000..6e1ebe9 --- /dev/null +++ b/test/imputors/nocb.jl @@ -0,0 +1,78 @@ +@testset "NOCB" begin + @testset "Default" begin + tester = ImputorTester(NOCB) + + test_hashing(tester) + test_equality(tester) + test_vector(tester) + test_matrix(tester) + # test_cube(tester) + test_dataframe(tester) + test_groupby(tester) + test_axisarray(tester) + test_nameddimsarray(tester) + test_keyedarray(tester) + test_columntable(tester) + test_rowtable(tester) + + @testset "Cube" begin + a = allowmissing(1.0:1.0:60.0) + a[[2, 7, 18, 23, 34, 41, 55, 59, 60]] .= missing + C = collect(reshape(a, 5, 4, 3)) + + # Cube tests are expected to fail + @test_throws MethodError impute(C, tester.imp(; tester.kwargs...); dims=3) + end + end + + @testset "Floats" begin + a = allowmissing(1.0:1.0:20.0) + a[[2, 3, 7]] .= missing + + result = impute(a, NOCB()) + expected = copy(a) + expected[2] = 4.0 + expected[3] = 4.0 + expected[7] = 8.0 + + @test result == expected + @test result == Impute.nocb(a) + + a2 = copy(a) + Impute.nocb!(a2) + @test a2 == result + + # Test LOCF at endpoints + b = ones(Union{Float64, Missing}, 20) + b[[1, 3, 20]] .= missing + result = Impute.nocb(b) + @test result[1] == 1.0 + @test ismissing(result[20]) + end + + @testset "Ints" begin + a = allowmissing(1:1:20) + a[[2, 3, 7]] .= missing + + result = impute(a, NOCB()) + expected = copy(a) + expected[2] = 4 + expected[3] = 4 + expected[7] = 8 + + @test result == expected + end + + @testset "Strings" begin + a = allowmissing([randstring(4) for i in 1:20]) + a[[2, 3, 7]] .= missing + + result = impute(a, NOCB()) + expected = copy(a) + expected[2] = expected[4] + expected[3] = expected[4] + expected[7] = expected[8] + + @test result == expected + end +end diff --git a/test/imputors/replace.jl b/test/imputors/replace.jl new file mode 100644 index 0000000..54d543e --- /dev/null +++ b/test/imputors/replace.jl @@ -0,0 +1,75 @@ +@testset "Replace" begin + @testset "Default" begin + # Tester that replaces with 0.0 + tester = ImputorTester(Replace; values=0.0) + + # Defining our own equality because an empty constructor isn't supported + @testset "Equality" begin + @test tester.imp(; tester.kwargs...) == tester.imp(; tester.kwargs...) + end + + test_vector(tester) + test_matrix(tester) + test_dataframe(tester) + # groupby test also fail because it tries to call an empty constructor + # test_groupby(tester) + test_axisarray(tester) + test_nameddimsarray(tester) + test_keyedarray(tester) + test_columntable(tester) + test_rowtable(tester) + end + + @testset "Multiple values over tables" begin + imp = Replace(; values=(DateTime(0), -9999, NaN, "")) + df_table = DataFrame( + :time => [missing, DateTime(2020, 02, 02), DateTime(2121, 12, 12)], + :loc => [12, -5, missing], + :val => [1.5, missing, 3.0], + :desc => ["foo", "bar", missing], + ) + df_expected = DataFrame( + :time => [DateTime(0), DateTime(2020, 02, 02), DateTime(2121, 12, 12)], + :loc => [12, -5, -9999], + :val => [1.5, NaN, 3.0], + :desc => ["foo", "bar", ""], + ) + + @testset "DataFrame" begin + table = copy(df_table) + expected = copy(df_expected) + + result = impute(table, imp) + @test isequal(result, expected) + + table2 = deepcopy(table) + impute!(table2, imp) + @test isequal(table2, expected) + end + + @testset "Column Table" begin + table = Tables.columntable(df_table) + expected = Tables.columntable(df_expected) + + result = impute(table, imp) + @test isequal(result, expected) + + table2 = deepcopy(table) + impute!(table2, imp) + @test isequal(table2, expected) + end + + @testset "Row Table" begin + table = Tables.rowtable(df_table) + expected = Tables.rowtable(df_expected) + + result = impute(table, imp) + @test isequal(result, expected) + + table2 = deepcopy(table) + impute!(table2, imp) + @test !isequal(table2, expected) + @test isequal(table2, table) + end + end +end diff --git a/test/imputors/srs.jl b/test/imputors/srs.jl new file mode 100644 index 0000000..20d5bde --- /dev/null +++ b/test/imputors/srs.jl @@ -0,0 +1,60 @@ +@testset "SRS" begin + @testset "Default" begin + tester = ImputorTester(SRS) + test_hashing(tester) + test_equality(tester) + test_vector(tester) + test_matrix(tester) + test_dataframe(tester) + # Behaviour is inconsistent for testing because of `rand` calls + # test_groupby(tester) + # test_axisarray(tester) + # test_nameddimsarray(tester) + # test_keyedarray(tester) + test_columntable(tester) + test_rowtable(tester) + end + + @testset "Floats" begin + a = allowmissing(1.0:1.0:20.0) + a[[2, 3, 7]] .= missing + result = impute(a, SRS(; rng=SequentialRNG())) + expected = copy(a) + expected[2] = 4.0 + expected[3] = 5.0 + expected[7] = 6.0 + + @test result == expected + + @test result == Impute.srs(a; rng=SequentialRNG()) + + a2 = copy(a) + + Impute.srs!(a2; rng=SequentialRNG()) + @test a2 == result + end + + @testset "Ints" begin + a = allowmissing(1:1:20) + a[[2, 3, 7]] .= missing + result = impute(a, SRS(; rng=SequentialRNG())) + expected = copy(a) + expected[2] = 4 + expected[3] = 5 + expected[7] = 6 + + @test result == expected + end + + @testset "Strings" begin + a = allowmissing([randstring(4) for i in 1:20]) + a[[2, 3, 7]] .= missing + result = impute(a, SRS(; rng=SequentialRNG())) + expected = copy(a) + expected[2] = expected[4] + expected[3] = expected[5] + expected[7] = expected[6] + + @test result == expected + end +end diff --git a/test/imputors/standardize.jl b/test/imputors/standardize.jl new file mode 100644 index 0000000..c2907d7 --- /dev/null +++ b/test/imputors/standardize.jl @@ -0,0 +1,206 @@ +# The standardize imputor is sufficiently different in its input and behaviour +# that we don't bother using the ImputorTester here. +@testset "Standardize" begin + # List a couple known missing data values people might use. + values = (NaN, 0.0, Nothing, "", 9999, -99, 0, DateTime(0)) + imp = Standardize(; values=values) + + @testset "Vector" begin + @testset "disallowmissing" begin + a = collect(1.0:1.0:20.0) + a[[2, 3, 7]] .= [NaN, 0.0, NaN] + + result = impute(a, imp) + @test eltype(result) == Union{Float64, Missing} + @test all(ismissing, result[[2, 3, 7]]) + + # In-place operation don't work when the source array doesn't allow missings. + b = copy(a) + result2 = impute!(b, imp) + @test eltype(result2) == Float64 + @test isequal(result2[[2, 3, 7]], [NaN, 0.0, NaN]) + end + + @testset "allowmissing" begin + a = allowmissing(collect(1.0:1.0:20.0)) + a[[2, 3, 7]] .= [NaN, 0.0, NaN] + + result = impute(a, imp) + @test eltype(result) == Union{Float64, Missing} + @test all(ismissing, result[[2, 3, 7]]) + + # In-place operation don't work when the source array doesn't allow missings. + b = copy(a) + result2 = impute!(b, imp) + @test eltype(result2) == Union{Float64, Missing} + @test all(ismissing, result2[[2, 3, 7]]) + end + + @testset "All missing" begin + # Test having only missing data + c = fill(missing, 10) + @test isequal(impute(c, imp), c) + end + end + + @testset "Matrix" begin + @testset "disallowmissing" begin + a = collect(1.0:1.0:20.0) + a[[2, 3, 7]] .= [NaN, 0.0, NaN] + m = collect(reshape(a, 5, 4)) + + result = impute(m, imp) + @test eltype(result) == Union{Float64, Missing} + @test all(ismissing, result[[2, 3, 7]]) + + # In-place operation don't work when the source array doesn't allow missings. + n = copy(m) + result2 = impute!(n, imp) + @test eltype(result2) == Float64 + @test isequal(result2[[2, 3, 7]], [NaN, 0.0, NaN]) + end + + @testset "allowmissing" begin + a = allowmissing(collect(1.0:1.0:20.0)) + a[[2, 3, 7]] .= [NaN, 0.0, NaN] + m = collect(reshape(a, 5, 4)) + + result = impute(m, imp) + @test eltype(result) == Union{Float64, Missing} + @test all(ismissing, result[[2, 3, 7]]) + + # In-place operation don't work when the source array doesn't allow missings. + n = copy(m) + result2 = impute!(n, imp) + @test eltype(result2) == Union{Float64, Missing} + @test all(ismissing, result2[[2, 3, 7]]) + end + + @testset "All missing" begin + # Test having only missing data + c = fill(missing, 5, 4) + @test isequal(impute(c, imp), c) + end + end + @testset "Tables" begin + @testset "DataFrame" begin + table = DataFrame( + :time => [DateTime(0), DateTime(2020, 02, 02), DateTime(2121, 12, 12)], + :loc => [12, -5, 9999], + :age => [4, -99, 18], + :val => [1.5, NaN, 3.0], + :desc => ["foo", "bar", ""], + ) + mtable = DataFrame( + :time => allowmissing([DateTime(0), DateTime(2020, 02, 02), DateTime(2121, 12, 12)]), + :loc => allowmissing([12, -5, 9999]), + :age => allowmissing([4, -99, 18]), + :val => allowmissing([1.5, NaN, 3.0]), + :desc => allowmissing(["foo", "bar", ""]), + ) + expected = DataFrame( + :time => [missing, DateTime(2020, 02, 02), DateTime(2121, 12, 12)], + :loc => [12, -5, missing], + :age => [4, missing, 18], + :val => [1.5, missing, 3.0], + :desc => ["foo", "bar", missing], + ) + + @testset "disallowmissing" begin + result = impute(table, imp) + @test isequal(result, expected) + + result2 = impute!(deepcopy(table), imp) + @test !isequal(result2, expected) + end + + @testset "allowmissing" begin + result = impute(mtable, imp) + @test isequal(result, expected) + + result2 = impute!(deepcopy(mtable), imp) + @test isequal(result2, expected) + end + end + + @testset "Column Table" begin + table = ( + time = [DateTime(0), DateTime(2020, 02, 02), DateTime(2121, 12, 12)], + loc = [12, -5, 9999], + age = [4, -99, 18], + val = [1.5, NaN, 3.0], + desc = ["foo", "bar", ""], + ) + mtable = ( + time = allowmissing([DateTime(0), DateTime(2020, 02, 02), DateTime(2121, 12, 12)]), + loc = allowmissing([12, -5, 9999]), + age = allowmissing([4, -99, 18]), + val = allowmissing([1.5, NaN, 3.0]), + desc = allowmissing(["foo", "bar", ""]), + ) + expected = ( + time = [missing, DateTime(2020, 02, 02), DateTime(2121, 12, 12)], + loc = [12, -5, missing], + age = [4, missing, 18], + val = [1.5, missing, 3.0], + desc = ["foo", "bar", missing], + ) + + @testset "disallowmissing" begin + result = impute(table, imp) + @test isequal(result, expected) + + result2 = impute!(deepcopy(table), imp) + @test !isequal(result2, expected) + end + + @testset "allowmissing" begin + result = impute(mtable, imp) + @test isequal(result, expected) + + result2 = impute!(deepcopy(mtable), imp) + @test isequal(result2, expected) + end + end + + @testset "Row Table" begin + table = Tables.rowtable(( + time = [DateTime(0), DateTime(2020, 02, 02), DateTime(2121, 12, 12)], + loc = [12, -5, 9999], + age = [4, -99, 18], + val = [1.5, NaN, 3.0], + desc = ["foo", "bar", ""], + )) + mtable = Tables.rowtable(( + time = allowmissing([DateTime(0), DateTime(2020, 02, 02), DateTime(2121, 12, 12)]), + loc = allowmissing([12, -5, 9999]), + age = allowmissing([4, -99, 18]), + val = allowmissing([1.5, NaN, 3.0]), + desc = allowmissing(["foo", "bar", ""]), + )) + expected = Tables.rowtable(( + time = [missing, DateTime(2020, 02, 02), DateTime(2121, 12, 12)], + loc = [12, -5, missing], + age = [4, missing, 18], + val = [1.5, missing, 3.0], + desc = ["foo", "bar", missing], + )) + + @testset "disallowmissing" begin + result = impute(table, imp) + @test isequal(result, expected) + + result2 = impute!(deepcopy(table), imp) + @test !isequal(result2, expected) + end + + @testset "allowmissing" begin + result = impute(mtable, imp) + @test isequal(result, expected) + + result2 = impute!(deepcopy(mtable), imp) + @test isequal(result2, expected) + end + end + end +end diff --git a/test/imputors/substitute.jl b/test/imputors/substitute.jl new file mode 100644 index 0000000..4ae0eb7 --- /dev/null +++ b/test/imputors/substitute.jl @@ -0,0 +1,124 @@ +@testset "Substitute" begin + @testset "Default" begin + test_all(ImputorTester(Substitute)) + end + + @testset "defaultstats" begin + @testset "robust" begin + # Defining our missing datasets + a = allowmissing(1.0:1.0:20.0) + a[[2, 3, 7]] .= missing + fill_val = median(skipmissing(a)) + + result = impute(a, Substitute()) + expected = copy(a) + expected[[2, 3, 7]] .= fill_val + + @test result == expected + @test result == Impute.substitute(a) + end + + @testset "weighted" begin + # Defining our missing datasets + a = allowmissing(1.0:1.0:20.0) + wv = eweights(20, 0.3) + a[[2, 3, 7]] .= missing + mask = .!ismissing.(a) + + fill_val = mean(a[mask], wv[mask]) + + result = impute(a, Substitute(; robust=false, weights=wv)) + expected = copy(a) + expected[[2, 3, 7]] .= fill_val + + @test result == expected + @test result == Impute.substitute(a; robust=false, weights=wv) + end + + @testset "counts" begin + a = allowmissing([1, 12, 4, 6, 2, 5, 9, 19, 24, 35, 44, 99]) + a[[2, 3, 7]] .= missing + + # We should default to taking the median because otherwise `mode` will + # just return `1` + fill_val = round(Int, median(skipmissing(a))) + + result = impute(a, Substitute()) + expected = copy(a) + expected[[2, 3, 7]] .= fill_val + + @test result == expected + @test result == Impute.substitute(a) + end + + @testset "ratings" begin + # Slightly imbalanced ratings + a = allowmissing(vcat(repeat(1:5, 5), [1, 1, 5])) + a[[2, 3, 7]] .= missing + + # We likely want to the mode because we only have a few unique values. + fill_val = mode(skipmissing(a)) + @test fill_val == 1 + + result = impute(a, Substitute()) + expected = copy(a) + expected[[2, 3, 7]] .= fill_val + + @test result == expected + @test result == Impute.substitute(a) + end + + @testset "bools" begin + a = allowmissing(vcat(falses(14), trues(6))) + a[[2, 3, 7]] .= missing + + # For the same reason as for ratings we should probably just use the mode. + # Though most of the time they'll give the same answer once rounded. + fill_val = mode(skipmissing(a)) + @test fill_val == false + + result = impute(a, Substitute()) + expected = copy(a) + expected[[2, 3, 7]] .= fill_val + + @test result == expected + @test result == Impute.substitute(a) + end + + @testset "non-real" begin + a = allowmissing(DateTime(2000, 1, 1):Day(1):DateTime(2000, 1, 20)) + a[[2, 3, 7]] .= missing + + # Median of `DateTime`s doesn't apply, so we fallback to `mode` + fill_val = mode(skipmissing(a)) + + # In this case that's just going to take the first observation it finds + fill_val == DateTime(2000, 1, 1) + + result = impute(a, Substitute()) + expected = copy(a) + expected[[2, 3, 7]] .= fill_val + + @test result == expected + @test result == Impute.substitute(a) + end + end + + @testset "custom statistic" begin + # Defining our missing datasets + a = allowmissing(1.0:1.0:20.0) + a[[2, 3, 7]] .= missing + + # We'll do mean - 1 std for some reason :) + μ, σ = mean_and_std(skipmissing(a)) + fill_val = μ - σ + + expected = copy(a) + expected[[2, 3, 7]] .= fill_val + result = Impute.substitute( + a; + statistic=data -> -(mean_and_std(skipmissing(data))...) + ) + @test result == expected + end +end diff --git a/test/imputors/svd.jl b/test/imputors/svd.jl new file mode 100644 index 0000000..0f29102 --- /dev/null +++ b/test/imputors/svd.jl @@ -0,0 +1,121 @@ +@testset "SVD" begin + @testset "Default" begin + tester = ImputorTester(SVD) + test_hashing(tester) + test_equality(tester) + + # test_matrix(tester) + # Default transpose test uses `isequal`, but the SVD imputor will have a + # small floating point error. + @testset "Matrix" begin + a = allowmissing(1.0:1.0:20.0) + a[[2, 3, 7]] .= missing + m = collect(reshape(a, 5, 4)) + + result = impute(m, tester.imp(; tester.kwargs...); dims=:cols) + + @testset "Base" begin + # Test that we have fewer missing values + @test count(ismissing, result) < count(ismissing, m) + @test isa(result, Matrix) + @test eltype(result) <: eltype(m) + + # Test that functional form behaves the same way + @test result == tester.f(m; dims=:cols, tester.kwargs...) + end + + @testset "In-place" begin + # Test that the in-place function return the new results and logs whether it + # successfully did it in-place + m2 = deepcopy(m) + m2_ = tester.f!(m2; dims=:cols, tester.kwargs...) + @test m2_ == result + if m2 != result + @warn "$(tester.f!) did not mutate input data of type Matrix" + end + end + + @testset "Transpose" begin + m_ = collect(m') + result_ = collect(result') + @test isapprox(tester.f(m_; dims=:rows, tester.kwargs...), result_) + @test isapprox(tester.f!(m_; dims=:rows, tester.kwargs...), result_) + end + + @testset "No missing" begin + # Test having no missing data + b = collect(reshape(allowmissing(1.0:1.0:20.0), 5, 4)) + @test impute(b, tester.imp(; tester.kwargs...); dims=:cols) == b + end + + @testset "All missing" begin + # Test having only missing data + c = missings(5, 2) + @test isequal(impute(c, tester.imp(; tester.kwargs...); dims=:cols), c) + c_ = tester.f!(deepcopy(c); dims=:cols) + @test isequal(c_, c) + end + end + # Internal `svd` call isn't supported by these type, but maybe they should be? + # test_axisarray(tester) + # test_nameddimsarray(tester) + # test_keyedarray(tester) + end + + # Test a case where we expect SVD to perform well (e.g., many variables, ) + @testset "Data match" begin + data = mapreduce(hcat, 1:1000) do i + seeds = [sin(i), cos(i), tan(i), atan(i)] + mapreduce(vcat, combinations(seeds)) do args + [ + +(args...), + *(args...), + +(args...) * 100, + +(abs.(args)...), + (+(args...) * 10) ^ 2, + (+(abs.(args)...) * 10) ^ 2, + log(+(abs.(args)...) * 100), + +(args...) * 100 + rand(-10:0.1:10), + ] + end + end + + # println(svd(data').S) + X = add_missings(data') + + svd_imputed = Impute.svd(X; dims=:cols) + mean_imputed = Impute.fill(copy(X); dims=:cols) + + # With sufficient correlation between the variables and enough observation we + # expect the svd imputation to perform severl times better than mean imputation. + @test nrmsd(svd_imputed, data') < nrmsd(mean_imputed, data') * 0.5 + end + + # Test a case where we know SVD imputation won't perform well + # (e.g., only a few variables, only ) + @testset "Data mismatch - too few variables" begin + data = Tables.matrix(Impute.dataset("test/table/electricity")) + X = add_missings(data) + + svd_imputed = Impute.svd(X; dims=:cols) + mean_imputed = Impute.fill(copy(X); dims=:cols) + + # If we don't have enough variables then SVD imputation will probably perform + # about as well as mean imputation. + @test nrmsd(svd_imputed, data) > nrmsd(mean_imputed, data) * 0.9 + end + + @testset "Data mismatch - poor low rank approximations" begin + M = rand(100, 200) + data = M * M' + X = add_missings(data) + + svd_imputed = Impute.svd(X; dims=:cols) + mean_imputed = Impute.fill(copy(X); dims=:cols) + + # If most of the variance in the original data can't be explained by a small + # subset of the eigen values in the svd decomposition then our low rank approximations + # won't perform very well. + @test nrmsd(svd_imputed, data) > nrmsd(mean_imputed, data) * 0.9 + end +end diff --git a/test/runtests.jl b/test/runtests.jl index 4fbc049..cdcbb0c 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,11 +1,12 @@ using AxisArrays using AxisKeys +using CSV using Combinatorics using DataFrames using Dates using Distances +using Documenter using LinearAlgebra -using RDatasets using Random using Statistics using StatsBase @@ -17,734 +18,51 @@ using Impute using Impute: Impute, Imputor, + Chain, DropObs, DropVars, Interpolate, Fill, + KNN, LOCF, NOCB, + Replace, SRS, - Context, - WeightedContext, - ImputeError, + Standardize, + Substitute, + SVD, + Filter, + Threshold, + ThresholdError, + apply, + assert, impute, impute!, interp, - chain + run, + threshold -function add_missings(X, ratio=0.1) - result = Matrix{Union{Float64, Missing}}(X) - - for i in 1:floor(Int, length(X) * ratio) - result[rand(1:length(X))] = missing - end - - return result -end - -function add_missings_single(X, ratio=0.1) - result = Matrix{Union{Float64, Missing}}(X) - - randcols = 1:floor(Int, size(X, 2) * ratio) - for col in randcols - result[rand(1:size(X, 1)), col] = missing - end - - return result -end - -# A sequential RNG for consistent testing across julia versions -mutable struct SequentialRNG <: AbstractRNG - idx::Int -end -SequentialRNG(; start_idx=1) = SequentialRNG(start_idx) - -function Base.rand(srng::SequentialRNG, x::Vector) - srng.idx = srng.idx < length(x) ? srng.idx + 1 : 1 - return x[srng.idx] -end @testset "Impute" begin - # Defining our missing datasets - a = allowmissing(1.0:1.0:20.0) - a[[2, 3, 7]] .= missing - mask = map(!ismissing, a) - ctx = Context(; limit=0.2) - - # We call collect to not have a wrapper type that references the same data. - m = collect(reshape(a, 5, 4)) - - aa = AxisArray( - deepcopy(m), - Axis{:time}(DateTime(2017, 6, 5, 5):Hour(1):DateTime(2017, 6, 5, 9)), - Axis{:id}(1:4) - ) - - table = DataFrame( - :sin => allowmissing(sin.(1.0:1.0:20.0)), - :cos => allowmissing(sin.(1.0:1.0:20.0)), - ) - - table.sin[[2, 3, 7, 12, 19]] .= missing - - @testset "Equality" begin - @testset "$T" for T in (DropObs, DropVars, Interpolate, Fill, LOCF, NOCB, SRS) - @test T() == T() - end - end - - @testset "Drop" begin - @testset "DropObs" begin - @testset "Vector" begin - result = impute(a, DropObs(; context=ctx)) - expected = deleteat!(deepcopy(a), [2, 3, 7]) - - @test result == expected - @test result == Impute.dropobs(a; context=ctx) - - a2 = deepcopy(a) - Impute.dropobs!(a2; context=ctx) - @test a2 == expected - end - - @testset "Matrix" begin - # Because we're removing 2 of our 5 rows we need to change the limit. - ctx = Context(; limit=0.4) - result = impute(m, DropObs(; context=ctx)) - expected = m[[1, 4, 5], :] - - @test isequal(result, expected) - @test isequal(result, Impute.dropobs(m; context=ctx)) - @test isequal(collect(result'), Impute.dropobs(collect(m'); dims=2, context=ctx)) - - m_ = Impute.dropobs!(m; context=ctx) - # The mutating test is broken because we need to making a copy of - # the original matrix - @test_broken isequal(m, expected) - @test isequal(m_, expected) - end - - @testset "Tables" begin - ctx = Context(; limit=0.4) - @testset "DataFrame" begin - df = deepcopy(table) - result = impute(df, DropObs(; context=ctx)) - expected = dropmissing(df) - - @test isequal(result, expected) - @test isequal(result, Impute.dropobs(df; context=ctx)) - - df_ = Impute.dropobs!(df; context=ctx) - # The mutating test is broken because we need to making a copy of - # the original table - @test_broken isequal(df, expected) - @test isequal(df_, expected) - end - - @testset "Column Table" begin - coltab = Tables.columntable(table) - - result = impute(coltab, DropObs(; context=ctx)) - expected = Tables.columntable(dropmissing(table)) - - @test isequal(result, expected) - @test isequal(result, Impute.dropobs(coltab; context=ctx)) - - coltab_ = Impute.dropobs!(coltab; context=ctx) - # The mutating test is broken because we need to making a copy of - # the original table - @test_broken isequal(coltab, expected) - @test isequal(coltab_, expected) - end - - @testset "Row Table" begin - rowtab = Tables.rowtable(table) - result = impute(rowtab, DropObs(; context=ctx)) - expected = Tables.rowtable(dropmissing(table)) - - @test isequal(result, expected) - @test isequal(result, Impute.dropobs(rowtab; context=ctx)) - - rowtab_ = Impute.dropobs!(rowtab; context=ctx) - # The mutating test is broken because we need to making a copy of - # the original table - # @test_broken isequal(rowtab, expected) - @test isequal(rowtab_, expected) - end - end - - @testset "AxisArray" begin - # Because we're removing 2 of our 5 rows we need to change the limit. - ctx = Context(; limit=0.4) - result = impute(aa, DropObs(; context=ctx)) - expected = aa[[1, 4, 5], :] - - @test isequal(result, expected) - @test isequal(result, Impute.dropobs(aa; context=ctx)) - - aa_ = Impute.dropobs!(aa; context=ctx) - # The mutating test is broken because we need to making a copy of - # the original matrix - @test_broken isequal(aa, expected) - @test isequal(aa_, expected) - end - end - - @testset "DropVars" begin - @testset "Vector" begin - @test_throws MethodError Impute.dropvars(a) - end - - @testset "Matrix" begin - ctx = Context(; limit=0.5) - result = impute(m, DropVars(; context=ctx)) - expected = copy(m)[:, 3:4] - - @test isequal(result, expected) - @test isequal(result, Impute.dropvars(m; context=ctx)) - @test isequal(collect(result'), Impute.dropvars(collect(m'); dims=2, context=ctx)) - - m_ = Impute.dropvars!(m; context=ctx) - # The mutating test is broken because we need to making a copy of - # the original matrix - @test_broken isequal(m, expected) - @test isequal(m_, expected) - end - - @testset "Tables" begin - @testset "DataFrame" begin - df = deepcopy(table) - result = impute(df, DropVars(; context=ctx)) - expected = select(df, :cos) - - @test isequal(result, expected) - @test isequal(result, Impute.dropvars(df; context=ctx)) - - Impute.dropvars!(df; context=ctx) - # The mutating test is broken because we need to making a copy of - # the original table - @test_broken isequal(df, expected) - end - - @testset "Column Table" begin - coltab = Tables.columntable(table) - - result = impute(coltab, DropVars(; context=ctx)) - expected = Tables.columntable(TableOperations.select(coltab, :cos)) - - @test isequal(result, expected) - @test isequal(result, Impute.dropvars(coltab; context=ctx)) - - Impute.dropvars!(coltab; context=ctx) - # The mutating test is broken because we need to making a copy of - # the original table - @test_broken isequal(coltab, expected) - end - - @testset "Row Table" begin - rowtab = Tables.rowtable(table) - result = impute(rowtab, DropVars(; context=ctx)) - expected = Tables.rowtable(TableOperations.select(rowtab, :cos)) - - @test isequal(result, expected) - @test isequal(result, Impute.dropvars(rowtab; context=ctx)) - - Impute.dropvars!(rowtab; context=ctx) - # The mutating test is broken because we need to making a copy of - # the original table - @test_broken isequal(rowtab, expected) - end - end - @testset "AxisArray" begin - ctx = Context(; limit=0.5) - result = impute(aa, DropVars(; context=ctx)) - expected = copy(aa)[:, 3:4] - - @test isequal(result, expected) - @test isequal(result, Impute.dropvars(aa; context=ctx)) - - aa_ = Impute.dropvars!(aa; context=ctx) - # The mutating test is broken because we need to making a copy of - # the original matrix - @test_broken isequal(aa, expected) - @test isequal(aa_, expected) - end - end - end - - @testset "Interpolate" begin - result = impute(a, Interpolate(; context=ctx)) - @test result == collect(1.0:1.0:20) - @test result == interp(a; context=ctx) - - # Test in-place method - a2 = copy(a) - Impute.interp!(a2; context=ctx) - @test a2 == result - - # Test interpolation between identical points - b = ones(Union{Float64, Missing}, 20) - b[[2, 3, 7]] .= missing - @test interp(b; context=ctx) == ones(Union{Float64, Missing}, 20) - - # Test interpolation at endpoints - b = ones(Union{Float64, Missing}, 20) - b[[1, 3, 20]] .= missing - result = interp(b; context=ctx) - @test ismissing(result[1]) - @test ismissing(result[20]) - end - - @testset "Fill" begin - @testset "Value" begin - fill_val = -1.0 - result = impute(a, Fill(; value=fill_val, context=ctx)) - expected = copy(a) - expected[[2, 3, 7]] .= fill_val - - @test result == expected - @test result == Impute.fill(a; value=fill_val, context=ctx) - end - - @testset "Mean" begin - result = impute(a, Fill(; value=mean, context=ctx)) - expected = copy(a) - expected[[2, 3, 7]] .= mean(a[mask]) - - @test result == expected - @test result == Impute.fill(a; value=mean, context=ctx) - - a2 = copy(a) - Impute.fill!(a2; context=ctx) - @test a2 == result - end - - @testset "Matrix" begin - ctx = Context(; limit=1.0) - expected = Matrix(Impute.dropobs(dataset("boot", "neuro"); context=ctx)) - data = Matrix(dataset("boot", "neuro")) - - result = impute(data, Fill(; value=0.0, context=ctx)) - @test size(result) == size(data) - @test result == Impute.fill(data; value=0.0, context=ctx) - - data2 = copy(data) - Impute.fill!(data2; value=0.0, context=ctx) - @test data2 == result - end - end - - @testset "LOCF" begin - result = impute(a, LOCF(; context=ctx)) - expected = copy(a) - expected[2] = 1.0 - expected[3] = 1.0 - expected[7] = 6.0 - - @test result == expected - @test result == Impute.locf(a; context=ctx) - - a2 = copy(a) - Impute.locf!(a2; context=ctx) - @test a2 == result - end - - @testset "NOCB" begin - result = impute(a, NOCB(; context=ctx)) - expected = copy(a) - expected[2] = 4.0 - expected[3] = 4.0 - expected[7] = 8.0 - - @test result == expected - @test result == Impute.nocb(a; context=ctx) - - a2 = copy(a) - Impute.nocb!(a2; context=ctx) - @test a2 == result - end - - @testset "SRS" begin - result = impute(a, SRS(; rng=SequentialRNG(), context=ctx)) - expected = copy(a) - expected[2] = 4.0 - expected[3] = 5.0 - expected[7] = 6.0 - - @test result == expected - - @test result == Impute.srs(a; rng=SequentialRNG(), context=ctx) - - a2 = copy(a) - - Impute.srs!(a2; rng=SequentialRNG(), context=ctx) - @test a2 == result - end - - @testset "Not enough data" begin - ctx = Context(; limit=0.1) - @test_throws ImputeError impute(a, DropObs(; context=ctx)) - @test_throws ImputeError Impute.dropobs(a; context=ctx) - end - - @testset "Chain" begin - orig = dataset("boot", "neuro") - ctx = Context(; limit=1.0) - - @testset "DataFrame" begin - result = Impute.interp(orig; context=ctx) |> Impute.locf!() |> Impute.nocb!() - - @test size(result) == size(orig) - # Confirm that we don't have any more missing values - @test all(!ismissing, Matrix(result)) - - # We can also use the Chain type with explicit Imputor types - result2 = impute( - orig, - Impute.Chain( - Impute.Interpolate(; context=ctx), - Impute.LOCF(), - Impute.NOCB() - ), - ) - - # Test creating a Chain via Imputor composition - imp = Impute.Interpolate(; context=ctx) ∘ Impute.LOCF() ∘ Impute.NOCB() - result3 = impute(orig, imp) - @test result == result2 - @test result == result3 - - @testset "GroupedDataFrame" begin - T = NamedTuple{(:hod, :obj, :val), Tuple{Int, Int, Union{Float64, Missing}}} - - df = map(Iterators.product(1:24, 1:8, 0:19)) do t - hod, obj, x = t - # Deterministically return some `missing`s per hod/obj pair - return if x in (0, 5, 12, 19) - T((hod, obj, missing)) - else - T((hod, obj, sin(hod) * cos(x) + obj)) - end - end |> DataFrame - - gdf1 = groupby(deepcopy(df), [:hod, :obj]) - gdf2 = groupby(df, [:hod, :obj]) - - f1 = Impute.interp(; context=ctx) ∘ Impute.locf!() ∘ Impute.nocb!() - f2 = Impute.interp!(; context=ctx) ∘ Impute.locf!() ∘ Impute.nocb!() - - result = mapreduce(f1, vcat, gdf1) - # Check that the result isn't the same as the source dataframe - @test df != result - # Check that the size is still the same since we didn't drop any rows - @test size(result) == size(df) - # Check that there are no remaining missing values - @test all(!ismissing, Tables.matrix(result)) - # Double check that our source dataframe still contains missings - @test any(ismissing, Tables.matrix(df)) - - # Test that we can also mutate the dataframe directly - map(f2, gdf2) - # Now we can check that we've replaced all the missing values in df - @test all(!ismissing, Tables.matrix(df)) - end - end - - @testset "Column Table" begin - result = Tables.columntable(orig) |> - Impute.interp!(; context=ctx) |> - Impute.locf!() |> - Impute.nocb!() |> - Tables.matrix - - @test size(result) == size(orig) - # Confirm that we don't have any more missing values - @test all(!ismissing, result) - end - - @testset "Row Table" begin - result = Tables.rowtable(orig) |> - Impute.interp!(; context=ctx) |> - Impute.locf!() |> - Impute.nocb!() |> - Tables.matrix - - @test size(result) == size(orig) - # Confirm that we don't have any more missing values - @test all(!ismissing, result) - end - - @testset "Matrix" begin - data = Matrix(orig) - result = Impute.interp(data; context=ctx) |> Impute.locf!() |> Impute.nocb!() - - @test size(result) == size(data) - # Confirm that we don't have any more missing values - @test all(!ismissing, result) - end - - @testset "AxisArray" begin - data = AxisArray( - Matrix(orig), - Axis{:row}(1:size(orig, 1)), - Axis{:V}(names(orig)), - ) - result = Impute.interp(data; context=ctx) |> Impute.locf!() |> Impute.nocb!() - - @test size(result) == size(data) - # Confirm that we don't have any more missing values - @test all(!ismissing, result) - end - - @testset "KeyedArray" begin - data = KeyedArray(Matrix(orig); row=1:size(orig, 1), V=names(orig)) - result = Impute.interp(data; context=ctx) |> Impute.locf!() |> Impute.nocb!() - - @test size(result) == size(data) - # Confirm that we don't have any more missing values - @test all(!ismissing, result) - end - end - - @testset "Alternate missing functions" begin - ctx1 = Context(; limit=1.0) - ctx2 = Context(; limit=1.0, is_missing=isnan) - data1 = dataset("boot", "neuro") # Missing values with `missing` - data2 = Impute.fill(data1; value=NaN, context=ctx1) # Missing values with `NaN` - - @test Impute.dropobs(data1; context=ctx1) == dropmissing(data1) - - result1 = Impute.interp(data1; context=ctx1) |> Impute.dropobs() - result2 = Impute.interp(data2; context=ctx2) |> Impute.dropobs(; context=ctx2) - - @test result1 == result2 - end - - @testset "Contexts" begin - @testset "Base" begin - ctx = Context(; limit=0.1) - @test_throws ImputeError Impute.dropobs(a; context=ctx) - @test_throws ImputeError impute(a, DropObs(; context=ctx)) - end - - @testset "Weighted" begin - # If we use an exponentially weighted context then we won't pass the limit - # because missing earlier observations is less important than later ones. - ctx = WeightedContext(eweights(20, 0.3); limit=0.1) - @test isa(ctx, WeightedContext) - result = impute(a, DropObs(; context=ctx)) - expected = copy(a) - deleteat!(expected, [2, 3, 7]) - @test result == expected - - # If we reverse the weights such that earlier observations are more important - # then our previous limit of 0.2 won't be enough to succeed. - ctx = WeightedContext(reverse!(eweights(20, 0.3)); limit=0.2) - @test_throws ImputeError impute(a, DropObs(; context=ctx)) - end - end - - @testset "Utils" begin - M = [1.0 2.0 3.0 4.0 5.0; 1.1 2.2 3.3 4.4 5.5] - - @testset "obswise" begin - @test map(sum, Impute.obswise(M; dims=2)) == [2.1, 4.2, 6.3, 8.4, 10.5] - @test map(sum, Impute.obswise(M; dims=1)) == [15, 16.5] - end - - @testset "varwise" begin - @test map(sum, Impute.varwise(M; dims=2)) == [15, 16.5] - @test map(sum, Impute.varwise(M; dims=1)) == [2.1, 4.2, 6.3, 8.4, 10.5] - end - - @testset "filterobs" begin - @test Impute.filterobs(x -> sum(x) > 5.0, M; dims=2) == M[:, 3:5] - @test Impute.filterobs(x -> sum(x) > 15.0, M; dims=1) == M[[false, true], :] - end - - @testset "filtervars" begin - @test Impute.filtervars(x -> sum(x) > 15.0, M; dims=2) == M[[false, true], :] - @test Impute.filtervars(x -> sum(x) > 5.0, M; dims=1) == M[:, 3:5] - end - end - - @testset "KNN" begin - @testset "Iris" begin - # Reference - # P. Schimitt, et. al - # A comparison of six methods for missing data imputation - iris = dataset("datasets", "iris") - iris2 = filter(row -> row[:Species] == "versicolor" || row[:Species] == "virginica", iris) - data = Array(iris2[:, [:SepalLength, :SepalWidth, :PetalLength, :PetalWidth]]) - num_tests = 100 - - @testset "Iris - 0.15" begin - X = add_missings(data, 0.15) - - knn_nrmsd, mean_nrmsd = 0.0, 0.0 - - for i = 1:num_tests - knn_imputed = impute(copy(X), Impute.KNN(; k=2)) - mean_imputed = impute(copy(X), - Fill(; value=mean, context=Context(; limit=1.0))) - - knn_nrmsd = ((i - 1) * knn_nrmsd + nrmsd(data, knn_imputed)) / i - mean_nrmsd = ((i - 1) * mean_nrmsd + nrmsd(data, mean_imputed)) / i - end - - @test knn_nrmsd < mean_nrmsd - # test type stability - @test typeof(X) == typeof(impute(copy(X), Impute.KNN(; k=2))) - @test typeof(X) == typeof(impute(copy(X), Fill(; value=mean, - context=Context(; limit=1.0)))) - end - - @testset "Iris - 0.25" begin - X = add_missings(data, 0.25) - - knn_nrmsd, mean_nrmsd = 0.0, 0.0 - - for i = 1:num_tests - knn_imputed = impute(copy(X), Impute.KNN(; k=2)) - mean_imputed = impute(copy(X), - Fill(; value=mean, context=Context(; limit=1.0))) - - knn_nrmsd = ((i - 1) * knn_nrmsd + nrmsd(data, knn_imputed)) / i - mean_nrmsd = ((i - 1) * mean_nrmsd + nrmsd(data, mean_imputed)) / i - end - - @test knn_nrmsd < mean_nrmsd - # test type stability - @test typeof(X) == typeof(impute(copy(X), Impute.KNN(; k=2))) - @test typeof(X) == typeof(impute(copy(X), Fill(; value=mean, - context=Context(; limit=1.0)))) - end - - @testset "Iris - 0.35" begin - X = add_missings(data, 0.35) - - knn_nrmsd, mean_nrmsd = 0.0, 0.0 - - for i = 1:num_tests - knn_imputed = impute(copy(X), Impute.KNN(; k=2)) - mean_imputed = impute(copy(X), - Fill(; value=mean, context=Context(; limit=1.0))) - - knn_nrmsd = ((i - 1) * knn_nrmsd + nrmsd(data, knn_imputed)) / i - mean_nrmsd = ((i - 1) * mean_nrmsd + nrmsd(data, mean_imputed)) / i - end - - @test knn_nrmsd < mean_nrmsd - # test type stability - @test typeof(X) == typeof(impute(copy(X), Impute.KNN(; k=2))) - @test typeof(X) == typeof(impute(copy(X), Fill(; value=mean, - context=Context(; limit=1.0)))) - end - end - - # Test a case where we expect kNN to perform well (e.g., many variables, ) - @testset "Data match" begin - data = mapreduce(hcat, 1:1000) do i - seeds = [sin(i), cos(i), tan(i), atan(i)] - mapreduce(vcat, combinations(seeds)) do args - [ - +(args...), - *(args...), - +(args...) * 100, - +(abs.(args)...), - (+(args...) * 10) ^ 2, - (+(abs.(args)...) * 10) ^ 2, - log(+(abs.(args)...) * 100), - +(args...) * 100 + rand(-10:0.1:10), - ] - end - end - - X = add_missings(data') - num_tests = 100 - - knn_nrmsd, mean_nrmsd = 0.0, 0.0 - - for i = 1:num_tests - knn_imputed = impute(copy(X), Impute.KNN(; k=2)) - mean_imputed = impute(copy(X), - Fill(; value=mean, context=Context(; limit=1.0))) - - knn_nrmsd = ((i - 1) * knn_nrmsd + nrmsd(data', knn_imputed)) / i - mean_nrmsd = ((i - 1) * mean_nrmsd + nrmsd(data', mean_imputed)) / i - end - - @test knn_nrmsd < mean_nrmsd - # test type stability - @test typeof(X) == typeof(impute(copy(X), Impute.KNN(; k=2))) - @test typeof(X) == typeof(impute(copy(X), Fill(; value=mean, - context=Context(; limit=1.0)))) - end - end - - include("deprecated.jl") include("testutils.jl") - @testset "$T" for T in (DropObs, DropVars, Interpolate, Fill, LOCF, NOCB) - test_all(ImputorTester(T)) - end - - @testset "SVD" begin - # Test a case where we expect SVD to perform well (e.g., many variables, ) - @testset "Data match" begin - data = mapreduce(hcat, 1:1000) do i - seeds = [sin(i), cos(i), tan(i), atan(i)] - mapreduce(vcat, combinations(seeds)) do args - [ - +(args...), - *(args...), - +(args...) * 100, - +(abs.(args)...), - (+(args...) * 10) ^ 2, - (+(abs.(args)...) * 10) ^ 2, - log(+(abs.(args)...) * 100), - +(args...) * 100 + rand(-10:0.1:10), - ] - end - end - - # println(svd(data').S) - X = add_missings(data') - - svd_imputed = Impute.svd(X) - mean_imputed = Impute.fill(copy(X)) - - # With sufficient correlation between the variables and enough observation we - # expect the svd imputation to perform severl times better than mean imputation. - @test nrmsd(svd_imputed, data') < nrmsd(mean_imputed, data') * 0.5 - end - - # Test a case where we know SVD imputation won't perform well - # (e.g., only a few variables, only ) - @testset "Data mismatch - too few variables" begin - data = Matrix(dataset("Ecdat", "Electricity")) - X = add_missings(data) - - svd_imputed = Impute.svd(X) - mean_imputed = Impute.fill(copy(X)) - - # If we don't have enough variables then SVD imputation will probably perform - # about as well as mean imputation. - @test nrmsd(svd_imputed, data) > nrmsd(mean_imputed, data) * 0.9 - end - - @testset "Data mismatch - poor low rank approximations" begin - M = rand(100, 200) - data = M * M' - X = add_missings(data) - - svd_imputed = Impute.svd(X) - mean_imputed = Impute.fill(copy(X)) - - # If most of the variance in the original data can't be explained by a small - # subset of the eigen values in the svd decomposition then our low rank approximations - # won't perform very well. - @test nrmsd(svd_imputed, data) > nrmsd(mean_imputed, data) * 0.9 - end - end + include("assertions.jl") + include("chain.jl") + include("data.jl") + include("deprecated.jl") + include("filter.jl") + include("imputors/interp.jl") + include("imputors/knn.jl") + include("imputors/locf.jl") + include("imputors/nocb.jl") + include("imputors/replace.jl") + include("imputors/srs.jl") + include("imputors/standardize.jl") + include("imputors/substitute.jl") + include("imputors/svd.jl") + include("utils.jl") + + # Start running doctests before we wrap up technical changes and work + # on more documentation + doctest(Impute) end diff --git a/test/testutils.jl b/test/testutils.jl index f0d8350..20e2ef6 100644 --- a/test/testutils.jl +++ b/test/testutils.jl @@ -1,4 +1,36 @@ +function add_missings(X, ratio=0.1) + result = Matrix{Union{Float64, Missing}}(X) + + for i in 1:floor(Int, length(X) * ratio) + result[rand(1:length(X))] = missing + end + + return result +end + +function add_missings_single(X, ratio=0.1) + result = Matrix{Union{Float64, Missing}}(X) + + randcols = 1:floor(Int, size(X, 2) * ratio) + for col in randcols + result[rand(1:size(X, 1)), col] = missing + end + + return result +end + +# A sequential RNG for consistent testing across julia versions +mutable struct SequentialRNG <: AbstractRNG + idx::Int +end +SequentialRNG(; start_idx=1) = SequentialRNG(start_idx) + +function Base.rand(srng::SequentialRNG, x::Vector) + srng.idx = srng.idx < length(x) ? srng.idx + 1 : 1 + return x[srng.idx] +end + struct ImputorTester{I<:Imputor} imp::Type{I} f::Function @@ -13,24 +45,31 @@ function ImputorTester(imp::Type{<:Imputor}; kwargs...) imp, getfield(Impute, Symbol(fname)), getfield(Impute, Symbol(fname * "!")), - merge( - NamedTuple{keys(kwargs)}(values(kwargs)), - (context = Context(; limit=1.0),), - ), + NamedTuple{keys(kwargs)}(values(kwargs)), ) end function test_all(tester::ImputorTester) + test_hashing(tester) test_equality(tester) test_vector(tester) test_matrix(tester) + test_cube(tester) test_dataframe(tester) test_groupby(tester) test_axisarray(tester) + test_nameddimsarray(tester) + test_keyedarray(tester) test_columntable(tester) test_rowtable(tester) end +function test_hashing(tester::ImputorTester) + @testset "Hashing" begin + @test hash(tester.imp()) == hash(tester.imp()) + end +end + function test_equality(tester::ImputorTester) @testset "Equality" begin @test tester.imp() == tester.imp() @@ -75,19 +114,7 @@ function test_vector(tester::ImputorTester) @testset "All missing" begin # Test having only missing data c = fill(missing, 10) - if tester.imp != Impute.DropObs - @test isequal(impute(c, tester.imp(; tester.kwargs...)), c) - else - @test impute(c, tester.imp(; tester.kwargs...)) == empty(c) - end - end - - @testset "Too many missing values" begin - # Test Context error condition - c = fill(missing, 10) - kwargs = merge(tester.kwargs, (context = Context(; limit=0.1),)) - @test_throws ImputeError impute(c, tester.imp(; kwargs...)) - @test_throws ImputeError tester.f(c; kwargs...) + @test isequal(impute(c, tester.imp(; tester.kwargs...)), c) end end end @@ -99,7 +126,7 @@ function test_matrix(tester::ImputorTester) a[[2, 3, 7]] .= missing m = collect(reshape(a, 5, 4)) - result = impute(m, tester.imp(; tester.kwargs...)) + result = impute(m, tester.imp(; tester.kwargs...); dims=:cols) @testset "Base" begin # Test that we have fewer missing values @@ -108,14 +135,14 @@ function test_matrix(tester::ImputorTester) @test eltype(result) <: eltype(m) # Test that functional form behaves the same way - @test result == tester.f(m; tester.kwargs...) + @test result == tester.f(m; dims=:cols, tester.kwargs...) end @testset "In-place" begin # Test that the in-place function return the new results and logs whether it # successfully did it in-place m2 = deepcopy(m) - m2_ = tester.f!(m2; tester.kwargs...) + m2_ = tester.f!(m2; dims=:cols, tester.kwargs...) @test m2_ == result if m2 != result @warn "$(tester.f!) did not mutate input data of type Matrix" @@ -125,37 +152,70 @@ function test_matrix(tester::ImputorTester) @testset "Transpose" begin m_ = collect(m') result_ = collect(result') - @test isequal(tester.f(m_; dims=2, tester.kwargs...), result_) + @test isequal(tester.f(m_; dims=:rows, tester.kwargs...), result_) - if !(tester.imp in (DropVars, DropObs, SRS)) - @test isequal(tester.f!(m_; dims=2, tester.kwargs...), result_) + if tester.imp != SRS + @test isequal(tester.f!(m_; dims=:rows, tester.kwargs...), result_) end end @testset "No missing" begin # Test having no missing data b = collect(reshape(allowmissing(1.0:1.0:20.0), 5, 4)) - @test impute(b, tester.imp(; tester.kwargs...)) == b + @test impute(b, tester.imp(; tester.kwargs...); dims=:cols) == b end @testset "All missing" begin # Test having only missing data - c = fill(missing, 5, 2) - if tester.imp == DropObs - @test impute(c, tester.imp(; tester.kwargs...)) == Matrix{Missing}(missing, 0, 2) - elseif tester.imp == DropVars - @test impute(c, tester.imp(; tester.kwargs...)) == Matrix{Missing}(missing, 5, 0) - else - @test isequal(impute(c, tester.imp(; tester.kwargs...)), c) + c = missings(5, 2) + @test isequal(impute(c, tester.imp(; tester.kwargs...); dims=:cols), c) + c_ = impute!(deepcopy(c), tester.imp(; tester.kwargs...); dims=:cols) + @test isequal(c_, c) + end + end +end + +function test_cube(tester::ImputorTester) + @testset "Cube" begin + a = allowmissing(1.0:1.0:60.0) + a[[2, 7, 18, 23, 34, 41, 55, 59, 60]] .= missing + C = collect(reshape(a, 5, 4, 3)) + + result = impute(C, tester.imp(; tester.kwargs...); dims=3) + + @testset "Base" begin + # Test that we have fewer missing values + @test count(ismissing, result) < count(ismissing, C) + @test isa(result, Array{Union{Float64, Missing}, 3}) + @test eltype(result) <: eltype(C) + + # Test that functional form behaves the same way + @test result == tester.f(C; dims=3, tester.kwargs...) + end + + @testset "In-place" begin + # Test that the in-place function return the new results and logs whether it + # successfully did it in-place + C2 = deepcopy(C) + C2_ = tester.f!(C2; dims=3, tester.kwargs...) + @test C2_ == result + if C2 != result + @warn "$(tester.f!) did not mutate input data of type Matrix" end end - @testset "Too many missing values" begin - # Test Context error condition - c = fill(missing, 5, 2) - kwargs = merge(tester.kwargs, (context = Context(; limit=0.1),)) - @test_throws ImputeError impute(c, tester.imp(; kwargs...)) - @test_throws ImputeError tester.f(c; kwargs...) + @testset "No missing" begin + # Test having no missing data + B = collect(reshape(allowmissing(1.0:1.0:60.0), 5, 4, 3)) + @test impute(B, tester.imp(; tester.kwargs...); dims=3) == B + end + + @testset "All missing" begin + # Test having only missing data + M = missings(5, 4, 3) + @test isequal(impute(M, tester.imp(; tester.kwargs...); dims=3), M) + M_ = impute!(deepcopy(M), tester.imp(; tester.kwargs...); dims=3) + @test isequal(M_, M) end end end @@ -206,25 +266,7 @@ function test_dataframe(tester::ImputorTester) :sin => fill(missing, 10), :cos => fill(missing, 10), ) - if tester.imp == DropObs - @test impute(c, tester.imp(; tester.kwargs...)) == DataFrame() - elseif tester.imp == DropVars - # https://github.com/JuliaData/Tables.jl/issues/117 - @test impute(c, tester.imp(; tester.kwargs...)) == DataFrame() - else - @test isequal(impute(c, tester.imp(; tester.kwargs...)), c) - end - end - - @testset "Too many missing values" begin - # Test Context error condition - c = DataFrame( - :sin => fill(missing, 10), - :cos => fill(missing, 10), - ) - kwargs = merge(tester.kwargs, (context = Context(; limit=0.1),)) - @test_throws ImputeError impute(c, tester.imp(; kwargs...)) - @test_throws ImputeError tester.f(c; kwargs...) + @test isequal(impute(c, tester.imp(; tester.kwargs...)), c) end end end @@ -250,13 +292,7 @@ function test_groupby(tester::ImputorTester) result = mapreduce(tester.f, vcat, groupby(df, [:hod, :obj])) @test !isequal(df, result) - if tester.imp == DropObs - # If we've dropped some observations then we should get back - # all, but the 4 missing observations per 24 hods and 8 objs. - @test size(result) == (24 * 8 * 16, 3) - else - @test size(result) == size(df) - end + @test size(result) == size(df) # Test that we successfully imputed something. # We expect LOCF and NOCB to leave `missing`s at the start and end of each @@ -280,7 +316,7 @@ function test_axisarray(tester::ImputorTester) Axis{:time}(DateTime(2017, 6, 5, 5):Hour(1):DateTime(2017, 6, 5, 9)), Axis{:id}(1:4) ) - result = impute(aa, tester.imp(; tester.kwargs...)) + result = impute(aa, tester.imp(; tester.kwargs...); dims=:cols) @testset "Base" begin # Test that we have fewer missing values @@ -289,14 +325,14 @@ function test_axisarray(tester::ImputorTester) @test eltype(result) <: eltype(aa) # Test that functional form behaves the same way - @test result == tester.f(aa; tester.kwargs...) + @test result == tester.f(aa; tester.kwargs..., dims=:cols) end @testset "In-place" begin # Test that the in-place function return the new results and logs whether it # successfully did it in-place aa2 = deepcopy(aa) - aa2_ = tester.f!(aa2; tester.kwargs...) + aa2_ = tester.f!(aa2; tester.kwargs..., dims=:cols) @test aa2_ == result if aa2 != result @warn "$(tester.f!) did not mutate input data of type AxisArray" @@ -305,6 +341,78 @@ function test_axisarray(tester::ImputorTester) end end +function test_nameddimsarray(tester::ImputorTester) + @testset "NamedDimsArray" begin + a = allowmissing(1.0:1.0:20.0) + a[[2, 3, 7]] .= missing + m = collect(reshape(a, 5, 4)) + nda = NamedDimsArray(deepcopy(m), (:time, :id)) + result = impute(nda, tester.imp(; tester.kwargs...); dims=:id) + + @testset "Base" begin + # Test that we have fewer missing values + @test count(ismissing, result) < count(ismissing, nda) + @test isa(result, NamedDimsArray) + @test eltype(result) <: eltype(nda) + + # Test that functional form behaves the same way + @test result == tester.f(nda; tester.kwargs..., dims=:id) + + # Test using cols still works + @test result == tester.f(nda; tester.kwargs..., dims=:cols) + end + + @testset "In-place" begin + # Test that the in-place function return the new results and logs whether it + # successfully did it in-place + nda2 = deepcopy(nda) + nda2_ = tester.f!(nda2; tester.kwargs..., dims=:cols) + @test nda2_ == result + if nda2 != result + @info "$(tester.f!) did not mutate input data of type NamedDimsArray" + end + end + end +end + +function test_keyedarray(tester::ImputorTester) + @testset "KeyedArray" begin + a = allowmissing(1.0:1.0:20.0) + a[[2, 3, 7]] .= missing + m = collect(reshape(a, 5, 4)) + ka = KeyedArray( + deepcopy(m); + time=DateTime(2017, 6, 5, 5):Hour(1):DateTime(2017, 6, 5, 9), + id=1:4, + ) + result = impute(ka, tester.imp(; tester.kwargs...); dims=:id) + + @testset "Base" begin + # Test that we have fewer missing values + @test count(ismissing, result) < count(ismissing, ka) + @test isa(result, KeyedArray) + @test eltype(result) <: eltype(ka) + + # Test that functional form behaves the same way + @test result == tester.f(ka; tester.kwargs..., dims=:id) + + # Test using cols still works + @test result == tester.f(ka; tester.kwargs..., dims=:cols) + end + + @testset "In-place" begin + # Test that the in-place function return the new results and logs whether it + # successfully did it in-place + ka2 = deepcopy(ka) + ka2_ = tester.f!(ka2; tester.kwargs..., dims=:cols) + @test ka2_ == result + if ka2 != result + @info "$(tester.f!) did not mutate input data of type KeyedArray" + end + end + end +end + function test_columntable(tester::ImputorTester) @testset "Column Table" begin coltab = ( @@ -360,7 +468,7 @@ function test_rowtable(tester::ImputorTester) end @testset "In-place" begin - # Test that the in-place function return the new results and logs whether it + # Test that the in-place function returns the new results and logs whether it # successfully did it in-place rowtab2 = deepcopy(rowtab) rowtab2_ = tester.f!(rowtab2; tester.kwargs...) diff --git a/test/utils.jl b/test/utils.jl new file mode 100644 index 0000000..bf873d0 --- /dev/null +++ b/test/utils.jl @@ -0,0 +1,12 @@ +@testset "Utilities" begin + @testset "Impute.dim" begin + X = rand(10, 5) + KA = KeyedArray(X; A=1:10, B=collect("abcde")) + + @test Impute.dim(X, 1) == Impute.dim(X, :rows) == Impute.dim(KA, :A) + @test first(eachslice(X, dims=1)) == first(eachslice(KA, dims=1)) == first(eachslice(KA, dims=:A)) + + @test Impute.dim(X, 2) == Impute.dim(X, :cols) == Impute.dim(KA, :B) + @test first(eachslice(X, dims=2)) == first(eachslice(KA, dims=2)) == first(eachslice(KA, dims=:B)) + end +end