From 7db6c529a4d5b26f62c0dd290bd8dba625e4235d Mon Sep 17 00:00:00 2001 From: Oren Ben-Kiki Date: Wed, 3 Jul 2024 09:13:09 +0300 Subject: [PATCH] Add access to gmara gene lists. --- Project.toml | 5 + deps/document.jl | 10 +- deps/jet.py | 2 +- docs/v0.1.0/.documenter-siteinfo.json | 2 +- docs/v0.1.0/anndata_format.html | 4 + docs/v0.1.0/boxes.html | 4 + docs/v0.1.0/contracts.html | 6 +- docs/v0.1.0/defaults.html | 4 + docs/v0.1.0/gmara.html | 467 ++++++++++++++++++++++++++ docs/v0.1.0/identify_genes.html | 4 + docs/v0.1.0/index.html | 48 ++- docs/v0.1.0/objects.inv | Bin 834 -> 973 bytes docs/v0.1.0/search_index.js | 2 +- src/Metacells.jl | 3 + src/gmara.jl | 309 +++++++++++++++++ src/gmara.md | 27 ++ test/gmara.jl | 51 +++ test/runtests.jl | 2 + 18 files changed, 944 insertions(+), 6 deletions(-) create mode 100644 docs/v0.1.0/gmara.html create mode 100644 src/gmara.jl create mode 100644 src/gmara.md create mode 100644 test/gmara.jl diff --git a/Project.toml b/Project.toml index cd72497..56243fe 100644 --- a/Project.toml +++ b/Project.toml @@ -6,12 +6,17 @@ version = "0.1.0" [deps] CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5" +CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193" +ConcurrentUtils = "3df5f688-6c4c-4767-8685-17f5ad261477" Daf = "1375bf9c-a47d-45a1-aad5-626dd8629d98" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" +GZip = "92fee26a-97fe-5a0c-ad85-20a5f3185b63" +HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" +Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" diff --git a/deps/document.jl b/deps/document.jl index 57ae3c8..05d0d2c 100644 --- a/deps/document.jl +++ b/deps/document.jl @@ -46,7 +46,15 @@ makedocs(; prettyurls = false, size_threshold_warn = 200 * 2^10, ), - pages = ["index.md", "contracts.md", "identify_genes.md", "boxes.md", "anndata_format.md", "defaults.md"], + pages = [ + "index.md", + "gmara.md", + "contracts.md", + "identify_genes.md", + "boxes.md", + "anndata_format.md", + "defaults.md", + ], ) if seen_problems diff --git a/deps/jet.py b/deps/jet.py index dfefb2b..f142101 100644 --- a/deps/jet.py +++ b/deps/jet.py @@ -72,7 +72,7 @@ def is_disabled(path, line): context_changed = True depth = len(line.split(' ')[0]) - while len(context_lines) >= depth: + while len(context_lines) >= max(depth, 1): context_lines.pop() context_disabled.pop() diff --git a/docs/v0.1.0/.documenter-siteinfo.json b/docs/v0.1.0/.documenter-siteinfo.json index a643b67..69911a7 100644 --- a/docs/v0.1.0/.documenter-siteinfo.json +++ b/docs/v0.1.0/.documenter-siteinfo.json @@ -1 +1 @@ -{"documenter":{"julia_version":"1.10.4","generation_timestamp":"2024-06-26T11:20:38","documenter_version":"1.4.1"}} \ No newline at end of file +{"documenter":{"julia_version":"1.10.4","generation_timestamp":"2024-07-03T09:12:26","documenter_version":"1.4.1"}} \ No newline at end of file diff --git a/docs/v0.1.0/anndata_format.html b/docs/v0.1.0/anndata_format.html index 9e1b509..2889523 100644 --- a/docs/v0.1.0/anndata_format.html +++ b/docs/v0.1.0/anndata_format.html @@ -53,6 +53,10 @@
  • +Gmara + +
  • +
  • Contracts
  • diff --git a/docs/v0.1.0/boxes.html b/docs/v0.1.0/boxes.html index 7e7c903..eabef73 100644 --- a/docs/v0.1.0/boxes.html +++ b/docs/v0.1.0/boxes.html @@ -53,6 +53,10 @@
  • +Gmara + +
  • +
  • Contracts
  • diff --git a/docs/v0.1.0/contracts.html b/docs/v0.1.0/contracts.html index 6c9094e..c4c2625 100644 --- a/docs/v0.1.0/contracts.html +++ b/docs/v0.1.0/contracts.html @@ -52,6 +52,10 @@ Metacells +
  • +Gmara + +
  • Contracts @@ -988,7 +992,7 @@

  • +Gmara + +
  • +
  • Contracts
  • diff --git a/docs/v0.1.0/gmara.html b/docs/v0.1.0/gmara.html new file mode 100644 index 0000000..1ad4671 --- /dev/null +++ b/docs/v0.1.0/gmara.html @@ -0,0 +1,467 @@ + + + + + + + +Gmara · Metacells.jl v0.1.0 + + + + + + + + + + + + + + + + + + + + + + + + +
    + +
    +
    + + + + +
    +
    +

    +Gmara + + + + + +

    +
    +
    + + + +Metacells.Gmara + + — +Module + +
    +
    +
    +

    Access gene names lists from +Gmara +. +

    +
    +
    Note +
    +
    +

    All the functions here are thread-safe and can also be invoked from multiple parallel processes using the same + +CACHE_DIR + +. This directory can even be shared between multiple users as long as they have read-write permissions to the shared directory. This should even work on NFS mounted volumes shared between multiple servers. +

    +
    +
    +
    +
    +
    +

    +Configuration + + + + + +

    +
    +
    + + + +Metacells.Gmara.CACHE_DIR + + — +Constant + +
    +
    +
    +

    The default ( +$HOME/.cache/gmara +) location of the cache of downloaded Gmara data files. +

    +

    You can override this by setting the +METACELLS_GMARA_CACHE_DIR + environment variable, or by passing an explicit +cache_dir + parameter to the functions. +

    +

    The top-level under this is the version indicator, where +main + is always the latest and greatest version. Under each version we store the files in the same path as in github, with a +.gz + suffix for the compressed raw data, +.jl_set.gz + for serialized Julia set objects, and +.lock + for temporary lock files for coordinating between parallel processes. +

    +
    +
    +
    +
    +
    + + + +Metacells.Gmara.TIMEOUT + + — +Constant + +
    +
    +
    +

    The default timeout in seconds (10) for waiting for a lock file in the Gmara cache. If not positive, will wait forever. If a process crashes very badly then a lock file may be left behind and may need to be removed by hand to allow access for the data. +

    +

    You can override this by setting the +METACELLS_GMARA_TIMEOUT + environment variable, or by passing an explicit +timeout + parameter to the functions. +

    +
    +
    +
    +

    +Functions + + + + + +

    +
    +
    + + + +Metacells.Gmara.normalize_gene_name + + — +Function + +
    +
    +
    +
    +normalize_gene_name(name::AbstractString; namespace::AbstractString)::AbstractString
    +
    +
    +

    Normalize the a gene name in some namespace. In most namespaces, this means removing the +.[0-9] + version suffix from the name, and converting the name to upper case. To lookup a name in a list or a namespace, you need to normalize the query gene name accordingly. The UCSC namespace is an exception in that it is all-lower-case and the +.[0-9] + suffix seems to be an inherent part of the identifier. +

    +
    +
    +
    +
    +
    + + + +Metacells.Gmara.gmara_namespace + + — +Function + +
    +
    +
    +
    +gmara_namespace(;
    +    species::AbstractString,
    +    namespace::AbstractString,
    +    version::AbstractString = "main"
    +)::AbstractSet{<:AbstractString}
    +
    +
    +

    Return the set of names in a namespace of genes of some species. As usual in Gmara, this includes everything that may be used as name, e.g. for Ensembl it includes genes, transcripts and proteins; for Symbol it includes genes and clones; etc. +

    +
    +
    +
    +
    +
    + + + +Metacells.Gmara.gmara_list + + — +Function + +
    +
    +
    +
    +gmara_list(;
    +    species::AbstractString,
    +    namespace::AbstractString,
    +    list::AbstractString,
    +    version::AbstractString = "main"
    +)::AbstractSet{<:AbstractString}
    +
    +
    +

    Return the set of names in list in a namespace of genes of some species. This returns all the names that are (probably) in the list; it a name isn't in the result, it is almost certain it does not belong in the list. As usual in Gmara, this includes everything that may be used as name, e.g. for Ensembl it includes genes, transcripts and proteins; for Symbol it includes genes and clones; etc. +

    +
    +
    +
    +
    +
    + + + +Metacells.Gmara.empty_gmara_cache! + + — +Function + +
    +
    +
    +
    +empty_gmara_cache!()::Nothing
    +
    +
    +

    All requests are cached in-memory. This makes repeated requests cheap. This consumes some (modest amount of) memory; also, if the data in the server has been updated (which rarely happens), you will keep getting the old result. This function releases all the memory and forces all subsequent requests to query the server. In the common case the server tells us our disk cache data is up to date, we don't re-download it). +

    +
    +
    +
    +

    +Index + + + + + +

    + +
    + +
    + +
    + + diff --git a/docs/v0.1.0/identify_genes.html b/docs/v0.1.0/identify_genes.html index f0c92d5..9c1aa28 100644 --- a/docs/v0.1.0/identify_genes.html +++ b/docs/v0.1.0/identify_genes.html @@ -53,6 +53,10 @@
  • +Gmara + +
  • +
  • Contracts
  • diff --git a/docs/v0.1.0/index.html b/docs/v0.1.0/index.html index ccbe506..1889154 100644 --- a/docs/v0.1.0/index.html +++ b/docs/v0.1.0/index.html @@ -61,6 +61,10 @@
  • +Gmara + +
  • +
  • Contracts
  • @@ -202,6 +206,12 @@

  • + +Metacells.Gmara + + +
  • +
  • Metacells.IdentifyGenes @@ -226,6 +236,18 @@

  • + +Metacells.Gmara.CACHE_DIR + + +
  • +
  • + +Metacells.Gmara.TIMEOUT + + +
  • +
  • Metacells.AnnDataFormat.CopyAnnData @@ -412,6 +434,30 @@

  • + +Metacells.Gmara.empty_gmara_cache! + + +
  • +
  • + +Metacells.Gmara.gmara_list + + +
  • +
  • + +Metacells.Gmara.gmara_namespace + + +
  • +
  • + +Metacells.Gmara.normalize_gene_name + + +
  • +
  • Metacells.IdentifyGenes.compute_genes_divergence! @@ -432,7 +478,7 @@