Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Using semsimian v0.1.18 that writes output directly rather than using oaklib to write output #609

Closed
wants to merge 40 commits into from
Closed
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
5dc1893
Using semsimian v0.1.17-rc2 that writes output directly rather than u…
hrshdhgd Jul 11, 2023
f62e096
~quick~ => --low-memory
hrshdhgd Jul 12, 2023
514a44b
TODO added as comment
hrshdhgd Jul 12, 2023
15fbce3
passing outfile
hrshdhgd Jul 13, 2023
32eba2e
get colun names from TermPairwiseSimilarity
hrshdhgd Jul 13, 2023
25c588d
formatted
hrshdhgd Jul 13, 2023
4608fc2
bug fix: all values must be a str to run `match`
hrshdhgd Jul 14, 2023
288fcd5
improved column generation from class attr
hrshdhgd Jul 14, 2023
64fea01
fill-table used via subversion
hrshdhgd Jul 14, 2023
1366017
formatted
hrshdhgd Jul 14, 2023
724b91e
added gilda dependency to tox
hrshdhgd Jul 14, 2023
29b9cd8
poetry update uses latest semsimian rc3 version
hrshdhgd Jul 14, 2023
6a4ca3c
added cosine score to similarity model
hrshdhgd Jul 15, 2023
e27ad72
added cosine score to similarity model
hrshdhgd Jul 15, 2023
d97b3b7
bumped semsimian version
hrshdhgd Jul 15, 2023
667276b
cosine similarity now calculated
hrshdhgd Jul 15, 2023
f66f27b
Semsimian ojct creation required term_pairwise_similarity_attributes
hrshdhgd Jul 18, 2023
f3e5bd5
Addresses #619
hrshdhgd Jul 21, 2023
fa7f3fa
poetry updated to latest version of semsimian
hrshdhgd Jul 21, 2023
513c2b7
linted and docstrings added
hrshdhgd Jul 21, 2023
65f0dbf
poetry update
hrshdhgd Jul 25, 2023
286140a
removed subprocess for autolabel
hrshdhgd Jul 28, 2023
5f711d9
switched --embeddings-file to more generic --term-vectors
hrshdhgd Jul 28, 2023
c23d28a
Added docs from --term-vectors
hrshdhgd Jul 28, 2023
54e7cfa
Added config file and renamed create object function
hrshdhgd Jul 28, 2023
bd2325e
Added docs
hrshdhgd Jul 28, 2023
7cc87ac
Added documentation
hrshdhgd Jul 28, 2023
4219e5a
formatted warning
hrshdhgd Jul 28, 2023
041e948
--low-memory should be Flase by default.
hrshdhgd Jul 28, 2023
b2ec561
typo fix
hrshdhgd Aug 3, 2023
bca6c57
pass resource path to rust
hrshdhgd Aug 4, 2023
3c9a249
move semsimian config file
hrshdhgd Aug 4, 2023
05b43da
Updated docstring
hrshdhgd Aug 4, 2023
d0992fd
formatted
hrshdhgd Aug 4, 2023
b31ae1d
updated poetry
hrshdhgd Aug 4, 2023
7b7f518
Commented out codeblock that seems redundant
hrshdhgd Aug 4, 2023
e9ca072
deleted commented codeblock
hrshdhgd Aug 4, 2023
de0377b
not needed since done by semsimianImplementation
hrshdhgd Aug 4, 2023
8af2b71
added code to support semsimian's termset similarity
hrshdhgd Aug 7, 2023
2882d3b
Merge branch 'main' into experimental-semsimian
hrshdhgd Aug 25, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,600 changes: 887 additions & 713 deletions poetry.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ pysolr = "^3.9.0"
eutils = ">=0.6.0"
requests-cache = "^1.0.1"
click = "*"
semsimian = "^0.1.16"
semsimian = ">=0.1.16"
urllib3 = {version = "< 2", optional = true}

[tool.poetry.dev-dependencies]
Expand Down
92 changes: 78 additions & 14 deletions src/oaklib/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import os
import re
import secrets
import subprocess
import sys
from collections import defaultdict
from enum import Enum, unique
Expand Down Expand Up @@ -61,13 +62,15 @@
)
from oaklib.datamodels.search import create_search_configuration
from oaklib.datamodels.settings import Settings
from oaklib.datamodels.similarity import TermPairwiseSimilarity
from oaklib.datamodels.summary_statistics_datamodel import (
GroupedStatistics,
UngroupedStatistics,
)
from oaklib.datamodels.text_annotator import TextAnnotationConfiguration
from oaklib.datamodels.validation_datamodel import ValidationConfiguration
from oaklib.datamodels.vocabulary import (
DEFAULT_SIMILARITY_MAP_FILE_BY_SEMSIMIAN,
DEVELOPS_FROM,
EQUIVALENT_CLASS,
HAS_OBO_NAMESPACE,
Expand Down Expand Up @@ -2647,13 +2650,24 @@ def similarity_pair(terms, predicates, autolabel: bool, output: TextIO, output_t
type=float,
help="Minimum value for information content",
)
@click.option(
"--embeddings-file",
type=click.File(mode="r"),
help="file containing embeddings of all necessary nodes.",
hrshdhgd marked this conversation as resolved.
Show resolved Hide resolved
)
@click.option("-o", "--output", help="path to output")
@click.option(
"--main-score-field",
default="phenodigm_score",
show_default=True,
help="Score used for summarization",
)
@click.option(
"--low-memory/--no-low-memory",
default=False,
show_default=True,
help="If set, results will be generated by Rust.",
)
@autolabel_option
@output_type_option
@click.argument("terms", nargs=-1)
Expand All @@ -2663,8 +2677,10 @@ def similarity(
set1_file,
set2_file,
autolabel: bool,
low_memory: bool,
min_jaccard_similarity: Optional[float],
min_ancestor_information_content: Optional[float],
embeddings_file: TextIO,
main_score_field,
output_type,
output,
Expand Down Expand Up @@ -2752,21 +2768,69 @@ def similarity(
else:
set2it = query_terms_iterator(terms, impl)
actual_predicates = _process_predicates_arg(predicates)
for sim in impl.all_by_all_pairwise_similarity(
set1it,
set2it,
predicates=actual_predicates,
min_jaccard_similarity=min_jaccard_similarity,
min_ancestor_information_content=min_ancestor_information_content,
):
if low_memory:
term_pairwise_similarity_attributes = [
attr
for attr in vars(TermPairwiseSimilarity)
if not any(attr.startswith(s) for s in ["class_", "__"])
]
impl.all_by_all_pairwise_similarity_quick(
set1it,
set2it,
predicates=actual_predicates,
min_jaccard_similarity=min_jaccard_similarity,
min_ancestor_information_content=min_ancestor_information_content,
embeddings_file=embeddings_file,
outfile=output,
)

# Read the output file line by line and store the contents in a list
if output is None:
output = DEFAULT_SIMILARITY_MAP_FILE_BY_SEMSIMIAN
with open(output, "r") as f:
lines = f.readlines()

# Add the column names to the first line of the list
columns_already_present = lines[0].strip().split("\t")
columns_missing = [
col
for col in term_pairwise_similarity_attributes
if col not in columns_already_present
]
columns_missing_as_str = "\t".join(columns_missing) + "\n"
header = lines[0].strip() + "\t" + columns_missing_as_str
lines[0] = header

# Write the updated contents back to the output file
with open(output, "w") as file:
file.writelines(lines)

if autolabel:
# TODO: this can be made more efficient
sim.subject_label = impl.label(sim.subject_id)
sim.object_label = impl.label(sim.object_id)
sim.ancestor_label = impl.label(sim.ancestor_id)
writer.emit(sim)
writer.finish()
writer.file.close()
new_output = output.replace(".tsv", "_filled.tsv")
command = f"runoak -i {impl.resource.slug} fill-table {output} -o {new_output} --allow-missing"
try:
subprocess.run(command, shell=True, check=True) # noqa
print(
f"{output} filled successfully by oaklib and results are in {new_output}."
)
except subprocess.CalledProcessError as e:
print(f"Command execution failed with error code {e.returncode}.")
else:
for sim in impl.all_by_all_pairwise_similarity(
set1it,
set2it,
predicates=actual_predicates,
min_jaccard_similarity=min_jaccard_similarity,
min_ancestor_information_content=min_ancestor_information_content,
):
if autolabel:
# TODO: this can be made more efficient
sim.subject_label = impl.label(sim.subject_id)
sim.object_label = impl.label(sim.object_id)
sim.ancestor_label = impl.label(sim.ancestor_id)
writer.emit(sim)
writer.finish()
writer.file.close()
else:
raise NotImplementedError(f"Cannot execute this using {impl} of type {type(impl)}")

Expand Down
Loading