Skip to content

Commit

Permalink
add a couple subcommands
Browse files Browse the repository at this point in the history
  • Loading branch information
jaimergp committed Mar 9, 2024
1 parent 945359a commit 1587b65
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 28 deletions.
13 changes: 10 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,21 +11,21 @@ the conda-forge path metadata.
To bootstrap, you need a local copy of the [`libcfgraph` repository](https://github.com/regro/libcfgraph):

```bash
# Initialize the database; takes ~15min
# Initialize the database; takes ~10-15min
$ python conda_forge_paths/path_to_artifacts_db.py bootstrap path/to/libcfgraph-repo/artifacts
# Enable full text search; takes ~2min
$ python conda_forge_paths/path_to_artifacts_db.py fts
```

This should create a ~9GB `path_to_artifacts.db` file. This should compress nicely with `zstd`:
This should create a ~9GB `path_to_artifacts.db` file. It compresses nicely with `zstd`:

```bash
$ ZSTD_NBTHREADS=4 ZSTD_CLEVEL=19 tar --zstd -cf path_to_artifacts.tar.zst path_to_artifacts.db
```

## Queries

The script also has a `query` subcommand:
The script also has a couple of `find-*` subcommands:

```bash
# Find artifacts providing this exact file
Expand All @@ -34,6 +34,13 @@ $ python conda_forge_paths/path_to_artifacts_db.py find-artifacts 'bin/python'
$ python conda_forge_paths/path_to_artifacts_db.py find-paths 'python'
```

The most recent artifact can be found with:

```bash
$ python conda_forge_paths/path_to_artifacts_db.py most-recent-artifact
# returns: cf/linux-64/llama-cpp-python-0.2.20-cuda112_habc0a91_2 1701711069.438 2023-12-04 17:31:09 UTC
```

This repo is also preconfigured for a datasette deployment, which offers the same query functionality:

```
Expand Down
66 changes: 44 additions & 22 deletions conda_forge_paths/path_to_artifacts_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import sqlite3
import sys
import time
from datetime import datetime, UTC
from itertools import batched
from pathlib import Path

Expand Down Expand Up @@ -141,15 +142,24 @@ def query(db, q, limit=100, fts=False):
else:
for row in db.execute(
f"""
SELECT group_concat(artifact, x'0a')
SELECT artifact
FROM Artifacts, PathToArtifactIds, json_each('[' || PathToArtifactIds.artifact_ids || ']') as each_id
WHERE PathToArtifactIds.path = (?) AND each_id.value = Artifacts.id
LIMIT {limit}
""",
(q,),
):
yield row

def most_recent_artifact(db):
for row in db.execute(
"""
SELECT artifact, timestamp
FROM Artifacts
ORDER BY timestamp DESC
LIMIT 1
"""
):
return row

if __name__ == "__main__":
if len(sys.argv) == 3:
Expand All @@ -160,27 +170,39 @@ def query(db, q, limit=100, fts=False):
bootstrap_from_libcfgraph_path_to_artifact(db, artifacts_dir)
db.commit()
db.close()
elif action in ("find-artifacts", "find-paths"):
sys.exit()

if action in ("find-artifacts", "find-paths"):
db = connect()
t0 = time.time()
for row in query(db, sys.argv[2], fts=action == "find-paths"):
print(*row, sep="\n")
for i, row in enumerate(query(db, sys.argv[2], fts=action == "find-paths")):
print(f"{i}) {row[0]}")
print(f"Query took {time.time() - t0:.4f} seconds")
db.close()
elif len(sys.argv) == 2 and sys.argv[1] == "fts":
db = connect()
t0 = time.time()
index_full_text_search(db)
print(f"FTS indexing took {time.time() - t0:.4f} seconds")
db.close()
else:
print(
f"Usage: {sys.argv[0]} subcommand",
"subcommands:" ,
" - bootstrap /path/to/libcfgraph/artifacts/ # initialize the database",
" - fts # index the full text search",
" - [find-artifacts <full path> # find artifacts by full path",
" - find-paths <path component> # find full paths by partial matches",
sep="\n"
)
sys.exit(1)
sys.exit()

if len(sys.argv) == 2:
if sys.argv[1] == "fts":
db = connect()
t0 = time.time()
index_full_text_search(db)
print(f"FTS indexing took {time.time() - t0:.4f} seconds")
db.close()
sys.exit()
if sys.argv[1] == "most-recent-artifact":
db = connect()
name, ts = most_recent_artifact(db)
print(name, ts / 1000, datetime.fromtimestamp(ts / 1000, UTC).strftime("%Y-%m-%d %H:%M:%S %Z"))
db.close()
sys.exit()

print(
f"Usage: {sys.argv[0]} subcommand",
"subcommands:" ,
" - bootstrap /path/to/libcfgraph/artifacts/ # initialize the database",
" - fts # index the full text search",
" - [find-artifacts <full path> # find artifacts by full path",
" - find-paths <path component> # find full paths by partial matches",
sep="\n"
)
sys.exit(1)
6 changes: 3 additions & 3 deletions datasette.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,19 @@ databases:
params:
- path
sql: |-
SELECT group_concat(artifact, x'0a')
SELECT artifact
FROM Artifacts, PathToArtifactIds, json_each('[' || PathToArtifactIds.artifact_ids || ']') as each_id
WHERE PathToArtifactIds.path = :path AND each_id.value = Artifacts.id
LIMIT 1
hide_sql: true
find_files:
title: Find full paths (match by path components)
params:
- path
sql: |-
SELECT highlight(PathToArtifactIds_fts, 0, '*', '*')
SELECT path
FROM PathToArtifactIds_fts
WHERE PathToArtifactIds_fts MATCH escape_fts(:path)
ORDER BY bm25(PathToArtifactIds_fts)
LIMIT 100
hide_sql: true
allow_sql: false

0 comments on commit 1587b65

Please sign in to comment.