Skip to content

Commit 76e4c77

Browse files
authored
Merge pull request #18 from uc-cdis/feat/indexing
Feat/indexing
2 parents cc25178 + f6d3eb9 commit 76e4c77

File tree

14 files changed

+1261
-226
lines changed

14 files changed

+1261
-226
lines changed

README.md

Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,151 @@ This is the client for interacting with the Indexd service for GUID brokering an
1616

1717
This is the client for interacting with the Gen3 submission service including GraphQL queries.
1818

19+
## Indexing Tools
20+
21+
### Download Manifest
22+
23+
How to download a manifest `object-manifest.csv` of all file objects in indexd for a given commons:
24+
25+
```
26+
import sys
27+
import logging
28+
import asyncio
29+
30+
from gen3.index import Gen3Index
31+
from gen3.tools import indexing
32+
from gen3.tools.indexing.verify_manifest import manifest_row_parsers
33+
34+
logging.basicConfig(filename="output.log", level=logging.DEBUG)
35+
logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
36+
37+
COMMONS = "https://{{insert-commons-here}}/"
38+
39+
def main():
40+
loop = asyncio.new_event_loop()
41+
asyncio.set_event_loop(loop)
42+
43+
loop.run_until_complete(
44+
indexing.async_download_object_manifest(
45+
COMMONS,
46+
output_filename="object-manifest.csv",
47+
num_processes=8,
48+
max_concurrent_requests=24,
49+
)
50+
)
51+
52+
53+
if __name__ == "__main__":
54+
main()
55+
56+
```
57+
58+
The output file will contain columns `guid, urls, authz, acl, md5, file_size` with info
59+
populated from indexd.
60+
61+
### Verify Manifest
62+
63+
How to verify the file objects in indexd against a "source of truth" manifest.
64+
65+
> Bonus: How to override default parsing of manifest to match a different structure.
66+
67+
In the example below we assume a manifest named `alternate-manifest.csv` already exists
68+
with info of what's expected in indexd. The headers in the `alternate-manifest.csv`
69+
are `guid, urls, authz, acl, md5, size`.
70+
71+
> NOTE: The alternate manifest headers differ rfom the default headers described above (`file_size` doesn't exist and should be taken from `size`)
72+
73+
```
74+
import sys
75+
import logging
76+
77+
from gen3.index import Gen3Index
78+
from gen3.tools import indexing
79+
from gen3.tools.indexing.verify_manifest import manifest_row_parsers
80+
81+
logging.basicConfig(filename="output.log", level=logging.INFO)
82+
logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
83+
84+
COMMONS = "https://{{insert-commons-here}}/"
85+
86+
87+
def main():
88+
def _get_file_size(row):
89+
try:
90+
return int(row.get("size"))
91+
except Exception:
92+
logging.warning(f"could not convert this to an int: {row.get('size')}")
93+
return row.get("size")
94+
95+
# override default parsers
96+
manifest_row_parsers["file_size"] = _get_file_size
97+
98+
indexing.verify_object_manifest(
99+
COMMONS, manifest_file="alternate-manifest.csv", num_processes=20
100+
)
101+
102+
103+
if __name__ == "__main__":
104+
main()
105+
106+
```
107+
108+
A more complex example is below. In this example:
109+
110+
* The input file is a tab-separated value file (instead of default CSV)
111+
* Note the `manifest_file_delimiter` argument
112+
* The arrays in the file are represented with Python-like list syntax
113+
* ex: `['DEV', 'test']` for the `acl` column
114+
* We are using more Python processes (20) to speed up the verify process
115+
* NOTE: You need to be careful about this, as indexd itself needs to support
116+
scaling to this number of concurrent requests coming in
117+
118+
```
119+
import sys
120+
import logging
121+
122+
from gen3.index import Gen3Index
123+
from gen3.tools import indexing
124+
from gen3.tools.indexing.verify_manifest import manifest_row_parsers
125+
126+
logging.basicConfig(filename="output.log", level=logging.INFO)
127+
logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
128+
129+
COMMONS = "https://{{insert-commons-here}}/"
130+
131+
132+
def main():
133+
def _get_file_size(row):
134+
try:
135+
return int(row.get("size"))
136+
except Exception:
137+
logging.warning(f"could not convert this to an int: {row.get('size')}")
138+
return row.get("size")
139+
140+
def _get_acl_from_row(row):
141+
return [row.get("acl").strip().strip("[").strip("]").strip("'")]
142+
143+
def _get_authz_from_row(row):
144+
return [row.get("authz").strip().strip("[").strip("]").strip("'")]
145+
146+
def _get_urls_from_row(row):
147+
return [row.get("url").strip()]
148+
149+
# override default parsers
150+
manifest_row_parsers["file_size"] = _get_file_size
151+
manifest_row_parsers["acl"] = _get_acl_from_row
152+
manifest_row_parsers["authz"] = _get_authz_from_row
153+
manifest_row_parsers["urls"] = _get_urls_from_row
154+
155+
indexing.verify_object_manifest(
156+
COMMONS,
157+
manifest_file="output-manifest.csv",
158+
manifest_file_delimiter="\t",
159+
num_processes=20,
160+
)
161+
162+
163+
if __name__ == "__main__":
164+
main()
165+
166+
```

0 commit comments

Comments
 (0)