Skip to content

Mimetypes used #20

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
2 changes: 1 addition & 1 deletion app/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ class Config(object):
KNOWLEDGEBASE_KEY = os.environ.get("KNOWLEDGEBASE_KEY", "secret-key")
DEPLOY_ENV = os.environ.get("DEPLOY_ENV", "development")
SPARC_APP_HOST = os.environ.get("SPARC_APP_HOST", "https://sparc-app.herokuapp.com")
SCI_CRUNCH_HOST = os.environ.get("SCICRUNCH_HOST", "https://scicrunch.org/api/1/elastic/SPARC_Datasets_pr")
SCI_CRUNCH_HOST = os.environ.get("SCICRUNCH_HOST", "https://scicrunch.org/api/1/elastic/SPARC_PortalDatasets_dev")
MAPSTATE_TABLENAME = os.environ.get("MAPSTATE_TABLENAME", "mapstates")
SCAFFOLDSTATE_TABLENAME = os.environ.get("SCAFFOLDSTATE_TABLENAME", "scaffoldstates")
WRIKE_TOKEN = os.environ.get("WRIKE_TOKEN")
Expand Down
57 changes: 57 additions & 0 deletions app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,63 @@ def direct_download_url(path):
resource = response["Body"].read()
return resource

# /scicrunch/: Returns scicrunch results for a given <search> query
@app.route("/scicrunch-dataset/<doi1>/<doi2>")
def sci_doi(doi1,doi2):
doi = doi1 + '/' + doi2
print(doi)
data = create_doi_request(doi)
try:
response = requests.post(
f'{Config.SCI_CRUNCH_HOST}/_search?api_key={Config.KNOWLEDGEBASE_KEY}',
json=data)
return response.json()
except requests.exceptions.HTTPError as err:
logging.error(err)
return json.dumps({'error': err})

# /scicrunch-processed/: Returns scicrunch results for a given <search> query
@app.route("/scicrunch-dataset-processed/<doi1>/<doi2>")
def sci_doi_processed(doi1,doi2):
doi = doi1 + '/' + doi2
print(doi)
data = create_doi_request(doi)
try:
response = requests.post(
f'{Config.SCI_CRUNCH_HOST}/_search?api_key={Config.KNOWLEDGEBASE_KEY}',
json=data)
return process_kb_results(response.json())
except requests.exceptions.HTTPError as err:
logging.error(err)
return json.dumps({'error': err})

# /scicrunch-query-string/: Returns results for given organ curie. These can be processed by the sidebar
@app.route("/scicrunch-query-string/")
def sci_organ():
fields = request.args.getlist('field')
curie = request.args.get('curie')
# field example: "*organ.curie"
data = {
"size": 20,
"from": 0,
"query": {
"query_string": {
"fields": fields,
"query": curie
}
}
}

try:
response = requests.post(
f'{Config.SCI_CRUNCH_HOST}/_search?api_key={Config.KNOWLEDGEBASE_KEY}',
json=data)
return process_kb_results(response.json())
except requests.exceptions.HTTPError as err:
logging.error(err)
return json.dumps({'error': err})



# /search/: Returns scicrunch results for a given <search> query
@app.route("/search/", defaults={'query': ''})
Expand Down
61 changes: 54 additions & 7 deletions app/process_kb_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,23 @@
'csvFiles': ['objects']
}

def create_doi_request(doi):

query = {
"query": {
"bool": {
"must": [{"match_all": {}}],
"should": [],
"filter": {
"term": {
"_id": f'DOI:{doi}'
}
}
}
}
}

return query

# create_facet_query(type): Generates facet search request data for scicrunch given a 'type'; where
# 'type' is either 'species', 'gender', or 'genotype' at this stage.
Expand All @@ -25,7 +42,8 @@ def create_facet_query(type):
type_map = {
'species': ['organisms.primary.species.name.aggregate', 'organisms.sample.species.name.aggregate'],
'gender': ['attributes.subject.sex.value'],
'genotype': ['anatomy.organ.name.aggregate']
'genotype': ['anatomy.organ.name.aggregate'],
'organ': ['anatomy.organ.name.aggregate']
}

data = {
Expand Down Expand Up @@ -69,7 +87,8 @@ def create_filter_request(query, terms, facets, size, start):
type_map = {
'species': ['organisms.primary.species.name.aggregate', 'organisms.sample.species.name'],
'gender': ['attributes.subject.sex.value', 'attributes.sample.sex.value'],
'genotype': ['anatomy.organ.name.aggregate']
'genotype': ['anatomy.organ.name.aggregate'],
'organ': ['anatomy.organ.name.aggregate']
}

# Data structure of a scicrunch search
Expand Down Expand Up @@ -136,7 +155,9 @@ def process_kb_results(results):
for i, hit in enumerate(hits):
attr = get_attributes(attributes, hit)
attr['doi'] = convert_doi_to_url(attr['doi'])
attr['csvFiles'] = find_csv_files(attr['csvFiles'])
objects = attr['csvFiles'] # Have to do this as not all datsets return objects
attr['csvFiles'] = find_csv_files(objects)
attr['scaffolds'] = find_scaffold_json_files(objects)
output.append(attr)
return json.dumps({'numberOfHits': results['hits']['total'], 'results': output})

Expand All @@ -146,11 +167,36 @@ def convert_doi_to_url(doi):
return doi
return doi.replace('DOI:', 'https://doi.org/')

def convert_url_to_doi(doi):
if not doi:
return doi
return doi.replace('https://doi.org/', 'DOI:')


def find_csv_files(obj_list):
if not obj_list:
return obj_list
return [obj for obj in obj_list if obj.get('mimetype', 'none') == 'text/csv']
return [obj for obj in obj_list if obj.get('mimetype', {}).get('name', 'none') == 'text/csv']


def find_scaffold_json_files(obj_list):
if not obj_list:
return obj_list
return [obj for obj in obj_list if obj.get('additional_mimetype', {}).get('name', 'none') == 'inode/vnd.abi.scaffold+file']


attributes = {
'scaffolds': ['scaffolds'],
'samples': ['attributes','sample','subject'],
'name': ['item','name'],
'identifier': ['item', 'identifier'],
'uri': ['distributions', 'current', 'uri'],
'updated': ['dates', 'updated'],
'organs': ['anatomy', 'organ'],
'contributors': ['contributors'],
'doi': ['item', 'curie'],
'csvFiles': ['objects']
}


# get_attributes: Use 'attributes' (defined at top of this document) to step through the large scicrunch result dict
Expand All @@ -160,11 +206,12 @@ def get_attributes(attributes, dataset):
for k, attr in attributes.items():
subset = dataset['_source'] # set our subest to the full dataset result
key_attr = False
for key in attr:
for n, key in enumerate(attr): # step through attributes
if isinstance(subset, dict):
if key in subset.keys():
if key in subset.keys(): # continue if keys are found
subset = subset[key]
key_attr = subset
if n+1 is len(attr): # if we made it to the end, save this subset
key_attr = subset
found_attr[k] = key_attr
return found_attr

5 changes: 5 additions & 0 deletions tests/test_scicrunch.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@ def test_scicrunch_keys(client):
assert r.status_code == 200
assert 'numberOfHits' in json.loads(r.data).keys()

def test_scicrunch_dataset_doi(client):
r = client.get('/scicrunch-dataset/DOI%3A10.26275%2Fpzek-91wx')
assert json.loads(r.data)['hits']['hits'][0]['_id'] == "DOI:10.26275/pzek-91wx"


def test_scicrunch_search(client):
r = client.get('/search/heart')
assert r.status_code == 200
Expand Down