Skip to content

Commit

Permalink
* Change to py2neo
Browse files Browse the repository at this point in the history
* support new pipeline format
* Add docker build+push action
  • Loading branch information
Tim authored and Tim committed Oct 29, 2020
1 parent 57e915d commit 9fdae81
Show file tree
Hide file tree
Showing 3 changed files with 156 additions and 111 deletions.
23 changes: 23 additions & 0 deletions .github/workflows/docker-image.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
name: Docker Image CI

on:
create:
tags:
- V*

jobs:

build:

runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v2
- name: Build the Docker image
run: docker build . --file dockerfile --tag data-biobert:latest
- name: Publish to Registry
uses: HurricanKai/Publish-Docker-Github-Action@master
with:
name: covidgraph/data-biobert
username: ${{ secrets.DZD_DOCKER_USERNAME }}
password: ${{ secrets.DZD_DOCKER_PASSWORD }}
242 changes: 132 additions & 110 deletions dataloader/main.py
Original file line number Diff line number Diff line change
@@ -1,130 +1,152 @@
import os
import sys
from neo4j import GraphDatabase
import py2neo
import json
import requests
import csv
import logging


from cypher_queries import *

neo4j_url = os.getenv('GC_NEO4J_URL', 'bolt://localhost:7687')
neo4j_user = os.getenv('GC_NEO4J_USER', 'neo4j')
neo4j_pw = os.getenv('GC_NEO4J_PASSWORD', 'test')
# logging
log = logging.getLogger(__name__)
log.addHandler(logging.StreamHandler())
LOG_LEVEL = os.getenv('LOG_LEVEL', "INFO")
log.setLevel(LOG_LEVEL)

# neo4j connection
neo4j_config_str = os.getenv('NEO4J', '{"host": "localhost"}')
neo4j_config_dict = json.loads(neo4j_config_str)
graph = py2neo.Graph(**neo4j_config_dict)

ENV = os.getenv('ENV', 'prod')


def download_csv(url, filename):
data = requests.get(url)
open(filename, 'wb').write(data.content)


if __name__ == "__main__":
# Connect to Neo4j
driver = GraphDatabase.driver(neo4j_url, auth=(neo4j_user, neo4j_pw))

graph.run(create_named_entity_constraint)

# Import named entities
log.info("Download named entities")
PUBMED_ABSTRACT_DOWNLOAD_URL = 'https://drive.google.com/u/0/uc?id=1Rlv70gNtalFp4T4XtpI6psJdJJzZFTTY&export=download'
PUBMED_ABSTRACT_FILE = 'pubmed_abstracts.csv'

download_csv(PUBMED_ABSTRACT_DOWNLOAD_URL, PUBMED_ABSTRACT_FILE)

with driver.session() as session:

try:
session.run(create_named_entity_constraint)
except:
pass

# Import named entities for abstracts

abstract_parameters = []
with open(PUBMED_ABSTRACT_FILE) as csvfile:
reader = csv.DictReader(csvfile)
for index, row in enumerate(reader):
abstract_parameters.append({'pubmed_id': row['pubmed_id'], 'entity_type': row['entity_type'],'entity_value': row['entity_value'], 'entity_id':row['covidgraph_id']})
# Batch by 1000 rows
if (index != 0) and (index % 1000 == 0):
print('importing {} batch abstract entities'.format(index / 1000))
r = session.run(pubmed_abstract_query, {'parameters': abstract_parameters})
abstract_parameters = []
# import the rest
r = session.run(pubmed_abstract_query, {'parameters': abstract_parameters})

# Import named entities external ids for genes

PUBMED_GENE_EXTERNALIDS_DOWNLOAD_URL = 'https://drive.google.com/u/0/uc?id=1KgJPBYB8D4_hN7wbiu0XOOM-lQdV8EgP&export=download'
PUBMED_GENE_EXTERNALIDS_FILE = 'pubmed_gene_external_ids.csv'

download_csv(PUBMED_GENE_EXTERNALIDS_DOWNLOAD_URL, PUBMED_GENE_EXTERNALIDS_FILE)
external_ids_parameters = []
with open(PUBMED_GENE_EXTERNALIDS_FILE) as csvfile:
reader = csv.reader(csvfile, delimiter='\t')
for index, row in enumerate(reader):
bern_id = row[3]
external_ids = row[1].split("|")
external_ids_parameters.append({'entity_id': bern_id, 'external_ids': external_ids})
# Batch by 1000 rows
if (index != 0) and (index % 1000 == 0):
print('importing {} batch gene external ids'.format(index / 1000))
r = session.run(import_external_ids_query, {'parameters': external_ids_parameters})
external_ids_parameters = []
# import the rest
r = session.run(import_external_ids_query, {'parameters': external_ids_parameters})


# Import named entities external ids for diseases

PUBMED_DISEASE_EXTERNALIDS_DOWNLOAD_URL = "https://drive.google.com/u/0/uc?id=1guHxBbUksuDx58zKh8o0d0dgs7klotFT&export=download"
PUBMED_DISEASE_EXTERNALIDS_FILE = "pubmed_disease_external_ids.csv"

download_csv(PUBMED_DISEASE_EXTERNALIDS_DOWNLOAD_URL, PUBMED_DISEASE_EXTERNALIDS_FILE)

external_ids_parameters = []
with open(PUBMED_DISEASE_EXTERNALIDS_FILE) as csvfile:
reader = csv.reader(csvfile, delimiter='\t')
for index, row in enumerate(reader):
bern_id = row[0]
external_ids = row[1].split(", ")
external_ids_parameters.append({'entity_id': bern_id, 'external_ids': external_ids})

# Batch by 1000 rows
if (index != 0) and (index % 1000 == 0):
print('importing {} batch disease external ids'.format(index / 1000))
r = session.run(import_external_ids_query, {'parameters': external_ids_parameters})
external_ids_parameters = []
# import the rest
r = session.run(import_external_ids_query, {'parameters': external_ids_parameters})

# Import named entities external ids for drugs

PUBMED_DRUG_EXTERNALIDS_DOWNLOAD_URL = "https://drive.google.com/u/0/uc?id=1zq-za_1OMCrrJaVwIj-dwIHogqVQ9n0G&export=download"
PUBMED_DRUG_EXTERNALIDS_FILE = "pubmed_drug_external_ids.csv"

download_csv(PUBMED_DRUG_EXTERNALIDS_DOWNLOAD_URL, PUBMED_DRUG_EXTERNALIDS_FILE)

external_ids_parameters = []
with open(PUBMED_DRUG_EXTERNALIDS_FILE) as csvfile:
reader = csv.reader(csvfile, delimiter='\t')
for index, row in enumerate(reader):
bern_id = row[0]
external_ids = row[1].split(", ")
external_ids_parameters.append({'entity_id': bern_id, 'external_ids': external_ids})
# Batch by 1000 rows
if (index != 0) and (index % 1000 == 0):
print('importing {} batch drug external ids'.format(index / 1000))
r = session.run(import_external_ids_query, {'parameters': external_ids_parameters})
external_ids_parameters = []
# import the rest
r = session.run(import_external_ids_query, {'parameters': external_ids_parameters})

# Species tranformation
#
#You can get a NCBI taxonomy ID by removing the last two digits of a BERN species ID.
#
#Example.
#1009505 -> 10095
#
#Mus sp., mice
#https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=10095

session.run(species_ncbi_query)



# Import named entities for abstracts

abstract_parameters = []
with open(PUBMED_ABSTRACT_FILE) as csvfile:
reader = csv.DictReader(csvfile)
for index, row in enumerate(reader):
abstract_parameters.append({'pubmed_id': row['pubmed_id'], 'entity_type': row['entity_type'],
'entity_value': row['entity_value'], 'entity_id': row['covidgraph_id']})
# Batch by 1000 rows
if (index != 0) and (index % 1000 == 0):
log.info(
'importing {} batch abstract entities'.format(index / 1000))
r = graph.run(pubmed_abstract_query, {
'parameters': abstract_parameters})
abstract_parameters = []
# import the rest
r = graph.run(pubmed_abstract_query, {
'parameters': abstract_parameters})

# Import named entities external ids for genes

PUBMED_GENE_EXTERNALIDS_DOWNLOAD_URL = 'https://drive.google.com/u/0/uc?id=1KgJPBYB8D4_hN7wbiu0XOOM-lQdV8EgP&export=download'
PUBMED_GENE_EXTERNALIDS_FILE = 'pubmed_gene_external_ids.csv'

download_csv(PUBMED_GENE_EXTERNALIDS_DOWNLOAD_URL,
PUBMED_GENE_EXTERNALIDS_FILE)
external_ids_parameters = []
with open(PUBMED_GENE_EXTERNALIDS_FILE) as csvfile:
reader = csv.reader(csvfile, delimiter='\t')
for index, row in enumerate(reader):
bern_id = row[3]
external_ids = row[1].split("|")
external_ids_parameters.append(
{'entity_id': bern_id, 'external_ids': external_ids})
# Batch by 1000 rows
if (index != 0) and (index % 1000 == 0):
log.info(
'importing {} batch gene external ids'.format(index / 1000))
r = graph.run(import_external_ids_query, {
'parameters': external_ids_parameters})
external_ids_parameters = []
# import the rest
r = graph.run(import_external_ids_query, {
'parameters': external_ids_parameters})

# Import named entities external ids for diseases

PUBMED_DISEASE_EXTERNALIDS_DOWNLOAD_URL = "https://drive.google.com/u/0/uc?id=1guHxBbUksuDx58zKh8o0d0dgs7klotFT&export=download"
PUBMED_DISEASE_EXTERNALIDS_FILE = "pubmed_disease_external_ids.csv"

download_csv(PUBMED_DISEASE_EXTERNALIDS_DOWNLOAD_URL,
PUBMED_DISEASE_EXTERNALIDS_FILE)

external_ids_parameters = []
with open(PUBMED_DISEASE_EXTERNALIDS_FILE) as csvfile:
reader = csv.reader(csvfile, delimiter='\t')
for index, row in enumerate(reader):
bern_id = row[0]
external_ids = row[1].split(", ")
external_ids_parameters.append(
{'entity_id': bern_id, 'external_ids': external_ids})

# Batch by 1000 rows
if (index != 0) and (index % 1000 == 0):
log.info('importing {} batch disease external ids'.format(
index / 1000))
r = graph.run(import_external_ids_query, {
'parameters': external_ids_parameters})
external_ids_parameters = []
# import the rest
r = graph.run(import_external_ids_query, {
'parameters': external_ids_parameters})

# Import named entities external ids for drugs

PUBMED_DRUG_EXTERNALIDS_DOWNLOAD_URL = "https://drive.google.com/u/0/uc?id=1zq-za_1OMCrrJaVwIj-dwIHogqVQ9n0G&export=download"
PUBMED_DRUG_EXTERNALIDS_FILE = "pubmed_drug_external_ids.csv"

download_csv(PUBMED_DRUG_EXTERNALIDS_DOWNLOAD_URL,
PUBMED_DRUG_EXTERNALIDS_FILE)

external_ids_parameters = []
with open(PUBMED_DRUG_EXTERNALIDS_FILE) as csvfile:
reader = csv.reader(csvfile, delimiter='\t')
for index, row in enumerate(reader):
bern_id = row[0]
external_ids = row[1].split(", ")
external_ids_parameters.append(
{'entity_id': bern_id, 'external_ids': external_ids})
# Batch by 1000 rows
if (index != 0) and (index % 1000 == 0):
log.info(
'importing {} batch drug external ids'.format(index / 1000))
r = graph.run(import_external_ids_query, {
'parameters': external_ids_parameters})
external_ids_parameters = []
# import the rest
r = graph.run(import_external_ids_query, {
'parameters': external_ids_parameters})

# Species tranformation
#
# You can get a NCBI taxonomy ID by removing the last two digits of a BERN species ID.
#
# Example.
# 1009505 -> 10095
#
# Mus sp., mice
# https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=10095

graph.run(species_ncbi_query)
2 changes: 1 addition & 1 deletion requirement.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
neo4j
py2neo
requests

0 comments on commit 9fdae81

Please sign in to comment.