Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
timbleimehl committed Apr 21, 2020
1 parent 6ed0ad7 commit 55da0b2
Show file tree
Hide file tree
Showing 7 changed files with 430 additions and 3 deletions.
150 changes: 147 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,147 @@
*.csv
csv/
zip/
dataset/

*.env
!*/env/DEFAULT.env


#Python stuff

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/
8 changes: 8 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"python.formatting.provider": "black",
"python.pythonPath": "/usr/bin/python3.7",
"files.watcherExclude": {
"**/dataset/**": true,
}

}
Empty file added dataloader/__init__.py
Empty file.
105 changes: 105 additions & 0 deletions dataloader/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import os
from Configs import ConfigBase


class DEFAULT(ConfigBase):
SCRIPT_DIR = os.path.dirname(
os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__)))
)
# Load to db every n patent file
# Increase or decrease this according to the amount of memory you have
BATCH_SIZE = 300

# Commit to db every n nodes
COMMIT_INTERVAL = 10000

LOG_LEVEL = "INFO"
GC_NEO4J_URL = "localhost"
GC_NEO4J_USER = None
GC_NEO4J_PASSWORD = None

# if set to True, the dataset will always be downloaded, regardless of its allready existing
REDOWNLOAD_DATASET_IF_EXISTENT = False

DATASET_SOURCE_URLS = [
"https://lens-public.s3.us-west-2.amazonaws.com/coronavirus/patent/fulltext/Coronavirus-broad-keyword-based-patents.zip",
"https://lens-public.s3.us-west-2.amazonaws.com/coronavirus/patent/fulltext/Coronavirus-patents-SARS-and-MERS.zip",
"https://lens-public.s3.us-west-2.amazonaws.com/coronavirus/patent/fulltext/Coronavirus-patents-SARS-and-MERS-TAC.zip",
"https://lens-public.s3.us-west-2.amazonaws.com/coronavirus/patent/fulltext/Coronavirus-limited-keywords-based-patents.zip",
"https://lens-public.s3.us-west-2.amazonaws.com/coronavirus/patent/fulltext/Coronavirus-CPC-based-patents.zip",
"https://lens-public.s3.us-west-2.amazonaws.com/coronavirus/patent/fulltext/Coronavirus-declared-patseq-organism.zip",
"https://lens-public.s3.us-west-2.amazonaws.com/coronavirus/patent/fulltext/Coronavirus-SARS-patents.zip",
"https://lens-public.s3.us-west-2.amazonaws.com/coronavirus/patent/fulltext/Coronavirus-MERS-patents.zip",
"https://lens-public.s3.us-west-2.amazonaws.com/coronavirus/patent/fulltext/Coronavirus-SARS-diagnosis-patents.zip",
"https://lens-public.s3.us-west-2.amazonaws.com/coronavirus/patent/fulltext/Coronavirus-MERS-diagnosis-patents.zip",
"https://lens-public.s3.us-west-2.amazonaws.com/coronavirus/patent/fulltext/Coronavirus-SARS-treatment.zip",
"https://lens-public.s3.us-west-2.amazonaws.com/coronavirus/patent/fulltext/Coronavirus-MERS-treatment.zip",
"https://lens-public.s3.us-west-2.amazonaws.com/coronavirus/patent/fulltext/Ventilators.zip",
]
# Where to store the downloaded dataset
DATASET_BASE_DIR = os.path.join(SCRIPT_DIR, "../dataset/")

JSON2GRAPH_DEFAULT_IDS = [
"lens_id",
"family_id",
"_id",
]

JSON2GRAPH_LABEL_OVERRIDE = {
"bibliographic_data": "Patent",
"nlp_cit": "NonPatentLiteratureCitation",
"pat_cit": "PatentLiteratureCitation",
"claims": "PatentClaim",
"description": "PatentDescription",
"abstract": "PatentAbstract",
"title": "PatentTitle",
"Collection_NonPatentLiteratureCitation": "PatentCitations",
"Collection_PatentLiteratureCitation": "PatentCitations",
"family_extended": {"Family": {"type": "extended"}},
"family_simple": {"Family": {"type": "simple"}},
}
JSON2GRAPH_COLLECTION_ANCHOR_EXTRA_LABELS = []

JSON2GRAPH_PRIMARYKEY_ATTR_BY_LABEL = {}
JSON2GRAPH_PRIMARYKEY_GENERATED_HASHED_ATTRS_BY_LABEL = {
"PatentDescription": ["text"],
"PatentClaim": ["text"],
"PatentAbstract": ["text"],
"PatentLiteratureCitation": "AllAttributes",
"NonPatentLiteratureCitation": "AllAttributes",
}

JSON2GRAPH_SKIP_COLLECTION_HUBS = [
"Collection_PatentTitle",
"Collection_PatentClaim",
"Collection_PatentAbstract",
"Collection_Entity",
"Collection_PatentDescription",
"Collection_lens_id",
]

JSON2GRAPH_ATTR_TO_RELTYPE_INSTEAD_OF_LABEL = {
"inventor": "Entity",
"owner": "Entity",
"applicant": "Entity",
}

JSON2GRAPH_PROPERTY_NAME_OVERRIDE = {
"Entity": {"inventor": "name", "owner": "name", "applicant": "name"}
}

JSON2GRAPH_RELTYPE_OVERRIDE = {}

JSON2GRAPH_INTERFOLD_JSON_ATTR = {"bibliographic_data": ["family"]}


class DEV(ConfigBase):
pass


class PROD(ConfigBase):
pass


class LOCAL(ConfigBase):
pass
52 changes: 52 additions & 0 deletions dataloader/download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import os
import logging
import requests
import zipfile
from Configs import getConfig


config = getConfig()
log = logging.getLogger(__name__)
log.addHandler(logging.StreamHandler())
log.setLevel(getattr(logging, config.LOG_LEVEL))


def download():

if not os.path.isdir(config.DATASET_BASE_DIR):
os.makedirs(config.DATASET_BASE_DIR)

log.info("Start downloading Lens.org patentdata dataset...")

for url in config.DATASET_SOURCE_URLS:
target_path_zip = os.path.join(config.DATASET_BASE_DIR, os.path.basename(url))
content_path = os.path.join(
config.DATASET_BASE_DIR, os.path.splitext(os.path.basename(url))[0]
)
if os.path.isdir(content_path) and not config.REDOWNLOAD_DATASET_IF_EXISTENT:
log.info(
"Skip downloading '{}'. Seems to be allready existing. Switch 'REDOWNLOAD_DATASET_IF_EXISTENT' in config.py to True to force redownload or delete '{}'".format(
url, content_path
)
)
continue
r = requests.get(url)
with open(target_path_zip, "wb") as f:
f.write(r.content)
unzip(target_path_zip)
for root, dirs, files in os.walk(content_path):
for name in files:
unzip(os.path.join(root, name))

log.info("Finished downloading Lens.org patentdata dataset...")


def unzip(zipfile_path):
target_path_content = os.path.dirname(zipfile_path)
with zipfile.ZipFile(zipfile_path, "r") as zip_ref:
zip_ref.extractall(target_path_content)
os.remove(zipfile_path)


if __name__ == "__main__":
download()
Loading

0 comments on commit 55da0b2

Please sign in to comment.