diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..26d3352 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,3 @@ +# Default ignored files +/shelf/ +/workspace.xml diff --git a/fakerequest.py b/fakerequest.py new file mode 100644 index 0000000..b1a9913 --- /dev/null +++ b/fakerequest.py @@ -0,0 +1,21 @@ +from sqlite3 import connect + +DATABASE_NAME = './mirror/mirror.db' +START_URL = "https://ratsinformation.leipzig.de/allris_leipzig_public/oparl/papers?body=2387&page=1" + + +class FakeResponse: + def __init__(self, state, content=None): + self.state = state + if content: + self.content = content + + +def get(url): + with connect(DATABASE_NAME) as con: + res = con.execute('SELECT data FROM mirror WHERE id=?', (url, )) + data = res.fetchone() + if data: + return FakeResponse(state=200, content=data[0]) + else: + return FakeResponse(state=400) diff --git a/neo_connector.py b/neo_connector.py new file mode 100644 index 0000000..79d5d4f --- /dev/null +++ b/neo_connector.py @@ -0,0 +1,40 @@ +from configparser import ConfigParser +from neo4j import GraphDatabase, Session +from contextlib import contextmanager + + +def neo4j_config(): + config = ConfigParser() + config.read('config') + section = config['Neo4j'] + return dict(uri=section['NEO4J_URI'], + user=section['NEO4J_USERNAME'], + password=section['NEO4J_PASSWORD']) + + +class Database: + def __init__(self, uri, user, password): + self.driver = GraphDatabase.driver(uri, auth=(user, password)) + self.driver.verify_connectivity() + self.session = self.driver.session + + +@contextmanager +def database_connection(): + database = Database(**neo4j_config()) + exception: (Exception, None) = None + try: + yield database + except Warning as w: + print(w) + except Exception as e: + exception = e + finally: + database.driver.close() + if exception: + raise exception + +if __name__ == '__main__': + database = Database(**neo4j_config()) + res = database.driver.execute_query('match (n) return count(n)') + exit() diff --git a/nodes_from_neo4j.py b/nodes_from_neo4j.py new file mode 100644 index 0000000..aa5e32f --- /dev/null +++ b/nodes_from_neo4j.py @@ -0,0 +1,53 @@ +from neo4j.graph import Node + +from nodes_scheme import \ + ATTRIBUTES, \ + AbcNodeInterface, \ + AbcOparlPaperInterface, \ + AbcOparlPersonInterface, \ + AbcOparlOrganizationInterface, \ + AbcOparlLocationInterface + + +class Paper(AbcOparlPaperInterface): + _content: Node + + @ATTRIBUTES.OPARL_ID.as_primary + def oparl_id(self): + return self._content.get('oparl_id') + + @ATTRIBUTES.MODIFIED + def modified(self): + return self._content.get('modified') + + @ATTRIBUTES.REFERENCE.as_primary + def reference(self): + return self._content.get('reference') + + @ATTRIBUTES.PAPER_TYPE + def paper_type(self): + return self._content.get('paper_type') + + @ATTRIBUTES.WEB_URL + def web_url(self): + return self._content.get('web_url') + + @ATTRIBUTES.ORIGIN_DATE + def origin_date(self): + return self._content.get('origin_date') + + def directors(self): + pass + + +factory_mapping = [Paper, + AbcOparlPersonInterface, + AbcOparlOrganizationInterface, + AbcOparlLocationInterface] + + +def node_factory(result): + for cls in factory_mapping: + if cls._labels == list(result.labels): + obj = cls(result) + return obj diff --git a/nodes_from_oparl.py b/nodes_from_oparl.py new file mode 100644 index 0000000..f4214b6 --- /dev/null +++ b/nodes_from_oparl.py @@ -0,0 +1,164 @@ +from oparl_objects import BasicOparl as OparlBasic +from oparl_objects import Paper as OparlPaper +from oparl_objects import Person as OparlPerson +from oparl_objects import Organization as OparlOrganization +from oparl_objects import Location as OparlLocation +from oparl_objects import oparl_factory as oparl_factory +import fakerequest as request +import json + +from nodes_scheme import \ + RELATIONS, \ + ATTRIBUTES, \ + AbcNodeInterface, \ + AbcOparlPaperInterface, \ + AbcOparlPersonInterface, \ + AbcOparlOrganizationInterface, \ + AbcOparlLocationInterface + + +class UnknownOparlNode(AbcNodeInterface): + _content: OparlBasic + _labels = [] + + @ATTRIBUTES.OPARL_ID.as_primary + def oparl_id(self): + return self._content.oparl_id + + +def converted_get_request(node: UnknownOparlNode): + url = node.oparl_id.value() + response = request.get(url) + if response.state == 200: + content = json.loads(response.content) + return node_factory(oparl_factory(content)) + + +class OparlPaperNode(AbcOparlPaperInterface): + _content: OparlPaper + + @ATTRIBUTES.OPARL_ID.as_primary + def oparl_id(self): + return self._content.oparl_id + + @ATTRIBUTES.MODIFIED + def modified(self): + return self._content.modified + + @ATTRIBUTES.REFERENCE.as_primary + def reference(self): + return self._content.reference + + @ATTRIBUTES.PAPER_TYPE + def paper_type(self): + return self._content.paper_type + + @ATTRIBUTES.WEB_URL + def web_url(self): + return self._content.web_url + + @ATTRIBUTES.ORIGIN_DATE + def origin_date(self): + return self._content.origin_date + + @RELATIONS.DIRECTED.as_generator + def directors(self): + for director in self._content.under_direction_of: + if isinstance(director, UnknownOparlNode): + director = converted_get_request(director) + yield node_factory(director), self + + @RELATIONS.SUBMITTED.as_generator + def originators(self): + for originator in self._content.originator_persons: + if isinstance(originator, UnknownOparlNode): + originator = converted_get_request(originator) + yield node_factory(originator), self + + + #_content.consultations + + +class OparlPersonNode(AbcOparlPersonInterface): + _content: OparlPerson + + @ATTRIBUTES.OPARL_ID.as_primary + def oparl_id(self): + return self._content.oparl_id + + @ATTRIBUTES.MODIFIED + def modified(self): + return self._content.modified + + @ATTRIBUTES.NAME + def name(self): + return self._content.name + + @ATTRIBUTES.WEB_URL + def web_url(self): + return self._content.web_url + + +class OparlOrganizationNode(AbcOparlOrganizationInterface): + _content: OparlOrganization + + @ATTRIBUTES.OPARL_ID.as_primary + def oparl_id(self): + return self._content.oparl_id + + @ATTRIBUTES.MODIFIED + def modified(self): + return self._content.modified + + @ATTRIBUTES.NAME + def name(self): + return self._content.name + + @ATTRIBUTES.START_DATE + def start_date(self): + return self._content.start_date + + @ATTRIBUTES.END_DATE + def end_date(self): + return self._content.end_date + + +class OparlLocationNode(AbcOparlLocationInterface): + _content: OparlLocation + + @ATTRIBUTES.OPARL_ID.as_primary + def oparl_id(self): + return self._content.oparl_id + + @ATTRIBUTES.MODIFIED + def modified(self): + return self._content.modified + + @ATTRIBUTES.LOCALITY + def locality(self): + return self._content.locality + + @ATTRIBUTES.POSTAL_CODE + def postal_code(self): + return self._content.postal_code + + @ATTRIBUTES.DESCRIPTION + def description(self): + return self._content.description + + @ATTRIBUTES.STREET_ADDRESS + def street_address(self): + return self._content.street_address + + +factory_mapping = {OparlPerson: OparlPersonNode, + OparlPaper: OparlPaperNode, + OparlOrganization: OparlOrganizationNode, + OparlLocation: OparlLocationNode, + OparlBasic: UnknownOparlNode} + + +def node_factory(oparl_obj): + assert isinstance(oparl_obj, OparlBasic) + node_cls = factory_mapping.get(oparl_obj.__class__) + return node_cls(oparl_obj) diff --git a/nodes_scheme.py b/nodes_scheme.py new file mode 100644 index 0000000..4514a03 --- /dev/null +++ b/nodes_scheme.py @@ -0,0 +1,255 @@ +from abc import abstractmethod, ABC +from typing import Any + + +class AbcNodeInterface(ABC): + _content: Any + _labels: list + + def __init__(self, content): + self._content = content + + @property + def labels(self): + for label in self._labels: + yield label + + def attributes(self): + for cls in self.__class__.mro(): + for key, attr in cls.__dict__.items(): + if isinstance(attr, DbAttributeHook): + yield getattr(self, key) + + def primary_keys(self): + for property_ in self.attributes(): + property_: DbAttribute + if property_.is_primary(): + yield property_ + + def non_primary_keys(self): + for attribute in self.attributes(): + attribute: DbAttribute + if not attribute.is_primary(): + yield attribute + + def relations(self): + for cls in self.__class__.mro(): + for key, attr in cls.__dict__.items(): + if isinstance(attr, DbRelationHook): + yield getattr(self, key) + + +class DbAttribute: + __slots__ = ('_get_key', '_get_value', '_is_primary') + + def __init__(self, key_getter, value_getter, is_primary): + self._get_key = key_getter + self._get_value = value_getter + self._is_primary = is_primary + + def key(self): + return self._get_key() + + def value(self): + return self._get_value() + + def is_primary(self): + return self._is_primary + + def __iter__(self): + yield self.key(), self.value() + + def __eq__(self, other): + assert isinstance(other, self.__class__) + if self.key() == other.key() and self.value() == other.value(): + return True + + +class DbAttributeHook(property): + pass + + +class DbRelationHook(property): + pass + + +class DbAttributeFactory: + __slots__ = ('_key',) + + def __init__(self, key: str): + self._key = key + + def key(self): + return self._key + + def __call__(self, func, is_primary=False): + def get_property(instance): + return DbAttribute(self.key, lambda: func(instance), is_primary) + + return DbAttributeHook(get_property) + + def as_primary(self, func): + return self(func, is_primary=True) + + +class DbRelation: + __slots__ = ('relation_type', 'source', 'target') + relation_type: str + source: AbcNodeInterface + target: AbcNodeInterface + + def __init__(self, source, rel_type, target): + self.relation_type = rel_type + self.source = source + self.target = target + + +class DbRelationFactory: + __slots__ = ('_relation_type', ) + + def __init__(self, relation_type: str): + self._relation_type = relation_type + + def __call__(self, func, cls=DbRelation): + def get_relation(source: AbcNodeInterface, target: AbcNodeInterface): + return cls(self._relation_type, source, target) + return DbRelationHook(get_relation) + + def with_class(self, cls): + assert issubclass(cls, DbRelation) + return lambda x: self(x, cls=cls) + + def as_generator(self, func, cls=DbRelation): + def relation_generator(*args): + for source, target in func(*args): + yield cls(self._relation_type, source, target) + return relation_generator + + def as_generator_with_class(self, cls): + assert issubclass(cls, DbRelation) + return lambda x: self.as_generator(x, cls=cls) + + +class LABELS(ABC): + OPARL = 'Oparl' + LEGIS_TERM = 'LegisTerm' + THREAD = 'Thread' + PAPER = 'Paper' + NAMED_ENTITY = 'NamedEntity' + PERSON = 'Person' + ORGANIZATION = 'Organization' + LOCATION = 'Location' + + +class RELATIONS(ABC): + IS_MEMBER = DbRelationFactory('IS_MEMBER') + LOCATED = DbRelationFactory('LOCATED') + PART_OF = DbRelationFactory('PART_OF') + DIRECTED = DbRelationFactory('DIRECTED') + SUBMITTED = DbRelationFactory('SUBMITTED') + + +class ATTRIBUTES(ABC): + DESCRIPTION = DbAttributeFactory('description') + LOCALITY = DbAttributeFactory('locality') + NAME = DbAttributeFactory('name') + MODIFIED = DbAttributeFactory('modified') + OPARL_ID = DbAttributeFactory('oparl_id') + ORIGIN_DATE = DbAttributeFactory('origin_date') + PAPER_TYPE = DbAttributeFactory('paper_type') + POSTAL_CODE = DbAttributeFactory('postal_code') + REFERENCE = DbAttributeFactory('reference') + START_DATE = DbAttributeFactory('start_date') + STREET_ADDRESS = DbAttributeFactory('street_address') + END_DATE = DbAttributeFactory('end_date') + WEB_URL = DbAttributeFactory('web_url') + + +class AbcOparlPaperInterface(AbcNodeInterface): + _labels = [LABELS.OPARL, + LABELS.PAPER] + + @abstractmethod + def oparl_id(self): pass + + @abstractmethod + def modified(self): pass + + @abstractmethod + def reference(self): pass + + @abstractmethod + def paper_type(self): pass + + @abstractmethod + def web_url(self): pass + + @abstractmethod + def origin_date(self): pass + + @abstractmethod + def directors(self): pass + + +class AbcOparlPersonInterface(AbcNodeInterface): + _labels = [LABELS.OPARL, + LABELS.NAMED_ENTITY, + LABELS.PERSON] + + @abstractmethod + def oparl_id(self): pass + + @abstractmethod + def modified(self): pass + + @abstractmethod + def name(self): pass + + @abstractmethod + def web_url(self): pass + + +class AbcOparlOrganizationInterface(AbcNodeInterface): + _labels = [LABELS.OPARL, + LABELS.NAMED_ENTITY, + LABELS.ORGANIZATION] + + @abstractmethod + def oparl_id(self): pass + + @abstractmethod + def modified(self): pass + + @abstractmethod + def name(self): pass + + @abstractmethod + def start_date(self): pass + + @abstractmethod + def end_date(self): pass + + +class AbcOparlLocationInterface(AbcNodeInterface): + _labels = [LABELS.OPARL, + LABELS.NAMED_ENTITY, + LABELS.LOCATION] + + @abstractmethod + def oparl_id(self): pass + + @abstractmethod + def modified(self): pass + + @abstractmethod + def locality(self): pass + + @abstractmethod + def postal_code(self): pass + + @abstractmethod + def description(self): pass + + @abstractmethod + def street_address(self): pass + diff --git a/oparl_objects.py b/oparl_objects.py new file mode 100644 index 0000000..0671e19 --- /dev/null +++ b/oparl_objects.py @@ -0,0 +1,253 @@ +from typing import Generator +from datetime import date, datetime +from re import findall + + +def as_date_type(func): + def convert_date(*args) -> date: + date_str = func(*args) + return date.fromisoformat(date_str) if date_str else None + + return convert_date + + +def as_datetime_type(func): + def convert_datetime(*args) -> date: + date_str = func(*args) + return datetime.fromisoformat(date_str) if date_str else None + + return convert_datetime + + +def as_simple_generator(func): + def generator(*args) -> Generator: + item = func(*args) + if isinstance(item, list): + for sub_item in item: + if sub_item: + yield sub_item + return generator + + +def as_oparl_object(func): + def oparl_object(*args): + return oparl_factory(func(*args)) + return oparl_object + + +def as_oparl_object_generator(func): + def oparl_object_generator(*args): + for item in as_simple_generator(func)(*args): + yield oparl_factory(item) + return oparl_object_generator + + +class BasicOparl: + _content: dict + + def __init__(self, content: dict): + self._content = content + + @property + def oparl_id(self) -> str: + return self._content.get('id') + + @property + def oparl_type(self) -> str: + return self._content.get('type') + + @property + @as_datetime_type + def modified(self): + return self._content.get('modified') + + @property + def is_deleted(self): + return self._content.get('deleted') + + +class Paper(BasicOparl): + @property + def subject(self) -> str: + return self._content.get('name') + + @property + def reference(self) -> str: + return self._content.get('reference') + + @property + def legis_term(self) -> str: + reference = self.reference + if isinstance(reference, str): + hits = findall('^[XIV]+', reference) + return hits[0] if hits else None + + @property + def thread_number(self) -> str: + reference = self.reference + if isinstance(reference, str): + hits = findall('\d{5}', reference) + return hits[0] if hits else None + + @property + @as_date_type + def origin_date(self): + return self._content.get('date') + + @property + def paper_type(self) -> str: + return self._content.get('paperType') + + @property + def file_url(self) -> str: + main_file = self._content.get('mainFile') + if isinstance(main_file, dict) and not main_file.get('deleted'): + return main_file.get('accessUrl') + + @property + @as_oparl_object_generator + def originator_persons(self): + return self._content.get('originatorPerson') + + @property + @as_oparl_object_generator + def under_direction_of(self): + return self._content.get('underDirectionOf') + + @property + @as_oparl_object_generator + def consultations(self): + return self._content.get('consultations') + + @property + def web_url(self) -> str: + return self._content.get('web') + + +class Person(BasicOparl): + @property + def name(self) -> str: + return self._content.get('name') + + @property + @as_oparl_object + def location(self) -> (str, dict): + loc_obj = self._content.get('locationObject') + if loc_obj: + return loc_obj + else: + return self._content.get('location') + + @property + @as_simple_generator + def status(self): + return self._content.get('status') + + @property + def web_url(self) -> str: + return self._content.get('web') + + @property + @as_oparl_object_generator + def memberships(self): + return self._content.get('membership') + + +class Organization(BasicOparl): + @property + def name(self) -> str: + return self._content.get('name') + + @property + @as_oparl_object + def location(self) -> (str, dict): + loc_obj = self._content.get('locationObject') + if loc_obj: + return loc_obj + else: + return self._content.get('location') + + @property + @as_date_type + def start_date(self): + return self._content.get('startDate') + + @property + @as_date_type + def end_date(self): + return self._content.get('endDate') + + @property + @as_oparl_object_generator + def memberships(self): + return self._content.get('membership') + + +class Location(BasicOparl): + @property + def locality(self) -> str: + return self._content.get('locality') + + @property + def postal_code(self) -> str: + return self._content.get('postalCode') + + @property + def description(self) -> str: + return self._content.get('description') + + @property + def street_address(self) -> str: + return self._content.get('streetAddress') + + +class Membership(BasicOparl): + @property + @as_oparl_object + def person(self) -> (str, dict): + return self._content.get('person') + + @property + @as_oparl_object + def organization(self) -> (str, dict): + return self._content.get('organization') + + @property + def voting_right(self) -> bool: + return self._content.get('votingRight') + + @property + def role(self) -> str: + return self._content.get('role') + + @property + @as_date_type + def start_date(self): + return self._content.get('startDate') + + @property + @as_date_type + def end_date(self): + return self._content.get('endDate') + + +fabric_dict = {"https://schema.oparl.org/1.1/Paper": Paper, + "https://schema.oparl.org/1.1/Person": Person, + "https://schema.oparl.org/1.1/Organization": Organization, + "https://schema.oparl.org/1.1/Location": Location, + "https://schema.oparl.org/1.1/Membership": Membership} + + +def oparl_factory(item: (str, dict)): + if item is None: + return + elif isinstance(item, str) and item.startswith('http'): + return BasicOparl(dict(id=item)) + elif isinstance(item, dict): + object_type = item.get('type') + assert object_type is not None + oparl_object = fabric_dict.get(object_type) + return oparl_object(item) + else: + message = f'unsupported item {item} type {type(item)}, expected url_str or dict with key "type"' + raise TypeError(message) diff --git a/scraping.py b/scraping.py new file mode 100644 index 0000000..4ca2dec --- /dev/null +++ b/scraping.py @@ -0,0 +1,61 @@ +import json +import fakerequest as request +from fakerequest import START_URL +from oparl_objects import oparl_factory +from neo_connector import database_connection, Session +from nodes_from_oparl import node_factory as oparl_node_factory +from statements import full_merge, retrieve_single +from nodes_scheme import AbcOparlPaperInterface +from nodes_from_oparl import UnknownOparlNode + + +def pages(start_url): + max_pages = 1 + page_count = 0 + url = start_url + while url and page_count < max_pages: + response = request.get(url) + page_count += 1 + if response.state == 200: + content = json.loads(response.content) + yield content.get('data') + url = content.get('links') + if url: + url = url.get('next') + + +def converted_item_on(page): + for item in page: + yield oparl_node_factory(oparl_factory(item)) + + +def converted_get_request(node: UnknownOparlNode): + url = node.oparl_id.value() + response = request.get(url) + if response.state == 200: + content = json.loads(response.content) + return oparl_node_factory(oparl_factory(content)) + + +def scrapping(db_con, start_url): + for page in pages(start_url): + for node_paper in converted_item_on(page): + with db_con.session() as session: + session: Session + knowledge = dict() + #db_node: AbcOparlPaperInterface = session.execute_write(retrieve_single, node_paper) + #if db_node.modified == node_paper.modified: continue + for director in node_paper.directors(): + print(director) + for originator in node_paper.originators(): + print(originator) + pass + + +if __name__ == '__main__': + with database_connection() as dbc: + scrapping(dbc, START_URL) + + +#print(connection.driver.execute_query('match (n) return count(n)')) +#connection.driver.close() diff --git a/statements.py b/statements.py new file mode 100644 index 0000000..b566de5 --- /dev/null +++ b/statements.py @@ -0,0 +1,90 @@ +from nodes_from_neo4j import node_factory as neo4j_node_factory +from nodes_scheme import AbcNodeInterface, DbAttribute + + +def retrieve_single(tx, node): + ref = 'n' + match_, parameter = prepare_match_by_primary(node, ref=ref) + return_, _ = prepare_return(ref) + + result = tx.run('\n'.join((match_, return_)), parameter).single().value() + return neo4j_node_factory(result) + + +def full_merge(tx, node): + ref = 'n' + match_, p1 = prepare_merge_by_primary(node, ref=ref) + on_create, p2 = prepare_create_set(node, ref=ref) + return_, _ = prepare_return(ref) + parameter = {**p1, **p2} + result = tx.run('\n'.join((match_, on_create, return_)), parameter) + result = result.single().value() + return neo4j_node_factory(result) + + +def delete_all(tx, *_): + return tx.run('MATCH (n) DETACH DELETE n').to_eager_result() + + +def prepare_match_by_primary(node_interface: AbcNodeInterface, ref='n') -> tuple[str, dict]: + prim_keys = list() + parameter = dict() + + for attribute in node_interface.primary_keys(): + for key, value in attribute: + p_key = f'{ref}X{key}' + prim_keys.append(f'{key}:${p_key}') + parameter.update({p_key: value}) + + labels = ':' + ':'.join(node_interface.labels) if node_interface.labels else '' + prim_keys = '{' + ','.join(prim_keys) + '}' + + statement = f'MATCH ({ref}{labels} {prim_keys})' + + return statement, parameter + + +def prepare_merge_by_primary(node_interface: AbcNodeInterface, ref='n') -> tuple[str, dict]: + parameter = dict() + keys = list() + + for attribute in node_interface.primary_keys(): + for key, value in attribute: + p_key = f'{ref}X{key}' + keys.append(f'{key}:${p_key}') + parameter.update({p_key: value}) + + labels = ':'.join(node_interface.labels) + keys = '{' + ','.join(keys) + '}' + + statement = f'MERGE ({ref}:{labels} {keys})' + return statement, parameter + + +def prepare_create_set(node_interface: AbcNodeInterface, ref='n'): + parameter = dict() + keys = list() + + for attribute in node_interface.non_primary_keys(): + for key, value in attribute: + p_key = f'{ref}X{key}' + keys.append(f'{ref}.{key}=${p_key}') + parameter.update({p_key: value}) + + statement = f'ON CREATE SET {",".join(keys)}' + return statement, parameter + + +def prepare_return(*args) -> (str, dict): + """arg[n] = ref | (ref, attribute, ...) | (ref, ), (ref, attribute), ...""" + pieces = [] + for arg in args: + if isinstance(arg, str): + pieces.append(arg) + if isinstance(arg, tuple): + ref = arg[0] + for a in arg[1:]: + a: DbAttribute + pieces.append(f'{ref}.{a.key()}') + + return f'RETURN {",".join(pieces)}', {}