From 6da96814759cb0d822c53b5cfe073b236a9f1280 Mon Sep 17 00:00:00 2001 From: Dan Davis Date: Thu, 4 Mar 2021 14:32:28 -0500 Subject: [PATCH] Add a MarcSearch class to memoize patterns. --- README.md | 22 ++++++++++++++++++++++ VERSION.txt | 2 +- pymarcspec/__init__.py | 2 +- pymarcspec/search.py | 34 +++++++++++++++++++++++++++++----- 4 files changed, 53 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 0195297..dce459c 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,28 @@ with open(sys.argv[1], 'rb') as f: print(subjects) ``` +There is also a `MarcSearch` object that memoizes each search expression, so that +you can conveniently run a number of different searches without creating several +parsed specs. For example: + +```python +import csv +import sys +from pymarcspec import MarcSearch +from pymarc import MARCReader + +writer = csv.writer(sys.stdout, dialect='unix', quoting=csv.QUOTE_MINIMAL) +writer.writerow(['id', 'title', 'subjects']) + +marcsearch = MarcSearch() +with open(sys.argv[1], 'rb') as f: + for record in MARCReader(f): + control_id = marcsearch.search('100', record) + title = marcsearch.search('245[0]$a-c', record) + subjects = marcsearch.search('650$a', record, field_delimiter=', ') + writer.writerow([control_id, title, subjects]) +``` + ## Development ### Building the Parser diff --git a/VERSION.txt b/VERSION.txt index 8acdd82..4e379d2 100644 --- a/VERSION.txt +++ b/VERSION.txt @@ -1 +1 @@ -0.0.1 +0.0.2 diff --git a/pymarcspec/__init__.py b/pymarcspec/__init__.py index cf6a287..01ceecb 100644 --- a/pymarcspec/__init__.py +++ b/pymarcspec/__init__.py @@ -1,4 +1,4 @@ from .parser import MarcSpecParser # noqa: from .semantics import MarcSearchSemantics # noqa: -from .search import MarcSearchParser # noqa: +from .search import MarcSearchParser, MarcSearch # noqa: from .model import MarcSpec # noqa: \ No newline at end of file diff --git a/pymarcspec/search.py b/pymarcspec/search.py index 592e326..ba21a78 100644 --- a/pymarcspec/search.py +++ b/pymarcspec/search.py @@ -9,7 +9,6 @@ from .semantics import MarcSearchSemantics -# memoize compiling of strings into AST using some searcher class MarcSearchParser(MarcSpecParser): def __init__(self, *args, **kwargs): kwargs.update({ @@ -17,19 +16,44 @@ def __init__(self, *args, **kwargs): 'semantics': MarcSearchSemantics() }) super().__init__(*args, **kwargs) + self.memoized = dict() + + +# memoize compiling of strings into specs +class MarcSearch: + """ + Memoizes compiled specifications to offset + cost of compiling each again and again. + + Can be used over multiple records and + multiple specs. + """ + def __init__(self): + self.parser = MarcSearchParser() + self.specs = dict() + + def parse(self, spec): + compiled_spec = self.specs.get(spec) + if compiled_spec is None: + self.specs[spec] = compiled_spec = self.parser.parse(spec) + return compiled_spec + + def search(self, spec, record, **kwargs): + compiled_spec = self.parse(spec) + return compiled_spec.search(record, **kwargs) def marc_search(marcspec, stream, field_delimiter=':', subfield_delimiter=''): - parser = MarcSearchParser() - spec = parser.parse(marcspec) + searcher = MarcSearch() + searcher.parse(marcspec) if stream.name.endswith('.xml'): generator = parse_xml_to_array(stream) else: generator = MARCReader(stream) for record in generator: - result = spec.search( - record, + result = searcher.search( + marcspec, record, field_delimiter=field_delimiter, subfield_delimiter=subfield_delimiter )