Merge pull request #29 from perrette/dev

Flexible key and file format
perrette · Apr 17, 2023 · 9abc1bd · 9abc1bd
2 parents e129964 + 6caf353
commit 9abc1bd
Show file tree

Hide file tree

Showing 11 changed files with 404 additions and 94 deletions.
diff --git a/README.md b/README.md
@@ -66,13 +66,52 @@ For the sake of the example, one of my owns: https://www.earth-syst-dynam.net/4/
     	INFO:papers:found doi:10.5194/esd-4-11-2013
     	INFO:papers:new entry: perrette_2013
     	INFO:papers:create directory: files/2013
-    	INFO:papers:mv /home/perrette/playground/papers/esd-4-11-2013.pdf files/2013/Perrette_2013.pdf
+    	INFO:papers:mv /home/perrette/playground/papers/esd-4-11-2013.pdf files/perrette_et_al_2013_a-scaling-approach-to-project-regional-sea-level-rise-and-its-uncertainties.pdf
     	INFO:papers:renamed file(s): 1
 
 (the `--info` argument asks for the above output information to be printed out to the terminal)
 
-In the common case where the bibtex (`--bibtex`) and files directory  (`--filesdir`) do not change,
-it is convenient to *install* `papers`.
+
+- control fields when renaming file
+
+        $> papers add --rename --info --name-template "{AuthorX}{year}-{Title}" --name-title-sep '' --name-author-sep '' esd-4-11-2013
+        INFO:papers:found doi:10.5194/esd-4-11-2013
+        INFO:papers:new entry: perrette2013scaling
+        INFO:papers:create directory: files/2013
+        INFO:papers:mv /home/perrette/playground/papers/esd-4-11-2013.pdf files/PerretteEtAl2013-AScalingApproachToProjectRegionalSeaLevelRiseAndItsUncertainties.pdf
+        INFO:papers:renamed file(s): 1
+
+where '--name-template' is a python template (will be formated via .format() method) with valid fields being any field available in the bibtex. Fields not in the bibtex will remain untouched.
+
+To rename `esd-4-11-2013.pdf` as `perrette_2013.pdf`, the template should be `--name-template {author}_{year} --name-author-num 1`
+If that happens to be the entry ID, `ID` also works.
+
+To `rename esd-4-11-2013.pdf` as `2013/Perrette2013-AScalingApproachToProjectRegionalSeaLevelRiseAndItsUncertainties.pdf`,
+name-template should be `--name-template {year}/{Author}{year}-{Title} --name-title-sep ''` (note the case).
+
+Entries are case-sensitive, and a few more fields are added, so that:
+- 'author' generates 'perrette'
+- 'Author' generates 'Perrette'
+- 'AUTHOR' generates 'PERRETTE'
+- 'authorX' generates 'perrette', 'perrette_and_landerer' or 'perrette_et_al' dependening on the number of authors
+- 'AuthorX' same as authorX but capitalized
+
+The modifiers are:
+
+- `--name-title-sep` : separator for title words
+- `--name-title-length` : max title length
+- `--name-title-word-size` : min size to be considered a word
+- `--name-title-word-num` : max number of title words
+
+and similarly:
+
+- `--name-author-sep` : separator for authors
+- `--name-author-num` : number of authors to  (not relevant for `{authorX}`)
+
+The same template and modifiers system applies to the bibtex key generation by replacing the prefix `--name-` with `--key-`, e.g. `--key-template`
+
+
+In the common case where the bibtex (`--bibtex`), files directory  (`--filesdir`), and name and key formats (e.g. `--name-template`) do not change, it is convenient to *install* `papers`.
 Install comes with the option to git-track any change to the bibtex file (`--git`) options.
 
 - setup git-tracked library (optional)

diff --git a/papers/bib.py b/papers/bib.py
@@ -29,9 +29,6 @@
 
 # KEY GENERATION
 # ==============
-NAUTHOR = 2
-NTITLE = 0
-
 
 def append_abc(key, keys=[]):
     """
@@ -60,24 +57,6 @@ def append_abc(key, keys=[]):
     return Key
 
 
-def generate_key(entry, nauthor=NAUTHOR, ntitle=NTITLE, minwordlen=3, mintitlen=4, keys=None):
-    # names = bibtexparser.customization.getnames(entry.get('author','unknown').lower().split(' and '))
-    names = family_names(entry.get('author','unknown').lower())
-    authortag = '_'.join([nm for nm in names[:nauthor]])
-    yeartag = entry.get('year','0000')
-    if not ntitle or not entry.get('title',''):
-        titletag = ''
-    else:
-        words = [word for word in entry['title'].lower().strip().split() if len(word) >= minwordlen]
-        while len(''.join(words[:ntitle])) < mintitlen and ntitle < len(words):
-            ntitle += 1
-        titletag = '_'.join(words[:ntitle])
-    key = authortag + yeartag + titletag
-    if keys and key in keys: # and not isinstance(keys, set):
-        key = append_abc(key, keys)
-    return key
-
-
 # DUPLICATE DEFINITION
 # ====================
 
@@ -114,12 +93,11 @@ def entry_id(e):
     return (e.get('doi','').lower(), authortitle)
 
 
-
-
 FUZZY_RATIO = 80
 
 # should be conservative (used in papers add)
 DEFAULT_SIMILARITY = 'FAIR'
+# DEFAULT_SIMILARITY = 'PARTIAL'
 
 EXACT_DUPLICATES = 104
 GOOD_DUPLICATES = 103
@@ -172,7 +150,7 @@ def are_duplicates(e1, e2, similarity=DEFAULT_SIMILARITY, fuzzy_ratio=FUZZY_RATI
     except KeyError:
         raise ValueError('similarity must be one of EXACT, GOOD, FAIR, PARTIAL, FUZZY')
 
-    score = compare_entries(e1, e2, fuzzy=level==FUZZY_DUPLICATES)
+    score = compare_entries(e1, e2, fuzzy=target==FUZZY_DUPLICATES)
     logger.debug('score: {}, target: {}, similarity: {}'.format(score, target, similarity))
     return score >= target
 
@@ -205,8 +183,6 @@ def read_entry_dir(self, direc, update_files=True):
     return entry
 
 
-
-
 def backupfile(bibtex):
     return os.path.join(os.path.dirname(bibtex), '.'+os.path.basename(bibtex)+'.backup')
 
@@ -216,7 +192,7 @@ class DuplicateKeyError(ValueError):
 class Biblio:
     """main config
     """
-    def __init__(self, db=None, filesdir=None, key_field='ID', nauthor=NAUTHOR, ntitle=NTITLE, similarity=DEFAULT_SIMILARITY):
+    def __init__(self, db=None, filesdir=None, key_field='ID', nameformat=None, keyformat=None, similarity=DEFAULT_SIMILARITY):
         self.filesdir = filesdir
         # assume an already sorted list
         self.key_field = key_field
@@ -225,10 +201,10 @@ def __init__(self, db=None, filesdir=None, key_field='ID', nauthor=NAUTHOR, ntit
         elif not isinstance(db, bibtexparser.bibdatabase.BibDatabase):
             raise TypeError('db must of type BibDatabase')
         self.db = db
-        self.sort()
-        self.nauthor = nauthor
-        self.ntitle = ntitle
+        self.nameformat = nameformat or config.nameformat
+        self.keyformat = keyformat or config.keyformat
         self.similarity = similarity
+        self.sort()
 
     @property
     def entries(self):
@@ -356,7 +332,10 @@ def insert_entry_check(self, entry, update_key=False, mergefiles=True, on_confli
     def generate_key(self, entry):
         " generate a unique key not yet present in the record "
         keys = {self.key(e) for e in self.db.entries}
-        return generate_key(entry, keys=keys, nauthor=self.nauthor, ntitle=self.ntitle)
+        key = self.keyformat(entry)
+        if keys and key in keys: # and not isinstance(keys, set):
+            key = append_abc(key, keys)
+        return key
 
     def append_abc_to_key(self, entry):
         return append_abc(entry['ID'], keys={self.key(e) for e in self.entries})
@@ -378,13 +357,19 @@ def fetch_doi(self, doi, **kw):
         self.add_bibtex(bibtex, **kw)
 
 
-    def add_pdf(self, pdf, attachments=None, rename=False, copy=False, search_doi=True, search_fulltext=True, scholar=False, **kw):
+    def add_pdf(self, pdf, attachments=None, rename=False, copy=False, search_doi=True, search_fulltext=True, scholar=False, doi=None, **kw):
 
-        bibtex = extract_pdf_metadata(pdf, search_doi, search_fulltext, scholar=scholar)
+        if doi:
+            bibtex = fetch_bibtex_by_doi(doi)
+        else:
+            bibtex = extract_pdf_metadata(pdf, search_doi, search_fulltext, scholar=scholar)
 
         bib = bibtexparser.loads(bibtex)
         entry = bib.entries[0]
 
+        # convert curly brackets to unicode
+        bibtexparser.customization.convert_to_unicode(entry)
+
         files = [pdf]
         if attachments:
             files += attachments
@@ -449,26 +434,45 @@ def check_duplicates(self, key=None, eq=None, mode='i'):
         self.sort() # keep sorted
 
 
-    def rename_entry_files(self, e, copy=False):
+    def rename_entry_files(self, e, copy=False, formatter=None):
+        """ Rename files
+
+        See `confog.Format` class
+
+
+            To rename esd-4-11-2013.pdf as perrette_2013.pdf, nameformat should be '{author}_{year}' with --name-nauthor 1.
+            If that happens to be the entry ID, 'ID' also works.
+
+            To rename esd-4-11-2013.pdf as
+            2013/Perrette2013-AScalingApproachToProjectRegionalSeaLevelRiseAndItsUncertainties.pdf,
+            nameformat should be '{year}/{Author}{year}-{Title}' with --name-nauthor 1 (note the case).
+
+            Entries are case-sensitive, so that:
+                'author' generates 'perrette'
+                'Author' generates 'Perrette'
+                'AUTHOR' generates 'PERRETTE'
+            any other case, like 'AuTHoR', will retrieve the field from 'e' with unaltered case.
+        """
 
         if self.filesdir is None:
             raise ValueError('filesdir is None, cannot rename entries')
 
         files = parse_file(e.get('file',''))
         # newname = entrydir(e, root)
-        direc = os.path.join(self.filesdir, e.get('year','0000'))
+
+        direc = self.filesdir
 
         if not files:
             logger.info('no files to rename')
             return
 
-        autoname = lambda e: e['ID'].replace(':','-').replace(';','-') # ':' and ';' are forbidden in file name
+        newname = (formatter or self.nameformat)(e)
 
         count = 0
         if len(files) == 1:
             file = files[0]
             base, ext = os.path.splitext(file)
-            newfile = os.path.join(direc, autoname(e)+ext)
+            newfile = os.path.join(direc, newname+ext)
             if not os.path.exists(file):
                 raise ValueError(file+': original file link is broken')
             elif file != newfile:
@@ -480,7 +484,7 @@ def rename_entry_files(self, e, copy=False):
 
         # several files: only rename container
         else:
-            newdir = os.path.join(direc, autoname(e))
+            newdir = os.path.join(direc, newname)
             newfiles = []
             for file in files:
                 newfile = os.path.join(newdir, os.path.basename(file))
@@ -769,6 +773,52 @@ def main():
     grp.add_argument('--dry-run', action='store_true',
         help='no PDF renaming/copying, no bibtex writing on disk (for testing)')
 
+    grp.add_argument('--key-template', default=config.keyformat.template,
+        help='python template for generating keys (default:%(default)s)')
+    grp.add_argument('--key-author-num', type=int, default=config.keyformat.author_num,
+        help='number of authors to include in key (default:%(default)s)')
+    grp.add_argument('--key-author-sep', default=config.keyformat.author_sep,
+        help='separator for authors in key (default:%(default)s)')
+    grp.add_argument('--key-title-word-num', type=int, default=config.keyformat.title_word_num,
+        help='number of title words to include in key (default:%(default)s)')
+    grp.add_argument('--key-title-word-size', type=int, default=config.keyformat.title_word_size,
+        help='number of title words to include in key (default:%(default)s)')
+    grp.add_argument('--key-title-sep', default=config.keyformat.title_sep,
+        help='separator for title words in key (default:%(default)s)')
+
+    grp.add_argument('--name-template', default=config.nameformat.template,
+        help='python template for renaming files (default:%(default)s)')
+    grp.add_argument('--name-author-num', type=int, default=config.nameformat.author_num,
+        help='number of authors to include in filename (default:%(default)s)')
+    grp.add_argument('--name-author-sep', default=config.nameformat.author_sep,
+        help='separator for authors in filename (default:%(default)s)')
+    grp.add_argument('--name-title-word-num', type=int, default=config.nameformat.title_word_num,
+        help='number of title words to include in filename (default:%(default)s)')
+    grp.add_argument('--name-title-word-size', type=int, default=config.nameformat.title_word_size,
+        help='min size of title words to include in filename (default:%(default)s)')
+    grp.add_argument('--name-title-length', type=int, default=config.nameformat.title_length,
+        help='title length to include in filename (default:%(default)s)')
+    grp.add_argument('--name-title-sep', default=config.nameformat.title_sep,
+        help='separator for title words in filename (default:%(default)s)')
+
+
+    def set_format_config_from_cmd(o):
+        config.keyformat.template = o.key_template
+        config.keyformat.author_num = o.key_author_num
+        config.keyformat.author_sep = o.key_author_sep
+        config.keyformat.title_word_num = o.key_title_word_num
+        config.keyformat.title_word_size = o.key_title_word_size
+        config.keyformat.title_sep = o.key_title_sep
+
+        config.nameformat.template = o.name_template
+        config.nameformat.author_num = o.name_author_num
+        config.nameformat.author_sep = o.name_author_sep
+        config.nameformat.title_length = o.name_title_length
+        config.nameformat.title_word_num = o.name_title_word_num
+        config.nameformat.title_word_size = o.name_title_word_size
+        config.nameformat.title_sep = o.name_title_sep
+
+
     # status
     # ======
     statusp = subparsers.add_parser('status',
@@ -922,6 +972,7 @@ def savebib(my, o):
         help='ignore errors when adding multiple files')
 
     grp = addp.add_argument_group('pdf metadata')
+    grp.add_argument('--doi', help='provide DOI -- skip parsing PDF')
     grp.add_argument('--no-query-doi', action='store_true', help='do not attempt to parse and query doi')
     grp.add_argument('--no-query-fulltext', action='store_true', help='do not attempt to query fulltext in case doi query fails')
     grp.add_argument('--scholar', action='store_true', help='use google scholar instead of crossref')
@@ -936,7 +987,6 @@ def savebib(my, o):
 
 
     def addcmd(o):
-
         if os.path.exists(o.bibtex):
             my = Biblio.load(o.bibtex, o.filesdir)
         else:
@@ -964,7 +1014,7 @@ def addcmd(o):
                     my.add_pdf(file, attachments=o.attachment, rename=o.rename, copy=o.copy,
                             search_doi=not o.no_query_doi,
                             search_fulltext=not o.no_query_fulltext,
-                            scholar=o.scholar,
+                            scholar=o.scholar, doi=o.doi,
                             **kw)
 
                 else: # file.endswith('.bib'):
@@ -994,8 +1044,8 @@ def addcmd(o):
     grp.add_argument('--fix-key', action='store_true', help='fix key based on author name and date (in case misssing or digit)')
     grp.add_argument('--key-ascii', action='store_true', help='replace keys unicode character with ascii')
     grp.add_argument('--auto-key', action='store_true', help='new, auto-generated key for all entries')
-    grp.add_argument('--nauthor', type=int, default=NAUTHOR, help='number of authors to include in key (default:%(default)s)')
-    grp.add_argument('--ntitle', type=int, default=NTITLE, help='number of title words to include in key (default:%(default)s)')
+#     grp.add_argument('--nauthor', type=int, default=config.nauthor, help='number of authors to include in key (default:%(default)s)')
+#     grp.add_argument('--ntitle', type=int, default=config.ntitle, help='number of title words to include in key (default:%(default)s)')
     # grp.add_argument('--ascii-key', action='store_true', help='replace unicode characters with closest ascii')
 
     grp = checkp.add_argument_group('crossref fetch and fix')
@@ -1201,7 +1251,11 @@ def longmatch(word, target):
         if o.duplicates_tit:
             entries = list_dup(entries, key=title_id)
         if o.duplicates:
-            eq = lambda a, b: a['ID'] == b['ID'] or are_duplicates(a, b, similarity=level, fuzzy_ratio=o.fuzzy_ratio)
+            # QUESTION MARK: in latest HEAD before merge with @malfatti's PR, I used hard-coded "PARTIAL".
+            # I think that's because we might need to be inclusive here, whereas the default is conservative (parameter used for several functions with possibly differing requirements).
+            # (otherwise we'd have used the command-line option o.similarity, or possibly DEFAULT_SIMILARITY)
+            # Might need to revise later (the question mark is from a review after a long time without use)
+            eq = lambda a, b: a['ID'] == b['ID'] or are_duplicates(a, b, similarity="PARTIAL", fuzzy_ratio=o.fuzzy_ratio)
             entries = list_dup(entries, eq=eq)
 
         def nfiles(e):
@@ -1243,7 +1297,7 @@ def nfiles(e):
                 print(e['ID'].encode('utf-8'))
         elif o.one_liner:
             for e in entries:
-                tit = e['title'][:60]+ ('...' if len(e['title'])>60 else '')
+                tit = e.get('title', '')[:60]+ ('...' if len(e.get('title', ''))>60 else '')
                 info = []
                 if e.get('doi',''):
                     info.append('doi:'+e['doi'])
@@ -1335,6 +1389,7 @@ def gitcmd(o):
         return statuscmd(o)
 
     def check_install():
+        set_format_config_from_cmd(o)
         if not os.path.exists(o.bibtex):
             print('papers: error: no bibtex file found, use `papers install` or `touch {}`'.format(o.bibtex))
             parser.exit(1)