Fixed bug outputting to Kobo. New options.

pettarin · Nov 29, 2015 · 335292f · 335292f
1 parent 7e9df1c
commit 335292f
Show file tree

Hide file tree

Showing 24 changed files with 1,037 additions and 652 deletions.
diff --git a/README.md b/README.md
@@ -2,8 +2,8 @@
 
 **Penelope** is a multi-tool for creating, editing and converting dictionaries, especially for eReader devices.
 
-* Version: 3.0.1
-* Date: 2015-11-22
+* Version: 3.1.0
+* Date: 2015-11-29
 * Developer: [Alberto Pettarin](http://www.albertopettarin.it/)
 * License: the MIT License (MIT)
 * Contact: [click here](http://www.albertopettarin.it/contact.html)
@@ -78,7 +78,7 @@ You might need to install `dictzip` (StarDict output) and  `kindlegen` (MOBI out
     $ python -m penelope
     ```
 
-This procedure will not any dependencies, see below.
+This procedure will not install any dependencies: you will need to do that manually, see below.
 
 
 ### Dependencies
@@ -97,9 +97,9 @@ This procedure will not any dependencies, see below.
     $ [sudo] pip install marisa-trie
     ```
 
-  or [`MARISA`](https://code.google.com/p/marisa-trie/) executables available in your `$PATH` or specified with `--marisa-bin-path`
+  or [MARISA](https://code.google.com/p/marisa-trie/) executables available in your `$PATH` or specified with `--marisa-bin-path`
 
-* to write MOBI Kindle dictionaries: the [`kindlegen`](https://www.amazon.com/gp/feature.html?docId=1000765211) executable, available in your `$PATH` or specified with `--kindlegen-path`
+* to write MOBI Kindle dictionaries: the [kindlegen](https://www.amazon.com/gp/feature.html?docId=1000765211) executable, available in your `$PATH` or specified with `--kindlegen-path`
 
 * to read/write XML dictionaries: the Python module `lxml`:
 
@@ -154,6 +154,8 @@ optional arguments:
   --title TITLE         title string
   --website WEBSITE     website string
   --year YEAR           year string
+  --apply-css APPLY_CSS
+                        apply the given CSS file (epub and mobi output only)
   --bookeen-collation-function BOOKEEN_COLLATION_FUNCTION
                         use the specified collation function
   --bookeen-install-file
@@ -165,29 +167,36 @@ optional arguments:
   --csv-ls CSV_LS       CSV line separator (default: '\n')
   --dictzip-path DICTZIP_PATH
                         path to dictzip executable
-  --epub-escape-strings
-                        escape HTML strings (default: False)
-  --epub-group-prefix-length EPUB_GROUP_PREFIX_LENGTH
-                        group headwords by prefix of given length (default: 3)
-  --epub-merge-group-size EPUB_MERGE_GROUP_SIZE
-                        merge headword groups with less than this number of
-                        headwords (default: 128)
-  --epub-output-definitions
-                        output definitions in addition to the headwords
-                        (default: False)
+  --epub-no-compress    do not create the compressed container (epub output
+                        only, default: False)
+  --escape-strings      escape HTML strings (default: False)
   --flatten-synonyms    flatten synonyms, creating a new entry with
                         headword=synonym and using the definition of the
                         original headword (default: False)
+  --group-by-prefix-function GROUP_BY_PREFIX_FUNCTION
+                        compute the prefix of headwords using the given prefix
+                        function file
+  --group-by-prefix-length GROUP_BY_PREFIX_LENGTH
+                        group headwords by prefix of given length (default: 2)
+  --group-by-prefix-merge-across-first
+                        merge headword groups even when the first character
+                        changes (default: False)
+  --group-by-prefix-merge-min-size GROUP_BY_PREFIX_MERGE_MIN_SIZE
+                        merge headword groups until the given minimum number
+                        of headwords is reached (default: 0, meaning no merge
+                        will take place)
+  --ignore-case         ignore headword case, all headwords will be lowercased
+                        (default: False)
+  --ignore-synonyms     ignore synonyms, not reading/writing them if present
+                        (default: False)
+  --include-index-page  include an index page (epub and mobi output only,
+                        default: False)
   --input-file-encoding INPUT_FILE_ENCODING
                         use the specified encoding for reading the raw
                         contents of input file(s) (default: 'utf-8')
   --input-parser INPUT_PARSER
                         use the specified parser function after reading the
                         raw contents of input file(s)
-  --ignore-case         ignore headword case, all headwords will be lowercased
-                        (default: False)
-  --ignore-synonyms     ignore synonyms, not reading/writing them if present
-                        (default: False)
   --kindlegen-path KINDLEGEN_PATH
                         path to kindlegen executable
   --marisa-bin-path MARISA_BIN_PATH
@@ -201,6 +210,8 @@ optional arguments:
                         | ')
   --mobi-no-kindlegen   do not run kindlegen, keep .opf and .html files
                         (default: False)
+  --no-definitions      do not output definitions for EPUB and MOBI formats
+                        (default: False)
   --sd-ignore-sametypesequence
                         ignore the value of sametypesequence in StarDict .ifo
                         files (default: False)
@@ -253,7 +264,6 @@ examples:
 
   $ penelope -i dict.xml -j xml -f en -t it -p mobi -o output.epub --epub-output-definitions
     As above, but also output definitions
-
 ```
 
 You can find ISO 639-1 language codes [here](http://en.wikipedia.org/wiki/List_of_ISO_639-1_codes).
@@ -338,6 +348,8 @@ were released under the GNU GPL 3 License.
 * Reading EPUB (3) dictionaries is not supported; the writing part needs polishing/refactoring
 * Reading PRC/MOBI (Kindle) dictionaries is not supported
 * There are some limitations on StarDict files that can be read (see comments in `format_stardict.py`)
+* Documentation is not complete
+* Unit tests are missing
 
 
 ## Acknowledgments 

diff --git a/README.rst b/README.rst
@@ -4,8 +4,8 @@ Penelope
 **Penelope** is a multi-tool for creating, editing and converting
 dictionaries, especially for eReader devices.
 
--  Version: 3.0.1
--  Date: 2015-11-22
+-  Version: 3.1.0
+-  Date: 2015-11-29
 -  Developer: `Alberto Pettarin <http://www.albertopettarin.it/>`__
 -  License: the MIT License (MIT)
 -  Contact: `click here <http://www.albertopettarin.it/contact.html>`__
@@ -96,7 +96,8 @@ From source code
 
        $ python -m penelope
 
-This procedure will not any dependencies, see below.
+This procedure will not install any dependencies: you will need to do
+that manually, see below.
 
 Dependencies
 ~~~~~~~~~~~~
@@ -116,11 +117,11 @@ Dependencies
 
        $ [sudo] pip install marisa-trie
 
-or ```MARISA`` <https://code.google.com/p/marisa-trie/>`__ executables
+or `MARISA <https://code.google.com/p/marisa-trie/>`__ executables
 available in your ``$PATH`` or specified with ``--marisa-bin-path``
 
 -  to write MOBI Kindle dictionaries: the
-   ```kindlegen`` <https://www.amazon.com/gp/feature.html?docId=1000765211>`__
+   `kindlegen <https://www.amazon.com/gp/feature.html?docId=1000765211>`__
    executable, available in your ``$PATH`` or specified with
    ``--kindlegen-path``
 
@@ -178,6 +179,8 @@ Usage
       --title TITLE         title string
       --website WEBSITE     website string
       --year YEAR           year string
+      --apply-css APPLY_CSS
+                            apply the given CSS file (epub and mobi output only)
       --bookeen-collation-function BOOKEEN_COLLATION_FUNCTION
                             use the specified collation function
       --bookeen-install-file
@@ -189,29 +192,36 @@ Usage
       --csv-ls CSV_LS       CSV line separator (default: '\n')
       --dictzip-path DICTZIP_PATH
                             path to dictzip executable
-      --epub-escape-strings
-                            escape HTML strings (default: False)
-      --epub-group-prefix-length EPUB_GROUP_PREFIX_LENGTH
-                            group headwords by prefix of given length (default: 3)
-      --epub-merge-group-size EPUB_MERGE_GROUP_SIZE
-                            merge headword groups with less than this number of
-                            headwords (default: 128)
-      --epub-output-definitions
-                            output definitions in addition to the headwords
-                            (default: False)
+      --epub-no-compress    do not create the compressed container (epub output
+                            only, default: False)
+      --escape-strings      escape HTML strings (default: False)
       --flatten-synonyms    flatten synonyms, creating a new entry with
                             headword=synonym and using the definition of the
                             original headword (default: False)
+      --group-by-prefix-function GROUP_BY_PREFIX_FUNCTION
+                            compute the prefix of headwords using the given prefix
+                            function file
+      --group-by-prefix-length GROUP_BY_PREFIX_LENGTH
+                            group headwords by prefix of given length (default: 2)
+      --group-by-prefix-merge-across-first
+                            merge headword groups even when the first character
+                            changes (default: False)
+      --group-by-prefix-merge-min-size GROUP_BY_PREFIX_MERGE_MIN_SIZE
+                            merge headword groups until the given minimum number
+                            of headwords is reached (default: 0, meaning no merge
+                            will take place)
+      --ignore-case         ignore headword case, all headwords will be lowercased
+                            (default: False)
+      --ignore-synonyms     ignore synonyms, not reading/writing them if present
+                            (default: False)
+      --include-index-page  include an index page (epub and mobi output only,
+                            default: False)
       --input-file-encoding INPUT_FILE_ENCODING
                             use the specified encoding for reading the raw
                             contents of input file(s) (default: 'utf-8')
       --input-parser INPUT_PARSER
                             use the specified parser function after reading the
                             raw contents of input file(s)
-      --ignore-case         ignore headword case, all headwords will be lowercased
-                            (default: False)
-      --ignore-synonyms     ignore synonyms, not reading/writing them if present
-                            (default: False)
       --kindlegen-path KINDLEGEN_PATH
                             path to kindlegen executable
       --marisa-bin-path MARISA_BIN_PATH
@@ -225,6 +235,8 @@ Usage
                             | ')
       --mobi-no-kindlegen   do not run kindlegen, keep .opf and .html files
                             (default: False)
+      --no-definitions      do not output definitions for EPUB and MOBI formats
+                            (default: False)
       --sd-ignore-sametypesequence
                             ignore the value of sametypesequence in StarDict .ifo
                             files (default: False)
@@ -277,7 +289,6 @@ Usage
 
       $ penelope -i dict.xml -j xml -f en -t it -p mobi -o output.epub --epub-output-definitions
         As above, but also output definitions
-      
 
 You can find ISO 639-1 language codes
 `here <http://en.wikipedia.org/wiki/List_of_ISO_639-1_codes>`__.
@@ -384,6 +395,8 @@ Limitations and Missing Features
 -  Reading PRC/MOBI (Kindle) dictionaries is not supported
 -  There are some limitations on StarDict files that can be read (see
    comments in ``format_stardict.py``)
+-  Documentation is not complete
+-  Unit tests are missing
 
 Acknowledgments
 ---------------

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-3.0.1
+3.1.0
diff --git a/bin/penelope b/bin/penelope
@@ -14,7 +14,7 @@ from penelope import main as package_main
 __author__ = "Alberto Pettarin"
 __copyright__ = "Copyright 2012-2015, Alberto Pettarin (www.albertopettarin.it)"
 __license__ = "MIT"
-__version__ = "3.0.1"
+__version__ = "3.1.0"
 __email__ = "[email protected]"
 __status__ = "Production"
 

diff --git a/penelope/__init__.py b/penelope/__init__.py
@@ -32,7 +32,7 @@
 __author__ = "Alberto Pettarin"
 __copyright__ = "Copyright 2012-2015, Alberto Pettarin (www.albertopettarin.it)"
 __license__ = "MIT"
-__version__ = "3.0.1"
+__version__ = "3.1.0"
 __email__ = "[email protected]"
 __status__ = "Production"
 

diff --git a/penelope/__main__.py b/penelope/__main__.py
@@ -31,7 +31,7 @@
 __author__ = "Alberto Pettarin"
 __copyright__ = "Copyright 2012-2015, Alberto Pettarin (www.albertopettarin.it)"
 __license__ = "MIT"
-__version__ = "3.0.1"
+__version__ = "3.1.0"
 __email__ = "[email protected]"
 __status__ = "Production"
 

diff --git a/penelope/collation_default.py b/penelope/collation_default.py
@@ -2,19 +2,21 @@
 # -*- coding: utf-8 -*-
 
 """
-This is the default collation function (IcuNoCase) for bookeen output format.
+This is the default collation function (IcuNoCase).
 """
 
 __author__ = "Alberto Pettarin"
 __copyright__ = "Copyright 2012-2015, Alberto Pettarin (www.albertopettarin.it)"
 __license__ = "MIT"
-__version__ = "3.0.1"
+__version__ = "3.1.0"
 __email__ = "[email protected]"
 __status__ = "Production"
 
 def collate_function(string1, string2):
     """
-    Implement IcuNoCase collation.
+    Implement default IcuNoCase collation,
+    by simply lowercasing the UTF-8 encoded versions
+    of the two strings.
 
     :param string1: first string
     :type  string1: unicode

diff --git a/penelope/collation_german.py b/penelope/collation_german.py
@@ -2,19 +2,27 @@
 # -*- coding: utf-8 -*-
 
 """
-This is a sample collation function (IcuNoCase) for German.
+This is a collation function (IcuNoCase) for German.
 """
 
 __author__ = "Alberto Pettarin"
 __copyright__ = "Copyright 2012-2015, Alberto Pettarin (www.albertopettarin.it)"
 __license__ = "MIT"
-__version__ = "3.0.1"
+__version__ = "3.1.0"
 __email__ = "[email protected]"
 __status__ = "Production"
 
+REPLACEMENTS = [
+    [u"ä", u"a"],
+    [u"ö", u"o"],
+    [u"ü", u"u"],
+    [u"ß", u"ss"]
+]
+
 def collate_function(string1, string2):
     """
     Implement IcuNoCase collation for German.
+    (I do not remember where the procedure comes from.)
 
     :param string1: first string
     :type  string1: unicode
@@ -26,10 +34,9 @@ def collate_function(string1, string2):
     b2 = string2.lower()
     c1 = b1
     c2 = b2
-    for f in [[u"ä", u"a"], [u"ö", u"o"], [u"ü", u"u"], [u"ß", u"ss"]]:
-        b1 = b1.replace(f[0], f[1])
-        b2 = b2.replace(f[0], f[1])
-
+    for repl in REPLACEMENTS:
+        b1 = b1.replace(repl[0], repl[1])
+        b2 = b2.replace(repl[0], repl[1])
     if b1.encode("utf-16") == b2.encode("utf-16"):
         if c1.encode("utf-16") == c2.encode("utf-16"):
             return 0