How to Sort Unicode Strings Alphabetically in Python?¶
+Lexicographic Sorting¶
+polish_names = ["Zbigniew", "Ludmiła", "Żaneta", "Łukasz"]
+sorted(polish_names)
+
['Ludmiła', 'Zbigniew', 'Łukasz', 'Żaneta']+
Unicode Collation Algorithm¶
+pyuca¶
+import pyuca
+
+collator = pyuca.Collator()
+sorted(polish_names, key=collator.sort_key)
+
['Ludmiła', 'Łukasz', 'Żaneta', 'Zbigniew']+
PyICU¶
+from icu import Collator, Locale
+
+collator = Collator.createInstance(Locale("pl_PL"))
+sorted(polish_names, key=collator.getSortKey)
+
['Ludmiła', 'Łukasz', 'Zbigniew', 'Żaneta']+
Locales¶
+from icu import Locale
+
+list(Locale.getAvailableLocales())[:5]
+
['af', 'af_NA', 'af_ZA', 'agq', 'agq_CM']+
Rule-Based Collator¶
+from icu import RuleBasedCollator
+
+collator = RuleBasedCollator(
+ """
+ &A<ą<<<Ą
+ &C<ć<<<Ć
+ &E<ę<<<Ę
+ &L<ł<<<Ł
+ &N<ń<<<Ń
+ &O<ó<<<Ó
+ &S<ś<<<Ś
+ &Z<ź<<<Ź<ż<<<Ż
+ """
+)
+sorted(polish_names, key=collator.getSortKey)
+
['Ludmiła', 'Łukasz', 'Zbigniew', 'Żaneta']+
Python's locale
Module¶
+import locale
+
+locale.setlocale(locale.LC_COLLATE, "pl_PL.UTF-8")
+sorted(polish_names, key=locale.strxfrm)
+
['Ludmiła', 'Łukasz', 'Zbigniew', 'Żaneta']+
Transliteration¶
+pangrams = {
+ "Czech": "Příliš žluťoučký kůň úpěl ďábelské ódy",
+ "Polish": "Pójdźmyż haftnąć z wklęsłości guberń",
+ "Icelandic": "Kæmi ný öxi hér, ykist þjófum nú bæði víl og ádrepa",
+}
+
Python's unicodedata
Module¶
+import unicodedata
+
+
+def transliterate_v1(text: str) -> str:
+ return (
+ unicodedata.normalize("NFD", text)
+ .encode("ascii", errors="ignore")
+ .decode("ascii")
+ )
+
for label, pangram in pangrams.items():
+ print(label, pangram, transliterate_v1(pangram), "", sep="\n")
+
Czech +Příliš žluťoučký kůň úpěl ďábelské ódy +Prilis zlutoucky kun upel dabelske ody + +Polish +Pójdźmyż haftnąć z wklęsłości guberń +Pojdzmyz haftnac z wklesosci gubern + +Icelandic +Kæmi ný öxi hér, ykist þjófum nú bæði víl og ádrepa +Kmi ny oxi her, ykist jofum nu bi vil og adrepa + ++
Custom Translation Table¶
+import sys
+import unicodedata
+
+
+def transliterate_v2(text: str, mapping: dict[str, str] = None) -> str:
+ combining_characters = "".join(
+ character
+ for code_point in range(sys.maxunicode)
+ if unicodedata.combining(character := chr(code_point))
+ )
+ if mapping:
+ src, dst = ["".join(x) for x in zip(*mapping.items())]
+ table = str.maketrans(src, dst, combining_characters)
+ else:
+ table = str.maketrans(dict.fromkeys(combining_characters))
+ return unicodedata.normalize("NFD", text).translate(table)
+
mapping = {"Ł": "L", "ł": "l"}
+for label, pangram in pangrams.items():
+ print(label, pangram, transliterate_v2(pangram, mapping), "", sep="\n")
+
Czech +Příliš žluťoučký kůň úpěl ďábelské ódy +Prilis zlutoucky kun upel dabelske ody + +Polish +Pójdźmyż haftnąć z wklęsłości guberń +Pojdzmyz haftnac z wkleslosci gubern + +Icelandic +Kæmi ný öxi hér, ykist þjófum nú bæði víl og ádrepa +Kæmi ny oxi her, ykist þjofum nu bæði vil og adrepa + ++
Unidecode¶
+import unidecode
+
+
+def transliterate_v3(text: str) -> str:
+ return unidecode.unidecode(text)
+
for label, pangram in pangrams.items():
+ print(label, pangram, transliterate_v3(pangram), "", sep="\n")
+
Czech +Příliš žluťoučký kůň úpěl ďábelské ódy +Prilis zlutoucky kun upel dabelske ody + +Polish +Pójdźmyż haftnąć z wklęsłości guberń +Pojdzmyz haftnac z wkleslosci gubern + +Icelandic +Kæmi ný öxi hér, ykist þjófum nú bæði víl og ádrepa +Kaemi ny oxi her, ykist thjofum nu baedi vil og adrepa + ++
PyICU Transliterator¶
+import icu
+
+tr = icu.Transliterator.createInstance("Any-ASCII")
+
+for label, pangram in pangrams.items():
+ print(label, pangram, tr.transliterate(pangram), "", sep="\n")
+
Czech +Příliš žluťoučký kůň úpěl ďábelské ódy +Prilis zlutoucky kun upel dabelske ody + +Polish +Pójdźmyż haftnąć z wklęsłości guberń +Pojdzmyz haftnac z wkleslosci gubern + +Icelandic +Kæmi ný öxi hér, ykist þjófum nú bæði víl og ádrepa +Kaemi ny oxi her, ykist thjofum nu baedi vil og adrepa + ++
Case-Insensitive Sorting¶
+Case Folding¶
+import functools
+import unicodedata
+
+
+def case_insensitive(text: str) -> str:
+ nfd = functools.partial(unicodedata.normalize, "NFD")
+ return nfd(nfd(text).casefold())
+
animaux = ["Tortue", "hérissonne", "Éléphant", "poisson", "éléphant"]
+sorted(animaux, key=case_insensitive)
+
['Éléphant', 'éléphant', 'hérissonne', 'poisson', 'Tortue']+
PyICU¶
+from icu import Collator, Locale
+
+collator = Collator.createInstance(Locale("fr_FR"))
+collator.setStrength(Collator.SECONDARY)
+sorted(animaux, key=collator.getSortKey)
+
['Éléphant', 'éléphant', 'hérissonne', 'poisson', 'Tortue']+
pyuca¶
Lowercase letters always come before uppercase letters:
+from pyuca import Collator
+
+sorted(animaux, key=Collator().sort_key)
+
['éléphant', 'Éléphant', 'hérissonne', 'poisson', 'Tortue']+
Natural Sort Order¶
+Regular Expressions¶
+import re
+
+
+def natural_order(text: str) -> tuple[str | int, ...]:
+ return tuple(
+ int(chunk) if chunk.isdigit() else chunk
+ for chunk in re.split(r"(\d+)", text)
+ )
+
sorted([f"log.{i}" for i in range(1, 111)], key=natural_order)[:15]
+
['log.1', + 'log.2', + 'log.3', + 'log.4', + 'log.5', + 'log.6', + 'log.7', + 'log.8', + 'log.9', + 'log.10', + 'log.11', + 'log.12', + 'log.13', + 'log.14', + 'log.15']+
PyICU + natsort¶
+filenames = [
+ "raport_maj-2023.xlsx",
+ "błędy.log.3",
+ "błędy.log.1",
+ "Raport_kwiecień_2023.xlsx",
+ "Błędy.log.101",
+]
+
+import locale
+
+locale.setlocale(locale.LC_ALL, "pl_PL.UTF-8")
+
+from natsort import natsorted, ns
+
+natsorted(filenames, alg=ns.LOCALE | ns.IGNORECASE)
+
['błędy.log.1', + 'błędy.log.3', + 'Błędy.log.101', + 'Raport_kwiecień_2023.xlsx', + 'raport_maj-2023.xlsx']+
Sorting Complex Objects¶
+from typing import NamedTuple
+
+
+class Person(NamedTuple):
+ first_name: str
+ last_name: str
+
+ def __repr__(self):
+ return f"{self.first_name} {self.last_name}"
+
+
+people = [
+ Person("Zbigniew", "Nowak"),
+ Person("Anna", "Wójcik"),
+ Person("Łukasz", "Kowalski"),
+ Person("Żaneta", "Jabłońska"),
+ Person("Anna", "Nowak"),
+ Person("Ludmiła", "Wiśniewska"),
+]
+
+sorted(people)
+
[Anna Nowak, + Anna Wójcik, + Ludmiła Wiśniewska, + Zbigniew Nowak, + Łukasz Kowalski, + Żaneta Jabłońska]+
pyuca¶
+import pyuca
+
+collator = pyuca.Collator()
+
+
+def compound_key(person: Person) -> tuple:
+ return (
+ collator.sort_key(person.first_name),
+ collator.sort_key(person.last_name),
+ )
+
+
+sorted(people, key=compound_key)
+
[Anna Nowak, + Anna Wójcik, + Ludmiła Wiśniewska, + Łukasz Kowalski, + Żaneta Jabłońska, + Zbigniew Nowak]+
PyICU¶
+from icu import Collator, Locale
+
+collator = Collator.createInstance(Locale("pl_PL"))
+
+
+def compound_key(person: Person) -> tuple:
+ return (
+ collator.getSortKey(person.first_name),
+ collator.getSortKey(person.last_name),
+ )
+
+
+sorted(people, key=compound_key)
+
[Anna Nowak, + Anna Wójcik, + Ludmiła Wiśniewska, + Łukasz Kowalski, + Zbigniew Nowak, + Żaneta Jabłońska]+