kobotoolbox · rgraber · Jan 30, 2025 · Feb 10, 2025 · Feb 11, 2025 · Feb 12, 2025
diff --git a/dev-requirements.txt b/dev-requirements.txt
@@ -7,6 +7,7 @@
 # Install testing / development requirements
 coverage[toml]==6.5.0
 coveralls==3.3.1
+ddt==1.7.2
 flake8==7.1.1
 funcsigs==1.0.2
 geojson-rewind==1.1.0

diff --git a/setup.py b/setup.py
@@ -19,7 +19,7 @@
 
 setup(
     name='formpack',
-    version='3.0.0',
+    version='3.0.1',
     description='Manipulation tools for KoBo forms',
     author='the formpack contributors (https://github.com/kobotoolbox/formpack/graphs/contributors)',
     url='https://github.com/kobotoolbox/formpack/',

diff --git a/src/formpack/utils/expand_content.py b/src/formpack/utils/expand_content.py
@@ -94,7 +94,84 @@ def _get_translations_from_special_cols(
     return translations, set(translated_cols)
 
 
+def clean_column_name(column_name: str, already_seen: dict[str, str]) -> str:
+    """
+
+    Preserves ":" vs "::" and any spaces around the colons
+    """
+    RE_MEDIA_COLUMN_NAMES = '|'.join(MEDIA_COLUMN_NAMES)
+    if column_name in already_seen:
+        return already_seen[column_name]
+
+    # "LaBeL" -> "label", "HiNT" -> "hint"
+    if column_name.lower() in ['label', 'hint']:
+        cleaned = column_name.lower()
+        already_seen[column_name] = cleaned
+        return cleaned
+
+    # "Bind:Some:Thing" -> "bind:Some:Thing", "BodY:" -> "body:"
+    match = re.match(r'^(bind|body):.*', column_name, flags=re.IGNORECASE)
+    if match:
+        lower_cased = match.groups()[0].lower()
+        cleaned = re.sub(r'^(bind|body)', lower_cased, column_name, flags=re.IGNORECASE)
+        already_seen[column_name] = cleaned
+        return cleaned
+
+    # "Media:Audio::ES" -> "media:audio::ES", "ViDeO : ES" -> "video : ES"
+    match = re.match(
+        rf'^(media\s*::?\s*)?({RE_MEDIA_COLUMN_NAMES})\s*::?\s*([^:]+)$',
+        column_name,
+        flags=re.IGNORECASE
+    )
+    if match:
+        matched = match.groups()
+        lower_media_prefix = matched[0].lower() if matched[0] else ''
+        lower_media_type = matched[1].lower()
+        cleaned = re.sub(rf'^(media\s*::?\s*)?({RE_MEDIA_COLUMN_NAMES})(\s*::?\s*)([^:]+)$',
+                          rf'{lower_media_prefix}{lower_media_type}\3\4',
+                          column_name, flags=re.IGNORECASE)
+        already_seen[column_name] = cleaned
+        return cleaned
+
+    # "Media: AuDiO" -> "media: audio", "VIDEO" -> "video"
+    match = re.match(
+        rf'^(media\s*::?\s*)?({RE_MEDIA_COLUMN_NAMES})$', column_name, flags=re.IGNORECASE
+    )
+    if match:
+        matched = match.groups()
+        lower_media_prefix = matched[0].lower() if matched[0] else ''
+        lower_media_type = matched[1].lower()
+        cleaned = re.sub(rf'^(media\s*::?\s*)?({RE_MEDIA_COLUMN_NAMES})$',
+                          rf'{lower_media_prefix}{lower_media_type}',
+                          column_name, flags=re.IGNORECASE)
+        already_seen[column_name] = cleaned
+
+    match = re.match(r'^([^:]+)(\s*::?\s*)([^:]+)$', column_name)
+    if match:
+        # example: label::x, constraint_message::x, hint::x
+        matched = match.groups()
+        lower_column_shortname = matched[0].lower()
+        cleaned = re.sub(r'^([^:]+)(\s*::?\s*)([^:]+)$', rf'{lower_column_shortname}\2\3', column_name,
+                          flags=re.IGNORECASE)
+        already_seen[column_name] = cleaned
+        return cleaned
+    cleaned = column_name.lower()
+    already_seen[column_name] = cleaned
+    return cleaned
+
+
+def preprocess_columns(content: Dict[str, List[Any]]) -> None:
+    seen = {}
+    for sheet, rows in content.items():
+        for row in rows:
+            for column_name, value in row.copy().items():
+                cleaned_name = clean_column_name(column_name, seen)
+                del row[column_name]
+                row[cleaned_name] = value
+
 def expand_content_in_place(content: Dict[str, List[Any]]) -> None:
+    preprocess_columns(content)
+
     specials, translations, transl_cols = _get_special_survey_cols(content)
 
     if len(translations) > 0:

diff --git a/tests/test_expand_content.py b/tests/test_expand_content.py
@@ -1,18 +1,21 @@
 # coding: utf-8
 import copy
 from collections import OrderedDict
+from ddt import data, ddt, unpack
+from unittest import TestCase
 
 from formpack import FormPack
 from formpack.constants import OR_OTHER_COLUMN as _OR_OTHER
 from formpack.constants import UNTRANSLATED
-from formpack.utils.expand_content import SCHEMA_VERSION
+from formpack.utils.expand_content import SCHEMA_VERSION, clean_column_name
 from formpack.utils.expand_content import _expand_tags
 from formpack.utils.expand_content import _get_special_survey_cols
 from formpack.utils.expand_content import expand_content, _expand_type_to_dict
 from formpack.utils.flatten_content import flatten_content
 from formpack.utils.string import orderable_with_none
 
 
+
 def test_expand_selects_with_or_other():
     assert _expand_type_to_dict('select_one xx or other').get(_OR_OTHER) == True
     assert _expand_type_to_dict('select_one   xx    or_other').get(_OR_OTHER) == True
@@ -604,5 +607,29 @@ def test_expand_translations_null_lang():
     assert s1 == s1_copy
 
 
+def test_expand_ignores_case():
+    s1 = {'survey': [{'type': 'text', 'Label': 'hi'}]}
+    expand_content(s1, in_place=True)
+    assert s1.get('translated') == ['Label']
+
+
 def _s(rows):
     return {'survey': [dict([[key, 'x']]) for key in rows]}
+
+@ddt
+class ColumnTestCase(TestCase):
+    @data(
+        ('FOO', 'foo'),
+        ('LABEL', 'label'),
+        ('HINT', 'hint'),
+        ('BIND::FOO', 'bind::FOO'),
+        ('BODY : FOO', 'body : FOO'),
+        ('MEDIA:AUDIO:Spanish', 'media:audio:Spanish'),
+        ('VIDEO :: SPANISH', 'video :: SPANISH'),
+        ('MEDIA:AUDIO', 'media:audio'),
+        ('IMAGE', 'image'),
+        ('LABEL : SPANISH', 'label : SPANISH')
+    )
+    @unpack
+    def test_clean_column_name(self, name, expected):
+        assert clean_column_name(name) == expected