Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sanitize column names to follow defined standards #313

Merged
merged 1 commit into from
Nov 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 46 additions & 18 deletions osm_fieldwork/update_xlsform.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,36 @@
SURVEY_GROUP_NAME = "survey_questions"


def standardize_xlsform_sheets(xlsform: dict) -> dict:
"""Standardizes column headers in both the 'survey' and 'choices' sheets of an XLSForm.

- Strips spaces and lowercases all column headers.
- Fixes formatting for columns with '::' (e.g., multilingual labels).

Args:
xlsform (dict): A dictionary with keys 'survey' and 'choices', each containing a DataFrame.

Returns:
dict: The updated XLSForm dictionary with standardized column headers.
"""

def clean_column_name(col_name):
if col_name == "label":
return "label::english(en)"
if "::" in col_name:
# Handle '::' columns (e.g., 'label::english (en)')
parts = col_name.split("::")
language_part = parts[1].replace(" ", "").lower() # Remove spaces and lowercase
return f"{parts[0]}::{language_part}"
return col_name.strip().lower() # General cleanup

# Apply cleaning to each sheet
for _sheet_name, sheet_df in xlsform.items():
sheet_df.columns = [clean_column_name(col) for col in sheet_df.columns]

return xlsform


def merge_dataframes(mandatory_df: pd.DataFrame, user_question_df: pd.DataFrame, digitisation_df: pd.DataFrame):
"""Merge multiple Pandas dataframes together, removing duplicate fields."""
# Remove empty rows from dataframes
Expand Down Expand Up @@ -92,8 +122,8 @@ def handle_translations(

if field in user_question_df.columns and not translation_columns:
# If user_question_df has only the base field (e.g., 'label'), map English translation from mandatory and digitisation
mandatory_df[field] = mandatory_df.get(f"{field}::English(en)", mandatory_df.get(field))
digitisation_df[field] = digitisation_df.get(f"{field}::English(en)", digitisation_df.get(field))
mandatory_df[field] = mandatory_df.get(f"{field}::english(en)", mandatory_df.get(field))
digitisation_df[field] = digitisation_df.get(f"{field}::english(en)", digitisation_df.get(field))

# Then drop translation columns
mandatory_df = mandatory_df.loc[:, ~mandatory_df.columns.str.startswith("label::")]
Expand Down Expand Up @@ -133,10 +163,10 @@ def create_survey_group(name: str) -> dict[str, pd.DataFrame]:
{
"type": ["begin group"],
"name": [name],
"label::English(en)": [name],
"label::Swahili(sw)": [name],
"label::French(fr)": [name],
"label::Spanish(es)": [name],
"label::english(en)": [name],
Copy link
Member

@spwoodcock spwoodcock Nov 15, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Very nice addition!

@Sujanadh
Just checking that forms definitely work with the lowercase label? Did you manage to test anywhere? 🙏

(I don't see why not, as it should match the (en) key, but need to be sure!)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes it does, i tested it

"label::swahili(sw)": [name],
"label::french(fr)": [name],
"label::spanish(es)": [name],
"relevant": "(${new_feature} != '') or (${building_exists} = 'yes')",
}
)
Expand All @@ -158,20 +188,16 @@ def append_select_one_from_file_row(df: pd.DataFrame, entity_name: str) -> pd.Da

# Find the row index after 'feature' row
row_index_to_split_on = select_one_from_file_index[0] + 1
# Strip the 's' from the end for singular form
if entity_name.endswith("s"):
# Plural to singular
entity_name = entity_name[:-1]

additional_row = pd.DataFrame(
{
"type": [f"select_one_from_file {entity_name}.csv"],
"name": [entity_name],
"label::English(en)": [entity_name],
"label::english(en)": [entity_name],
"appearance": ["map"],
"label::Swahili(sw)": [entity_name],
"label::French(fr)": [entity_name],
"label::Spanish(es)": [entity_name],
"label::swahili(sw)": [entity_name],
"label::french(fr)": [entity_name],
"label::spanish(es)": [entity_name],
}
)

Expand Down Expand Up @@ -205,12 +231,14 @@ async def append_mandatory_fields(
custom_sheets = pd.read_excel(custom_form, sheet_name=None, engine="calamine")
mandatory_sheets = pd.read_excel(f"{xlsforms_path}/common/mandatory_fields.xls", sheet_name=None, engine="calamine")
digitisation_sheets = pd.read_excel(f"{xlsforms_path}/common/digitisation_fields.xls", sheet_name=None, engine="calamine")
custom_sheets = standardize_xlsform_sheets(custom_sheets)

# Merge 'survey' and 'choices' sheets
if "survey" not in custom_sheets:
msg = "Survey sheet is required in XLSForm!"
log.error(msg)
raise ValueError(msg)

log.debug("Merging survey sheet XLSForm data")
custom_sheets["survey"] = merge_dataframes(
mandatory_sheets.get("survey"), custom_sheets.get("survey"), digitisation_sheets.get("survey")
Expand All @@ -223,10 +251,10 @@ async def append_mandatory_fields(
if not form_category_row.empty:
custom_sheets["survey"].loc[custom_sheets["survey"]["name"] == "form_category", "calculation"] = f"once('{form_category}')"

if "choices" not in custom_sheets:
msg = "Choices sheet is required in XLSForm!"
log.error(msg)
raise ValueError(msg)
# Ensure the 'choices' sheet exists in custom_sheets
if "choices" not in custom_sheets or custom_sheets["choices"] is None:
custom_sheets["choices"] = pd.DataFrame(columns=["list_name", "name", "label::english(en)"])

log.debug("Merging choices sheet XLSForm data")
custom_sheets["choices"] = merge_dataframes(
mandatory_sheets.get("choices"), custom_sheets.get("choices"), digitisation_sheets.get("choices")
Expand Down
Binary file modified osm_fieldwork/xlsforms/common/digitisation_fields.xls
Binary file not shown.
Binary file modified osm_fieldwork/xlsforms/common/mandatory_fields.xls
Binary file not shown.
Loading