From d3662181e139243c8c709dffbe48124bb100b7fa Mon Sep 17 00:00:00 2001 From: Alex Garel Date: Wed, 24 Jul 2024 10:14:06 +0200 Subject: [PATCH] fix: less transformations on comment lines and others (#523) Various fixes: - avoid changing blank and comments lines - fix space formatting in parents in unparse - fix properties order in unparse --- .../parser/parser.py | 2 +- .../parser/taxonomy_parser.py | 154 ++++++++---------- .../openfoodfacts_taxonomy_parser/unparser.py | 4 +- parser/tests/data/test.txt | 16 +- .../tests/data/test_comment_below_parent.txt | 9 + .../test_parse_unparse_integration.py | 16 +- .../integration/test_parser_integration.py | 17 +- parser/tests/unit/test_unparser_unit.py | 48 ++++++ 8 files changed, 158 insertions(+), 108 deletions(-) create mode 100644 parser/tests/data/test_comment_below_parent.txt create mode 100644 parser/tests/unit/test_unparser_unit.py diff --git a/parser/openfoodfacts_taxonomy_parser/parser/parser.py b/parser/openfoodfacts_taxonomy_parser/parser/parser.py index 6050fe08..633ff92a 100644 --- a/parser/openfoodfacts_taxonomy_parser/parser/parser.py +++ b/parser/openfoodfacts_taxonomy_parser/parser/parser.py @@ -143,9 +143,9 @@ def _create_child_links(self, child_links: list[ChildLink], project_label: str): MATCH (p:{project_label}) USING INDEX p:{project_label}(id) WHERE p.id = child_link.parent_id MATCH (c:{project_label}) USING INDEX c:{project_label}(id) + WHERE c.id = child_link.id """ + """ - WHERE c.id = child_link.id CREATE (c)-[relations:is_child_of {position: child_link.position}]->(p) WITH relations UNWIND relations AS relation diff --git a/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py b/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py index 53ad67f1..0cbad078 100644 --- a/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py +++ b/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py @@ -1,5 +1,4 @@ import collections -import copy import logging import re import sys @@ -31,11 +30,6 @@ class NodeData: properties: dict[str, str] = field(default_factory=dict) tags: dict[str, list[str]] = field(default_factory=dict) comments: dict[str, list[str]] = field(default_factory=dict) - # comments_stack is a list of tuples (line_number, comment), - # to keep track of comments just above the current line - # during parsing of an entry, to be able to add them - # to the right property or tag when possible - comments_stack: list[tuple[int, str]] = field(default_factory=list) is_external: bool = False # True if the node comes from another taxonomy original_taxonomy: str | None = None # the name of the taxonomy the node comes from @@ -62,6 +56,10 @@ def get_node_type(self): else: return NodeType.ENTRY + @property + def is_started(self): + return self.id or self.parent_tags or self.tags or self.properties + class PreviousLink(TypedDict): before_id: str @@ -96,16 +94,7 @@ def _file_iter(self, filename: str, start: int = 0) -> Iterator[tuple[int, str]] for line_number, line in enumerate(file): if line_number < start: continue - # sanitizing - # remove any space characters at end of line - line = line.rstrip() - # replace commas between digits - # and that have no space around by a lower comma character - # and do the same for escaped comma (preceded by a \) - # (to distinguish them from commas acting as tags separators) - line = re.sub(r"(\d),(\d)", r"\1‚\2", line) - line = re.sub(r"\\,", "\\‚", line) - yield line_number, line + yield line_number, line.rstrip("\n") line_count += 1 yield line_count, "" # to end the last entry if not ended @@ -114,13 +103,30 @@ def _normalize_entry_id(self, raw_id: str) -> str: Get a normalized string but keeping the language code "lc:", used for id and parent tag """ + raw_id = raw_id.strip() lc, main_tag = raw_id.split(":", 1) normalized_main_tag = normalize_text(main_tag, lc, stopwords=self.stopwords) normalized_id = f"{lc}:{normalized_main_tag}" return normalized_id + def prepare_line(self, line: str) -> str: + """prepare line for parsing + + This is different from `normalize_text` which is to compute ids + """ + line = line.strip() + # sanitizing + # remove any space or commas characters at end of line + line = re.sub(r"[\s,]+$", "", line) + # replace commas between digits and that have no space around by a lower comma character + # and do the same for escaped comma (preceded by a \) + # (to distinguish them from commas acting as tags separators) + line = re.sub(r"(\d),(\d)", r"\1‚\2", line) + line = re.sub(r"\\,", "\\‚", line) + return line + def undo_normalize_text(self, text: str) -> str: - """Undo some normalizations made in `_file_iter`""" + """Undo some normalizations made in `prepare_line`""" # restore commas from lower comma characters text = re.sub(r"(\d)‚(\d)", r"\1,\2", text) text = re.sub(r"\\‚", "\\,", text) @@ -149,7 +155,7 @@ def _header_harvest(self, filename: str) -> tuple[list[str], int]: h = 0 header: list[str] = [] for _, line in self._file_iter(filename): - if not (line) or line[0] == "#": + if not (line.strip()) or line[0] == "#": header.append(line) else: break @@ -158,7 +164,7 @@ def _header_harvest(self, filename: str) -> tuple[list[str], int]: # we don't want to eat the comments of the next block # and it removes the last separating line for i in range(len(header)): - if header.pop(): + if header.pop().strip(): h -= 1 else: break @@ -170,7 +176,7 @@ def _entry_end(self, line: str, data: NodeData) -> bool: if data.id.startswith("stopwords") or data.id.startswith("synonyms"): # stopwords and synonyms are one-liners; if the id is set, it's the end return True - if not line and data.id: + if not line.strip() and data.id: # entries are separated by a blank line return True return False @@ -182,7 +188,7 @@ def _remove_separating_line(self, data: NodeData) -> NodeData: """ is_before = data.is_before # first, check if there is at least one preceding line - if data.preceding_lines and not data.preceding_lines[0]: + if data.preceding_lines and not data.preceding_lines[0].strip(): if data.id.startswith("synonyms"): # it's a synonyms block, # if the previous block is a stopwords block, @@ -202,42 +208,13 @@ def _remove_separating_line(self, data: NodeData) -> NodeData: data.preceding_lines.pop(0) return data - def _get_node_data_with_comments_above_key( - self, data: NodeData, line_number: int, key: str - ) -> NodeData: + def _add_comments(self, data: NodeData, comments: list[str], key: str) -> NodeData: """Returns the updated node data with comments above the given key stored in the {key}_comments property.""" - new_data = copy.deepcopy(data) - - # Get comments just above the given line - comments_above = [] - current_line = line_number - 1 - while new_data.comments_stack and new_data.comments_stack[-1][0] == current_line: - comments_above.append(new_data.comments_stack.pop()[1]) - current_line -= 1 - if comments_above: - new_data.comments[key + "_comments"] = comments_above[::-1] - - return new_data - - def _get_node_data_with_parent_and_end_comments(self, data: NodeData) -> NodeData: - """Returns the updated node data with parent and end comments""" - new_data = copy.deepcopy(data) - - # Get parent comments (part of an entry block and just above/between the parents lines) - parent_comments = [] - while new_data.preceding_lines and new_data.preceding_lines[-1] != "": - parent_comments.append(new_data.preceding_lines.pop()) - if parent_comments: - new_data.comments["parent_comments"] = parent_comments[::-1] - - # Get end comments (part of an entry block after the last tag/prop - # and before the separating blank line) - end_comments = [comment[1] for comment in new_data.comments_stack] - if end_comments: - new_data.comments["end_comments"] = end_comments - - return new_data + if comments: + data.comments.setdefault(f"{key}_comments", []).extend(comments) + # reset the comments list + comments[:] = [] _language_code_prefix = re.compile( r"[a-zA-Z][a-zA-Z][a-zA-Z]?([-_][a-zA-Z][a-zA-Z][a-zA-Z]?)?:" @@ -247,17 +224,35 @@ def is_entry_synonyms_line(self, line): matching_prefix = self._language_code_prefix.match(line) if matching_prefix: # verify it's not a property, that is a name followed by a colon and a language - # we need no-qa this because of black vs flake8 opinion return not ( self._language_code_prefix.match(line[matching_prefix.end() :]) # noqa: E203 ) return False + def finalize_data(self, data, comments, saved_nodes): + data = self._remove_separating_line(data) + if data.get_node_type() == NodeType.ENTRY: + self._add_comments(data, comments, "end") + if data.id in saved_nodes: + # this duplicate node will be merged with the first one + data.is_before = None + msg = ( + f"WARNING: Entry with same id {data.id} already exists, " + f"duplicate id in file at line {data.src_position}. " + "The two nodes will be merged, keeping the last " + "values in case of conflicts." + ) + self.parser_logger.error(msg) + else: + saved_nodes.append(data.id) + return data + def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[NodeData]: """Transform data from file to dictionary""" saved_nodes = [] index_stopwords = 0 index_synonyms = 0 + comments = [] # Check if it is correctly written correctly_written = re.compile(r"\w+\Z") @@ -268,39 +263,27 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N line_number = ( entries_start_line # if the iterator is empty, line_number will not be unbound ) - for line_number, line in self._file_iter(filename, entries_start_line): + for line_number, raw_line in self._file_iter(filename, entries_start_line): # yield data if block ended - if self._entry_end(line, data): - data = self._remove_separating_line(data) - if data.get_node_type() == NodeType.ENTRY: - data = self._get_node_data_with_parent_and_end_comments(data) - if data.id in saved_nodes: - # this duplicate node will be merged with the first one - data.is_before = None - msg = ( - f"WARNING: Entry with same id {data.id} already exists, " - f"duplicate id in file at line {data.src_position}. " - "The two nodes will be merged, keeping the last " - "values in case of conflicts." - ) - self.parser_logger.error(msg) - else: - saved_nodes.append(data.id) - is_before = data.id - yield data # another function will use this dictionary to create a node + if self._entry_end(raw_line, data): + is_before = data.is_before + # another function will use data to create a node + yield self.finalize_data(data, comments, saved_nodes) + # if data was a duplicate (is_before is None) reuse same is_before + is_before = data.id if data.is_before else is_before data = NodeData(is_before=is_before) # harvest the line - if not (line) or line[0] == "#": + if not (raw_line.strip()) or raw_line[0] == "#": # comment or blank line - if data.id: + if data.is_started: # we are within the node definition - data.comments_stack.append((line_number, line)) + comments.append(raw_line) else: # we are before the actual node - data.preceding_lines.append(line) + data.preceding_lines.append(raw_line) else: - line = line.rstrip(",") + line = self.prepare_line(raw_line) if not data.src_position: data.src_position = line_number + 1 if line.startswith("stopwords"): @@ -342,6 +325,7 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N elif line[0] == "<": # parent definition data.parent_tags.append((self._normalize_entry_id(line[1:]), line_number + 1)) + self._add_comments(data, comments, "parent") elif self.is_entry_synonyms_line(line): # synonyms definition if not data.id: @@ -363,9 +347,7 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N tagsids_list.append(word_normalized) data.tags["tags_" + lang] = tags_list data.tags["tags_ids_" + lang] = tagsids_list - data = self._get_node_data_with_comments_above_key( - data, line_number, "tags_" + lang - ) + self._add_comments(data, comments, "tags_" + lang) else: # property definition property_name = None @@ -384,7 +366,7 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N correctly_written.match(property_name) and correctly_written.match(lc) ): self.parser_logger.error( - f"Reading error at line {line_number + 1}," + f"Reading error at line {line_number + 1}, " f"unexpected format: '{self.parser_logger.ellipsis(line)}'" ) if property_name: @@ -392,9 +374,7 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N data.properties[prop_key] = self.undo_normalize_text( property_value.strip() ) - data = self._get_node_data_with_comments_above_key( - data, line_number, prop_key - ) + self._add_comments(data, comments, prop_key) data.id = "__footer__" data.preceding_lines.pop(0) diff --git a/parser/openfoodfacts_taxonomy_parser/unparser.py b/parser/openfoodfacts_taxonomy_parser/unparser.py index 1f4cd510..142d7113 100644 --- a/parser/openfoodfacts_taxonomy_parser/unparser.py +++ b/parser/openfoodfacts_taxonomy_parser/unparser.py @@ -59,7 +59,7 @@ def get_tags_line(self, node, lc): @staticmethod def property_sort_key(property): - name, lang_code, *_ = property.split("_", 2) + name, lang_code = property.rsplit("_", 1) # give priority to xx and en language codes priority = {"en": 1, "xx": 0} return (name, priority.get(lang_code, 100), lang_code) @@ -88,7 +88,7 @@ def get_parents_lines(self, parents): parent = dict(parent) lc = parent["main_language"] parent_id = parent["tags_" + lc][0] - yield "<" + lc + ": " + parent_id + yield "< " + lc + ":" + parent_id def iter_lines(self, project_label): previous_block_id = "" diff --git a/parser/tests/data/test.txt b/parser/tests/data/test.txt index 92f38f86..c501a4b5 100644 --- a/parser/tests/data/test.txt +++ b/parser/tests/data/test.txt @@ -6,33 +6,33 @@ synonyms:en: passion fruit, passionfruit synonyms:fr: fruit de la passion, maracuja, passion -