From fad0414c17d4bc32ebc98f50996f9b5cf71e558a Mon Sep 17 00:00:00 2001 From: Morgante Pell Date: Tue, 16 Jul 2024 03:31:01 -0700 Subject: [PATCH 1/4] fix: try using replacement approach --- crates/core/src/test.rs | 130 ++++++++++++++++++++++++++++++++++ crates/language/src/python.rs | 2 +- 2 files changed, 131 insertions(+), 1 deletion(-) diff --git a/crates/core/src/test.rs b/crates/core/src/test.rs index a76b46d2b..81bb91e8c 100644 --- a/crates/core/src/test.rs +++ b/crates/core/src/test.rs @@ -12014,6 +12014,136 @@ fn trailing_comma_import_from_python_with_alias() { .unwrap(); } +// refer to https://github.com/getgrit/gritql/issues/416 +#[test] +fn trailing_comma_after_argument_removal() { + run_test_expected({ + TestArgExpected { + pattern: r#" + language python + `TaskMetadata($args)` where { + $args <: any { + contains `n_samples=$_` as $ns_kwarg where { + $ns_kwarg <: `n_samples = $ns_val` => . + }, + contains `avg_character_length=$_` as $avg_kwarg where { + $avg_kwarg <: `avg_character_length = $avg_val` => `stats=GeneralDescriptiveStats(n_samples=$ns_val, avg_character_length=$avg_val)` + }, + }, + } + "#.to_owned(), + source: r#" + from pydantic import BaseModel + + + class TaskMetadata(BaseModel): + n_samples: dict[str, int] + avg_character_length: dict[str, float] + + + if __name__ == "__main__": + TaskMetadata( + name="TbilisiCityHallBitextMining", + dataset={ + "path": "jupyterjazz/tbilisi-city-hall-titles", + "revision": "798bb599140565cca2dab8473035fa167e5ee602", + }, + description="Parallel news titles from the Tbilisi City Hall website (https://tbilisi.gov.ge/).", + type="BitextMining", + category="s2s", + eval_splits=[_EVAL_SPLIT], + eval_langs=_EVAL_LANGS, + main_score="f1", + domains=["News"], + text_creation="created", + n_samples={_EVAL_SPLIT: 1820}, + reference="https://huggingface.co/datasets/jupyterjazz/tbilisi-city-hall-titles", + date=("2024-05-02", "2024-05-03"), + form=["written"], + task_subtypes=[], + license="Not specified", + socioeconomic_status="mixed", + annotations_creators="derived", + dialect=[], + bibtex_citation="", + avg_character_length={_EVAL_SPLIT: 78}, + ) + "# + .to_owned(), + expected: r#" + from pydantic import BaseModel + + + class TaskMetadata(BaseModel): + n_samples: dict[str, int] + avg_character_length: dict[str, float] + + + if __name__ == "__main__": + TaskMetadata( + name="TbilisiCityHallBitextMining", + dataset={ + "path": "jupyterjazz/tbilisi-city-hall-titles", + "revision": "798bb599140565cca2dab8473035fa167e5ee602", + }, + description="Parallel news titles from the Tbilisi City Hall website (https://tbilisi.gov.ge/).", + type="BitextMining", + category="s2s", + eval_splits=[_EVAL_SPLIT], + eval_langs=_EVAL_LANGS, + main_score="f1", + domains=["News"], + text_creation="created", + + reference="https://huggingface.co/datasets/jupyterjazz/tbilisi-city-hall-titles", + date=("2024-05-02", "2024-05-03"), + form=["written"], + task_subtypes=[], + license="Not specified", + socioeconomic_status="mixed", + annotations_creators="derived", + dialect=[], + bibtex_citation="", + stats=GeneralDescriptiveStats(n_samples={_EVAL_SPLIT: 1820}, avg_character_length={_EVAL_SPLIT: 78}), + ) + "# + .to_owned(), + } + }) + .unwrap(); +} + +/// Same as above test, but ensures the behavior doesn't depend on line breaks +#[test] +fn trailing_comma_after_argument_removal_one_line() { + run_test_expected({ + TestArgExpected { + pattern: r#" + language python + `TaskMetadata($args)` where { + $args <: any { + contains `n_samples=$_` as $ns_kwarg where { + $ns_kwarg <: `n_samples = $ns_val` => . + }, + contains `avg_character_length=$_` as $avg_kwarg where { + $avg_kwarg <: `avg_character_length = $avg_val` => `stats=GeneralDescriptiveStats(n_samples=$ns_val, avg_character_length=$avg_val)` + }, + }, + } + "#.to_owned(), + source: r#" + TaskMetadata(description="Parallel news titles from the Tbilisi City Hall website (https://tbilisi.gov.ge/).", main_score="f1", domains=["News"], text_creation="created", n_samples={_EVAL_SPLIT: 1820}, reference="https://huggingface.co/datasets/jupyterjazz/tbilisi-city-hall-titles") + "# + .to_owned(), + expected: r#" + TaskMetadata(description="Parallel news titles from the Tbilisi City Hall website (https://tbilisi.gov.ge/).", main_score="f1", domains=["News"], text_creation="created", reference="https://huggingface.co/datasets/jupyterjazz/tbilisi-city-hall-titles") + "# + .to_owned(), + } + }) + .unwrap(); +} + #[test] fn python_orphaned_from_imports() { run_test_expected({ diff --git a/crates/language/src/python.rs b/crates/language/src/python.rs index ff710feb4..9bab7cad8 100644 --- a/crates/language/src/python.rs +++ b/crates/language/src/python.rs @@ -79,7 +79,7 @@ impl Language for Python { fn check_replacements(&self, n: NodeWithSource<'_>, replacements: &mut Vec) { if n.node.is_error() { - if n.text().is_ok_and(|t| t == "->") { + if n.text().is_ok_and(|t| t == "->") || n.text().is_ok_and(|t| t == ",") { replacements.push(Replacement::new(n.range(), "")); } return; From ca46057bc146a7f0914fe7657d2428482c940272 Mon Sep 17 00:00:00 2001 From: Morgante Pell Date: Tue, 16 Jul 2024 03:35:47 -0700 Subject: [PATCH 2/4] simpify test --- crates/core/src/test.rs | 102 +++++++--------------------------------- 1 file changed, 16 insertions(+), 86 deletions(-) diff --git a/crates/core/src/test.rs b/crates/core/src/test.rs index 81bb91e8c..a85690c51 100644 --- a/crates/core/src/test.rs +++ b/crates/core/src/test.rs @@ -12022,90 +12022,27 @@ fn trailing_comma_after_argument_removal() { pattern: r#" language python `TaskMetadata($args)` where { - $args <: any { - contains `n_samples=$_` as $ns_kwarg where { - $ns_kwarg <: `n_samples = $ns_val` => . - }, - contains `avg_character_length=$_` as $avg_kwarg where { - $avg_kwarg <: `avg_character_length = $avg_val` => `stats=GeneralDescriptiveStats(n_samples=$ns_val, avg_character_length=$avg_val)` - }, - }, + $args <: contains `n_samples=$_` => . } "#.to_owned(), source: r#" - from pydantic import BaseModel - - - class TaskMetadata(BaseModel): - n_samples: dict[str, int] - avg_character_length: dict[str, float] - - - if __name__ == "__main__": - TaskMetadata( - name="TbilisiCityHallBitextMining", - dataset={ - "path": "jupyterjazz/tbilisi-city-hall-titles", - "revision": "798bb599140565cca2dab8473035fa167e5ee602", - }, - description="Parallel news titles from the Tbilisi City Hall website (https://tbilisi.gov.ge/).", - type="BitextMining", - category="s2s", - eval_splits=[_EVAL_SPLIT], - eval_langs=_EVAL_LANGS, - main_score="f1", - domains=["News"], - text_creation="created", - n_samples={_EVAL_SPLIT: 1820}, - reference="https://huggingface.co/datasets/jupyterjazz/tbilisi-city-hall-titles", - date=("2024-05-02", "2024-05-03"), - form=["written"], - task_subtypes=[], - license="Not specified", - socioeconomic_status="mixed", - annotations_creators="derived", - dialect=[], - bibtex_citation="", - avg_character_length={_EVAL_SPLIT: 78}, - ) + TaskMetadata( + description="Parallel news titles from the Tbilisi City Hall website (https://tbilisi.gov.ge/).", + main_score="f1", domains=["News"], + text_creation="created", + n_samples={_EVAL_SPLIT: 1820}, + reference="https://huggingface.co/datasets/jupyterjazz/tbilisi-city-hall-titles" + ) "# .to_owned(), expected: r#" - from pydantic import BaseModel - - - class TaskMetadata(BaseModel): - n_samples: dict[str, int] - avg_character_length: dict[str, float] - - - if __name__ == "__main__": - TaskMetadata( - name="TbilisiCityHallBitextMining", - dataset={ - "path": "jupyterjazz/tbilisi-city-hall-titles", - "revision": "798bb599140565cca2dab8473035fa167e5ee602", - }, - description="Parallel news titles from the Tbilisi City Hall website (https://tbilisi.gov.ge/).", - type="BitextMining", - category="s2s", - eval_splits=[_EVAL_SPLIT], - eval_langs=_EVAL_LANGS, - main_score="f1", - domains=["News"], - text_creation="created", - - reference="https://huggingface.co/datasets/jupyterjazz/tbilisi-city-hall-titles", - date=("2024-05-02", "2024-05-03"), - form=["written"], - task_subtypes=[], - license="Not specified", - socioeconomic_status="mixed", - annotations_creators="derived", - dialect=[], - bibtex_citation="", - stats=GeneralDescriptiveStats(n_samples={_EVAL_SPLIT: 1820}, avg_character_length={_EVAL_SPLIT: 78}), - ) + TaskMetadata( + description="Parallel news titles from the Tbilisi City Hall website (https://tbilisi.gov.ge/).", + main_score="f1", domains=["News"], + text_creation="created", + + reference="https://huggingface.co/datasets/jupyterjazz/tbilisi-city-hall-titles" + ) "# .to_owned(), } @@ -12121,14 +12058,7 @@ fn trailing_comma_after_argument_removal_one_line() { pattern: r#" language python `TaskMetadata($args)` where { - $args <: any { - contains `n_samples=$_` as $ns_kwarg where { - $ns_kwarg <: `n_samples = $ns_val` => . - }, - contains `avg_character_length=$_` as $avg_kwarg where { - $avg_kwarg <: `avg_character_length = $avg_val` => `stats=GeneralDescriptiveStats(n_samples=$ns_val, avg_character_length=$avg_val)` - }, - }, + $args <: contains `n_samples=$_` => . } "#.to_owned(), source: r#" From bf1a30f9042eb17c88665dd0cda751ffee2d48f5 Mon Sep 17 00:00:00 2001 From: Alex Ley Date: Sat, 26 Oct 2024 23:28:40 +0100 Subject: [PATCH 3/4] fix(python): in `check_replacements()` handle both single line and multi line hanging comma after removing kwargs fixes #416 --- crates/core/src/test.rs | 2 +- crates/language/src/python.rs | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/crates/core/src/test.rs b/crates/core/src/test.rs index 2ce94d8f7..640fbef9c 100644 --- a/crates/core/src/test.rs +++ b/crates/core/src/test.rs @@ -12324,7 +12324,7 @@ fn trailing_comma_after_argument_removal_one_line() { "# .to_owned(), expected: r#" - TaskMetadata(description="Parallel news titles from the Tbilisi City Hall website (https://tbilisi.gov.ge/).", main_score="f1", domains=["News"], text_creation="created", reference="https://huggingface.co/datasets/jupyterjazz/tbilisi-city-hall-titles") + TaskMetadata(description="Parallel news titles from the Tbilisi City Hall website (https://tbilisi.gov.ge/).", main_score="f1", domains=["News"], text_creation="created", reference="https://huggingface.co/datasets/jupyterjazz/tbilisi-city-hall-titles") "# .to_owned(), } diff --git a/crates/language/src/python.rs b/crates/language/src/python.rs index b2af55f57..3faf4b36e 100644 --- a/crates/language/src/python.rs +++ b/crates/language/src/python.rs @@ -79,11 +79,22 @@ impl Language for Python { fn check_replacements(&self, n: NodeWithSource<'_>, replacements: &mut Vec) { if n.node.is_error() { + // https://github.com/getgrit/gritql/issues/416 single line example if n.text().is_ok_and(|t| t == "->") || n.text().is_ok_and(|t| t == ",") { replacements.push(Replacement::new(n.range(), "")); } return; } + if n.node.kind() == "," { + // https://github.com/getgrit/gritql/issues/416 multi line example + if let Some(prev) = n.node.prev_sibling() { + let empty = prev.range().end_byte() == prev.range().start_byte(); + if prev.kind() == "identifier" && empty { + replacements.push(Replacement::new(n.range(), "")); + return; + } + } + } if n.node.kind() == "import_from_statement" { if let Some(name_field) = n.node.child_by_field_name("name") { let names_text = name_field From dcfcf4d0cbdc1789e0abe91790991cdd830f4f18 Mon Sep 17 00:00:00 2001 From: Alex Ley Date: Sat, 26 Oct 2024 23:40:59 +0100 Subject: [PATCH 4/4] fix(python): simplify `check_replacements()` to only check if the prev node is empty fixes #416 --- crates/language/src/python.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/language/src/python.rs b/crates/language/src/python.rs index 3faf4b36e..5e0171f8a 100644 --- a/crates/language/src/python.rs +++ b/crates/language/src/python.rs @@ -89,7 +89,7 @@ impl Language for Python { // https://github.com/getgrit/gritql/issues/416 multi line example if let Some(prev) = n.node.prev_sibling() { let empty = prev.range().end_byte() == prev.range().start_byte(); - if prev.kind() == "identifier" && empty { + if empty { replacements.push(Replacement::new(n.range(), "")); return; }