From 8c119abe0bcf70686ed19dd6853e9c3bacadbfe9 Mon Sep 17 00:00:00 2001 From: Joshua Peek Date: Wed, 15 Feb 2023 09:53:50 -0800 Subject: [PATCH] feat(python): Parse JSON data in `Utf8` to polars dtype (#6885) --- .../source/reference/expressions/strings.rst | 1 + .../docs/source/reference/series/strings.rst | 1 + py-polars/polars/internals/expr/string.py | 42 +++++++++++++++++++ py-polars/polars/internals/series/string.py | 33 ++++++++++++++- py-polars/src/lazy/dsl.rs | 23 ++++++++++ .../tests/unit/namespaces/test_string.py | 32 ++++++++++++++ 6 files changed, 131 insertions(+), 1 deletion(-) diff --git a/py-polars/docs/source/reference/expressions/strings.rst b/py-polars/docs/source/reference/expressions/strings.rst index 1c5add8ec657..cdd51d55da27 100644 --- a/py-polars/docs/source/reference/expressions/strings.rst +++ b/py-polars/docs/source/reference/expressions/strings.rst @@ -18,6 +18,7 @@ The following methods are available under the `expr.str` attribute. Expr.str.explode Expr.str.extract Expr.str.extract_all + Expr.str.json_extract Expr.str.json_path_match Expr.str.lengths Expr.str.ljust diff --git a/py-polars/docs/source/reference/series/strings.rst b/py-polars/docs/source/reference/series/strings.rst index 9351fc007429..ef199280c49a 100644 --- a/py-polars/docs/source/reference/series/strings.rst +++ b/py-polars/docs/source/reference/series/strings.rst @@ -18,6 +18,7 @@ The following methods are available under the `Series.str` attribute. Series.str.explode Series.str.extract Series.str.extract_all + Series.str.json_extract Series.str.json_path_match Series.str.lengths Series.str.ljust diff --git a/py-polars/polars/internals/expr/string.py b/py-polars/polars/internals/expr/string.py index 4d236c7b08d0..4cb5f82ab4d8 100644 --- a/py-polars/polars/internals/expr/string.py +++ b/py-polars/polars/internals/expr/string.py @@ -7,9 +7,11 @@ DataType, Date, Datetime, + PolarsDataType, PolarsTemporalType, Time, is_polars_dtype, + py_type_to_dtype, ) if TYPE_CHECKING: @@ -643,6 +645,46 @@ def starts_with(self, sub: str | pli.Expr) -> pli.Expr: sub = pli.expr_to_lit_or_expr(sub, str_to_lit=True)._pyexpr return pli.wrap_expr(self._pyexpr.str_starts_with(sub)) + def json_extract(self, dtype: PolarsDataType | None = None) -> pli.Expr: + """ + Parse string values as JSON. + + Throw errors if encounter invalid JSON strings. + + Parameters + ---------- + dtype + The dtype to cast the extracted value to. If None, the dtype will be + inferred from the JSON value. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"json": ['{"a":1, "b": true}', None, '{"a":2, "b": false}']} + ... ) + >>> dtype = pl.Struct([pl.Field("a", pl.Int64), pl.Field("b", pl.Boolean)]) + >>> df.select(pl.col("json").str.json_extract(dtype)) + shape: (3, 1) + ┌─────────────┐ + │ json │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {1,true} │ + │ {null,null} │ + │ {2,false} │ + └─────────────┘ + + See Also + -------- + json_path_match : Extract the first match of json string with provided JSONPath + expression. + + """ + if dtype is not None: + dtype = py_type_to_dtype(dtype) + return pli.wrap_expr(self._pyexpr.str_json_extract(dtype)) + def json_path_match(self, json_path: str) -> pli.Expr: """ Extract the first match of json string with provided JSONPath expression. diff --git a/py-polars/polars/internals/series/string.py b/py-polars/polars/internals/series/string.py index 97ce56579fe6..8cdfd095334e 100644 --- a/py-polars/polars/internals/series/string.py +++ b/py-polars/polars/internals/series/string.py @@ -3,7 +3,7 @@ from typing import TYPE_CHECKING import polars.internals as pli -from polars.datatypes import PolarsTemporalType +from polars.datatypes import PolarsDataType, PolarsTemporalType from polars.internals.series.utils import expr_dispatch if TYPE_CHECKING: @@ -318,6 +318,37 @@ def encode(self, encoding: TransferEncoding) -> pli.Series: """ + def json_extract(self, dtype: PolarsDataType | None = None) -> pli.Series: + """ + Parse string values as JSON. + + Throw errors if encounter invalid JSON strings. + + Parameters + ---------- + dtype + The dtype to cast the extracted value to. If None, the dtype will be + inferred from the JSON value. + + Examples + -------- + >>> s = pl.Series("json", ['{"a":1, "b": true}', None, '{"a":2, "b": false}']) + >>> s.str.json_extract() + shape: (3,) + Series: 'json' [struct[2]] + [ + {1,true} + {null,null} + {2,false} + ] + + See Also + -------- + json_path_match : Extract the first match of json string with provided JSONPath + expression. + + """ + def json_path_match(self, json_path: str) -> pli.Series: """ Extract the first match of json string with provided JSONPath expression. diff --git a/py-polars/src/lazy/dsl.rs b/py-polars/src/lazy/dsl.rs index 0f0e02cc9f13..9a7a03ac660b 100644 --- a/py-polars/src/lazy/dsl.rs +++ b/py-polars/src/lazy/dsl.rs @@ -857,6 +857,29 @@ impl PyExpr { .into() } + pub fn str_json_extract(&self, dtype: Option>) -> PyExpr { + let dtype = dtype.map(|wrap| wrap.0); + + let output_type = match dtype.clone() { + Some(dtype) => GetOutput::from_type(dtype), + None => GetOutput::from_type(DataType::Unknown), + }; + + let function = move |s: Series| { + let ca = s.utf8()?; + match ca.json_extract(dtype.clone()) { + Ok(ca) => Ok(Some(ca.into_series())), + Err(e) => Err(PolarsError::ComputeError(format!("{e:?}").into())), + } + }; + + self.clone() + .inner + .map(function, output_type) + .with_fmt("str.json_extract") + .into() + } + #[cfg(feature = "extract_jsonpath")] pub fn str_json_path_match(&self, pat: String) -> PyExpr { let function = move |s: Series| { diff --git a/py-polars/tests/unit/namespaces/test_string.py b/py-polars/tests/unit/namespaces/test_string.py index afb2dd87231f..158636045b91 100644 --- a/py-polars/tests/unit/namespaces/test_string.py +++ b/py-polars/tests/unit/namespaces/test_string.py @@ -180,6 +180,38 @@ def test_str_split() -> None: assert out[2].to_list() == ["ab,", "c,", "de"] +def test_json_extract_series() -> None: + s = pl.Series(["[1, 2, 3]", None, "[4, 5, 6]"]) + expected = pl.Series([[1, 2, 3], None, [4, 5, 6]]) + dtype = pl.List(pl.Int64) + assert_series_equal(s.str.json_extract(None), expected) + assert_series_equal(s.str.json_extract(dtype), expected) + + s = pl.Series(['{"a": 1, "b": true}', None, '{"a": 2, "b": false}']) + expected = pl.Series([{"a": 1, "b": True}, None, {"a": 2, "b": False}]) + dtype2 = pl.Struct([pl.Field("a", pl.Int64), pl.Field("b", pl.Boolean)]) + assert_series_equal(s.str.json_extract(None), expected) + assert_series_equal(s.str.json_extract(dtype2), expected) + + expected = pl.Series([{"a": 1}, None, {"a": 2}]) + dtype2 = pl.Struct([pl.Field("a", pl.Int64)]) + assert_series_equal(s.str.json_extract(dtype2), expected) + + +def test_json_extract_lazy_expr() -> None: + dtype = pl.Struct([pl.Field("a", pl.Int64), pl.Field("b", pl.Boolean)]) + ldf = ( + pl.DataFrame({"json": ['{"a": 1, "b": true}', None, '{"a": 2, "b": false}']}) + .lazy() + .select(pl.col("json").str.json_extract(dtype)) + ) + expected = pl.DataFrame( + {"json": [{"a": 1, "b": True}, None, {"a": 2, "b": False}]} + ).lazy() + assert ldf.schema == {"json": dtype} + assert_frame_equal(ldf, expected) + + def test_jsonpath_single() -> None: s = pl.Series(['{"a":"1"}', None, '{"a":2}', '{"a":2.1}', '{"a":true}']) expected = pl.Series(["1", None, "2", "2.1", "true"])