Skip to content

Commit

Permalink
feat(python): Parse JSON data in Utf8 to polars dtype (#6885)
Browse files Browse the repository at this point in the history
  • Loading branch information
josh authored Feb 15, 2023
1 parent 2c5e079 commit 8c119ab
Show file tree
Hide file tree
Showing 6 changed files with 131 additions and 1 deletion.
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/expressions/strings.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ The following methods are available under the `expr.str` attribute.
Expr.str.explode
Expr.str.extract
Expr.str.extract_all
Expr.str.json_extract
Expr.str.json_path_match
Expr.str.lengths
Expr.str.ljust
Expand Down
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/series/strings.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ The following methods are available under the `Series.str` attribute.
Series.str.explode
Series.str.extract
Series.str.extract_all
Series.str.json_extract
Series.str.json_path_match
Series.str.lengths
Series.str.ljust
Expand Down
42 changes: 42 additions & 0 deletions py-polars/polars/internals/expr/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@
DataType,
Date,
Datetime,
PolarsDataType,
PolarsTemporalType,
Time,
is_polars_dtype,
py_type_to_dtype,
)

if TYPE_CHECKING:
Expand Down Expand Up @@ -643,6 +645,46 @@ def starts_with(self, sub: str | pli.Expr) -> pli.Expr:
sub = pli.expr_to_lit_or_expr(sub, str_to_lit=True)._pyexpr
return pli.wrap_expr(self._pyexpr.str_starts_with(sub))

def json_extract(self, dtype: PolarsDataType | None = None) -> pli.Expr:
"""
Parse string values as JSON.
Throw errors if encounter invalid JSON strings.
Parameters
----------
dtype
The dtype to cast the extracted value to. If None, the dtype will be
inferred from the JSON value.
Examples
--------
>>> df = pl.DataFrame(
... {"json": ['{"a":1, "b": true}', None, '{"a":2, "b": false}']}
... )
>>> dtype = pl.Struct([pl.Field("a", pl.Int64), pl.Field("b", pl.Boolean)])
>>> df.select(pl.col("json").str.json_extract(dtype))
shape: (3, 1)
┌─────────────┐
│ json │
│ --- │
│ struct[2] │
╞═════════════╡
│ {1,true} │
│ {null,null} │
│ {2,false} │
└─────────────┘
See Also
--------
json_path_match : Extract the first match of json string with provided JSONPath
expression.
"""
if dtype is not None:
dtype = py_type_to_dtype(dtype)
return pli.wrap_expr(self._pyexpr.str_json_extract(dtype))

def json_path_match(self, json_path: str) -> pli.Expr:
"""
Extract the first match of json string with provided JSONPath expression.
Expand Down
33 changes: 32 additions & 1 deletion py-polars/polars/internals/series/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from typing import TYPE_CHECKING

import polars.internals as pli
from polars.datatypes import PolarsTemporalType
from polars.datatypes import PolarsDataType, PolarsTemporalType
from polars.internals.series.utils import expr_dispatch

if TYPE_CHECKING:
Expand Down Expand Up @@ -318,6 +318,37 @@ def encode(self, encoding: TransferEncoding) -> pli.Series:
"""

def json_extract(self, dtype: PolarsDataType | None = None) -> pli.Series:
"""
Parse string values as JSON.
Throw errors if encounter invalid JSON strings.
Parameters
----------
dtype
The dtype to cast the extracted value to. If None, the dtype will be
inferred from the JSON value.
Examples
--------
>>> s = pl.Series("json", ['{"a":1, "b": true}', None, '{"a":2, "b": false}'])
>>> s.str.json_extract()
shape: (3,)
Series: 'json' [struct[2]]
[
{1,true}
{null,null}
{2,false}
]
See Also
--------
json_path_match : Extract the first match of json string with provided JSONPath
expression.
"""

def json_path_match(self, json_path: str) -> pli.Series:
"""
Extract the first match of json string with provided JSONPath expression.
Expand Down
23 changes: 23 additions & 0 deletions py-polars/src/lazy/dsl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -857,6 +857,29 @@ impl PyExpr {
.into()
}

pub fn str_json_extract(&self, dtype: Option<Wrap<DataType>>) -> PyExpr {
let dtype = dtype.map(|wrap| wrap.0);

let output_type = match dtype.clone() {
Some(dtype) => GetOutput::from_type(dtype),
None => GetOutput::from_type(DataType::Unknown),
};

let function = move |s: Series| {
let ca = s.utf8()?;
match ca.json_extract(dtype.clone()) {
Ok(ca) => Ok(Some(ca.into_series())),
Err(e) => Err(PolarsError::ComputeError(format!("{e:?}").into())),
}
};

self.clone()
.inner
.map(function, output_type)
.with_fmt("str.json_extract")
.into()
}

#[cfg(feature = "extract_jsonpath")]
pub fn str_json_path_match(&self, pat: String) -> PyExpr {
let function = move |s: Series| {
Expand Down
32 changes: 32 additions & 0 deletions py-polars/tests/unit/namespaces/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,38 @@ def test_str_split() -> None:
assert out[2].to_list() == ["ab,", "c,", "de"]


def test_json_extract_series() -> None:
s = pl.Series(["[1, 2, 3]", None, "[4, 5, 6]"])
expected = pl.Series([[1, 2, 3], None, [4, 5, 6]])
dtype = pl.List(pl.Int64)
assert_series_equal(s.str.json_extract(None), expected)
assert_series_equal(s.str.json_extract(dtype), expected)

s = pl.Series(['{"a": 1, "b": true}', None, '{"a": 2, "b": false}'])
expected = pl.Series([{"a": 1, "b": True}, None, {"a": 2, "b": False}])
dtype2 = pl.Struct([pl.Field("a", pl.Int64), pl.Field("b", pl.Boolean)])
assert_series_equal(s.str.json_extract(None), expected)
assert_series_equal(s.str.json_extract(dtype2), expected)

expected = pl.Series([{"a": 1}, None, {"a": 2}])
dtype2 = pl.Struct([pl.Field("a", pl.Int64)])
assert_series_equal(s.str.json_extract(dtype2), expected)


def test_json_extract_lazy_expr() -> None:
dtype = pl.Struct([pl.Field("a", pl.Int64), pl.Field("b", pl.Boolean)])
ldf = (
pl.DataFrame({"json": ['{"a": 1, "b": true}', None, '{"a": 2, "b": false}']})
.lazy()
.select(pl.col("json").str.json_extract(dtype))
)
expected = pl.DataFrame(
{"json": [{"a": 1, "b": True}, None, {"a": 2, "b": False}]}
).lazy()
assert ldf.schema == {"json": dtype}
assert_frame_equal(ldf, expected)


def test_jsonpath_single() -> None:
s = pl.Series(['{"a":"1"}', None, '{"a":2}', '{"a":2.1}', '{"a":true}'])
expected = pl.Series(["1", None, "2", "2.1", "true"])
Expand Down

0 comments on commit 8c119ab

Please sign in to comment.