Skip to content

Commit

Permalink
fix: Ensure that ASCII* table formats do not use the UTF8 ellipsis …
Browse files Browse the repository at this point in the history
…char when truncating rows/cols/values (pola-rs#19404)
  • Loading branch information
alexander-beedie authored Oct 23, 2024
1 parent 4cfe56f commit 69c9c3a
Show file tree
Hide file tree
Showing 5 changed files with 182 additions and 57 deletions.
144 changes: 91 additions & 53 deletions crates/polars-core/src/fmt.rs
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,26 @@ fn get_str_len_limit() -> usize {
fn get_list_len_limit() -> usize {
parse_env_var_limit(FMT_TABLE_CELL_LIST_LEN, DEFAULT_LIST_LEN_LIMIT)
}
#[cfg(any(feature = "fmt", feature = "fmt_no_tty"))]
fn get_ellipsis() -> &'static str {
match std::env::var(FMT_TABLE_FORMATTING).as_deref().unwrap_or("") {
preset if preset.starts_with("ASCII") => "...",
_ => "…",
}
}

fn estimate_string_width(s: &str) -> usize {
// get a slightly more accurate estimate of a string's screen
// width, accounting (very roughly) for multibyte characters
let n_chars = s.chars().count();
let n_bytes = s.len();
if n_bytes == n_chars {
n_chars
} else {
let adjust = n_bytes as f64 / n_chars as f64;
std::cmp::min(n_chars * 2, (n_chars as f64 * adjust).ceil() as usize)
}
}

macro_rules! format_array {
($f:ident, $a:expr, $dtype:expr, $name:expr, $array_type:expr) => {{
Expand Down Expand Up @@ -424,7 +444,7 @@ impl Debug for DataFrame {
}
}
#[cfg(any(feature = "fmt", feature = "fmt_no_tty"))]
fn make_str_val(v: &str, truncate: usize) -> String {
fn make_str_val(v: &str, truncate: usize, ellipsis: &String) -> String {
let v_trunc = &v[..v
.char_indices()
.take(truncate)
Expand All @@ -434,14 +454,19 @@ fn make_str_val(v: &str, truncate: usize) -> String {
if v == v_trunc {
v.to_string()
} else {
format!("{v_trunc}")
format!("{v_trunc}{ellipsis}")
}
}

#[cfg(any(feature = "fmt", feature = "fmt_no_tty"))]
fn field_to_str(f: &Field, str_truncate: usize) -> (String, usize) {
let name = make_str_val(f.name(), str_truncate);
let name_length = name.len();
fn field_to_str(
f: &Field,
str_truncate: usize,
ellipsis: &String,
padding: usize,
) -> (String, usize) {
let name = make_str_val(f.name(), str_truncate, ellipsis);
let name_length = estimate_string_width(name.as_str());
let mut column_name = name;
if env_is_true(FMT_TABLE_HIDE_COLUMN_NAMES) {
column_name = "".to_string();
Expand Down Expand Up @@ -473,11 +498,11 @@ fn field_to_str(f: &Field, str_truncate: usize) -> (String, usize) {
format!("{column_name}{separator}{column_dtype}")
};
let mut s_len = std::cmp::max(name_length, dtype_length);
let separator_length = separator.trim().len();
let separator_length = estimate_string_width(separator.trim());
if s_len < separator_length {
s_len = separator_length;
}
(s, s_len + 2)
(s, s_len + padding)
}

#[cfg(any(feature = "fmt", feature = "fmt_no_tty"))]
Expand All @@ -487,27 +512,29 @@ fn prepare_row(
n_last: usize,
str_truncate: usize,
max_elem_lengths: &mut [usize],
ellipsis: &String,
padding: usize,
) -> Vec<String> {
let reduce_columns = n_first + n_last < row.len();
let n_elems = n_first + n_last + reduce_columns as usize;
let mut row_strings = Vec::with_capacity(n_elems);

for (idx, v) in row[0..n_first].iter().enumerate() {
let elem_str = make_str_val(v, str_truncate);
let elem_len = elem_str.len() + 2;
let elem_str = make_str_val(v, str_truncate, ellipsis);
let elem_len = estimate_string_width(elem_str.as_str()) + padding;
if max_elem_lengths[idx] < elem_len {
max_elem_lengths[idx] = elem_len;
};
row_strings.push(elem_str);
}
if reduce_columns {
row_strings.push("…".to_string());
max_elem_lengths[n_first] = 3;
row_strings.push(ellipsis.to_string());
max_elem_lengths[n_first] = ellipsis.chars().count() + padding;
}
let elem_offset = n_first + reduce_columns as usize;
for (idx, v) in row[row.len() - n_last..].iter().enumerate() {
let elem_str = make_str_val(v, str_truncate);
let elem_len = elem_str.len() + 2;
let elem_str = make_str_val(v, str_truncate, ellipsis);
let elem_len = estimate_string_width(elem_str.as_str()) + padding;
let elem_idx = elem_offset + idx;
if max_elem_lengths[elem_idx] < elem_len {
max_elem_lengths[elem_idx] = elem_len;
Expand Down Expand Up @@ -542,56 +569,56 @@ impl Display for DataFrame {
"The column lengths in the DataFrame are not equal."
);

let table_style = std::env::var(FMT_TABLE_FORMATTING).unwrap_or("DEFAULT".to_string());
let is_utf8 = !table_style.starts_with("ASCII");
let preset = match table_style.as_str() {
"ASCII_FULL" => ASCII_FULL,
"ASCII_FULL_CONDENSED" => ASCII_FULL_CONDENSED,
"ASCII_NO_BORDERS" => ASCII_NO_BORDERS,
"ASCII_BORDERS_ONLY" => ASCII_BORDERS_ONLY,
"ASCII_BORDERS_ONLY_CONDENSED" => ASCII_BORDERS_ONLY_CONDENSED,
"ASCII_HORIZONTAL_ONLY" => ASCII_HORIZONTAL_ONLY,
"ASCII_MARKDOWN" | "MARKDOWN" => ASCII_MARKDOWN,
"UTF8_FULL" => UTF8_FULL,
"UTF8_FULL_CONDENSED" => UTF8_FULL_CONDENSED,
"UTF8_NO_BORDERS" => UTF8_NO_BORDERS,
"UTF8_BORDERS_ONLY" => UTF8_BORDERS_ONLY,
"UTF8_HORIZONTAL_ONLY" => UTF8_HORIZONTAL_ONLY,
"NOTHING" => NOTHING,
_ => UTF8_FULL_CONDENSED,
};
let ellipsis = get_ellipsis().to_string();
let ellipsis_len = ellipsis.chars().count();
let max_n_cols = get_col_limit();
let max_n_rows = get_row_limit();
let str_truncate = get_str_len_limit();
let padding = 2; // eg: one char either side of the value

let (n_first, n_last) = if self.width() > max_n_cols {
((max_n_cols + 1) / 2, max_n_cols / 2)
} else {
(self.width(), 0)
};

let reduce_columns = n_first + n_last < self.width();
let n_tbl_cols = n_first + n_last + reduce_columns as usize;
let mut names = Vec::with_capacity(n_tbl_cols);
let mut name_lengths = Vec::with_capacity(n_tbl_cols);

let fields = self.fields();
for field in fields[0..n_first].iter() {
let (s, l) = field_to_str(field, str_truncate);
let (s, l) = field_to_str(field, str_truncate, &ellipsis, padding);
names.push(s);
name_lengths.push(l);
}
if reduce_columns {
names.push("…".into());
name_lengths.push(3);
names.push(ellipsis.clone());
name_lengths.push(ellipsis_len);
}
for field in fields[self.width() - n_last..].iter() {
let (s, l) = field_to_str(field, str_truncate);
let (s, l) = field_to_str(field, str_truncate, &ellipsis, padding);
names.push(s);
name_lengths.push(l);
}
let (preset, is_utf8) = match std::env::var(FMT_TABLE_FORMATTING)
.as_deref()
.unwrap_or("DEFAULT")
{
"ASCII_FULL" => (ASCII_FULL, false),
"ASCII_FULL_CONDENSED" => (ASCII_FULL_CONDENSED, false),
"ASCII_NO_BORDERS" => (ASCII_NO_BORDERS, false),
"ASCII_BORDERS_ONLY" => (ASCII_BORDERS_ONLY, false),
"ASCII_BORDERS_ONLY_CONDENSED" => (ASCII_BORDERS_ONLY_CONDENSED, false),
"ASCII_HORIZONTAL_ONLY" => (ASCII_HORIZONTAL_ONLY, false),
"ASCII_MARKDOWN" => (ASCII_MARKDOWN, false),
"UTF8_FULL" => (UTF8_FULL, true),
"UTF8_FULL_CONDENSED" => (UTF8_FULL_CONDENSED, true),
"UTF8_NO_BORDERS" => (UTF8_NO_BORDERS, true),
"UTF8_BORDERS_ONLY" => (UTF8_BORDERS_ONLY, true),
"UTF8_HORIZONTAL_ONLY" => (UTF8_HORIZONTAL_ONLY, true),
"NOTHING" => (NOTHING, false),
"DEFAULT" => (UTF8_FULL_CONDENSED, true),
_ => (UTF8_FULL_CONDENSED, true),
};

let mut table = Table::new();
table
Expand All @@ -601,7 +628,6 @@ impl Display for DataFrame {
if is_utf8 && env_is_true(FMT_TABLE_ROUNDED_CORNERS) {
table.apply_modifier(UTF8_ROUND_CORNERS);
}

let mut constraints = Vec::with_capacity(n_tbl_cols);
let mut max_elem_lengths: Vec<usize> = vec![0; n_tbl_cols];

Expand All @@ -610,7 +636,6 @@ impl Display for DataFrame {
// Truncate the table if we have more rows than the
// configured maximum number of rows
let mut rows = Vec::with_capacity(std::cmp::max(max_n_rows, 2));

let half = max_n_rows / 2;
let rest = max_n_rows % 2;

Expand All @@ -621,22 +646,36 @@ impl Display for DataFrame {
.map(|c| c.str_value(i).unwrap())
.collect();

let row_strings =
prepare_row(row, n_first, n_last, str_truncate, &mut max_elem_lengths);

let row_strings = prepare_row(
row,
n_first,
n_last,
str_truncate,
&mut max_elem_lengths,
&ellipsis,
padding,
);
rows.push(row_strings);
}
let dots = rows[0].iter().map(|_| "…".to_string()).collect();
let dots = vec![ellipsis.clone(); rows[0].len()];
rows.push(dots);

for i in (height - half)..height {
let row = self
.get_columns()
.iter()
.map(|c| c.str_value(i).unwrap())
.collect();

let row_strings =
prepare_row(row, n_first, n_last, str_truncate, &mut max_elem_lengths);
let row_strings = prepare_row(
row,
n_first,
n_last,
str_truncate,
&mut max_elem_lengths,
&ellipsis,
padding,
);
rows.push(row_strings);
}
table.add_rows(rows);
Expand All @@ -654,6 +693,8 @@ impl Display for DataFrame {
n_last,
str_truncate,
&mut max_elem_lengths,
&ellipsis,
padding,
);
table.add_row(row_strings);
} else {
Expand All @@ -662,10 +703,9 @@ impl Display for DataFrame {
}
}
} else if height > 0 {
let dots: Vec<String> = self.columns.iter().map(|_| "…".to_string()).collect();
let dots: Vec<String> = vec![ellipsis.clone(); self.columns.len()];
table.add_row(dots);
}

let tbl_fallback_width = 100;
let tbl_width = std::env::var("POLARS_TABLE_WIDTH")
.map(|s| {
Expand All @@ -683,10 +723,10 @@ impl Display for DataFrame {
lower: Width::Fixed(l as u16),
upper: Width::Fixed(u as u16),
};
let min_col_width = 5;
let min_col_width = std::cmp::max(5, 3 + padding);
for (idx, elem_len) in max_elem_lengths.iter().enumerate() {
let mx = std::cmp::min(
str_truncate + 3, // (3 = 2 space chars of padding + ellipsis char)
str_truncate + ellipsis_len + padding,
std::cmp::max(name_lengths[idx], *elem_len),
);
if mx <= min_col_width {
Expand Down Expand Up @@ -1011,7 +1051,7 @@ fn format_blob(f: &mut Formatter<'_>, bytes: &[u8]) -> fmt::Result {
}
}
if bytes.len() > width {
write!(f, "\"...")?;
write!(f, "\"")?;
} else {
write!(f, "\"")?;
}
Expand Down Expand Up @@ -1138,9 +1178,7 @@ impl Series {
if self.is_empty() {
return "[]".to_owned();
}

let max_items = get_list_len_limit();

match max_items {
0 => "[…]".to_owned(),
_ if max_items >= self.len() => {
Expand Down
1 change: 1 addition & 0 deletions crates/polars/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,7 @@
//! * `ASCII_BORDERS_ONLY_CONDENSED`
//! * `ASCII_HORIZONTAL_ONLY`
//! * `ASCII_MARKDOWN`
//! * `MARKDOWN`
//! * `UTF8_FULL`
//! * `UTF8_FULL_CONDENSED`
//! * `UTF8_NO_BORDERS`
Expand Down
8 changes: 5 additions & 3 deletions py-polars/polars/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
"ASCII_BORDERS_ONLY_CONDENSED",
"ASCII_HORIZONTAL_ONLY",
"ASCII_MARKDOWN",
"MARKDOWN",
"UTF8_FULL",
"UTF8_FULL_CONDENSED",
"UTF8_NO_BORDERS",
Expand Down Expand Up @@ -195,7 +196,7 @@ def __init__(
>>> df = pl.DataFrame({"abc": [1.0, 2.5, 5.0], "xyz": [True, False, True]})
>>> with pl.Config(
... # these options will be set for scope duration
... tbl_formatting="ASCII_MARKDOWN",
... tbl_formatting="MARKDOWN",
... tbl_hide_dataframe_shape=True,
... tbl_rows=10,
... ):
Expand Down Expand Up @@ -1037,7 +1038,8 @@ def set_tbl_formatting(
* "ASCII_BORDERS_ONLY": ASCII, borders only.
* "ASCII_BORDERS_ONLY_CONDENSED": ASCII, borders only, dense row spacing.
* "ASCII_HORIZONTAL_ONLY": ASCII, horizontal lines only.
* "ASCII_MARKDOWN": ASCII, Markdown compatible.
* "ASCII_MARKDOWN": Markdown format (ascii ellipses for truncated values).
* "MARKDOWN": Markdown format (utf8 ellipses for truncated values).
* "UTF8_FULL": UTF8, with all borders and lines, including row dividers.
* "UTF8_FULL_CONDENSED": Same as UTF8_FULL, but with dense row spacing.
* "UTF8_NO_BORDERS": UTF8, no borders.
Expand All @@ -1060,7 +1062,7 @@ def set_tbl_formatting(
... {"abc": [-2.5, 5.0], "mno": ["hello", "world"], "xyz": [True, False]}
... )
>>> with pl.Config(
... tbl_formatting="ASCII_MARKDOWN",
... tbl_formatting="MARKDOWN",
... tbl_hide_column_data_types=True,
... tbl_hide_dataframe_shape=True,
... ):
Expand Down
2 changes: 1 addition & 1 deletion py-polars/polars/selectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -508,7 +508,7 @@ def as_expr(self) -> Expr:
def _re_string(string: str | Collection[str], *, escape: bool = True) -> str:
"""Return escaped regex, potentially representing multiple string fragments."""
if isinstance(string, str):
rx = f"{re_escape(string)}" if escape else string
rx = re_escape(string) if escape else string
else:
strings: list[str] = []
for st in string:
Expand Down
Loading

0 comments on commit 69c9c3a

Please sign in to comment.