Skip to content

Commit

Permalink
Add --print-characters flag. Helps discover the unicode ranges
Browse files Browse the repository at this point in the history
  • Loading branch information
faern committed Oct 1, 2024
1 parent dc44058 commit 59c619d
Show file tree
Hide file tree
Showing 4 changed files with 61 additions and 17 deletions.
8 changes: 2 additions & 6 deletions src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,8 @@ impl FromStr for CharacterType {
}

fn unicode_notation_to_char(unicode_notation: &str) -> Result<char, InvalidCharacterType> {
let parse = |unicode_notation: &str| -> Option<char> {
let hex_str_number = unicode_notation.strip_prefix("U+")?;
let int_number = u32::from_str_radix(hex_str_number, 16).ok()?;
char::from_u32(int_number)
};
parse(unicode_notation).ok_or_else(|| InvalidCharacterType(unicode_notation.to_owned()))
crate::unicode_notation::unicode_notation_to_char(unicode_notation)
.ok_or_else(|| InvalidCharacterType(unicode_notation.to_owned()))
}

/// All types of code that can have special rules about what is allowed or denied.
Expand Down
59 changes: 49 additions & 10 deletions src/main.rs
Original file line number Diff line number Diff line change
@@ -1,22 +1,21 @@
use std::collections::HashMap;
use std::fs;
use std::io;
use std::path::Path;
use std::path::PathBuf;
use std::path::{Path, PathBuf};

use anyhow::Context;
use clap::Parser;
use config::CodeType;
use config::Config;
use config::Language;
use miette::{miette, LabeledSpan, NamedSource, Severity};
use rules::Decision;
use rules::RuleSet;
use unic_ucd_name::Name;

use crate::config::{CodeType, Config, Language};
use crate::rules::{CharacterType, Decision, RuleSet};
use crate::unicode_notation::char_to_unicode_notation;

mod config;
mod rules;
mod unicode_blocks;
mod unicode_notation;

// Replaces the previous idea of "RuleChain"s.
struct RuleDispatcher {
Expand Down Expand Up @@ -113,10 +112,23 @@ struct Args {
paths: Vec<PathBuf>,

/// Print the names of all the Unicode blocks that this tool recognizes, then exits.
///
/// Enable verbose output to also print the code point ranges for each block.
#[arg(long)]
print_unicode_blocks: bool,

/// Print the character(s) in the given character type, then exits.
///
/// As argument you can specify anything you can add to the allow end deny lists in the
/// config file. For example:
///
/// `--print-characters "Mathematical Operators"` will print all unicode code points
/// in that block.
///
/// `--print-characters U+100..U+1ff` will print all characters between 100 and 1ff (hex)
#[arg(long)]
print_characters: Option<CharacterType>,

/// Enable more verbose output.
#[arg(short, long)]
verbose: bool,
Expand All @@ -131,15 +143,28 @@ fn main() -> anyhow::Result<()> {
for (&name, range) in &unicode_blocks::UNICODE_BLOCKS {
print!("{name}");
if args.verbose {
let range_start = u32::from(*range.start());
let range_end = u32::from(*range.end());
print!(": U+{range_start}..U+{range_end}");
let range_start = char_to_unicode_notation(*range.start());
let range_end = char_to_unicode_notation(*range.end());
print!(": {range_start}..{range_end}");
}
println!();
}
return Ok(());
}

if let Some(character_type) = args.print_characters {
match character_type {
CharacterType::CodePoint(c) => print_char_range(c..=c),
CharacterType::Range(range) => print_char_range(range),
CharacterType::Bidi => print_char_range(rules::BIDI_CHARACTERS.iter().copied()),
CharacterType::Block(block) => print_char_range(block.clone()),
// TODO: `char::MIN` and `char::MAX` are heading for stabilization. When they are
// stable we can replace these constants for those in std.
CharacterType::Anything => print_char_range('\0'..='\u{10ffff}'),
}
return Ok(());
}

let default_config = get_default_config();
let mut dispatcher = RuleDispatcher {
user_config: None,
Expand Down Expand Up @@ -331,3 +356,17 @@ fn get_default_config() -> Config {
]),
}
}

/// Prints to stdout, one line per character in the iterator.
/// The format is to first print the unicode notation followed
/// by the actual character glyph followed by the name if we know of a name.
fn print_char_range(range: impl Iterator<Item = char>) {
for c in range {
let code_point = char_to_unicode_notation(c);
let char_name = match Name::of(c) {
Some(name) => format!(" ({name})"),
None => "".to_owned(),
};
println!("{code_point}: '{c}'{char_name}");
}
}
2 changes: 1 addition & 1 deletion src/rules.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ impl RuleSet {
}
}

#[derive(Debug)]
#[derive(Debug, Clone)]
pub enum CharacterType {
/// Single character (eg. "U+9000")
CodePoint(char),
Expand Down
9 changes: 9 additions & 0 deletions src/unicode_notation.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
pub fn unicode_notation_to_char(unicode_notation: &str) -> Option<char> {
let hex_str_number = unicode_notation.strip_prefix("U+")?;
let int_number = u32::from_str_radix(hex_str_number, 16).ok()?;
char::from_u32(int_number)
}

pub fn char_to_unicode_notation(c: char) -> String {
format!("U+{:X}", u32::from(c))
}

0 comments on commit 59c619d

Please sign in to comment.